1 files changed, 963 insertions, 499 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index b17771dd5ce7..6904d794d60a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -134,48 +134,6 @@ struct amdgpu_prt_cb {
 };
 
 /**
- * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
- *
- * @base: base structure for tracking BO usage in a VM
- * @vm: vm to which bo is to be added
- * @bo: amdgpu buffer object
- *
- * Initialize a bo_va_base structure and add it to the appropriate lists
- *
- */
-static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
-				   struct amdgpu_vm *vm,
-				   struct amdgpu_bo *bo)
-{
-	base->vm = vm;
-	base->bo = bo;
-	INIT_LIST_HEAD(&base->bo_list);
-	INIT_LIST_HEAD(&base->vm_status);
-
-	if (!bo)
-		return;
-	list_add_tail(&base->bo_list, &bo->va);
-
-	if (bo->tbo.type == ttm_bo_type_kernel)
-		list_move(&base->vm_status, &vm->relocated);
-
-	if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
-		return;
-
-	if (bo->preferred_domains &
-	    amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
-		return;
-
-	/*
-	 * we checked all the prerequisites, but it looks like this per vm bo
-	 * is currently evicted. add the bo to the evicted list to make sure it
-	 * is validated on next vm use to avoid fault.
-	 * */
-	list_move_tail(&base->vm_status, &vm->evicted);
-	base->moved = true;
-}
-
-/**
  * amdgpu_vm_level_shift - return the addr shift for each level
  *
  * @adev: amdgpu_device pointer
@@ -233,6 +191,26 @@ static unsigned amdgpu_vm_num_entries(struct amdgpu_device *adev,
 }
 
 /**
+ * amdgpu_vm_entries_mask - the mask to get the entry number of a PD/PT
+ *
+ * @adev: amdgpu_device pointer
+ * @level: VMPT level
+ *
+ * Returns:
+ * The mask to extract the entry number of a PD/PT from an address.
+ */
+static uint32_t amdgpu_vm_entries_mask(struct amdgpu_device *adev,
+				       unsigned int level)
+{
+	if (level <= adev->vm_manager.root_level)
+		return 0xffffffff;
+	else if (level != AMDGPU_VM_PTB)
+		return 0x1ff;
+	else
+		return AMDGPU_VM_PTE_COUNT(adev) - 1;
+}
+
+/**
  * amdgpu_vm_bo_size - returns the size of the BOs in bytes
  *
  * @adev: amdgpu_device pointer
@@ -247,6 +225,382 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
 }
 
 /**
+ * amdgpu_vm_bo_evicted - vm_bo is evicted
+ *
+ * @vm_bo: vm_bo which is evicted
+ *
+ * State for PDs/PTs and per VM BOs which are not at the location they should
+ * be.
+ */
+static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
+{
+	struct amdgpu_vm *vm = vm_bo->vm;
+	struct amdgpu_bo *bo = vm_bo->bo;
+
+	vm_bo->moved = true;
+	if (bo->tbo.type == ttm_bo_type_kernel)
+		list_move(&vm_bo->vm_status, &vm->evicted);
+	else
+		list_move_tail(&vm_bo->vm_status, &vm->evicted);
+}
+
+/**
+ * amdgpu_vm_bo_relocated - vm_bo is reloacted
+ *
+ * @vm_bo: vm_bo which is relocated
+ *
+ * State for PDs/PTs which needs to update their parent PD.
+ */
+static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
+{
+	list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+}
+
+/**
+ * amdgpu_vm_bo_moved - vm_bo is moved
+ *
+ * @vm_bo: vm_bo which is moved
+ *
+ * State for per VM BOs which are moved, but that change is not yet reflected
+ * in the page tables.
+ */
+static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
+{
+	list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
+}
+
+/**
+ * amdgpu_vm_bo_idle - vm_bo is idle
+ *
+ * @vm_bo: vm_bo which is now idle
+ *
+ * State for PDs/PTs and per VM BOs which have gone through the state machine
+ * and are now idle.
+ */
+static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
+{
+	list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
+	vm_bo->moved = false;
+}
+
+/**
+ * amdgpu_vm_bo_invalidated - vm_bo is invalidated
+ *
+ * @vm_bo: vm_bo which is now invalidated
+ *
+ * State for normal BOs which are invalidated and that change not yet reflected
+ * in the PTs.
+ */
+static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
+{
+	spin_lock(&vm_bo->vm->invalidated_lock);
+	list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated);
+	spin_unlock(&vm_bo->vm->invalidated_lock);
+}
+
+/**
+ * amdgpu_vm_bo_done - vm_bo is done
+ *
+ * @vm_bo: vm_bo which is now done
+ *
+ * State for normal BOs which are invalidated and that change has been updated
+ * in the PTs.
+ */
+static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
+{
+	spin_lock(&vm_bo->vm->invalidated_lock);
+	list_del_init(&vm_bo->vm_status);
+	spin_unlock(&vm_bo->vm->invalidated_lock);
+}
+
+/**
+ * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
+ *
+ * @base: base structure for tracking BO usage in a VM
+ * @vm: vm to which bo is to be added
+ * @bo: amdgpu buffer object
+ *
+ * Initialize a bo_va_base structure and add it to the appropriate lists
+ *
+ */
+static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
+				   struct amdgpu_vm *vm,
+				   struct amdgpu_bo *bo)
+{
+	base->vm = vm;
+	base->bo = bo;
+	base->next = NULL;
+	INIT_LIST_HEAD(&base->vm_status);
+
+	if (!bo)
+		return;
+	base->next = bo->vm_bo;
+	bo->vm_bo = base;
+
+	if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
+		return;
+
+	vm->bulk_moveable = false;
+	if (bo->tbo.type == ttm_bo_type_kernel)
+		amdgpu_vm_bo_relocated(base);
+	else
+		amdgpu_vm_bo_idle(base);
+
+	if (bo->preferred_domains &
+	    amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
+		return;
+
+	/*
+	 * we checked all the prerequisites, but it looks like this per vm bo
+	 * is currently evicted. add the bo to the evicted list to make sure it
+	 * is validated on next vm use to avoid fault.
+	 * */
+	amdgpu_vm_bo_evicted(base);
+}
+
+/**
+ * amdgpu_vm_pt_parent - get the parent page directory
+ *
+ * @pt: child page table
+ *
+ * Helper to get the parent entry for the child page table. NULL if we are at
+ * the root page directory.
+ */
+static struct amdgpu_vm_pt *amdgpu_vm_pt_parent(struct amdgpu_vm_pt *pt)
+{
+	struct amdgpu_bo *parent = pt->base.bo->parent;
+
+	if (!parent)
+		return NULL;
+
+	return container_of(parent->vm_bo, struct amdgpu_vm_pt, base);
+}
+
+/**
+ * amdgpu_vm_pt_cursor - state for for_each_amdgpu_vm_pt
+ */
+struct amdgpu_vm_pt_cursor {
+	uint64_t pfn;
+	struct amdgpu_vm_pt *parent;
+	struct amdgpu_vm_pt *entry;
+	unsigned level;
+};
+
+/**
+ * amdgpu_vm_pt_start - start PD/PT walk
+ *
+ * @adev: amdgpu_device pointer
+ * @vm: amdgpu_vm structure
+ * @start: start address of the walk
+ * @cursor: state to initialize
+ *
+ * Initialize a amdgpu_vm_pt_cursor to start a walk.
+ */
+static void amdgpu_vm_pt_start(struct amdgpu_device *adev,
+			       struct amdgpu_vm *vm, uint64_t start,
+			       struct amdgpu_vm_pt_cursor *cursor)
+{
+	cursor->pfn = start;
+	cursor->parent = NULL;
+	cursor->entry = &vm->root;
+	cursor->level = adev->vm_manager.root_level;
+}
+
+/**
+ * amdgpu_vm_pt_descendant - go to child node
+ *
+ * @adev: amdgpu_device pointer
+ * @cursor: current state
+ *
+ * Walk to the child node of the current node.
+ * Returns:
+ * True if the walk was possible, false otherwise.
+ */
+static bool amdgpu_vm_pt_descendant(struct amdgpu_device *adev,
+				    struct amdgpu_vm_pt_cursor *cursor)
+{
+	unsigned mask, shift, idx;
+
+	if (!cursor->entry->entries)
+		return false;
+
+	BUG_ON(!cursor->entry->base.bo);
+	mask = amdgpu_vm_entries_mask(adev, cursor->level);
+	shift = amdgpu_vm_level_shift(adev, cursor->level);
+
+	++cursor->level;
+	idx = (cursor->pfn >> shift) & mask;
+	cursor->parent = cursor->entry;
+	cursor->entry = &cursor->entry->entries[idx];
+	return true;
+}
+
+/**
+ * amdgpu_vm_pt_sibling - go to sibling node
+ *
+ * @adev: amdgpu_device pointer
+ * @cursor: current state
+ *
+ * Walk to the sibling node of the current node.
+ * Returns:
+ * True if the walk was possible, false otherwise.
+ */
+static bool amdgpu_vm_pt_sibling(struct amdgpu_device *adev,
+				 struct amdgpu_vm_pt_cursor *cursor)
+{
+	unsigned shift, num_entries;
+
+	/* Root doesn't have a sibling */
+	if (!cursor->parent)
+		return false;
+
+	/* Go to our parents and see if we got a sibling */
+	shift = amdgpu_vm_level_shift(adev, cursor->level - 1);
+	num_entries = amdgpu_vm_num_entries(adev, cursor->level - 1);
+
+	if (cursor->entry == &cursor->parent->entries[num_entries - 1])
+		return false;
+
+	cursor->pfn += 1ULL << shift;
+	cursor->pfn &= ~((1ULL << shift) - 1);
+	++cursor->entry;
+	return true;
+}
+
+/**
+ * amdgpu_vm_pt_ancestor - go to parent node
+ *
+ * @cursor: current state
+ *
+ * Walk to the parent node of the current node.
+ * Returns:
+ * True if the walk was possible, false otherwise.
+ */
+static bool amdgpu_vm_pt_ancestor(struct amdgpu_vm_pt_cursor *cursor)
+{
+	if (!cursor->parent)
+		return false;
+
+	--cursor->level;
+	cursor->entry = cursor->parent;
+	cursor->parent = amdgpu_vm_pt_parent(cursor->parent);
+	return true;
+}
+
+/**
+ * amdgpu_vm_pt_next - get next PD/PT in hieratchy
+ *
+ * @adev: amdgpu_device pointer
+ * @cursor: current state
+ *
+ * Walk the PD/PT tree to the next node.
+ */
+static void amdgpu_vm_pt_next(struct amdgpu_device *adev,
+			      struct amdgpu_vm_pt_cursor *cursor)
+{
+	/* First try a newborn child */
+	if (amdgpu_vm_pt_descendant(adev, cursor))
+		return;
+
+	/* If that didn't worked try to find a sibling */
+	while (!amdgpu_vm_pt_sibling(adev, cursor)) {
+		/* No sibling, go to our parents and grandparents */
+		if (!amdgpu_vm_pt_ancestor(cursor)) {
+			cursor->pfn = ~0ll;
+			return;
+		}
+	}
+}
+
+/**
+ * amdgpu_vm_pt_first_leaf - get first leaf PD/PT
+ *
+ * @adev: amdgpu_device pointer
+ * @vm: amdgpu_vm structure
+ * @start: start addr of the walk
+ * @cursor: state to initialize
+ *
+ * Start a walk and go directly to the leaf node.
+ */
+static void amdgpu_vm_pt_first_leaf(struct amdgpu_device *adev,
+				    struct amdgpu_vm *vm, uint64_t start,
+				    struct amdgpu_vm_pt_cursor *cursor)
+{
+	amdgpu_vm_pt_start(adev, vm, start, cursor);
+	while (amdgpu_vm_pt_descendant(adev, cursor));
+}
+
+/**
+ * amdgpu_vm_pt_next_leaf - get next leaf PD/PT
+ *
+ * @adev: amdgpu_device pointer
+ * @cursor: current state
+ *
+ * Walk the PD/PT tree to the next leaf node.
+ */
+static void amdgpu_vm_pt_next_leaf(struct amdgpu_device *adev,
+				   struct amdgpu_vm_pt_cursor *cursor)
+{
+	amdgpu_vm_pt_next(adev, cursor);
+	while (amdgpu_vm_pt_descendant(adev, cursor));
+}
+
+/**
+ * for_each_amdgpu_vm_pt_leaf - walk over all leaf PDs/PTs in the hierarchy
+ */
+#define for_each_amdgpu_vm_pt_leaf(adev, vm, start, end, cursor)		\
+	for (amdgpu_vm_pt_first_leaf((adev), (vm), (start), &(cursor));		\
+	     (cursor).pfn <= end; amdgpu_vm_pt_next_leaf((adev), &(cursor)))
+
+/**
+ * amdgpu_vm_pt_first_dfs - start a deep first search
+ *
+ * @adev: amdgpu_device structure
+ * @vm: amdgpu_vm structure
+ * @cursor: state to initialize
+ *
+ * Starts a deep first traversal of the PD/PT tree.
+ */
+static void amdgpu_vm_pt_first_dfs(struct amdgpu_device *adev,
+				   struct amdgpu_vm *vm,
+				   struct amdgpu_vm_pt_cursor *cursor)
+{
+	amdgpu_vm_pt_start(adev, vm, 0, cursor);
+	while (amdgpu_vm_pt_descendant(adev, cursor));
+}
+
+/**
+ * amdgpu_vm_pt_next_dfs - get the next node for a deep first search
+ *
+ * @adev: amdgpu_device structure
+ * @cursor: current state
+ *
+ * Move the cursor to the next node in a deep first search.
+ */
+static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
+				  struct amdgpu_vm_pt_cursor *cursor)
+{
+	if (!cursor->entry)
+		return;
+
+	if (!cursor->parent)
+		cursor->entry = NULL;
+	else if (amdgpu_vm_pt_sibling(adev, cursor))
+		while (amdgpu_vm_pt_descendant(adev, cursor));
+	else
+		amdgpu_vm_pt_ancestor(cursor);
+}
+
+/**
+ * for_each_amdgpu_vm_pt_dfs_safe - safe deep first search of all PDs/PTs
+ */
+#define for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry)			\
+	for (amdgpu_vm_pt_first_dfs((adev), (vm), &(cursor)),			\
+	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor));\
+	     (entry); (entry) = (cursor).entry,					\
+	     amdgpu_vm_pt_next_dfs((adev), &(cursor)))
+
+/**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
  *
  * @vm: vm providing the BOs
@@ -260,15 +614,55 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
 			 struct amdgpu_bo_list_entry *entry)
 {
-	entry->robj = vm->root.base.bo;
 	entry->priority = 0;
-	entry->tv.bo = &entry->robj->tbo;
+	entry->tv.bo = &vm->root.base.bo->tbo;
 	entry->tv.shared = true;
 	entry->user_pages = NULL;
 	list_add(&entry->tv.head, validated);
 }
 
 /**
+ * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
+ *
+ * @adev: amdgpu device pointer
+ * @vm: vm providing the BOs
+ *
+ * Move all BOs to the end of LRU and remember their positions to put them
+ * together.
+ */
+void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
+				struct amdgpu_vm *vm)
+{
+	struct ttm_bo_global *glob = adev->mman.bdev.glob;
+	struct amdgpu_vm_bo_base *bo_base;
+
+	if (vm->bulk_moveable) {
+		spin_lock(&glob->lru_lock);
+		ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
+		spin_unlock(&glob->lru_lock);
+		return;
+	}
+
+	memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
+
+	spin_lock(&glob->lru_lock);
+	list_for_each_entry(bo_base, &vm->idle, vm_status) {
+		struct amdgpu_bo *bo = bo_base->bo;
+
+		if (!bo->parent)
+			continue;
+
+		ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move);
+		if (bo->shadow)
+			ttm_bo_move_to_lru_tail(&bo->shadow->tbo,
+						&vm->lru_bulk_move);
+	}
+	spin_unlock(&glob->lru_lock);
+
+	vm->bulk_moveable = true;
+}
+
+/**
  * amdgpu_vm_validate_pt_bos - validate the page table BOs
  *
  * @adev: amdgpu device pointer
@@ -285,47 +679,36 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			      int (*validate)(void *p, struct amdgpu_bo *bo),
 			      void *param)
 {
-	struct ttm_bo_global *glob = adev->mman.bdev.glob;
 	struct amdgpu_vm_bo_base *bo_base, *tmp;
 	int r = 0;
 
+	vm->bulk_moveable &= list_empty(&vm->evicted);
+
 	list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
 		struct amdgpu_bo *bo = bo_base->bo;
 
-		if (bo->parent) {
-			r = validate(param, bo);
-			if (r)
-				break;
-
-			spin_lock(&glob->lru_lock);
-			ttm_bo_move_to_lru_tail(&bo->tbo);
-			if (bo->shadow)
-				ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
-			spin_unlock(&glob->lru_lock);
-		}
+		r = validate(param, bo);
+		if (r)
+			break;
 
 		if (bo->tbo.type != ttm_bo_type_kernel) {
-			spin_lock(&vm->moved_lock);
-			list_move(&bo_base->vm_status, &vm->moved);
-			spin_unlock(&vm->moved_lock);
+			amdgpu_vm_bo_moved(bo_base);
 		} else {
-			list_move(&bo_base->vm_status, &vm->relocated);
+			if (vm->use_cpu_for_update)
+				r = amdgpu_bo_kmap(bo, NULL);
+			else
+				r = amdgpu_ttm_alloc_gart(&bo->tbo);
+			if (r)
+				break;
+			if (bo->shadow) {
+				r = amdgpu_ttm_alloc_gart(&bo->shadow->tbo);
+				if (r)
+					break;
+			}
+			amdgpu_vm_bo_relocated(bo_base);
 		}
 	}
 
-	spin_lock(&glob->lru_lock);
-	list_for_each_entry(bo_base, &vm->idle, vm_status) {
-		struct amdgpu_bo *bo = bo_base->bo;
-
-		if (!bo->parent)
-			continue;
-
-		ttm_bo_move_to_lru_tail(&bo->tbo);
-		if (bo->shadow)
-			ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
-	}
-	spin_unlock(&glob->lru_lock);
-
 	return r;
 }
 
@@ -376,7 +759,7 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
 		if (level == adev->vm_manager.root_level) {
 			ats_entries = amdgpu_vm_level_shift(adev, level);
 			ats_entries += AMDGPU_GPU_PAGE_SHIFT;
-			ats_entries = AMDGPU_VA_HOLE_START >> ats_entries;
+			ats_entries = AMDGPU_GMC_HOLE_START >> ats_entries;
 			ats_entries = min(ats_entries, entries);
 			entries -= ats_entries;
 		} else {
@@ -397,6 +780,10 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
 	if (r)
 		goto error;
 
+	r = amdgpu_ttm_alloc_gart(&bo->tbo);
+	if (r)
+		return r;
+
 	r = amdgpu_job_alloc_with_ib(adev, 64, &job);
 	if (r)
 		goto error;
@@ -448,117 +835,33 @@ error:
 }
 
 /**
- * amdgpu_vm_alloc_levels - allocate the PD/PT levels
+ * amdgpu_vm_bo_param - fill in parameters for PD/PT allocation
  *
  * @adev: amdgpu_device pointer
- * @vm: requested vm
- * @parent: parent PT
- * @saddr: start of the address range
- * @eaddr: end of the address range
- * @level: VMPT level
- * @ats: indicate ATS support from PTE
- *
- * Make sure the page directories and page tables are allocated
- *
- * Returns:
- * 0 on success, errno otherwise.
+ * @vm: requesting vm
+ * @bp: resulting BO allocation parameters
  */
-static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
-				  struct amdgpu_vm *vm,
-				  struct amdgpu_vm_pt *parent,
-				  uint64_t saddr, uint64_t eaddr,
-				  unsigned level, bool ats)
+static void amdgpu_vm_bo_param(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			       int level, struct amdgpu_bo_param *bp)
 {
-	unsigned shift = amdgpu_vm_level_shift(adev, level);
-	unsigned pt_idx, from, to;
-	u64 flags;
-	int r;
-
-	if (!parent->entries) {
-		unsigned num_entries = amdgpu_vm_num_entries(adev, level);
-
-		parent->entries = kvmalloc_array(num_entries,
-						   sizeof(struct amdgpu_vm_pt),
-						   GFP_KERNEL | __GFP_ZERO);
-		if (!parent->entries)
-			return -ENOMEM;
-		memset(parent->entries, 0 , sizeof(struct amdgpu_vm_pt));
-	}
-
-	from = saddr >> shift;
-	to = eaddr >> shift;
-	if (from >= amdgpu_vm_num_entries(adev, level) ||
-	    to >= amdgpu_vm_num_entries(adev, level))
-		return -EINVAL;
-
-	++level;
-	saddr = saddr & ((1 << shift) - 1);
-	eaddr = eaddr & ((1 << shift) - 1);
-
-	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-	if (vm->root.base.bo->shadow)
-		flags |= AMDGPU_GEM_CREATE_SHADOW;
+	memset(bp, 0, sizeof(*bp));
+
+	bp->size = amdgpu_vm_bo_size(adev, level);
+	bp->byte_align = AMDGPU_GPU_PAGE_SIZE;
+	bp->domain = AMDGPU_GEM_DOMAIN_VRAM;
+	if (bp->size <= PAGE_SIZE && adev->asic_type >= CHIP_VEGA10 &&
+	    adev->flags & AMD_IS_APU)
+		bp->domain |= AMDGPU_GEM_DOMAIN_GTT;
+	bp->domain = amdgpu_bo_get_preferred_pin_domain(adev, bp->domain);
+	bp->flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
+		AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 	if (vm->use_cpu_for_update)
-		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-	else
-		flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
-
-	/* walk over the address space and allocate the page tables */
-	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
-		struct reservation_object *resv = vm->root.base.bo->tbo.resv;
-		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
-		struct amdgpu_bo *pt;
-
-		if (!entry->base.bo) {
-			struct amdgpu_bo_param bp;
-
-			memset(&bp, 0, sizeof(bp));
-			bp.size = amdgpu_vm_bo_size(adev, level);
-			bp.byte_align = AMDGPU_GPU_PAGE_SIZE;
-			bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-			bp.flags = flags;
-			bp.type = ttm_bo_type_kernel;
-			bp.resv = resv;
-			r = amdgpu_bo_create(adev, &bp, &pt);
-			if (r)
-				return r;
-
-			r = amdgpu_vm_clear_bo(adev, vm, pt, level, ats);
-			if (r) {
-				amdgpu_bo_unref(&pt->shadow);
-				amdgpu_bo_unref(&pt);
-				return r;
-			}
-
-			if (vm->use_cpu_for_update) {
-				r = amdgpu_bo_kmap(pt, NULL);
-				if (r) {
-					amdgpu_bo_unref(&pt->shadow);
-					amdgpu_bo_unref(&pt);
-					return r;
-				}
-			}
-
-			/* Keep a reference to the root directory to avoid
-			* freeing them up in the wrong order.
-			*/
-			pt->parent = amdgpu_bo_ref(parent->base.bo);
-
-			amdgpu_vm_bo_base_init(&entry->base, vm, pt);
-		}
-
-		if (level < AMDGPU_VM_PTB) {
-			uint64_t sub_saddr = (pt_idx == from) ? saddr : 0;
-			uint64_t sub_eaddr = (pt_idx == to) ? eaddr :
-				((1 << shift) - 1);
-			r = amdgpu_vm_alloc_levels(adev, vm, entry, sub_saddr,
-						   sub_eaddr, level, ats);
-			if (r)
-				return r;
-		}
-	}
-
-	return 0;
+		bp->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+	else if (!vm->root.base.bo || vm->root.base.bo->shadow)
+		bp->flags |= AMDGPU_GEM_CREATE_SHADOW;
+	bp->type = ttm_bo_type_kernel;
+	if (vm->root.base.bo)
+		bp->resv = vm->root.base.bo->tbo.resv;
 }
 
 /**
@@ -569,7 +872,7 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
  * @saddr: Start address which needs to be allocated
  * @size: Size from start address we need.
  *
- * Make sure the page tables are allocated.
+ * Make sure the page directories and page tables are allocated
  *
  * Returns:
  * 0 on success, errno otherwise.
@@ -578,8 +881,11 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
 			struct amdgpu_vm *vm,
 			uint64_t saddr, uint64_t size)
 {
-	uint64_t eaddr;
+	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_bo *pt;
 	bool ats = false;
+	uint64_t eaddr;
+	int r;
 
 	/* validate the parameters */
 	if (saddr & AMDGPU_GPU_PAGE_MASK || size & AMDGPU_GPU_PAGE_MASK)
@@ -588,7 +894,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
 	eaddr = saddr + size - 1;
 
 	if (vm->pte_support_ats)
-		ats = saddr < AMDGPU_VA_HOLE_START;
+		ats = saddr < AMDGPU_GMC_HOLE_START;
 
 	saddr /= AMDGPU_GPU_PAGE_SIZE;
 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
@@ -599,8 +905,84 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
 		return -EINVAL;
 	}
 
-	return amdgpu_vm_alloc_levels(adev, vm, &vm->root, saddr, eaddr,
-				      adev->vm_manager.root_level, ats);
+	for_each_amdgpu_vm_pt_leaf(adev, vm, saddr, eaddr, cursor) {
+		struct amdgpu_vm_pt *entry = cursor.entry;
+		struct amdgpu_bo_param bp;
+
+		if (cursor.level < AMDGPU_VM_PTB) {
+			unsigned num_entries;
+
+			num_entries = amdgpu_vm_num_entries(adev, cursor.level);
+			entry->entries = kvmalloc_array(num_entries,
+							sizeof(*entry->entries),
+							GFP_KERNEL |
+							__GFP_ZERO);
+			if (!entry->entries)
+				return -ENOMEM;
+		}
+
+
+		if (entry->base.bo)
+			continue;
+
+		amdgpu_vm_bo_param(adev, vm, cursor.level, &bp);
+
+		r = amdgpu_bo_create(adev, &bp, &pt);
+		if (r)
+			return r;
+
+		r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats);
+		if (r)
+			goto error_free_pt;
+
+		if (vm->use_cpu_for_update) {
+			r = amdgpu_bo_kmap(pt, NULL);
+			if (r)
+				goto error_free_pt;
+		}
+
+		/* Keep a reference to the root directory to avoid
+		* freeing them up in the wrong order.
+		*/
+		pt->parent = amdgpu_bo_ref(cursor.parent->base.bo);
+
+		amdgpu_vm_bo_base_init(&entry->base, vm, pt);
+	}
+
+	return 0;
+
+error_free_pt:
+	amdgpu_bo_unref(&pt->shadow);
+	amdgpu_bo_unref(&pt);
+	return r;
+}
+
+/**
+ * amdgpu_vm_free_pts - free PD/PT levels
+ *
+ * @adev: amdgpu device structure
+ * @vm: amdgpu vm structure
+ *
+ * Free the page directory or page table level and all sub levels.
+ */
+static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
+			       struct amdgpu_vm *vm)
+{
+	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_vm_pt *entry;
+
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry) {
+
+		if (entry->base.bo) {
+			entry->base.bo->vm_bo = NULL;
+			list_del(&entry->base.vm_status);
+			amdgpu_bo_unref(&entry->base.bo->shadow);
+			amdgpu_bo_unref(&entry->base.bo);
+		}
+		kvfree(entry->entries);
+	}
+
+	BUG_ON(vm->root.base.bo);
 }
 
 /**
@@ -714,7 +1096,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 	}
 
 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
-	vm_flush_needed &= !!ring->funcs->emit_vm_flush;
+	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
+			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 		ring->funcs->emit_wreg;
 
@@ -799,12 +1182,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
 				       struct amdgpu_bo *bo)
 {
-	struct amdgpu_bo_va *bo_va;
+	struct amdgpu_vm_bo_base *base;
 
-	list_for_each_entry(bo_va, &bo->va, base.bo_list) {
-		if (bo_va->base.vm == vm) {
-			return bo_va;
-		}
+	for (base = bo->vm_bo; base; base = base->next) {
+		if (base->vm != vm)
+			continue;
+
+		return container_of(base, struct amdgpu_bo_va, base);
 	}
 	return NULL;
 }
@@ -957,6 +1341,22 @@ static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	return r;
 }
 
+/**
+ * amdgpu_vm_update_func - helper to call update function
+ *
+ * Calls the update function for both the given BO as well as its shadow.
+ */
+static void amdgpu_vm_update_func(struct amdgpu_pte_update_params *params,
+				  struct amdgpu_bo *bo,
+				  uint64_t pe, uint64_t addr,
+				  unsigned count, uint32_t incr,
+				  uint64_t flags)
+{
+	if (bo->shadow)
+		params->func(params, bo->shadow, pe, addr, count, incr, flags);
+	params->func(params, bo, pe, addr, count, incr, flags);
+}
+
 /*
  * amdgpu_vm_update_pde - update a single level in the hierarchy
  *
@@ -984,47 +1384,28 @@ static void amdgpu_vm_update_pde(struct amdgpu_pte_update_params *params,
 		pbo = pbo->parent;
 
 	level += params->adev->vm_manager.root_level;
-	pt = amdgpu_bo_gpu_offset(entry->base.bo);
-	flags = AMDGPU_PTE_VALID;
-	amdgpu_gmc_get_vm_pde(params->adev, level, &pt, &flags);
+	amdgpu_gmc_get_pde_for_bo(entry->base.bo, level, &pt, &flags);
 	pde = (entry - parent->entries) * 8;
-	if (bo->shadow)
-		params->func(params, bo->shadow, pde, pt, 1, 0, flags);
-	params->func(params, bo, pde, pt, 1, 0, flags);
+	amdgpu_vm_update_func(params, bo, pde, pt, 1, 0, flags);
 }
 
 /*
- * amdgpu_vm_invalidate_level - mark all PD levels as invalid
+ * amdgpu_vm_invalidate_pds - mark all PDs as invalid
  *
  * @adev: amdgpu_device pointer
  * @vm: related vm
- * @parent: parent PD
- * @level: VMPT level
  *
  * Mark all PD level as invalid after an error.
  */
-static void amdgpu_vm_invalidate_level(struct amdgpu_device *adev,
-				       struct amdgpu_vm *vm,
-				       struct amdgpu_vm_pt *parent,
-				       unsigned level)
+static void amdgpu_vm_invalidate_pds(struct amdgpu_device *adev,
+				     struct amdgpu_vm *vm)
 {
-	unsigned pt_idx, num_entries;
-
-	/*
-	 * Recurse into the subdirectories. This recursion is harmless because
-	 * we only have a maximum of 5 layers.
-	 */
-	num_entries = amdgpu_vm_num_entries(adev, level);
-	for (pt_idx = 0; pt_idx < num_entries; ++pt_idx) {
-		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
-
-		if (!entry->base.bo)
-			continue;
+	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_vm_pt *entry;
 
-		if (!entry->base.moved)
-			list_move(&entry->base.vm_status, &vm->relocated);
-		amdgpu_vm_invalidate_level(adev, vm, entry, level + 1);
-	}
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry)
+		if (entry->base.bo && !entry->base.moved)
+			amdgpu_vm_bo_relocated(&entry->base);
 }
 
 /*
@@ -1054,14 +1435,6 @@ restart:
 	params.adev = adev;
 
 	if (vm->use_cpu_for_update) {
-		struct amdgpu_vm_bo_base *bo_base;
-
-		list_for_each_entry(bo_base, &vm->relocated, vm_status) {
-			r = amdgpu_bo_kmap(bo_base->bo, NULL);
-			if (unlikely(r))
-				return r;
-		}
-
 		r = amdgpu_vm_wait_pd(adev, vm, AMDGPU_FENCE_OWNER_VM);
 		if (unlikely(r))
 			return r;
@@ -1078,25 +1451,16 @@ restart:
 	}
 
 	while (!list_empty(&vm->relocated)) {
-		struct amdgpu_vm_bo_base *bo_base, *parent;
 		struct amdgpu_vm_pt *pt, *entry;
-		struct amdgpu_bo *bo;
 
-		bo_base = list_first_entry(&vm->relocated,
-					   struct amdgpu_vm_bo_base,
-					   vm_status);
-		bo_base->moved = false;
-		list_del_init(&bo_base->vm_status);
+		entry = list_first_entry(&vm->relocated, struct amdgpu_vm_pt,
+					 base.vm_status);
+		amdgpu_vm_bo_idle(&entry->base);
 
-		bo = bo_base->bo->parent;
-		if (!bo)
+		pt = amdgpu_vm_pt_parent(entry);
+		if (!pt)
 			continue;
 
-		parent = list_first_entry(&bo->va, struct amdgpu_vm_bo_base,
-					  bo_list);
-		pt = container_of(parent, struct amdgpu_vm_pt, base);
-		entry = container_of(bo_base, struct amdgpu_vm_pt, base);
-
 		amdgpu_vm_update_pde(&params, vm, pt, entry);
 
 		if (!vm->use_cpu_for_update &&
@@ -1138,85 +1502,90 @@ restart:
 	return 0;
 
 error:
-	amdgpu_vm_invalidate_level(adev, vm, &vm->root,
-				   adev->vm_manager.root_level);
+	amdgpu_vm_invalidate_pds(adev, vm);
 	amdgpu_job_free(job);
 	return r;
 }
 
 /**
- * amdgpu_vm_find_entry - find the entry for an address
+ * amdgpu_vm_update_huge - figure out parameters for PTE updates
  *
- * @p: see amdgpu_pte_update_params definition
- * @addr: virtual address in question
- * @entry: resulting entry or NULL
- * @parent: parent entry
- *
- * Find the vm_pt entry and it's parent for the given address.
+ * Make sure to set the right flags for the PTEs at the desired level.
  */
-void amdgpu_vm_get_entry(struct amdgpu_pte_update_params *p, uint64_t addr,
-			 struct amdgpu_vm_pt **entry,
-			 struct amdgpu_vm_pt **parent)
-{
-	unsigned level = p->adev->vm_manager.root_level;
-
-	*parent = NULL;
-	*entry = &p->vm->root;
-	while ((*entry)->entries) {
-		unsigned shift = amdgpu_vm_level_shift(p->adev, level++);
+static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params *params,
+				  struct amdgpu_bo *bo, unsigned level,
+				  uint64_t pe, uint64_t addr,
+				  unsigned count, uint32_t incr,
+				  uint64_t flags)
 
-		*parent = *entry;
-		*entry = &(*entry)->entries[addr >> shift];
-		addr &= (1ULL << shift) - 1;
+{
+	if (level != AMDGPU_VM_PTB) {
+		flags |= AMDGPU_PDE_PTE;
+		amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
 	}
 
-	if (level != AMDGPU_VM_PTB)
-		*entry = NULL;
+	amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
 }
 
 /**
- * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
+ * amdgpu_vm_fragment - get fragment for PTEs
  *
- * @p: see amdgpu_pte_update_params definition
- * @entry: vm_pt entry to check
- * @parent: parent entry
- * @nptes: number of PTEs updated with this operation
- * @dst: destination address where the PTEs should point to
- * @flags: access flags fro the PTEs
+ * @params: see amdgpu_pte_update_params definition
+ * @start: first PTE to handle
+ * @end: last PTE to handle
+ * @flags: hw mapping flags
+ * @frag: resulting fragment size
+ * @frag_end: end of this fragment
  *
- * Check if we can update the PD with a huge page.
+ * Returns the first possible fragment for the start and end address.
  */
-static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
-					struct amdgpu_vm_pt *entry,
-					struct amdgpu_vm_pt *parent,
-					unsigned nptes, uint64_t dst,
-					uint64_t flags)
+static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
+			       uint64_t start, uint64_t end, uint64_t flags,
+			       unsigned int *frag, uint64_t *frag_end)
 {
-	uint64_t pde;
+	/**
+	 * The MC L1 TLB supports variable sized pages, based on a fragment
+	 * field in the PTE. When this field is set to a non-zero value, page
+	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+	 * flags are considered valid for all PTEs within the fragment range
+	 * and corresponding mappings are assumed to be physically contiguous.
+	 *
+	 * The L1 TLB can store a single PTE for the whole fragment,
+	 * significantly increasing the space available for translation
+	 * caching. This leads to large improvements in throughput when the
+	 * TLB is under pressure.
+	 *
+	 * The L2 TLB distributes small and large fragments into two
+	 * asymmetric partitions. The large fragment cache is significantly
+	 * larger. Thus, we try to use large fragments wherever possible.
+	 * Userspace can support this by aligning virtual base address and
+	 * allocation size to the fragment size.
+	 *
+	 * Starting with Vega10 the fragment size only controls the L1. The L2
+	 * is now directly feed with small/huge/giant pages from the walker.
+	 */
+	unsigned max_frag;
 
-	/* In the case of a mixed PT the PDE must point to it*/
-	if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
-	    nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
-		/* Set the huge page flag to stop scanning at this PDE */
-		flags |= AMDGPU_PDE_PTE;
-	}
+	if (params->adev->asic_type < CHIP_VEGA10)
+		max_frag = params->adev->vm_manager.fragment_size;
+	else
+		max_frag = 31;
 
-	if (!(flags & AMDGPU_PDE_PTE)) {
-		if (entry->huge) {
-			/* Add the entry to the relocated list to update it. */
-			entry->huge = false;
-			list_move(&entry->base.vm_status, &p->vm->relocated);
-		}
+	/* system pages are non continuously */
+	if (params->src) {
+		*frag = 0;
+		*frag_end = end;
 		return;
 	}
 
-	entry->huge = true;
-	amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
-
-	pde = (entry - parent->entries) * 8;
-	if (parent->base.bo->shadow)
-		p->func(p, parent->base.bo->shadow, pde, dst, 1, 0, flags);
-	p->func(p, parent->base.bo, pde, dst, 1, 0, flags);
+	/* This intentionally wraps around if no bit is set */
+	*frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - start) - 1);
+	if (*frag >= max_frag) {
+		*frag = max_frag;
+		*frag_end = end & ~((1ULL << max_frag) - 1);
+	} else {
+		*frag_end = start + (1 << *frag);
+	}
 }
 
 /**
@@ -1234,112 +1603,105 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
  * 0 for success, -EINVAL for failure.
  */
 static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
-				  uint64_t start, uint64_t end,
-				  uint64_t dst, uint64_t flags)
+				 uint64_t start, uint64_t end,
+				 uint64_t dst, uint64_t flags)
 {
 	struct amdgpu_device *adev = params->adev;
-	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
+	struct amdgpu_vm_pt_cursor cursor;
+	uint64_t frag_start = start, frag_end;
+	unsigned int frag;
 
-	uint64_t addr, pe_start;
-	struct amdgpu_bo *pt;
-	unsigned nptes;
+	/* figure out the initial fragment */
+	amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end);
 
-	/* walk over the address space and update the page tables */
-	for (addr = start; addr < end; addr += nptes,
-	     dst += nptes * AMDGPU_GPU_PAGE_SIZE) {
-		struct amdgpu_vm_pt *entry, *parent;
+	/* walk over the address space and update the PTs */
+	amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
+	while (cursor.pfn < end) {
+		struct amdgpu_bo *pt = cursor.entry->base.bo;
+		unsigned shift, parent_shift, mask;
+		uint64_t incr, entry_end, pe_start;
 
-		amdgpu_vm_get_entry(params, addr, &entry, &parent);
-		if (!entry)
+		if (!pt)
 			return -ENOENT;
 
-		if ((addr & ~mask) == (end & ~mask))
-			nptes = end - addr;
-		else
-			nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
-
-		amdgpu_vm_handle_huge_pages(params, entry, parent,
-					    nptes, dst, flags);
-		/* We don't need to update PTEs for huge pages */
-		if (entry->huge)
+		/* The root level can't be a huge page */
+		if (cursor.level == adev->vm_manager.root_level) {
+			if (!amdgpu_vm_pt_descendant(adev, &cursor))
+				return -ENOENT;
 			continue;
+		}
 
-		pt = entry->base.bo;
-		pe_start = (addr & mask) * 8;
-		if (pt->shadow)
-			params->func(params, pt->shadow, pe_start, dst, nptes,
-				     AMDGPU_GPU_PAGE_SIZE, flags);
-		params->func(params, pt, pe_start, dst, nptes,
-			     AMDGPU_GPU_PAGE_SIZE, flags);
-	}
-
-	return 0;
-}
+		/* First check if the entry is already handled */
+		if (cursor.pfn < frag_start) {
+			cursor.entry->huge = true;
+			amdgpu_vm_pt_next(adev, &cursor);
+			continue;
+		}
 
-/*
- * amdgpu_vm_frag_ptes - add fragment information to PTEs
- *
- * @params: see amdgpu_pte_update_params definition
- * @vm: requested vm
- * @start: first PTE to handle
- * @end: last PTE to handle
- * @dst: addr those PTEs should point to
- * @flags: hw mapping flags
- *
- * Returns:
- * 0 for success, -EINVAL for failure.
- */
-static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
-				uint64_t start, uint64_t end,
-				uint64_t dst, uint64_t flags)
-{
-	/**
-	 * The MC L1 TLB supports variable sized pages, based on a fragment
-	 * field in the PTE. When this field is set to a non-zero value, page
-	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
-	 * flags are considered valid for all PTEs within the fragment range
-	 * and corresponding mappings are assumed to be physically contiguous.
-	 *
-	 * The L1 TLB can store a single PTE for the whole fragment,
-	 * significantly increasing the space available for translation
-	 * caching. This leads to large improvements in throughput when the
-	 * TLB is under pressure.
-	 *
-	 * The L2 TLB distributes small and large fragments into two
-	 * asymmetric partitions. The large fragment cache is significantly
-	 * larger. Thus, we try to use large fragments wherever possible.
-	 * Userspace can support this by aligning virtual base address and
-	 * allocation size to the fragment size.
-	 */
-	unsigned max_frag = params->adev->vm_manager.fragment_size;
-	int r;
+		/* If it isn't already handled it can't be a huge page */
+		if (cursor.entry->huge) {
+			/* Add the entry to the relocated list to update it. */
+			cursor.entry->huge = false;
+			amdgpu_vm_bo_relocated(&cursor.entry->base);
+		}
 
-	/* system pages are non continuously */
-	if (params->src || !(flags & AMDGPU_PTE_VALID))
-		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
-
-	while (start != end) {
-		uint64_t frag_flags, frag_end;
-		unsigned frag;
-
-		/* This intentionally wraps around if no bit is set */
-		frag = min((unsigned)ffs(start) - 1,
-			   (unsigned)fls64(end - start) - 1);
-		if (frag >= max_frag) {
-			frag_flags = AMDGPU_PTE_FRAG(max_frag);
-			frag_end = end & ~((1ULL << max_frag) - 1);
-		} else {
-			frag_flags = AMDGPU_PTE_FRAG(frag);
-			frag_end = start + (1 << frag);
+		shift = amdgpu_vm_level_shift(adev, cursor.level);
+		parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
+		if (adev->asic_type < CHIP_VEGA10) {
+			/* No huge page support before GMC v9 */
+			if (cursor.level != AMDGPU_VM_PTB) {
+				if (!amdgpu_vm_pt_descendant(adev, &cursor))
+					return -ENOENT;
+				continue;
+			}
+		} else if (frag < shift) {
+			/* We can't use this level when the fragment size is
+			 * smaller than the address shift. Go to the next
+			 * child entry and try again.
+			 */
+			if (!amdgpu_vm_pt_descendant(adev, &cursor))
+				return -ENOENT;
+			continue;
+		} else if (frag >= parent_shift) {
+			/* If the fragment size is even larger than the parent
+			 * shift we should go up one level and check it again.
+			 */
+			if (!amdgpu_vm_pt_ancestor(&cursor))
+				return -ENOENT;
+			continue;
 		}
 
-		r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
-					  flags | frag_flags);
-		if (r)
-			return r;
+		/* Looks good so far, calculate parameters for the update */
+		incr = AMDGPU_GPU_PAGE_SIZE << shift;
+		mask = amdgpu_vm_entries_mask(adev, cursor.level);
+		pe_start = ((cursor.pfn >> shift) & mask) * 8;
+		entry_end = (mask + 1) << shift;
+		entry_end += cursor.pfn & ~(entry_end - 1);
+		entry_end = min(entry_end, end);
+
+		do {
+			uint64_t upd_end = min(entry_end, frag_end);
+			unsigned nptes = (upd_end - frag_start) >> shift;
+
+			amdgpu_vm_update_huge(params, pt, cursor.level,
+					      pe_start, dst, nptes, incr,
+					      flags | AMDGPU_PTE_FRAG(frag));
+
+			pe_start += nptes * 8;
+			dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
+
+			frag_start = upd_end;
+			if (frag_start >= frag_end) {
+				/* figure out the next fragment */
+				amdgpu_vm_fragment(params, frag_start, end,
+						   flags, &frag, &frag_end);
+				if (frag < shift)
+					break;
+			}
+		} while (frag_start < entry_end);
 
-		dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
-		start = frag_end;
+		if (frag >= shift)
+			amdgpu_vm_pt_next(adev, &cursor);
 	}
 
 	return 0;
@@ -1401,8 +1763,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 
 		params.func = amdgpu_vm_cpu_set_ptes;
 		params.pages_addr = pages_addr;
-		return amdgpu_vm_frag_ptes(&params, start, last + 1,
-					   addr, flags);
+		return amdgpu_vm_update_ptes(&params, start, last + 1,
+					     addr, flags);
 	}
 
 	ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
@@ -1481,7 +1843,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	if (r)
 		goto error_free;
 
-	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
 	if (r)
 		goto error_free;
 
@@ -1696,10 +2058,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 		amdgpu_asic_flush_hdp(adev, NULL);
 	}
 
-	spin_lock(&vm->moved_lock);
-	list_del_init(&bo_va->base.vm_status);
-	spin_unlock(&vm->moved_lock);
-
 	/* If the BO is not in its preferred location add it back to
 	 * the evicted list so that it gets validated again on the
 	 * next command submission.
@@ -1708,9 +2066,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 		uint32_t mem_type = bo->tbo.mem.mem_type;
 
 		if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
-			list_add_tail(&bo_va->base.vm_status, &vm->evicted);
+			amdgpu_vm_bo_evicted(&bo_va->base);
 		else
-			list_add(&bo_va->base.vm_status, &vm->idle);
+			amdgpu_vm_bo_idle(&bo_va->base);
+	} else {
+		amdgpu_vm_bo_done(&bo_va->base);
 	}
 
 	list_splice_init(&bo_va->invalids, &bo_va->valids);
@@ -1895,7 +2255,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
 			struct amdgpu_bo_va_mapping, list);
 		list_del(&mapping->list);
 
-		if (vm->pte_support_ats && mapping->start < AMDGPU_VA_HOLE_START)
+		if (vm->pte_support_ats &&
+		    mapping->start < AMDGPU_GMC_HOLE_START)
 			init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
 
 		r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
@@ -1936,40 +2297,40 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
 			   struct amdgpu_vm *vm)
 {
 	struct amdgpu_bo_va *bo_va, *tmp;
-	struct list_head moved;
+	struct reservation_object *resv;
 	bool clear;
 	int r;
 
-	INIT_LIST_HEAD(&moved);
-	spin_lock(&vm->moved_lock);
-	list_splice_init(&vm->moved, &moved);
-	spin_unlock(&vm->moved_lock);
+	list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
+		/* Per VM BOs never need to bo cleared in the page tables */
+		r = amdgpu_vm_bo_update(adev, bo_va, false);
+		if (r)
+			return r;
+	}
 
-	list_for_each_entry_safe(bo_va, tmp, &moved, base.vm_status) {
-		struct reservation_object *resv = bo_va->base.bo->tbo.resv;
+	spin_lock(&vm->invalidated_lock);
+	while (!list_empty(&vm->invalidated)) {
+		bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
+					 base.vm_status);
+		resv = bo_va->base.bo->tbo.resv;
+		spin_unlock(&vm->invalidated_lock);
 
-		/* Per VM BOs never need to bo cleared in the page tables */
-		if (resv == vm->root.base.bo->tbo.resv)
-			clear = false;
 		/* Try to reserve the BO to avoid clearing its ptes */
-		else if (!amdgpu_vm_debug && reservation_object_trylock(resv))
+		if (!amdgpu_vm_debug && reservation_object_trylock(resv))
 			clear = false;
 		/* Somebody else is using the BO right now */
 		else
 			clear = true;
 
 		r = amdgpu_vm_bo_update(adev, bo_va, clear);
-		if (r) {
-			spin_lock(&vm->moved_lock);
-			list_splice(&moved, &vm->moved);
-			spin_unlock(&vm->moved_lock);
+		if (r)
 			return r;
-		}
 
-		if (!clear && resv != vm->root.base.bo->tbo.resv)
+		if (!clear)
 			reservation_object_unlock(resv);
-
+		spin_lock(&vm->invalidated_lock);
 	}
+	spin_unlock(&vm->invalidated_lock);
 
 	return 0;
 }
@@ -2034,9 +2395,7 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
 
 	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv &&
 	    !bo_va->base.moved) {
-		spin_lock(&vm->moved_lock);
 		list_move(&bo_va->base.vm_status, &vm->moved);
-		spin_unlock(&vm->moved_lock);
 	}
 	trace_amdgpu_vm_bo_map(bo_va, mapping);
 }
@@ -2388,13 +2747,27 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
 		      struct amdgpu_bo_va *bo_va)
 {
 	struct amdgpu_bo_va_mapping *mapping, *next;
+	struct amdgpu_bo *bo = bo_va->base.bo;
 	struct amdgpu_vm *vm = bo_va->base.vm;
+	struct amdgpu_vm_bo_base **base;
+
+	if (bo) {
+		if (bo->tbo.resv == vm->root.base.bo->tbo.resv)
+			vm->bulk_moveable = false;
 
-	list_del(&bo_va->base.bo_list);
+		for (base = &bo_va->base.bo->vm_bo; *base;
+		     base = &(*base)->next) {
+			if (*base != &bo_va->base)
+				continue;
+
+			*base = bo_va->base.next;
+			break;
+		}
+	}
 
-	spin_lock(&vm->moved_lock);
+	spin_lock(&vm->invalidated_lock);
 	list_del(&bo_va->base.vm_status);
-	spin_unlock(&vm->moved_lock);
+	spin_unlock(&vm->invalidated_lock);
 
 	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
 		list_del(&mapping->list);
@@ -2432,30 +2805,24 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
 	if (bo->parent && bo->parent->shadow == bo)
 		bo = bo->parent;
 
-	list_for_each_entry(bo_base, &bo->va, bo_list) {
+	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
 		struct amdgpu_vm *vm = bo_base->vm;
-		bool was_moved = bo_base->moved;
 
-		bo_base->moved = true;
 		if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
-			if (bo->tbo.type == ttm_bo_type_kernel)
-				list_move(&bo_base->vm_status, &vm->evicted);
-			else
-				list_move_tail(&bo_base->vm_status,
-					       &vm->evicted);
+			amdgpu_vm_bo_evicted(bo_base);
 			continue;
 		}
 
-		if (was_moved)
+		if (bo_base->moved)
 			continue;
+		bo_base->moved = true;
 
-		if (bo->tbo.type == ttm_bo_type_kernel) {
-			list_move(&bo_base->vm_status, &vm->relocated);
-		} else {
-			spin_lock(&bo_base->vm->moved_lock);
-			list_move(&bo_base->vm_status, &vm->moved);
-			spin_unlock(&bo_base->vm->moved_lock);
-		}
+		if (bo->tbo.type == ttm_bo_type_kernel)
+			amdgpu_vm_bo_relocated(bo_base);
+		else if (bo->tbo.resv == vm->root.base.bo->tbo.resv)
+			amdgpu_vm_bo_moved(bo_base);
+		else
+			amdgpu_vm_bo_invalidated(bo_base);
 	}
 }
 
@@ -2574,6 +2941,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
 		 adev->vm_manager.fragment_size);
 }
 
+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
+{
+	struct amdgpu_retryfault_hashtable *fault_hash;
+
+	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
+	if (!fault_hash)
+		return fault_hash;
+
+	INIT_CHASH_TABLE(fault_hash->hash,
+			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
+	spin_lock_init(&fault_hash->lock);
+	fault_hash->count = 0;
+
+	return fault_hash;
+}
+
 /**
  * amdgpu_vm_init - initialize a vm instance
  *
@@ -2592,13 +2975,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 {
 	struct amdgpu_bo_param bp;
 	struct amdgpu_bo *root;
-	const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
-		AMDGPU_VM_PTE_COUNT(adev) * 8);
-	unsigned ring_instance;
-	struct amdgpu_ring *ring;
-	struct drm_sched_rq *rq;
-	unsigned long size;
-	uint64_t flags;
 	int r, i;
 
 	vm->va = RB_ROOT_CACHED;
@@ -2606,18 +2982,15 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		vm->reserved_vmid[i] = NULL;
 	INIT_LIST_HEAD(&vm->evicted);
 	INIT_LIST_HEAD(&vm->relocated);
-	spin_lock_init(&vm->moved_lock);
 	INIT_LIST_HEAD(&vm->moved);
 	INIT_LIST_HEAD(&vm->idle);
+	INIT_LIST_HEAD(&vm->invalidated);
+	spin_lock_init(&vm->invalidated_lock);
 	INIT_LIST_HEAD(&vm->freed);
 
 	/* create scheduler entity for page table updates */
-
-	ring_instance = atomic_inc_return(&adev->vm_manager.vm_pte_next_ring);
-	ring_instance %= adev->vm_manager.vm_pte_num_rings;
-	ring = adev->vm_manager.vm_pte_rings[ring_instance];
-	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
-	r = drm_sched_entity_init(&vm->entity, &rq, 1, NULL);
+	r = drm_sched_entity_init(&vm->entity, adev->vm_manager.vm_pte_rqs,
+				  adev->vm_manager.vm_pte_num_rqs, NULL);
 	if (r)
 		return r;
 
@@ -2639,20 +3012,9 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		  "CPU update of VM recommended only for large BAR system\n");
 	vm->last_update = NULL;
 
-	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-	if (vm->use_cpu_for_update)
-		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-	else if (vm_context != AMDGPU_VM_CONTEXT_COMPUTE)
-		flags |= AMDGPU_GEM_CREATE_SHADOW;
-
-	size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level);
-	memset(&bp, 0, sizeof(bp));
-	bp.size = size;
-	bp.byte_align = align;
-	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-	bp.flags = flags;
-	bp.type = ttm_bo_type_kernel;
-	bp.resv = NULL;
+	amdgpu_vm_bo_param(adev, vm, adev->vm_manager.root_level, &bp);
+	if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE)
+		bp.flags &= ~AMDGPU_GEM_CREATE_SHADOW;
 	r = amdgpu_bo_create(adev, &bp, &root);
 	if (r)
 		goto error_free_sched_entity;
@@ -2683,6 +3045,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		vm->pasid = pasid;
 	}
 
+	vm->fault_hash = init_fault_hash();
+	if (!vm->fault_hash) {
+		r = -ENOMEM;
+		goto error_free_root;
+	}
+
 	INIT_KFIFO(vm->faults);
 	vm->fault_credit = 16;
 
@@ -2722,7 +3090,7 @@ error_free_sched_entity:
  * Returns:
  * 0 for success, -errno for errors.
  */
-int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid)
 {
 	bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
 	int r;
@@ -2734,7 +3102,20 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	/* Sanity checks */
 	if (!RB_EMPTY_ROOT(&vm->va.rb_root) || vm->root.entries) {
 		r = -EINVAL;
-		goto error;
+		goto unreserve_bo;
+	}
+
+	if (pasid) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
+		r = idr_alloc(&adev->vm_manager.pasid_idr, vm, pasid, pasid + 1,
+			      GFP_ATOMIC);
+		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+		if (r == -ENOSPC)
+			goto unreserve_bo;
+		r = 0;
 	}
 
 	/* Check if PD needs to be reinitialized and do it before
@@ -2745,7 +3126,7 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 			       adev->vm_manager.root_level,
 			       pte_support_ats);
 		if (r)
-			goto error;
+			goto free_idr;
 	}
 
 	/* Update VM state */
@@ -2764,45 +3145,52 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 
+		/* Free the original amdgpu allocated pasid
+		 * Will be replaced with kfd allocated pasid
+		 */
+		amdgpu_pasid_free(vm->pasid);
 		vm->pasid = 0;
 	}
 
 	/* Free the shadow bo for compute VM */
 	amdgpu_bo_unref(&vm->root.base.bo->shadow);
 
-error:
+	if (pasid)
+		vm->pasid = pasid;
+
+	goto unreserve_bo;
+
+free_idr:
+	if (pasid) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
+		idr_remove(&adev->vm_manager.pasid_idr, pasid);
+		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+	}
+unreserve_bo:
 	amdgpu_bo_unreserve(vm->root.base.bo);
 	return r;
 }
 
 /**
- * amdgpu_vm_free_levels - free PD/PT levels
- *
- * @adev: amdgpu device structure
- * @parent: PD/PT starting level to free
- * @level: level of parent structure
+ * amdgpu_vm_release_compute - release a compute vm
+ * @adev: amdgpu_device pointer
+ * @vm: a vm turned into compute vm by calling amdgpu_vm_make_compute
  *
- * Free the page directory or page table level and all sub levels.
+ * This is a correspondant of amdgpu_vm_make_compute. It decouples compute
+ * pasid from vm. Compute should stop use of vm after this call.
  */
-static void amdgpu_vm_free_levels(struct amdgpu_device *adev,
-				  struct amdgpu_vm_pt *parent,
-				  unsigned level)
+void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-	unsigned i, num_entries = amdgpu_vm_num_entries(adev, level);
+	if (vm->pasid) {
+		unsigned long flags;
 
-	if (parent->base.bo) {
-		list_del(&parent->base.bo_list);
-		list_del(&parent->base.vm_status);
-		amdgpu_bo_unref(&parent->base.bo->shadow);
-		amdgpu_bo_unref(&parent->base.bo);
+		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
+		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
+		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 	}
-
-	if (parent->entries)
-		for (i = 0; i < num_entries; i++)
-			amdgpu_vm_free_levels(adev, &parent->entries[i],
-					      level + 1);
-
-	kvfree(parent->entries);
+	vm->pasid = 0;
 }
 
 /**
@@ -2826,7 +3214,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 
 	/* Clear pending page faults from IH when the VM is destroyed */
 	while (kfifo_get(&vm->faults, &fault))
-		amdgpu_ih_clear_fault(adev, fault);
+		amdgpu_vm_clear_fault(vm->fault_hash, fault);
 
 	if (vm->pasid) {
 		unsigned long flags;
@@ -2836,6 +3224,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 	}
 
+	kfree(vm->fault_hash);
+	vm->fault_hash = NULL;
+
 	drm_sched_entity_destroy(&vm->entity);
 
 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
@@ -2862,8 +3253,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	if (r) {
 		dev_err(adev->dev, "Leaking page tables because BO reservation failed\n");
 	} else {
-		amdgpu_vm_free_levels(adev, &vm->root,
-				      adev->vm_manager.root_level);
+		amdgpu_vm_free_pts(adev, vm);
 		amdgpu_bo_unreserve(root);
 	}
 	amdgpu_bo_unref(&root);
@@ -2926,7 +3316,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
 		adev->vm_manager.seqno[i] = 0;
 
-	atomic_set(&adev->vm_manager.vm_pte_next_ring, 0);
 	spin_lock_init(&adev->vm_manager.prt_lock);
 	atomic_set(&adev->vm_manager.num_prt_users, 0);
 
@@ -3002,7 +3391,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 /**
  * amdgpu_vm_get_task_info - Extracts task info for a PASID.
  *
- * @dev: drm device pointer
+ * @adev: drm device pointer
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
@@ -3037,3 +3426,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 		}
 	}
 }
+
+/**
+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table
+ *
+ * @fault_hash: fault hash table
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a retry page fault interrupt is
+ * received. If this is a new page fault, it will be added to a hash
+ * table. The return value indicates whether this is a new fault, or
+ * a fault that was already known and is already being handled.
+ *
+ * If there are too many pending page faults, this will fail. Retry
+ * interrupts should be ignored in this case until there is enough
+ * free space.
+ *
+ * Returns 0 if the fault was added, 1 if the fault was already known,
+ * -ENOSPC if there are too many pending faults.
+ */
+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
+{
+	unsigned long flags;
+	int r = -ENOSPC;
+
+	if (WARN_ON_ONCE(!fault_hash))
+		/* Should be allocated in amdgpu_vm_init
+		 */
+		return r;
+
+	spin_lock_irqsave(&fault_hash->lock, flags);
+
+	/* Only let the hash table fill up to 50% for best performance */
+	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
+		goto unlock_out;
+
+	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
+	if (!r)
+		fault_hash->count++;
+
+	/* chash_table_copy_in should never fail unless we're losing count */
+	WARN_ON_ONCE(r < 0);
+
+unlock_out:
+	spin_unlock_irqrestore(&fault_hash->lock, flags);
+	return r;
+}
+
+/**
+ * amdgpu_vm_clear_fault - Remove a page fault record
+ *
+ * @fault_hash: fault hash table
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a page fault has been handled. Any
+ * future interrupt with this key will be processed as a new
+ * page fault.
+ */
+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
+{
+	unsigned long flags;
+	int r;
+
+	if (!fault_hash)
+		return;
+
+	spin_lock_irqsave(&fault_hash->lock, flags);
+
+	r = chash_table_remove(&fault_hash->hash, key, NULL);
+	if (!WARN_ON_ONCE(r < 0)) {
+		fault_hash->count--;
+		WARN_ON_ONCE(fault_hash->count < 0);
+	}
+
+	spin_unlock_irqrestore(&fault_hash->lock, flags);
+}