59 files changed, 1562 insertions, 661 deletions
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/gr/Kbuild
index 558c86fd8e82..b5418f05ccd8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/Kbuild
@@ -40,6 +40,7 @@ nvkm-y += nvkm/engine/gr/gp108.o
 nvkm-y += nvkm/engine/gr/gp10b.o
 nvkm-y += nvkm/engine/gr/gv100.o
 nvkm-y += nvkm/engine/gr/tu102.o
+nvkm-y += nvkm/engine/gr/ga102.o
 
 nvkm-y += nvkm/engine/gr/ctxnv40.o
 nvkm-y += nvkm/engine/gr/ctxnv50.o
@@ -63,3 +64,4 @@ nvkm-y += nvkm/engine/gr/ctxgp104.o
 nvkm-y += nvkm/engine/gr/ctxgp107.o
 nvkm-y += nvkm/engine/gr/ctxgv100.o
 nvkm-y += nvkm/engine/gr/ctxtu102.o
+nvkm-y += nvkm/engine/gr/ctxga102.o
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/base.c
index 61759f54406e..71b824e6da9d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/base.c
@@ -136,6 +136,17 @@ nvkm_gr_oneinit(struct nvkm_engine *engine)
 }
 
 static int
+nvkm_gr_reset(struct nvkm_engine *engine)
+{
+	struct nvkm_gr *gr = nvkm_gr(engine);
+
+	if (gr->func->reset)
+		return gr->func->reset(gr);
+
+	return -ENOSYS;
+}
+
+static int
 nvkm_gr_init(struct nvkm_engine *engine)
 {
 	struct nvkm_gr *gr = nvkm_gr(engine);
@@ -166,6 +177,7 @@ nvkm_gr = {
 	.oneinit = nvkm_gr_oneinit,
 	.init = nvkm_gr_init,
 	.fini = nvkm_gr_fini,
+	.reset = nvkm_gr_reset,
 	.intr = nvkm_gr_intr,
 	.tile = nvkm_gr_tile,
 	.chsw_load = nvkm_gr_chsw_load,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxga102.c
new file mode 100644
index 000000000000..11461adf5036
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxga102.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2019 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "ctxgf100.h"
+
+static void
+ga102_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	tpc = gv100_gr_nonpes_aware_tpc(gr, gpc, tpc);
+
+	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x608), sm);
+}
+
+static void
+ga102_grctx_generate_unkn(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_mask(device, 0x41980c, 0x00000010, 0x00000010);
+	nvkm_mask(device, 0x41be08, 0x00000004, 0x00000004);
+}
+
+static void
+ga102_grctx_generate_r419ea8(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x419ea8, nvkm_rd32(device, 0x504728) | 0x08000000);
+}
+
+const struct gf100_grctx_func
+ga102_grctx = {
+	.main = gf100_grctx_generate_main,
+	.unkn = ga102_grctx_generate_unkn,
+	.bundle = gm107_grctx_generate_bundle,
+	.bundle_size = 0x3000,
+	.bundle_min_gpm_fifo_depth = 0x180,
+	.bundle_token_limit = 0x1140,
+	.pagepool = gp100_grctx_generate_pagepool,
+	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gv100_grctx_generate_attrib_cb,
+	.attrib = gv100_grctx_generate_attrib,
+	.attrib_nr_max = 0x800,
+	.attrib_nr = 0x4a1,
+	.alpha_nr_max = 0xc00,
+	.alpha_nr = 0x800,
+	.unknown_size = 0x80000,
+	.unknown = tu102_grctx_generate_unknown,
+	.gfxp_nr = 0xd28,
+	.sm_id = ga102_grctx_generate_sm_id,
+	.skip_pd_num_tpc_per_gpc = true,
+	.rop_mapping = gv100_grctx_generate_rop_mapping,
+	.r406500 = gm200_grctx_generate_r406500,
+	.r400088 = gv100_grctx_generate_r400088,
+	.r419ea8 = ga102_grctx_generate_r419ea8,
+};
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
index 297915719bf2..cb390e0134a2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
@@ -26,6 +26,7 @@
 #include <subdev/fb.h>
 #include <subdev/mc.h>
 #include <subdev/timer.h>
+#include <engine/fifo.h>
 
 /*******************************************************************************
  * PGRAPH context register lists
@@ -990,43 +991,16 @@ gf100_grctx_pack_tpc[] = {
  * PGRAPH context implementation
  ******************************************************************************/
 
-int
-gf100_grctx_mmio_data(struct gf100_grctx *info, u32 size, u32 align, bool priv)
-{
-	if (info->data) {
-		info->buffer[info->buffer_nr] = round_up(info->addr, align);
-		info->addr = info->buffer[info->buffer_nr] + size;
-		info->data->size = size;
-		info->data->align = align;
-		info->data->priv = priv;
-		info->data++;
-		return info->buffer_nr++;
-	}
-	return -1;
-}
-
 void
-gf100_grctx_mmio_item(struct gf100_grctx *info, u32 addr, u32 data,
-		      int shift, int buffer)
+gf100_grctx_patch_wr32(struct gf100_gr_chan *chan, u32 addr, u32 data)
 {
-	struct nvkm_device *device = info->gr->base.engine.subdev.device;
-	if (info->data) {
-		if (shift >= 0) {
-			info->mmio->addr = addr;
-			info->mmio->data = data;
-			info->mmio->shift = shift;
-			info->mmio->buffer = buffer;
-			if (buffer >= 0)
-				data |= info->buffer[buffer] >> shift;
-			info->mmio++;
-		} else
-			return;
-	} else {
-		if (buffer >= 0)
-			return;
+	if (unlikely(!chan->mmio)) {
+		nvkm_wr32(chan->gr->base.engine.subdev.device, addr, data);
+		return;
 	}
 
-	nvkm_wr32(device, addr, data);
+	nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, addr);
+	nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, data);
 }
 
 void
@@ -1037,56 +1011,60 @@ gf100_grctx_generate_r419cb8(struct gf100_gr *gr)
 }
 
 void
-gf100_grctx_generate_bundle(struct gf100_grctx *info)
+gf100_grctx_generate_bundle(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
-	mmio_refn(info, 0x408004, 0x00000000, s, b);
-	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_refn(info, 0x418808, 0x00000000, s, b);
-	mmio_wr32(info, 0x41880c, 0x80000000 | (grctx->bundle_size >> s));
+	gf100_grctx_patch_wr32(chan, 0x408004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408008, 0x80000000 | (size >> 8));
+	gf100_grctx_patch_wr32(chan, 0x418808, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x41880c, 0x80000000 | (size >> 8));
 }
 
 void
-gf100_grctx_generate_pagepool(struct gf100_grctx *info)
+gf100_grctx_generate_pagepool(struct gf100_gr_chan *chan, u64 addr)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
-	mmio_refn(info, 0x40800c, 0x00000000, s, b);
-	mmio_wr32(info, 0x408010, 0x80000000);
-	mmio_refn(info, 0x419004, 0x00000000, s, b);
-	mmio_wr32(info, 0x419008, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x40800c, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408010, 0x80000000);
+	gf100_grctx_patch_wr32(chan, 0x419004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x419008, 0x00000000);
 }
 
 void
-gf100_grctx_generate_attrib(struct gf100_grctx *info)
+gf100_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32 attrib = grctx->attrib_nr;
-	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	int gpc, tpc;
 	u32 bo = 0;
 
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_wr32(info, 0x405830, (attrib << 16));
+	gf100_grctx_patch_wr32(chan, 0x405830, (attrib << 16));
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
 		for (tpc = 0; tpc < gr->tpc_nr[gpc]; tpc++) {
 			const u32 o = TPC_UNIT(gpc, tpc, 0x0520);
-			mmio_skip(info, o, (attrib << 16) | ++bo);
-			mmio_wr32(info, o, (attrib << 16) | --bo);
+
+			gf100_grctx_patch_wr32(chan, o, (attrib << 16) | bo);
 			bo += grctx->attrib_nr_max;
 		}
 	}
 }
 
 void
+gf100_grctx_generate_attrib_cb(struct gf100_gr_chan *chan, u64 addr, u32 size)
+{
+	gf100_grctx_patch_wr32(chan, 0x418810, 0x80000000 | addr >> 12);
+	gf100_grctx_patch_wr32(chan, 0x419848, 0x10000000 | addr >> 12);
+}
+
+u32
+gf100_grctx_generate_attrib_cb_size(struct gf100_gr *gr)
+{
+	const struct gf100_grctx_func *grctx = gr->func->grctx;
+
+	return 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max) * gr->tpc_total;
+}
+
+void
 gf100_grctx_generate_unkn(struct gf100_gr *gr)
 {
 }
@@ -1361,8 +1339,9 @@ gf100_grctx_generate_floorsweep(struct gf100_gr *gr)
 }
 
 void
-gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
+gf100_grctx_generate_main(struct gf100_gr_chan *chan)
 {
+	struct gf100_gr *gr = chan->gr;
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	u32 idle_timeout;
@@ -1380,15 +1359,23 @@ gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 		gf100_gr_mmio(gr, gr->sw_ctx);
 	}
 
+	if (gr->func->init_419bd8)
+		gr->func->init_419bd8(gr);
+	if (grctx->r419ea8)
+		grctx->r419ea8(gr);
+
 	gf100_gr_wait_idle(gr);
 
 	idle_timeout = nvkm_mask(device, 0x404154, 0xffffffff, 0x00000000);
 
-	grctx->pagepool(info);
-	grctx->bundle(info);
-	grctx->attrib(info);
+	grctx->pagepool(chan, chan->pagepool->addr);
+	grctx->bundle(chan, chan->bundle_cb->addr, grctx->bundle_size);
+	grctx->attrib_cb(chan, chan->attrib_cb->addr, grctx->attrib_cb_size(gr));
+	grctx->attrib(chan);
 	if (grctx->patch_ltc)
-		grctx->patch_ltc(info);
+		grctx->patch_ltc(chan);
+	if (grctx->unknown_size)
+		grctx->unknown(chan, chan->unknown->addr, grctx->unknown_size);
 	grctx->unkn(gr);
 
 	gf100_grctx_generate_floorsweep(gr);
@@ -1396,12 +1383,23 @@ gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 	gf100_gr_wait_idle(gr);
 
 	if (grctx->r400088) grctx->r400088(gr, false);
+
 	if (gr->bundle)
 		gf100_gr_icmd(gr, gr->bundle);
 	else
 		gf100_gr_icmd(gr, grctx->icmd);
-	if (grctx->sw_veid_bundle_init)
+
+	if (gr->bundle_veid)
+		gf100_gr_icmd(gr, gr->bundle_veid);
+	else
 		gf100_gr_icmd(gr, grctx->sw_veid_bundle_init);
+
+	if (gr->bundle64)
+		gf100_gr_icmd(gr, gr->bundle64);
+	else
+	if (grctx->sw_bundle64_init)
+		gf100_gr_icmd(gr, grctx->sw_bundle64_init);
+
 	if (grctx->r400088) grctx->r400088(gr, true);
 
 	nvkm_wr32(device, 0x404154, idle_timeout);
@@ -1428,21 +1426,20 @@ gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 		grctx->r408840(gr);
 	if (grctx->r419c0c)
 		grctx->r419c0c(gr);
+
+	gf100_gr_wait_idle(gr);
 }
 
 #define CB_RESERVED 0x80000
 
 int
-gf100_grctx_generate(struct gf100_gr *gr)
+gf100_grctx_generate(struct gf100_gr *gr, struct gf100_gr_chan *chan, struct nvkm_gpuobj *inst)
 {
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_memory *inst = NULL;
 	struct nvkm_memory *data = NULL;
-	struct nvkm_vmm *vmm = NULL;
 	struct nvkm_vma *ctx = NULL;
-	struct gf100_grctx info;
 	int ret, i;
 	u64 addr;
 
@@ -1457,72 +1454,47 @@ gf100_grctx_generate(struct gf100_gr *gr)
 		grctx->unkn88c(gr, true);
 
 	/* Reset FECS. */
-	nvkm_wr32(device, 0x409614, 0x00000070);
-	nvkm_usec(device, 10, NVKM_DELAY);
-	nvkm_mask(device, 0x409614, 0x00000700, 0x00000700);
-	nvkm_usec(device, 10, NVKM_DELAY);
-	nvkm_rd32(device, 0x409614);
+	gr->func->fecs.reset(gr);
 
 	if (grctx->unkn88c)
 		grctx->unkn88c(gr, false);
 
 	/* NV_PGRAPH_FE_PWR_MODE_AUTO. */
 	nvkm_wr32(device, 0x404170, 0x00000010);
+	nvkm_msec(device, 2000,
+		if (!(nvkm_rd32(device, 0x404170) & 0x00000010))
+			break;
+	);
 
 	/* Init SCC RAM. */
 	nvkm_wr32(device, 0x40802c, 0x00000001);
 
-	/* Allocate memory to for a "channel", which we'll use to generate
-	 * the default context values.
-	 */
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
-			      0x1000, 0x1000, true, &inst);
-	if (ret)
-		goto done;
-
-	ret = nvkm_vmm_new(device, 0, 0, NULL, 0, NULL, "grctx", &vmm);
-	if (ret)
-		goto done;
-
-	vmm->debug = subdev->debug;
-
-	ret = nvkm_vmm_join(vmm, inst);
-	if (ret)
-		goto done;
-
+	/* Allocate memory to store context, and dummy global context buffers. */
 	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
 			      CB_RESERVED + gr->size, 0, true, &data);
 	if (ret)
 		goto done;
 
-	ret = nvkm_vmm_get(vmm, 0, nvkm_memory_size(data), &ctx);
+	ret = nvkm_vmm_get(chan->vmm, 0, nvkm_memory_size(data), &ctx);
 	if (ret)
 		goto done;
 
-	ret = nvkm_memory_map(data, 0, vmm, ctx, NULL, 0);
+	ret = nvkm_memory_map(data, 0, chan->vmm, ctx, NULL, 0);
 	if (ret)
 		goto done;
 
-
 	/* Setup context pointer. */
 	nvkm_kmap(inst);
 	nvkm_wo32(inst, 0x0210, lower_32_bits(ctx->addr + CB_RESERVED) | 4);
 	nvkm_wo32(inst, 0x0214, upper_32_bits(ctx->addr + CB_RESERVED));
 	nvkm_done(inst);
 
-	/* Setup default state for mmio list construction. */
-	info.gr = gr;
-	info.data = gr->mmio_data;
-	info.mmio = gr->mmio_list;
-	info.addr = ctx->addr;
-	info.buffer_nr = 0;
-
 	/* Make channel current. */
-	addr = nvkm_memory_addr(inst) >> 12;
+	addr = inst->addr >> 12;
 	if (gr->firmware) {
 		ret = gf100_gr_fecs_bind_pointer(gr, 0x80000000 | addr);
 		if (ret)
-			goto done;
+			goto done_inst;
 
 		nvkm_kmap(data);
 		nvkm_wo32(data, 0x1c, 1);
@@ -1540,19 +1512,27 @@ gf100_grctx_generate(struct gf100_gr *gr)
 		);
 	}
 
-	grctx->main(gr, &info);
+	grctx->main(chan);
 
-	/* Trigger a context unload by unsetting the "next channel valid" bit
-	 * and faking a context switch interrupt.
-	 */
-	nvkm_mask(device, 0x409b04, 0x80000000, 0x00000000);
-	nvkm_wr32(device, 0x409000, 0x00000100);
-	if (nvkm_msec(device, 2000,
-		if (!(nvkm_rd32(device, 0x409b00) & 0x80000000))
-			break;
-	) < 0) {
-		ret = -EBUSY;
-		goto done;
+	if (!gr->firmware) {
+		/* Trigger a context unload by unsetting the "next channel valid" bit
+		 * and faking a context switch interrupt.
+		 */
+		nvkm_mask(device, 0x409b04, 0x80000000, 0x00000000);
+		nvkm_wr32(device, 0x409000, 0x00000100);
+		if (nvkm_msec(device, 2000,
+			if (!(nvkm_rd32(device, 0x409b00) & 0x80000000))
+				break;
+		) < 0) {
+			ret = -EBUSY;
+			goto done_inst;
+		}
+	} else {
+		ret = gf100_gr_fecs_wfi_golden_save(gr, 0x80000000 | addr);
+		if (ret)
+			goto done_inst;
+
+		nvkm_mask(device, 0x409b00, 0x80000000, 0x00000000);
 	}
 
 	gr->data = kmalloc(gr->size, GFP_KERNEL);
@@ -1566,12 +1546,14 @@ gf100_grctx_generate(struct gf100_gr *gr)
 		ret = -ENOMEM;
 	}
 
+done_inst:
+	nvkm_kmap(inst);
+	nvkm_wo32(inst, 0x0210, 0);
+	nvkm_wo32(inst, 0x0214, 0);
+	nvkm_done(inst);
 done:
-	nvkm_vmm_put(vmm, &ctx);
+	nvkm_vmm_put(chan->vmm, &ctx);
 	nvkm_memory_unref(&data);
-	nvkm_vmm_part(vmm, inst);
-	nvkm_vmm_unref(&vmm);
-	nvkm_memory_unref(&inst);
 	return ret;
 }
 
@@ -1590,6 +1572,8 @@ gf100_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf100_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
index 32bbddc0993e..00dbeda7e346 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
@@ -3,27 +3,12 @@
 #define __NVKM_GRCTX_NVC0_H__
 #include "gf100.h"
 
-struct gf100_grctx {
-	struct gf100_gr *gr;
-	struct gf100_gr_data *data;
-	struct gf100_gr_mmio *mmio;
-	int buffer_nr;
-	u64 buffer[4];
-	u64 addr;
-};
-
-int  gf100_grctx_mmio_data(struct gf100_grctx *, u32 size, u32 align, bool priv);
-void gf100_grctx_mmio_item(struct gf100_grctx *, u32 addr, u32 data, int s, int);
-
-#define mmio_vram(a,b,c,d) gf100_grctx_mmio_data((a), (b), (c), (d))
-#define mmio_refn(a,b,c,d,e) gf100_grctx_mmio_item((a), (b), (c), (d), (e))
-#define mmio_skip(a,b,c) mmio_refn((a), (b), (c), -1, -1)
-#define mmio_wr32(a,b,c) mmio_refn((a), (b), (c),  0, -1)
+void gf100_grctx_patch_wr32(struct gf100_gr_chan *, u32 addr, u32 data);
 
 struct gf100_grctx_func {
 	void (*unkn88c)(struct gf100_gr *, bool on);
 	/* main context generation function */
-	void  (*main)(struct gf100_gr *, struct gf100_grctx *);
+	void  (*main)(struct gf100_gr_chan *);
 	/* context-specific modify-on-first-load list generation function */
 	void  (*unkn)(struct gf100_gr *);
 	/* mmio context data */
@@ -37,23 +22,29 @@ struct gf100_grctx_func {
 	const struct gf100_gr_pack *icmd;
 	const struct gf100_gr_pack *mthd;
 	const struct gf100_gr_pack *sw_veid_bundle_init;
+	const struct gf100_gr_pack *sw_bundle64_init;
 	/* bundle circular buffer */
-	void (*bundle)(struct gf100_grctx *);
+	void (*bundle)(struct gf100_gr_chan *, u64 addr, u32 size);
 	u32 bundle_size;
 	u32 bundle_min_gpm_fifo_depth;
 	u32 bundle_token_limit;
 	/* pagepool */
-	void (*pagepool)(struct gf100_grctx *);
+	void (*pagepool)(struct gf100_gr_chan *, u64 addr);
 	u32 pagepool_size;
 	/* attribute(/alpha) circular buffer */
-	void (*attrib)(struct gf100_grctx *);
+	u32 (*attrib_cb_size)(struct gf100_gr *);
+	void (*attrib_cb)(struct gf100_gr_chan *, u64 addr, u32 size);
+	void (*attrib)(struct gf100_gr_chan *);
 	u32 attrib_nr_max;
 	u32 attrib_nr;
 	u32 alpha_nr_max;
 	u32 alpha_nr;
 	u32 gfxp_nr;
+	/* some other context buffer */
+	void (*unknown)(struct gf100_gr_chan *, u64 addr, u32 size);
+	u32 unknown_size;
 	/* other patch buffer stuff */
-	void (*patch_ltc)(struct gf100_grctx *);
+	void (*patch_ltc)(struct gf100_gr_chan *);
 	/* floorsweeping */
 	void (*sm_id)(struct gf100_gr *, int gpc, int tpc, int sm);
 	void (*tpc_nr)(struct gf100_gr *, int gpc);
@@ -78,14 +69,17 @@ struct gf100_grctx_func {
 	void (*r419a3c)(struct gf100_gr *);
 	void (*r408840)(struct gf100_gr *);
 	void (*r419c0c)(struct gf100_gr *);
+	void (*r419ea8)(struct gf100_gr *);
 };
 
 extern const struct gf100_grctx_func gf100_grctx;
-int  gf100_grctx_generate(struct gf100_gr *);
-void gf100_grctx_generate_main(struct gf100_gr *, struct gf100_grctx *);
-void gf100_grctx_generate_bundle(struct gf100_grctx *);
-void gf100_grctx_generate_pagepool(struct gf100_grctx *);
-void gf100_grctx_generate_attrib(struct gf100_grctx *);
+int  gf100_grctx_generate(struct gf100_gr *, struct gf100_gr_chan *, struct nvkm_gpuobj *inst);
+void gf100_grctx_generate_main(struct gf100_gr_chan *);
+void gf100_grctx_generate_pagepool(struct gf100_gr_chan *, u64);
+void gf100_grctx_generate_bundle(struct gf100_gr_chan *, u64, u32);
+u32 gf100_grctx_generate_attrib_cb_size(struct gf100_gr *);
+void gf100_grctx_generate_attrib_cb(struct gf100_gr_chan *, u64, u32);
+void gf100_grctx_generate_attrib(struct gf100_gr_chan *);
 void gf100_grctx_generate_unkn(struct gf100_gr *);
 void gf100_grctx_generate_floorsweep(struct gf100_gr *);
 void gf100_grctx_generate_sm_id(struct gf100_gr *, int, int, int);
@@ -97,14 +91,14 @@ void gf100_grctx_generate_max_ways_evict(struct gf100_gr *);
 void gf100_grctx_generate_r419cb8(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gf108_grctx;
-void gf108_grctx_generate_attrib(struct gf100_grctx *);
+void gf108_grctx_generate_attrib(struct gf100_gr_chan *);
 void gf108_grctx_generate_unkn(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gf104_grctx;
 extern const struct gf100_grctx_func gf110_grctx;
 
 extern const struct gf100_grctx_func gf117_grctx;
-void gf117_grctx_generate_attrib(struct gf100_grctx *);
+void gf117_grctx_generate_attrib(struct gf100_gr_chan *);
 void gf117_grctx_generate_rop_mapping(struct gf100_gr *);
 void gf117_grctx_generate_dist_skip_table(struct gf100_gr *);
 
@@ -115,9 +109,9 @@ void gk104_grctx_generate_alpha_beta_tables(struct gf100_gr *);
 void gk104_grctx_generate_gpc_tpc_nr(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gk20a_grctx;
-void gk104_grctx_generate_bundle(struct gf100_grctx *);
-void gk104_grctx_generate_pagepool(struct gf100_grctx *);
-void gk104_grctx_generate_patch_ltc(struct gf100_grctx *);
+void gk104_grctx_generate_pagepool(struct gf100_gr_chan *, u64);
+void gk104_grctx_generate_bundle(struct gf100_gr_chan *, u64, u32);
+void gk104_grctx_generate_patch_ltc(struct gf100_gr_chan *);
 void gk104_grctx_generate_unkn(struct gf100_gr *);
 void gk104_grctx_generate_r418800(struct gf100_gr *);
 
@@ -128,9 +122,10 @@ extern const struct gf100_grctx_func gk110b_grctx;
 extern const struct gf100_grctx_func gk208_grctx;
 
 extern const struct gf100_grctx_func gm107_grctx;
-void gm107_grctx_generate_bundle(struct gf100_grctx *);
-void gm107_grctx_generate_pagepool(struct gf100_grctx *);
-void gm107_grctx_generate_attrib(struct gf100_grctx *);
+void gm107_grctx_generate_pagepool(struct gf100_gr_chan *, u64);
+void gm107_grctx_generate_bundle(struct gf100_gr_chan *, u64, u32);
+void gm107_grctx_generate_attrib_cb(struct gf100_gr_chan *, u64, u32);
+void gm107_grctx_generate_attrib(struct gf100_gr_chan *);
 void gm107_grctx_generate_sm_id(struct gf100_gr *, int, int, int);
 
 extern const struct gf100_grctx_func gm200_grctx;
@@ -143,11 +138,13 @@ void gm200_grctx_generate_r419a3c(struct gf100_gr *);
 extern const struct gf100_grctx_func gm20b_grctx;
 
 extern const struct gf100_grctx_func gp100_grctx;
-void gp100_grctx_generate_pagepool(struct gf100_grctx *);
+void gp100_grctx_generate_pagepool(struct gf100_gr_chan *, u64);
+void gp100_grctx_generate_attrib_cb(struct gf100_gr_chan *, u64, u32);
 void gp100_grctx_generate_smid_config(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gp102_grctx;
-void gp102_grctx_generate_attrib(struct gf100_grctx *);
+u32 gp102_grctx_generate_attrib_cb_size(struct gf100_gr *);
+void gp102_grctx_generate_attrib(struct gf100_gr_chan *);
 
 extern const struct gf100_grctx_func gp104_grctx;
 
@@ -158,11 +155,15 @@ extern const struct gf100_grctx_func gv100_grctx;
 extern const struct gf100_grctx_func tu102_grctx;
 void gv100_grctx_unkn88c(struct gf100_gr *, bool);
 void gv100_grctx_generate_unkn(struct gf100_gr *);
-extern const struct gf100_gr_init gv100_grctx_init_sw_veid_bundle_init_0[];
-void gv100_grctx_generate_attrib(struct gf100_grctx *);
+void gv100_grctx_generate_attrib_cb(struct gf100_gr_chan *, u64, u32);
+void gv100_grctx_generate_attrib(struct gf100_gr_chan *);
 void gv100_grctx_generate_rop_mapping(struct gf100_gr *);
 void gv100_grctx_generate_r400088(struct gf100_gr *, bool);
 
+void tu102_grctx_generate_unknown(struct gf100_gr_chan *, u64, u32);
+
+extern const struct gf100_grctx_func ga102_grctx;
+
 /* context init value lists */
 
 extern const struct gf100_gr_pack gf100_grctx_pack_icmd[];
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c
index 7a0564b6e3c7..ba63a3b46518 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c
@@ -94,6 +94,8 @@ gf104_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf100_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
index dda2c32e6232..0bc2eab6ad98 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
@@ -733,25 +733,20 @@ gf108_grctx_pack_tpc[] = {
  ******************************************************************************/
 
 void
-gf108_grctx_generate_attrib(struct gf100_grctx *info)
+gf108_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32   beta = grctx->attrib_nr;
-	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int timeslice_mode = 1;
 	const int max_batches = 0xffff;
 	u32 bo = 0;
 	u32 ao = bo + grctx->attrib_nr_max * gr->tpc_total;
 	int gpc, tpc;
 
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_wr32(info, 0x405830, (beta << 16) | alpha);
-	mmio_wr32(info, 0x4064c4, ((alpha / 4) << 16) | max_batches);
+	gf100_grctx_patch_wr32(chan, 0x405830, (beta << 16) | alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
 		for (tpc = 0; tpc < gr->tpc_nr[gpc]; tpc++) {
@@ -759,10 +754,10 @@ gf108_grctx_generate_attrib(struct gf100_grctx *info)
 			const u32 b =  beta;
 			const u32 t = timeslice_mode;
 			const u32 o = TPC_UNIT(gpc, tpc, 0x500);
-			mmio_skip(info, o + 0x20, (t << 28) | (b << 16) | ++bo);
-			mmio_wr32(info, o + 0x20, (t << 28) | (b << 16) | --bo);
+
+			gf100_grctx_patch_wr32(chan, o + 0x20, (t << 28) | (b << 16) | bo);
 			bo += grctx->attrib_nr_max;
-			mmio_wr32(info, o + 0x44, (a << 16) | ao);
+			gf100_grctx_patch_wr32(chan, o + 0x44, (a << 16) | ao);
 			ao += grctx->alpha_nr_max;
 		}
 	}
@@ -795,6 +790,8 @@ gf108_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf108_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c
index f5cca5e6a4f2..64b723b0afb5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c
@@ -342,6 +342,8 @@ gf110_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf100_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
index 276c282d19aa..e34c5da2a9ff 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
@@ -241,38 +241,34 @@ gf117_grctx_generate_rop_mapping(struct gf100_gr *gr)
 }
 
 void
-gf117_grctx_generate_attrib(struct gf100_grctx *info)
+gf117_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32   beta = grctx->attrib_nr;
-	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int timeslice_mode = 1;
 	const int max_batches = 0xffff;
 	u32 bo = 0;
 	u32 ao = bo + grctx->attrib_nr_max * gr->tpc_total;
 	int gpc, ppc;
 
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_wr32(info, 0x405830, (beta << 16) | alpha);
-	mmio_wr32(info, 0x4064c4, ((alpha / 4) << 16) | max_batches);
+	gf100_grctx_patch_wr32(chan, 0x405830, (beta << 16) | alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++) {
 			const u32 a = alpha * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 b =  beta * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 t = timeslice_mode;
 			const u32 o = PPC_UNIT(gpc, ppc, 0);
+
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
-			mmio_skip(info, o + 0xc0, (t << 28) | (b << 16) | ++bo);
-			mmio_wr32(info, o + 0xc0, (t << 28) | (b << 16) | --bo);
+
+			gf100_grctx_patch_wr32(chan, o + 0xc0, (t << 28) | (b << 16) | bo);
 			bo += grctx->attrib_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, o + 0xe4, (a << 16) | ao);
+			gf100_grctx_patch_wr32(chan, o + 0xe4, (a << 16) | ao);
 			ao += grctx->alpha_nr_max * gr->ppc_tpc_nr[gpc][ppc];
 		}
 	}
@@ -294,6 +290,8 @@ gf117_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c
index 0cfe46366af6..426ad1b8d426 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c
@@ -510,6 +510,8 @@ gf119_grctx = {
 	.bundle_size = 0x1800,
 	.pagepool = gf100_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf108_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
index 304e9d268bad..94233d0119df 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
@@ -861,43 +861,33 @@ gk104_grctx_generate_r418800(struct gf100_gr *gr)
 }
 
 void
-gk104_grctx_generate_patch_ltc(struct gf100_grctx *info)
+gk104_grctx_generate_patch_ltc(struct gf100_gr_chan *chan)
 {
-	struct nvkm_device *device = info->gr->base.engine.subdev.device;
+	struct nvkm_device *device = chan->gr->base.engine.subdev.device;
 	u32 data0 = nvkm_rd32(device, 0x17e91c);
 	u32 data1 = nvkm_rd32(device, 0x17e920);
+
 	/*XXX: Figure out how to modify this correctly! */
-	mmio_wr32(info, 0x17e91c, data0);
-	mmio_wr32(info, 0x17e920, data1);
+	gf100_grctx_patch_wr32(chan, 0x17e91c, data0);
+	gf100_grctx_patch_wr32(chan, 0x17e920, data1);
 }
 
 void
-gk104_grctx_generate_bundle(struct gf100_grctx *info)
+gk104_grctx_generate_bundle(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth,
-				    grctx->bundle_size / 0x20);
+	const struct gf100_grctx_func *grctx = chan->gr->func->grctx;
+	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth, size / 0x20);
 	const u32 token_limit = grctx->bundle_token_limit;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
-	mmio_refn(info, 0x408004, 0x00000000, s, b);
-	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_refn(info, 0x418808, 0x00000000, s, b);
-	mmio_wr32(info, 0x41880c, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_wr32(info, 0x4064c8, (state_limit << 16) | token_limit);
+
+	gf100_grctx_generate_bundle(chan, addr, size);
+	gf100_grctx_patch_wr32(chan, 0x4064c8, (state_limit << 16) | token_limit);
 }
 
 void
-gk104_grctx_generate_pagepool(struct gf100_grctx *info)
+gk104_grctx_generate_pagepool(struct gf100_gr_chan *chan, u64 addr)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
-	mmio_refn(info, 0x40800c, 0x00000000, s, b);
-	mmio_wr32(info, 0x408010, 0x80000000);
-	mmio_refn(info, 0x419004, 0x00000000, s, b);
-	mmio_wr32(info, 0x419008, 0x00000000);
-	mmio_wr32(info, 0x4064cc, 0x80000000);
+	gf100_grctx_generate_pagepool(chan, addr);
+	gf100_grctx_patch_wr32(chan, 0x4064cc, 0x80000000);
 }
 
 void
@@ -991,6 +981,8 @@ gk104_grctx = {
 	.bundle_token_limit = 0x600,
 	.pagepool = gk104_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c
index 86547cfc38dc..4391458e1fb2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c
@@ -838,6 +838,8 @@ gk110_grctx = {
 	.bundle_token_limit = 0x7c0,
 	.pagepool = gk104_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c
index ebb947bd1446..7b9a34f9ec3c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c
@@ -87,6 +87,8 @@ gk110b_grctx = {
 	.bundle_token_limit = 0x600,
 	.pagepool = gk104_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c
index 4d40512b5c99..c78d07a8bb7d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c
@@ -553,6 +553,8 @@ gk208_grctx = {
 	.bundle_token_limit = 0x200,
 	.pagepool = gk104_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x324,
 	.attrib_nr = 0x218,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c
index c0d36bc601f9..ac5fdcb5cd3f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c
@@ -25,8 +25,9 @@
 #include <subdev/mc.h>
 
 static void
-gk20a_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
+gk20a_grctx_generate_main(struct gf100_gr_chan *chan)
 {
+	struct gf100_gr *gr = chan->gr;
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	u32 idle_timeout;
@@ -38,7 +39,8 @@ gk20a_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 
 	idle_timeout = nvkm_mask(device, 0x404154, 0xffffffff, 0x00000000);
 
-	grctx->attrib(info);
+	grctx->attrib_cb(chan, chan->attrib_cb->addr, grctx->attrib_cb_size(gr));
+	grctx->attrib(chan);
 
 	grctx->unkn(gr);
 
@@ -60,8 +62,8 @@ gk20a_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 	gf100_gr_wait_idle(gr);
 
 	gf100_gr_icmd(gr, gr->bundle);
-	grctx->pagepool(info);
-	grctx->bundle(info);
+	grctx->pagepool(chan, chan->pagepool->addr);
+	grctx->bundle(chan, chan->bundle_cb->addr, grctx->bundle_size);
 }
 
 const struct gf100_grctx_func
@@ -74,6 +76,8 @@ gk20a_grctx = {
 	.bundle_token_limit = 0x100,
 	.pagepool = gk104_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gf100_grctx_generate_attrib_cb,
 	.attrib = gf117_grctx_generate_attrib,
 	.attrib_nr_max = 0x240,
 	.attrib_nr = 0x240,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
index 0b3964e6b36e..beac66eb2a80 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
@@ -876,75 +876,70 @@ gm107_grctx_generate_r419e00(struct gf100_gr *gr)
 }
 
 void
-gm107_grctx_generate_bundle(struct gf100_grctx *info)
+gm107_grctx_generate_bundle(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth,
-				    grctx->bundle_size / 0x20);
+	const struct gf100_grctx_func *grctx = chan->gr->func->grctx;
+	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth, size / 0x20);
 	const u32 token_limit = grctx->bundle_token_limit;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
-	mmio_refn(info, 0x408004, 0x00000000, s, b);
-	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_refn(info, 0x418e24, 0x00000000, s, b);
-	mmio_wr32(info, 0x418e28, 0x80000000 | (grctx->bundle_size >> s));
-	mmio_wr32(info, 0x4064c8, (state_limit << 16) | token_limit);
+
+	gf100_grctx_patch_wr32(chan, 0x408004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408008, 0x80000000 | (size >> 8));
+	gf100_grctx_patch_wr32(chan, 0x418e24, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x418e28, 0x80000000 | (size >> 8));
+	gf100_grctx_patch_wr32(chan, 0x4064c8, (state_limit << 16) | token_limit);
 }
 
 void
-gm107_grctx_generate_pagepool(struct gf100_grctx *info)
+gm107_grctx_generate_pagepool(struct gf100_gr_chan *chan, u64 addr)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
-	mmio_refn(info, 0x40800c, 0x00000000, s, b);
-	mmio_wr32(info, 0x408010, 0x80000000);
-	mmio_refn(info, 0x419004, 0x00000000, s, b);
-	mmio_wr32(info, 0x419008, 0x00000000);
-	mmio_wr32(info, 0x4064cc, 0x80000000);
-	mmio_wr32(info, 0x418e30, 0x80000000); /* guess at it being related */
+	gk104_grctx_generate_pagepool(chan, addr);
+	gf100_grctx_patch_wr32(chan, 0x418e30, 0x80000000);
 }
 
 void
-gm107_grctx_generate_attrib(struct gf100_grctx *info)
+gm107_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32 attrib = grctx->attrib_nr;
-	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int max_batches = 0xffff;
 	u32 bo = 0;
 	u32 ao = bo + grctx->attrib_nr_max * gr->tpc_total;
 	int gpc, ppc, n = 0;
 
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_refn(info, 0x419c2c, 0x10000000, s, b);
-	mmio_wr32(info, 0x405830, (attrib << 16) | alpha);
-	mmio_wr32(info, 0x4064c4, ((alpha / 4) << 16) | max_batches);
+	gf100_grctx_patch_wr32(chan, 0x405830, (attrib << 16) | alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++, n++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++, n++) {
 			const u32 as =  alpha * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 bs = attrib * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 u = 0x418ea0 + (n * 0x04);
 			const u32 o = PPC_UNIT(gpc, ppc, 0);
+
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
-			mmio_wr32(info, o + 0xc0, bs);
-			mmio_wr32(info, o + 0xf4, bo);
+
+			gf100_grctx_patch_wr32(chan, o + 0xc0, bs);
+			gf100_grctx_patch_wr32(chan, o + 0xf4, bo);
 			bo += grctx->attrib_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, o + 0xe4, as);
-			mmio_wr32(info, o + 0xf8, ao);
+			gf100_grctx_patch_wr32(chan, o + 0xe4, as);
+			gf100_grctx_patch_wr32(chan, o + 0xf8, ao);
 			ao += grctx->alpha_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, u, ((bs / 3) << 16) | bs);
+			gf100_grctx_patch_wr32(chan, u, ((bs / 3) << 16) | bs);
 		}
 	}
 }
 
+void
+gm107_grctx_generate_attrib_cb(struct gf100_gr_chan *chan, u64 addr, u32 size)
+{
+	gf100_grctx_generate_attrib_cb(chan, addr, size);
+
+	gf100_grctx_patch_wr32(chan, 0x419c2c, 0x10000000 | addr >> 12);
+}
+
 static void
 gm107_grctx_generate_r406500(struct gf100_gr *gr)
 {
@@ -978,6 +973,8 @@ gm107_grctx = {
 	.bundle_token_limit = 0x2c0,
 	.pagepool = gm107_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gm107_grctx_generate_attrib_cb,
 	.attrib = gm107_grctx_generate_attrib,
 	.attrib_nr_max = 0xff0,
 	.attrib_nr = 0xaa0,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm200.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm200.c
index 013d05a0f0f6..175da8ac656c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm200.c
@@ -87,7 +87,7 @@ gm200_grctx_generate_dist_skip_table(struct gf100_gr *gr)
 	int gpc, ppc, i;
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++) {
 			u8 ppc_tpcs = gr->ppc_tpc_nr[gpc][ppc];
 			u8 ppc_tpcm = gr->ppc_tpc_mask[gpc][ppc];
 			while (ppc_tpcs-- > gr->ppc_tpc_min)
@@ -111,6 +111,8 @@ gm200_grctx = {
 	.bundle_token_limit = 0x780,
 	.pagepool = gm107_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gm107_grctx_generate_attrib_cb,
 	.attrib = gm107_grctx_generate_attrib,
 	.attrib_nr_max = 0x600,
 	.attrib_nr = 0x400,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm20b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm20b.c
index 6b92f8aa18a3..b8edccfada58 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm20b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm20b.c
@@ -22,8 +22,9 @@
 #include "ctxgf100.h"
 
 static void
-gm20b_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
+gm20b_grctx_generate_main(struct gf100_gr_chan *chan)
 {
+	struct gf100_gr *gr = chan->gr;
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	u32 idle_timeout;
@@ -35,7 +36,8 @@ gm20b_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 
 	idle_timeout = nvkm_mask(device, 0x404154, 0xffffffff, 0x00000000);
 
-	grctx->attrib(info);
+	grctx->attrib_cb(chan, chan->attrib_cb->addr, grctx->attrib_cb_size(gr));
+	grctx->attrib(chan);
 
 	grctx->unkn(gr);
 
@@ -63,8 +65,8 @@ gm20b_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 	gf100_gr_wait_idle(gr);
 
 	gf100_gr_icmd(gr, gr->bundle);
-	grctx->pagepool(info);
-	grctx->bundle(info);
+	grctx->pagepool(chan, chan->pagepool->addr);
+	grctx->bundle(chan, chan->bundle_cb->addr, grctx->bundle_size);
 }
 
 const struct gf100_grctx_func
@@ -77,6 +79,8 @@ gm20b_grctx = {
 	.bundle_token_limit = 0x1c0,
 	.pagepool = gm107_grctx_generate_pagepool,
 	.pagepool_size = 0x8000,
+	.attrib_cb_size = gf100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gm107_grctx_generate_attrib_cb,
 	.attrib = gm107_grctx_generate_attrib,
 	.attrib_nr_max = 0x600,
 	.attrib_nr = 0x400,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
index 0b3326262e12..8485aaeae7a9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
@@ -30,66 +30,76 @@
  ******************************************************************************/
 
 void
-gp100_grctx_generate_pagepool(struct gf100_grctx *info)
+gp100_grctx_generate_pagepool(struct gf100_gr_chan *chan, u64 addr)
 {
-	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
-	mmio_refn(info, 0x40800c, 0x00000000, s, b);
-	mmio_wr32(info, 0x408010, 0x8007d800);
-	mmio_refn(info, 0x419004, 0x00000000, s, b);
-	mmio_wr32(info, 0x419008, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x40800c, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408010, 0x8007d800);
+	gf100_grctx_patch_wr32(chan, 0x419004, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x419008, 0x00000000);
 }
 
 static void
-gp100_grctx_generate_attrib(struct gf100_grctx *info)
+gp100_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32 attrib = grctx->attrib_nr;
-	const int s = 12;
 	const int max_batches = 0xffff;
 	u32 size = grctx->alpha_nr_max * gr->tpc_total;
 	u32 ao = 0;
 	u32 bo = ao + size;
-	int gpc, ppc, b, n = 0;
+	int gpc, ppc, n = 0;
 
-	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
-		size += grctx->attrib_nr_max * gr->ppc_nr[gpc] * gr->ppc_tpc_max;
-	size = ((size * 0x20) + 128) & ~127;
-	b = mmio_vram(info, size, (1 << s), false);
-
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_refn(info, 0x419c2c, 0x10000000, s, b);
-	mmio_refn(info, 0x419b00, 0x00000000, s, b);
-	mmio_wr32(info, 0x419b04, 0x80000000 | size >> 7);
-	mmio_wr32(info, 0x405830, attrib);
-	mmio_wr32(info, 0x40585c, alpha);
-	mmio_wr32(info, 0x4064c4, ((alpha / 4) << 16) | max_batches);
+	gf100_grctx_patch_wr32(chan, 0x405830, attrib);
+	gf100_grctx_patch_wr32(chan, 0x40585c, alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++, n++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++, n++) {
 			const u32 as =  alpha * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 bs = attrib * gr->ppc_tpc_max;
 			const u32 u = 0x418ea0 + (n * 0x04);
 			const u32 o = PPC_UNIT(gpc, ppc, 0);
+
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
-			mmio_wr32(info, o + 0xc0, bs);
-			mmio_wr32(info, o + 0xf4, bo);
-			mmio_wr32(info, o + 0xf0, bs);
+
+			gf100_grctx_patch_wr32(chan, o + 0xc0, bs);
+			gf100_grctx_patch_wr32(chan, o + 0xf4, bo);
+			gf100_grctx_patch_wr32(chan, o + 0xf0, bs);
 			bo += grctx->attrib_nr_max * gr->ppc_tpc_max;
-			mmio_wr32(info, o + 0xe4, as);
-			mmio_wr32(info, o + 0xf8, ao);
+			gf100_grctx_patch_wr32(chan, o + 0xe4, as);
+			gf100_grctx_patch_wr32(chan, o + 0xf8, ao);
 			ao += grctx->alpha_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, u, bs);
+			gf100_grctx_patch_wr32(chan, u, bs);
 		}
 	}
 
-	mmio_wr32(info, 0x418eec, 0x00000000);
-	mmio_wr32(info, 0x41befc, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x418eec, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x41befc, 0x00000000);
+}
+
+void
+gp100_grctx_generate_attrib_cb(struct gf100_gr_chan *chan, u64 addr, u32 size)
+{
+	gm107_grctx_generate_attrib_cb(chan, addr, size);
+
+	gf100_grctx_patch_wr32(chan, 0x419b00, 0x00000000 | addr >> 12);
+	gf100_grctx_patch_wr32(chan, 0x419b04, 0x80000000 | size >> 7);
+}
+
+static u32
+gp100_grctx_generate_attrib_cb_size(struct gf100_gr *gr)
+{
+	const struct gf100_grctx_func *grctx = gr->func->grctx;
+	u32 size = grctx->alpha_nr_max * gr->tpc_total;
+	int gpc;
+
+	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
+		size += grctx->attrib_nr_max * gr->func->ppc_nr * gr->ppc_tpc_max;
+
+	return ((size * 0x20) + 128) & ~127;
 }
 
 void
@@ -123,6 +133,8 @@ gp100_grctx = {
 	.bundle_token_limit = 0x1080,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp100_grctx_generate_attrib_cb_size,
+	.attrib_cb = gp100_grctx_generate_attrib_cb,
 	.attrib = gp100_grctx_generate_attrib,
 	.attrib_nr_max = 0x660,
 	.attrib_nr = 0x440,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
index daee17bf7d0d..7537979a5492 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
@@ -37,58 +37,62 @@ gp102_grctx_generate_r408840(struct gf100_gr *gr)
 }
 
 void
-gp102_grctx_generate_attrib(struct gf100_grctx *info)
+gp102_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32 attrib = grctx->attrib_nr;
 	const u32   gfxp = grctx->gfxp_nr;
-	const int s = 12;
 	const int max_batches = 0xffff;
 	u32 size = grctx->alpha_nr_max * gr->tpc_total;
 	u32 ao = 0;
 	u32 bo = ao + size;
-	int gpc, ppc, b, n = 0;
+	int gpc, ppc, n = 0;
 
-	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
-		size += grctx->gfxp_nr * gr->ppc_nr[gpc] * gr->ppc_tpc_max;
-	size = ((size * 0x20) + 128) & ~127;
-	b = mmio_vram(info, size, (1 << s), false);
-
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_refn(info, 0x419c2c, 0x10000000, s, b);
-	mmio_refn(info, 0x419b00, 0x00000000, s, b);
-	mmio_wr32(info, 0x419b04, 0x80000000 | size >> 7);
-	mmio_wr32(info, 0x405830, attrib);
-	mmio_wr32(info, 0x40585c, alpha);
-	mmio_wr32(info, 0x4064c4, ((alpha / 4) << 16) | max_batches);
+	gf100_grctx_patch_wr32(chan, 0x405830, attrib);
+	gf100_grctx_patch_wr32(chan, 0x40585c, alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++, n++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++, n++) {
 			const u32 as =  alpha * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 bs = attrib * gr->ppc_tpc_max;
 			const u32 gs =   gfxp * gr->ppc_tpc_max;
 			const u32 u = 0x418ea0 + (n * 0x04);
 			const u32 o = PPC_UNIT(gpc, ppc, 0);
 			const u32 p = GPC_UNIT(gpc, 0xc44 + (ppc * 4));
+
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
-			mmio_wr32(info, o + 0xc0, gs);
-			mmio_wr32(info, p, bs);
-			mmio_wr32(info, o + 0xf4, bo);
-			mmio_wr32(info, o + 0xf0, bs);
+
+			gf100_grctx_patch_wr32(chan, o + 0xc0, gs);
+			gf100_grctx_patch_wr32(chan, p, bs);
+			gf100_grctx_patch_wr32(chan, o + 0xf4, bo);
+			gf100_grctx_patch_wr32(chan, o + 0xf0, bs);
 			bo += gs;
-			mmio_wr32(info, o + 0xe4, as);
-			mmio_wr32(info, o + 0xf8, ao);
+			gf100_grctx_patch_wr32(chan, o + 0xe4, as);
+			gf100_grctx_patch_wr32(chan, o + 0xf8, ao);
 			ao += grctx->alpha_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, u, bs);
+			gf100_grctx_patch_wr32(chan, u, bs);
 		}
 	}
 
-	mmio_wr32(info, 0x4181e4, 0x00000100);
-	mmio_wr32(info, 0x41befc, 0x00000100);
+	gf100_grctx_patch_wr32(chan, 0x4181e4, 0x00000100);
+	gf100_grctx_patch_wr32(chan, 0x41befc, 0x00000100);
+}
+
+u32
+gp102_grctx_generate_attrib_cb_size(struct gf100_gr *gr)
+{
+	const struct gf100_grctx_func *grctx = gr->func->grctx;
+	u32 size = grctx->alpha_nr_max * gr->tpc_total;
+	int gpc;
+
+	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
+		size += grctx->gfxp_nr * gr->func->ppc_nr * gr->ppc_tpc_max;
+
+	return ((size * 0x20) + 127) & ~127;
 }
 
 const struct gf100_grctx_func
@@ -101,6 +105,8 @@ gp102_grctx = {
 	.bundle_token_limit = 0x900,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gp100_grctx_generate_attrib_cb,
 	.attrib = gp102_grctx_generate_attrib,
 	.attrib_nr_max = 0x4b0,
 	.attrib_nr = 0x320,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp104.c
index 3b85e3d326b2..90b5f793e567 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp104.c
@@ -31,6 +31,8 @@ gp104_grctx = {
 	.bundle_token_limit = 0x900,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gp100_grctx_generate_attrib_cb,
 	.attrib = gp102_grctx_generate_attrib,
 	.attrib_nr_max = 0x4b0,
 	.attrib_nr = 0x320,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp107.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp107.c
index 5060c5ee5ce0..d191761a0471 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp107.c
@@ -39,6 +39,8 @@ gp107_grctx = {
 	.bundle_token_limit = 0x300,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gp100_grctx_generate_attrib_cb,
 	.attrib = gp102_grctx_generate_attrib,
 	.attrib_nr_max = 0x15de,
 	.attrib_nr = 0x540,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c
index 39553d55d3f3..957ea9d6bad4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c
@@ -25,7 +25,7 @@
  * PGRAPH context implementation
  ******************************************************************************/
 
-const struct gf100_gr_init
+static const struct gf100_gr_init
 gv100_grctx_init_sw_veid_bundle_init_0[] = {
 	{ 0x00001000, 64, 0x00100000, 0x00000008 },
 	{ 0x00000941, 64, 0x00100000, 0x00000000 },
@@ -59,67 +59,70 @@ gv100_grctx_pack_sw_veid_bundle_init[] = {
 };
 
 void
-gv100_grctx_generate_attrib(struct gf100_grctx *info)
+gv100_grctx_generate_attrib(struct gf100_gr_chan *chan)
 {
-	struct gf100_gr *gr = info->gr;
+	struct gf100_gr *gr = chan->gr;
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32  alpha = grctx->alpha_nr;
 	const u32 attrib = grctx->attrib_nr;
 	const u32   gfxp = grctx->gfxp_nr;
-	const int s = 12;
+	const int max_batches = 0xffff;
 	u32 size = grctx->alpha_nr_max * gr->tpc_total;
 	u32 ao = 0;
 	u32 bo = ao + size;
-	int gpc, ppc, b, n = 0;
+	int gpc, ppc, n = 0;
 
-	for (gpc = 0; gpc < gr->gpc_nr; gpc++)
-		size += grctx->gfxp_nr * gr->ppc_nr[gpc] * gr->ppc_tpc_max;
-	size = ((size * 0x20) + 127) & ~127;
-	b = mmio_vram(info, size, (1 << s), false);
-
-	mmio_refn(info, 0x418810, 0x80000000, s, b);
-	mmio_refn(info, 0x419848, 0x10000000, s, b);
-	mmio_refn(info, 0x419c2c, 0x10000000, s, b);
-	mmio_refn(info, 0x419e00, 0x00000000, s, b);
-	mmio_wr32(info, 0x419e04, 0x80000000 | size >> 7);
-	mmio_wr32(info, 0x405830, attrib);
-	mmio_wr32(info, 0x40585c, alpha);
+	gf100_grctx_patch_wr32(chan, 0x405830, attrib);
+	gf100_grctx_patch_wr32(chan, 0x40585c, alpha);
+	gf100_grctx_patch_wr32(chan, 0x4064c4, ((alpha / 4) << 16) | max_batches);
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++, n++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++, n++) {
 			const u32 as =  alpha * gr->ppc_tpc_nr[gpc][ppc];
 			const u32 bs = attrib * gr->ppc_tpc_max;
 			const u32 gs =   gfxp * gr->ppc_tpc_max;
 			const u32 u = 0x418ea0 + (n * 0x04);
 			const u32 o = PPC_UNIT(gpc, ppc, 0);
+
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
-			mmio_wr32(info, o + 0xc0, gs);
-			mmio_wr32(info, o + 0xf4, bo);
-			mmio_wr32(info, o + 0xf0, bs);
+
+			gf100_grctx_patch_wr32(chan, o + 0xc0, gs);
+			gf100_grctx_patch_wr32(chan, o + 0xf4, bo);
+			gf100_grctx_patch_wr32(chan, o + 0xf0, bs);
 			bo += gs;
-			mmio_wr32(info, o + 0xe4, as);
-			mmio_wr32(info, o + 0xf8, ao);
+			gf100_grctx_patch_wr32(chan, o + 0xe4, as);
+			gf100_grctx_patch_wr32(chan, o + 0xf8, ao);
 			ao += grctx->alpha_nr_max * gr->ppc_tpc_nr[gpc][ppc];
-			mmio_wr32(info, u, bs);
+			gf100_grctx_patch_wr32(chan, u, bs);
 		}
 	}
 
-	mmio_wr32(info, 0x4181e4, 0x00000100);
-	mmio_wr32(info, 0x41befc, 0x00000100);
+	gf100_grctx_patch_wr32(chan, 0x4181e4, 0x00000100);
+	gf100_grctx_patch_wr32(chan, 0x41befc, 0x00000100);
+}
+
+void
+gv100_grctx_generate_attrib_cb(struct gf100_gr_chan *chan, u64 addr, u32 size)
+{
+	gm107_grctx_generate_attrib_cb(chan, addr, size);
+
+	gf100_grctx_patch_wr32(chan, 0x419e00, 0x00000000 | addr >> 12);
+	gf100_grctx_patch_wr32(chan, 0x419e04, 0x80000000 | size >> 7);
 }
 
 void
 gv100_grctx_generate_rop_mapping(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
+	const u32 mapregs = DIV_ROUND_UP(gr->func->gpc_nr * gr->func->tpc_nr, 6);
 	u32 data;
 	int i, j;
 
 	/* Pack tile map into register format. */
 	nvkm_wr32(device, 0x418bb8, (gr->tpc_total << 8) |
 				     gr->screen_tile_row_offset);
-	for (i = 0; i < 11; i++) {
+	for (i = 0; i < mapregs; i++) {
 		for (data = 0, j = 0; j < 6; j++)
 			data |= (gr->tile[i * 6 + j] & 0x1f) << (j * 5);
 		nvkm_wr32(device, 0x418b08 + (i * 4), data);
@@ -157,6 +160,9 @@ static void
 gv100_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	tpc = gv100_gr_nonpes_aware_tpc(gr, gpc, tpc);
+
 	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x608), sm);
 	nvkm_wr32(device, GPC_UNIT(gpc, 0x0c10 + tpc * 4), sm);
 	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), sm);
@@ -198,6 +204,8 @@ gv100_grctx = {
 	.bundle_token_limit = 0x1680,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gv100_grctx_generate_attrib_cb,
 	.attrib = gv100_grctx_generate_attrib,
 	.attrib_nr_max = 0x6c0,
 	.attrib_nr = 0x480,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c
index 2299ca07d04a..542ab0c78be6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c
@@ -34,6 +34,9 @@ static void
 tu102_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	tpc = gv100_gr_nonpes_aware_tpc(gr, gpc, tpc);
+
 	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x608), sm);
 	nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), sm);
 }
@@ -47,42 +50,38 @@ tu102_grctx_init_unknown_bundle_init_0[] = {
 };
 
 static const struct gf100_gr_pack
-tu102_grctx_pack_sw_veid_bundle_init[] = {
-	{ gv100_grctx_init_sw_veid_bundle_init_0 },
-	{ tu102_grctx_init_unknown_bundle_init_0 },
+tu102_grctx_pack_sw_bundle64_init[] = {
+	{ tu102_grctx_init_unknown_bundle_init_0, .type = 64 },
 	{}
 };
 
-static void
-tu102_grctx_generate_attrib(struct gf100_grctx *info)
+void
+tu102_grctx_generate_unknown(struct gf100_gr_chan *chan, u64 addr, u32 size)
 {
-	const u64 size = 0x80000; /*XXX: educated guess */
-	const int s = 8;
-	const int b = mmio_vram(info, size, (1 << s), true);
-
-	gv100_grctx_generate_attrib(info);
-
-	mmio_refn(info, 0x408070, 0x00000000, s, b);
-	mmio_wr32(info, 0x408074, size >> s); /*XXX: guess */
-	mmio_refn(info, 0x419034, 0x00000000, s, b);
-	mmio_wr32(info, 0x408078, 0x00000000);
+	gf100_grctx_patch_wr32(chan, 0x408070, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408074, size >> 8); /*XXX: guess */
+	gf100_grctx_patch_wr32(chan, 0x419034, addr >> 8);
+	gf100_grctx_patch_wr32(chan, 0x408078, 0x00000000);
 }
 
 const struct gf100_grctx_func
 tu102_grctx = {
-	.unkn88c = gv100_grctx_unkn88c,
 	.main = gf100_grctx_generate_main,
 	.unkn = gv100_grctx_generate_unkn,
-	.sw_veid_bundle_init = tu102_grctx_pack_sw_veid_bundle_init,
+	.sw_bundle64_init = tu102_grctx_pack_sw_bundle64_init,
 	.bundle = gm107_grctx_generate_bundle,
 	.bundle_size = 0x3000,
 	.bundle_min_gpm_fifo_depth = 0x180,
 	.bundle_token_limit = 0xa80,
 	.pagepool = gp100_grctx_generate_pagepool,
 	.pagepool_size = 0x20000,
-	.attrib = tu102_grctx_generate_attrib,
+	.attrib_cb_size = gp102_grctx_generate_attrib_cb_size,
+	.attrib_cb = gv100_grctx_generate_attrib_cb,
+	.attrib = gv100_grctx_generate_attrib,
 	.attrib_nr_max = 0x800,
 	.attrib_nr = 0x700,
+	.unknown_size = 0x80000,
+	.unknown = tu102_grctx_generate_unknown,
 	.alpha_nr_max = 0xc00,
 	.alpha_nr = 0x800,
 	.gfxp_nr = 0xfa8,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ga102.c
new file mode 100644
index 000000000000..a5b5ac2755a2
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ga102.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright 2019 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "gf100.h"
+#include "ctxgf100.h"
+
+#include <core/firmware.h>
+#include <subdev/acr.h>
+#include <subdev/timer.h>
+#include <subdev/vfn.h>
+
+#include <nvfw/flcn.h>
+
+#include <nvif/class.h>
+
+static void
+ga102_gr_zbc_clear_color(struct gf100_gr *gr, int zbc)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	u32 invalid[] = { 0, 0, 0, 0 }, *color;
+
+	if (gr->zbc_color[zbc].format)
+		color = gr->zbc_color[zbc].l2;
+	else
+		color = invalid;
+
+	nvkm_mask(device, 0x41bcb4, 0x0000001f, zbc);
+	nvkm_wr32(device, 0x41bcec, color[0]);
+	nvkm_wr32(device, 0x41bcf0, color[1]);
+	nvkm_wr32(device, 0x41bcf4, color[2]);
+	nvkm_wr32(device, 0x41bcf8, color[3]);
+}
+
+static const struct gf100_gr_func_zbc
+ga102_gr_zbc = {
+	.clear_color = ga102_gr_zbc_clear_color,
+	.clear_depth = gp100_gr_zbc_clear_depth,
+	.stencil_get = gp102_gr_zbc_stencil_get,
+	.clear_stencil = gp102_gr_zbc_clear_stencil,
+};
+
+static void
+ga102_gr_gpccs_reset(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x41a610, 0x00000000);
+	nvkm_msec(device, 1, NVKM_DELAY);
+	nvkm_wr32(device, 0x41a610, 0x00000001);
+}
+
+static const struct nvkm_acr_lsf_func
+ga102_gr_gpccs_acr = {
+	.flags = NVKM_ACR_LSF_FORCE_PRIV_LOAD,
+	.bl_entry = 0x3400,
+	.bld_size = sizeof(struct flcn_bl_dmem_desc_v2),
+	.bld_write = gp108_gr_acr_bld_write,
+	.bld_patch = gp108_gr_acr_bld_patch,
+};
+
+static void
+ga102_gr_fecs_reset(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x409614, 0x00000010);
+	nvkm_wr32(device, 0x41a614, 0x00000020);
+	nvkm_usec(device, 10, NVKM_DELAY);
+	nvkm_wr32(device, 0x409614, 0x00000110);
+	nvkm_wr32(device, 0x41a614, 0x00000a20);
+	nvkm_usec(device, 10, NVKM_DELAY);
+	nvkm_rd32(device, 0x409614);
+	nvkm_rd32(device, 0x41a614);
+}
+
+static const struct nvkm_acr_lsf_func
+ga102_gr_fecs_acr = {
+	.bl_entry = 0x7e00,
+	.bld_size = sizeof(struct flcn_bl_dmem_desc_v2),
+	.bld_write = gp108_gr_acr_bld_write,
+	.bld_patch = gp108_gr_acr_bld_patch,
+};
+
+static void
+ga102_gr_init_rop_exceptions(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x41bcbc, 0x40000000);
+	nvkm_wr32(device, 0x41bc38, 0x40000000);
+	nvkm_wr32(device, 0x41ac94, nvkm_rd32(device, 0x502c94));
+}
+
+static void
+ga102_gr_init_40a790(struct gf100_gr *gr)
+{
+	nvkm_wr32(gr->base.engine.subdev.device, 0x40a790, 0xc0000000);
+}
+
+static void
+ga102_gr_init_gpc_mmu(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x418880, nvkm_rd32(device, 0x100c80) & 0xf8001fff);
+	nvkm_wr32(device, 0x418894, 0x00000000);
+
+	nvkm_wr32(device, 0x4188b4, nvkm_rd32(device, 0x100cc8));
+	nvkm_wr32(device, 0x4188b8, nvkm_rd32(device, 0x100ccc));
+	nvkm_wr32(device, 0x4188b0, nvkm_rd32(device, 0x100cc4));
+}
+
+static struct nvkm_intr *
+ga102_gr_oneinit_intr(struct gf100_gr *gr, enum nvkm_intr_type *pvector)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	*pvector = nvkm_rd32(device, 0x400154) & 0x00000fff;
+	return &device->vfn->intr;
+}
+
+static const struct gf100_gr_func
+ga102_gr = {
+	.oneinit_intr = ga102_gr_oneinit_intr,
+	.oneinit_tiles = gm200_gr_oneinit_tiles,
+	.oneinit_sm_id = gv100_gr_oneinit_sm_id,
+	.init = gf100_gr_init,
+	.init_419bd8 = gv100_gr_init_419bd8,
+	.init_gpc_mmu = ga102_gr_init_gpc_mmu,
+	.init_vsc_stream_master = gk104_gr_init_vsc_stream_master,
+	.init_zcull = tu102_gr_init_zcull,
+	.init_num_active_ltcs = gf100_gr_init_num_active_ltcs,
+	.init_swdx_pes_mask = gp102_gr_init_swdx_pes_mask,
+	.init_fs = tu102_gr_init_fs,
+	.init_fecs_exceptions = tu102_gr_init_fecs_exceptions,
+	.init_40a790 = ga102_gr_init_40a790,
+	.init_ds_hww_esr_2 = gm200_gr_init_ds_hww_esr_2,
+	.init_sked_hww_esr = gk104_gr_init_sked_hww_esr,
+	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
+	.init_504430 = gv100_gr_init_504430,
+	.init_shader_exceptions = gv100_gr_init_shader_exceptions,
+	.init_rop_exceptions = ga102_gr_init_rop_exceptions,
+	.init_4188a4 = gv100_gr_init_4188a4,
+	.trap_mp = gv100_gr_trap_mp,
+	.fecs.reset = ga102_gr_fecs_reset,
+	.gpccs.reset = ga102_gr_gpccs_reset,
+	.rops = gm200_gr_rops,
+	.gpc_nr = 7,
+	.tpc_nr = 6,
+	.ppc_nr = 3,
+	.grctx = &ga102_grctx,
+	.zbc = &ga102_gr_zbc,
+	.sclass = {
+		{ -1, -1, FERMI_TWOD_A },
+		{ -1, -1, KEPLER_INLINE_TO_MEMORY_B },
+		{ -1, -1, AMPERE_B, &gf100_fermi },
+		{ -1, -1, AMPERE_COMPUTE_B },
+		{}
+	}
+};
+
+MODULE_FIRMWARE("nvidia/ga102/gr/fecs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga102/gr/fecs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga102/gr/gpccs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga102/gr/gpccs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga102/gr/NET_img.bin");
+
+MODULE_FIRMWARE("nvidia/ga103/gr/fecs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga103/gr/fecs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga103/gr/gpccs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga103/gr/gpccs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga103/gr/NET_img.bin");
+
+MODULE_FIRMWARE("nvidia/ga104/gr/fecs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga104/gr/fecs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga104/gr/gpccs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga104/gr/gpccs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga104/gr/NET_img.bin");
+
+MODULE_FIRMWARE("nvidia/ga106/gr/fecs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga106/gr/fecs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga106/gr/gpccs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga106/gr/gpccs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga106/gr/NET_img.bin");
+
+MODULE_FIRMWARE("nvidia/ga107/gr/fecs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga107/gr/fecs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga107/gr/gpccs_bl.bin");
+MODULE_FIRMWARE("nvidia/ga107/gr/gpccs_sig.bin");
+MODULE_FIRMWARE("nvidia/ga107/gr/NET_img.bin");
+
+struct netlist_region {
+	u32 region_id;
+	u32 data_size;
+	u32 data_offset;
+};
+
+struct netlist_image_header {
+	u32 version;
+	u32 regions;
+};
+
+struct netlist_image {
+	struct netlist_image_header header;
+	struct netlist_region regions[];
+};
+
+struct netlist_av64 {
+	u32 addr;
+	u32 data_hi;
+	u32 data_lo;
+};
+
+static int
+ga102_gr_av64_to_init(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
+{
+	struct gf100_gr_init *init;
+	struct gf100_gr_pack *pack;
+	int nent;
+	int i;
+
+	nent = (blob->size / sizeof(struct netlist_av64));
+	pack = vzalloc((sizeof(*pack) * 2) + (sizeof(*init) * (nent + 1)));
+	if (!pack)
+		return -ENOMEM;
+
+	init = (void *)(pack + 2);
+	pack[0].init = init;
+	pack[0].type = 64;
+
+	for (i = 0; i < nent; i++) {
+		struct gf100_gr_init *ent = &init[i];
+		struct netlist_av64 *av = &((struct netlist_av64 *)blob->data)[i];
+
+		ent->addr = av->addr;
+		ent->data = ((u64)av->data_hi << 32) | av->data_lo;
+		ent->count = 1;
+		ent->pitch = 1;
+	}
+
+	*ppack = pack;
+	return 0;
+}
+
+static int
+ga102_gr_load(struct gf100_gr *gr, int ver, const struct gf100_gr_fwif *fwif)
+{
+	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
+	const struct firmware *fw;
+	const struct netlist_image *net;
+	const struct netlist_region *fecs_inst = NULL;
+	const struct netlist_region *fecs_data = NULL;
+	const struct netlist_region *gpccs_inst = NULL;
+	const struct netlist_region *gpccs_data = NULL;
+	int ret, i;
+
+	ret = nvkm_firmware_get(subdev, "gr/NET_img", 0, &fw);
+	if (ret)
+		return ret;
+
+	net = (const void *)fw->data;
+	nvkm_debug(subdev, "netlist version %d, %d regions\n",
+		   net->header.version, net->header.regions);
+
+	for (i = 0; i < net->header.regions; i++) {
+		const struct netlist_region *reg = &net->regions[i];
+		struct nvkm_blob blob = {
+			.data = (void *)fw->data + reg->data_offset,
+			.size = reg->data_size,
+		};
+
+		nvkm_debug(subdev, "\t%2d: %08x %08x\n",
+			   reg->region_id, reg->data_offset, reg->data_size);
+
+		switch (reg->region_id) {
+		case  0: fecs_data = reg; break;
+		case  1: fecs_inst = reg; break;
+		case  2: gpccs_data = reg; break;
+		case  3: gpccs_inst = reg; break;
+		case  4: gk20a_gr_av_to_init(&blob, &gr->bundle); break;
+		case  5: gk20a_gr_aiv_to_init(&blob, &gr->sw_ctx); break;
+		case  7: gk20a_gr_av_to_method(&blob, &gr->method); break;
+		case 28: tu102_gr_av_to_init_veid(&blob, &gr->bundle_veid); break;
+		case 34: ga102_gr_av64_to_init(&blob, &gr->bundle64); break;
+		case 48: gk20a_gr_av_to_init(&blob, &gr->sw_nonctx1); break;
+		case 49: gk20a_gr_av_to_init(&blob, &gr->sw_nonctx2); break;
+		case 50: gk20a_gr_av_to_init(&blob, &gr->sw_nonctx3); break;
+		case 51: gk20a_gr_av_to_init(&blob, &gr->sw_nonctx4); break;
+		default:
+			break;
+		}
+	}
+
+	ret = nvkm_acr_lsfw_load_bl_sig_net(subdev, &gr->fecs.falcon, NVKM_ACR_LSF_FECS,
+					    "gr/fecs_", ver, fwif->fecs,
+					    fw->data + fecs_inst->data_offset,
+						       fecs_inst->data_size,
+					    fw->data + fecs_data->data_offset,
+						       fecs_data->data_size);
+	if (ret)
+		return ret;
+
+	ret = nvkm_acr_lsfw_load_bl_sig_net(subdev, &gr->gpccs.falcon, NVKM_ACR_LSF_GPCCS,
+					    "gr/gpccs_", ver, fwif->gpccs,
+					    fw->data + gpccs_inst->data_offset,
+						       gpccs_inst->data_size,
+					    fw->data + gpccs_data->data_offset,
+						       gpccs_data->data_size);
+	if (ret)
+		return ret;
+
+	gr->firmware = true;
+
+	nvkm_firmware_put(fw);
+	return 0;
+}
+
+static const struct gf100_gr_fwif
+ga102_gr_fwif[] = {
+	{  0, ga102_gr_load, &ga102_gr, &ga102_gr_fecs_acr, &ga102_gr_gpccs_acr },
+	{ -1, gm200_gr_nofw },
+	{}
+};
+
+int
+ga102_gr_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, struct nvkm_gr **pgr)
+{
+	return gf100_gr_new_(ga102_gr_fwif, device, type, inst, pgr);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
index f16eabf4f642..5f20079c3660 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
@@ -67,7 +67,7 @@ gf100_gr_zbc_color_get(struct gf100_gr *gr, int format,
 	struct nvkm_ltc *ltc = gr->base.engine.subdev.device->ltc;
 	int zbc = -ENOSPC, i;
 
-	for (i = ltc->zbc_min; i <= ltc->zbc_max; i++) {
+	for (i = ltc->zbc_color_min; i <= ltc->zbc_color_max; i++) {
 		if (gr->zbc_color[i].format) {
 			if (gr->zbc_color[i].format != format)
 				continue;
@@ -114,7 +114,7 @@ gf100_gr_zbc_depth_get(struct gf100_gr *gr, int format,
 	struct nvkm_ltc *ltc = gr->base.engine.subdev.device->ltc;
 	int zbc = -ENOSPC, i;
 
-	for (i = ltc->zbc_min; i <= ltc->zbc_max; i++) {
+	for (i = ltc->zbc_depth_min; i <= ltc->zbc_depth_max; i++) {
 		if (gr->zbc_depth[i].format) {
 			if (gr->zbc_depth[i].format != format)
 				continue;
@@ -355,15 +355,14 @@ static void *
 gf100_gr_chan_dtor(struct nvkm_object *object)
 {
 	struct gf100_gr_chan *chan = gf100_gr_chan(object);
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(chan->data); i++) {
-		nvkm_vmm_put(chan->vmm, &chan->data[i].vma);
-		nvkm_memory_unref(&chan->data[i].mem);
-	}
 
 	nvkm_vmm_put(chan->vmm, &chan->mmio_vma);
 	nvkm_memory_unref(&chan->mmio);
+
+	nvkm_vmm_put(chan->vmm, &chan->attrib_cb);
+	nvkm_vmm_put(chan->vmm, &chan->unknown);
+	nvkm_vmm_put(chan->vmm, &chan->bundle_cb);
+	nvkm_vmm_put(chan->vmm, &chan->pagepool);
 	nvkm_vmm_unref(&chan->vmm);
 	return chan;
 }
@@ -380,12 +379,10 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		  struct nvkm_object **pobject)
 {
 	struct gf100_gr *gr = gf100_gr(base);
-	struct gf100_gr_data *data = gr->mmio_data;
-	struct gf100_gr_mmio *mmio = gr->mmio_list;
 	struct gf100_gr_chan *chan;
 	struct gf100_vmm_map_v0 args = { .priv = 1 };
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	int ret, i;
+	int ret;
 
 	if (!(chan = kzalloc(sizeof(*chan), GFP_KERNEL)))
 		return -ENOMEM;
@@ -394,63 +391,91 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 	chan->vmm = nvkm_vmm_ref(fifoch->vmm);
 	*pobject = &chan->object;
 
-	/* allocate memory for a "mmio list" buffer that's used by the HUB
-	 * fuc to modify some per-context register settings on first load
-	 * of the context.
-	 */
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x100,
-			      false, &chan->mmio);
+	/* Map pagepool. */
+	ret = nvkm_vmm_get(chan->vmm, 12, nvkm_memory_size(gr->pagepool), &chan->pagepool);
 	if (ret)
 		return ret;
 
-	ret = nvkm_vmm_get(fifoch->vmm, 12, 0x1000, &chan->mmio_vma);
+	ret = nvkm_memory_map(gr->pagepool, 0, chan->vmm, chan->pagepool, &args, sizeof(args));
 	if (ret)
 		return ret;
 
-	ret = nvkm_memory_map(chan->mmio, 0, fifoch->vmm,
-			      chan->mmio_vma, &args, sizeof(args));
+	/* Map bundle circular buffer. */
+	ret = nvkm_vmm_get(chan->vmm, 12, nvkm_memory_size(gr->bundle_cb), &chan->bundle_cb);
+	if (ret)
+		return ret;
+
+	ret = nvkm_memory_map(gr->bundle_cb, 0, chan->vmm, chan->bundle_cb, &args, sizeof(args));
+	if (ret)
+		return ret;
+
+	/* Map attribute circular buffer. */
+	ret = nvkm_vmm_get(chan->vmm, 12, nvkm_memory_size(gr->attrib_cb), &chan->attrib_cb);
 	if (ret)
 		return ret;
 
-	/* allocate buffers referenced by mmio list */
-	for (i = 0; data->size && i < ARRAY_SIZE(gr->mmio_data); i++) {
-		ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
-				      data->size, data->align, false,
-				      &chan->data[i].mem);
+	if (device->card_type < GP100) {
+		ret = nvkm_memory_map(gr->attrib_cb, 0, chan->vmm, chan->attrib_cb, NULL, 0);
 		if (ret)
 			return ret;
-
-		ret = nvkm_vmm_get(fifoch->vmm, 12,
-				   nvkm_memory_size(chan->data[i].mem),
-				   &chan->data[i].vma);
+	} else {
+		ret = nvkm_memory_map(gr->attrib_cb, 0, chan->vmm, chan->attrib_cb,
+				      &args, sizeof(args));;
 		if (ret)
 			return ret;
+	}
 
-		args.priv = data->priv;
+	/* Map some context buffer of unknown purpose. */
+	if (gr->func->grctx->unknown_size) {
+		ret = nvkm_vmm_get(chan->vmm, 12, nvkm_memory_size(gr->unknown), &chan->unknown);
+		if (ret)
+			return ret;
 
-		ret = nvkm_memory_map(chan->data[i].mem, 0, chan->vmm,
-				      chan->data[i].vma, &args, sizeof(args));
+		ret = nvkm_memory_map(gr->unknown, 0, chan->vmm, chan->unknown,
+				      &args, sizeof(args));
 		if (ret)
 			return ret;
+	}
 
-		data++;
+	/* Generate golden context image. */
+	mutex_lock(&gr->fecs.mutex);
+	if (gr->data == NULL) {
+		ret = gf100_grctx_generate(gr, chan, fifoch->inst);
+		if (ret) {
+			nvkm_error(&base->engine.subdev, "failed to construct context\n");
+			return ret;
+		}
 	}
+	mutex_unlock(&gr->fecs.mutex);
 
-	/* finally, fill in the mmio list and point the context at it */
-	nvkm_kmap(chan->mmio);
-	for (i = 0; mmio->addr && i < ARRAY_SIZE(gr->mmio_list); i++) {
-		u32 addr = mmio->addr;
-		u32 data = mmio->data;
+	/* allocate memory for a "mmio list" buffer that's used by the HUB
+	 * fuc to modify some per-context register settings on first load
+	 * of the context.
+	 */
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x100,
+			      false, &chan->mmio);
+	if (ret)
+		return ret;
 
-		if (mmio->buffer >= 0) {
-			u64 info = chan->data[mmio->buffer].vma->addr;
-			data |= info >> mmio->shift;
-		}
+	ret = nvkm_vmm_get(fifoch->vmm, 12, 0x1000, &chan->mmio_vma);
+	if (ret)
+		return ret;
 
-		nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, addr);
-		nvkm_wo32(chan->mmio, chan->mmio_nr++ * 4, data);
-		mmio++;
-	}
+	ret = nvkm_memory_map(chan->mmio, 0, fifoch->vmm,
+			      chan->mmio_vma, &args, sizeof(args));
+	if (ret)
+		return ret;
+
+	/* finally, fill in the mmio list and point the context at it */
+	nvkm_kmap(chan->mmio);
+	gr->func->grctx->pagepool(chan, chan->pagepool->addr);
+	gr->func->grctx->bundle(chan, chan->bundle_cb->addr, gr->func->grctx->bundle_size);
+	gr->func->grctx->attrib_cb(chan, chan->attrib_cb->addr, gr->func->grctx->attrib_cb_size(gr));
+	gr->func->grctx->attrib(chan);
+	if (gr->func->grctx->patch_ltc)
+		gr->func->grctx->patch_ltc(chan);
+	if (gr->func->grctx->unknown_size)
+		gr->func->grctx->unknown(chan, chan->unknown->addr, gr->func->grctx->unknown_size);
 	nvkm_done(chan->mmio);
 	return 0;
 }
@@ -727,7 +752,7 @@ gf100_gr_fecs_ctrl_ctxsw(struct gf100_gr *gr, u32 mthd)
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
 	nvkm_wr32(device, 0x409804, 0xffffffff);
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x409500, 0xffffffff);
 	nvkm_wr32(device, 0x409504, mthd);
 	nvkm_msec(device, 2000,
@@ -771,12 +796,45 @@ gf100_gr_fecs_stop_ctxsw(struct nvkm_gr *base)
 	return ret;
 }
 
+static int
+gf100_gr_fecs_halt_pipeline(struct gf100_gr *gr)
+{
+	int ret = 0;
+
+	if (gr->firmware) {
+		mutex_lock(&gr->fecs.mutex);
+		ret = gf100_gr_fecs_ctrl_ctxsw(gr, 0x04);
+		mutex_unlock(&gr->fecs.mutex);
+	}
+
+	return ret;
+}
+
+int
+gf100_gr_fecs_wfi_golden_save(struct gf100_gr *gr, u32 inst)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_mask(device, 0x409800, 0x00000003, 0x00000000);
+	nvkm_wr32(device, 0x409500, inst);
+	nvkm_wr32(device, 0x409504, 0x00000009);
+	nvkm_msec(device, 2000,
+		u32 stat = nvkm_rd32(device, 0x409800);
+		if (stat & 0x00000002)
+			return -EIO;
+		if (stat & 0x00000001)
+			return 0;
+	);
+
+	return -ETIMEDOUT;
+}
+
 int
 gf100_gr_fecs_bind_pointer(struct gf100_gr *gr, u32 inst)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
-	nvkm_wr32(device, 0x409840, 0x00000030);
+	nvkm_mask(device, 0x409800, 0x00000030, 0x00000000);
 	nvkm_wr32(device, 0x409500, inst);
 	nvkm_wr32(device, 0x409504, 0x00000003);
 	nvkm_msec(device, 2000,
@@ -867,7 +925,7 @@ gf100_gr_fecs_discover_pm_image_size(struct gf100_gr *gr, u32 *psize)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x409500, 0x00000000);
 	nvkm_wr32(device, 0x409504, 0x00000025);
 	nvkm_msec(device, 2000,
@@ -883,7 +941,7 @@ gf100_gr_fecs_discover_zcull_image_size(struct gf100_gr *gr, u32 *psize)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x409500, 0x00000000);
 	nvkm_wr32(device, 0x409504, 0x00000016);
 	nvkm_msec(device, 2000,
@@ -899,7 +957,7 @@ gf100_gr_fecs_discover_image_size(struct gf100_gr *gr, u32 *psize)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x409500, 0x00000000);
 	nvkm_wr32(device, 0x409504, 0x00000010);
 	nvkm_msec(device, 2000,
@@ -915,7 +973,7 @@ gf100_gr_fecs_set_watchdog_timeout(struct gf100_gr *gr, u32 timeout)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x409500, timeout);
 	nvkm_wr32(device, 0x409504, 0x00000021);
 }
@@ -955,7 +1013,7 @@ gf100_gr_zbc_init(struct gf100_gr *gr)
 	const u32 f32_1[] = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000,
 			      0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
 	struct nvkm_ltc *ltc = gr->base.engine.subdev.device->ltc;
-	int index, c = ltc->zbc_min, d = ltc->zbc_min, s = ltc->zbc_min;
+	int index, c = ltc->zbc_color_min, d = ltc->zbc_depth_min, s = ltc->zbc_depth_min;
 
 	if (!gr->zbc_color[0].format) {
 		gf100_gr_zbc_color_get(gr, 1,  & zero[0],   &zero[4]); c++;
@@ -971,13 +1029,13 @@ gf100_gr_zbc_init(struct gf100_gr *gr)
 		}
 	}
 
-	for (index = c; index <= ltc->zbc_max; index++)
+	for (index = c; index <= ltc->zbc_color_max; index++)
 		gr->func->zbc->clear_color(gr, index);
-	for (index = d; index <= ltc->zbc_max; index++)
+	for (index = d; index <= ltc->zbc_depth_max; index++)
 		gr->func->zbc->clear_depth(gr, index);
 
 	if (gr->func->zbc->clear_stencil) {
-		for (index = s; index <= ltc->zbc_max; index++)
+		for (index = s; index <= ltc->zbc_depth_max; index++)
 			gr->func->zbc->clear_stencil(gr, index);
 	}
 }
@@ -1003,7 +1061,7 @@ gf100_gr_wait_idle(struct gf100_gr *gr)
 		nvkm_rd32(device, 0x400700);
 
 		gr_enabled = nvkm_rd32(device, 0x200) & 0x1000;
-		ctxsw_active = nvkm_rd32(device, 0x2640) & 0x8000;
+		ctxsw_active = nvkm_fifo_ctxsw_in_progress(&gr->base.engine);
 		gr_busy = nvkm_rd32(device, 0x40060c) & 0x1;
 
 		if (!gr_enabled || (!gr_busy && !ctxsw_active))
@@ -1039,7 +1097,7 @@ gf100_gr_icmd(struct gf100_gr *gr, const struct gf100_gr_pack *p)
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const struct gf100_gr_pack *pack;
 	const struct gf100_gr_init *init;
-	u32 data = 0;
+	u64 data = 0;
 
 	nvkm_wr32(device, 0x400208, 0x80000000);
 
@@ -1049,6 +1107,8 @@ gf100_gr_icmd(struct gf100_gr *gr, const struct gf100_gr_pack *p)
 
 		if ((pack == p && init == p->init) || data != init->data) {
 			nvkm_wr32(device, 0x400204, init->data);
+			if (pack->type == 64)
+				nvkm_wr32(device, 0x40020c, upper_32_bits(init->data));
 			data = init->data;
 		}
 
@@ -1542,13 +1602,13 @@ gf100_gr_ctxctl_isr(struct gf100_gr *gr)
 	}
 }
 
-static void
-gf100_gr_intr(struct nvkm_gr *base)
+static irqreturn_t
+gf100_gr_intr(struct nvkm_inth *inth)
 {
-	struct gf100_gr *gr = gf100_gr(base);
+	struct gf100_gr *gr = container_of(inth, typeof(*gr), base.engine.subdev.inth);
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_fifo_chan *chan;
+	struct nvkm_chan *chan;
 	unsigned long flags;
 	u64 inst = nvkm_rd32(device, 0x409b00) & 0x0fffffff;
 	u32 stat = nvkm_rd32(device, 0x400100);
@@ -1561,10 +1621,10 @@ gf100_gr_intr(struct nvkm_gr *base)
 	const char *name = "unknown";
 	int chid = -1;
 
-	chan = nvkm_fifo_chan_inst(device->fifo, (u64)inst << 12, &flags);
+	chan = nvkm_chan_get_inst(&gr->base.engine, (u64)inst << 12, &flags);
 	if (chan) {
-		name = chan->object.client->name;
-		chid = chan->chid;
+		name = chan->name;
+		chid = chan->id;
 	}
 
 	if (device->card_type < NV_E0 || subc < 4)
@@ -1631,7 +1691,8 @@ gf100_gr_intr(struct nvkm_gr *base)
 	}
 
 	nvkm_wr32(device, 0x400500, 0x00010001);
-	nvkm_fifo_chan_put(device->fifo, flags, &chan);
+	nvkm_chan_put(&chan, flags);
+	return IRQ_HANDLED;
 }
 
 static void
@@ -1721,7 +1782,7 @@ gf100_gr_init_ctxctl_ext(struct gf100_gr *gr)
 	nvkm_mc_unk260(device, 1);
 
 	/* start both of them running */
-	nvkm_wr32(device, 0x409840, 0xffffffff);
+	nvkm_wr32(device, 0x409800, 0x00000000);
 	nvkm_wr32(device, 0x41a10c, 0x00000000);
 	nvkm_wr32(device, 0x40910c, 0x00000000);
 
@@ -1763,15 +1824,6 @@ gf100_gr_init_ctxctl_ext(struct gf100_gr *gr)
 			return ret;
 	}
 
-	/* Generate golden context image. */
-	if (gr->data == NULL) {
-		int ret = gf100_grctx_generate(gr);
-		if (ret) {
-			nvkm_error(subdev, "failed to construct context\n");
-			return ret;
-		}
-	}
-
 	return 0;
 }
 
@@ -1823,14 +1875,6 @@ gf100_gr_init_ctxctl_int(struct gf100_gr *gr)
 	}
 
 	gr->size = nvkm_rd32(device, 0x409804);
-	if (gr->data == NULL) {
-		int ret = gf100_grctx_generate(gr);
-		if (ret) {
-			nvkm_error(subdev, "failed to construct context\n");
-			return ret;
-		}
-	}
-
 	return 0;
 }
 
@@ -1847,10 +1891,11 @@ gf100_gr_init_ctxctl(struct gf100_gr *gr)
 	return ret;
 }
 
-void
+int
 gf100_gr_oneinit_sm_id(struct gf100_gr *gr)
 {
 	int tpc, gpc;
+
 	for (tpc = 0; tpc < gr->tpc_max; tpc++) {
 		for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
 			if (tpc < gr->tpc_nr[gpc]) {
@@ -1860,6 +1905,8 @@ gf100_gr_oneinit_sm_id(struct gf100_gr *gr)
 			}
 		}
 	}
+
+	return 0;
 }
 
 void
@@ -1944,7 +1991,17 @@ gf100_gr_oneinit(struct nvkm_gr *base)
 	struct gf100_gr *gr = gf100_gr(base);
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	int i, j;
+	struct nvkm_intr *intr = &device->mc->intr;
+	enum nvkm_intr_type intr_type = NVKM_INTR_SUBDEV;
+	int ret, i, j;
+
+	if (gr->func->oneinit_intr)
+		intr = gr->func->oneinit_intr(gr, &intr_type);
+
+	ret = nvkm_inth_add(intr, intr_type, NVKM_INTR_PRIO_NORMAL, &gr->base.engine.subdev,
+			    gf100_gr_intr, &gr->base.engine.subdev.inth);
+	if (ret)
+		return ret;
 
 	nvkm_pmu_pgob(device->pmu, false);
 
@@ -1954,12 +2011,14 @@ gf100_gr_oneinit(struct nvkm_gr *base)
 		gr->tpc_nr[i]  = nvkm_rd32(device, GPC_UNIT(i, 0x2608));
 		gr->tpc_max = max(gr->tpc_max, gr->tpc_nr[i]);
 		gr->tpc_total += gr->tpc_nr[i];
-		gr->ppc_nr[i]  = gr->func->ppc_nr;
-		for (j = 0; j < gr->ppc_nr[i]; j++) {
+		for (j = 0; j < gr->func->ppc_nr; j++) {
 			gr->ppc_tpc_mask[i][j] =
 				nvkm_rd32(device, GPC_UNIT(i, 0x0c30 + (j * 4)));
 			if (gr->ppc_tpc_mask[i][j] == 0)
 				continue;
+
+			gr->ppc_nr[i]++;
+
 			gr->ppc_mask[i] |= (1 << j);
 			gr->ppc_tpc_nr[i][j] = hweight8(gr->ppc_tpc_mask[i][j]);
 			if (gr->ppc_tpc_min == 0 ||
@@ -1968,12 +2027,37 @@ gf100_gr_oneinit(struct nvkm_gr *base)
 			if (gr->ppc_tpc_max < gr->ppc_tpc_nr[i][j])
 				gr->ppc_tpc_max = gr->ppc_tpc_nr[i][j];
 		}
+
+		gr->ppc_total += gr->ppc_nr[i];
+	}
+
+	/* Allocate global context buffers. */
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, gr->func->grctx->pagepool_size,
+			      0x100, false, &gr->pagepool);
+	if (ret)
+		return ret;
+
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, gr->func->grctx->bundle_size,
+			      0x100, false, &gr->bundle_cb);
+	if (ret)
+		return ret;
+
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, gr->func->grctx->attrib_cb_size(gr),
+			      0x1000, false, &gr->attrib_cb);
+	if (ret)
+		return ret;
+
+	if (gr->func->grctx->unknown_size) {
+		ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, gr->func->grctx->unknown_size,
+				      0x100, false, &gr->unknown);
+		if (ret)
+			return ret;
 	}
 
 	memset(gr->tile, 0xff, sizeof(gr->tile));
 	gr->func->oneinit_tiles(gr);
-	gr->func->oneinit_sm_id(gr);
-	return 0;
+
+	return gr->func->oneinit_sm_id(gr);
 }
 
 static int
@@ -1983,7 +2067,7 @@ gf100_gr_init_(struct nvkm_gr *base)
 	struct nvkm_subdev *subdev = &base->engine.subdev;
 	struct nvkm_device *device = subdev->device;
 	bool reset = device->chipset == 0x137 || device->chipset == 0x138;
-	u32 ret;
+	int ret;
 
 	/* On certain GP107/GP108 boards, we trigger a weird issue where
 	 * GR will stop responding to PRI accesses after we've asked the
@@ -2019,7 +2103,12 @@ gf100_gr_init_(struct nvkm_gr *base)
 	if (ret)
 		return ret;
 
-	return gr->func->init(gr);
+	ret = gr->func->init(gr);
+	if (ret)
+		return ret;
+
+	nvkm_inth_allow(&subdev->inth);
+	return 0;
 }
 
 static int
@@ -2027,6 +2116,9 @@ gf100_gr_fini(struct nvkm_gr *base, bool suspend)
 {
 	struct gf100_gr *gr = gf100_gr(base);
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
+
+	nvkm_inth_block(&subdev->inth);
+
 	nvkm_falcon_put(&gr->gpccs.falcon, subdev);
 	nvkm_falcon_put(&gr->fecs.falcon, subdev);
 	return 0;
@@ -2039,6 +2131,11 @@ gf100_gr_dtor(struct nvkm_gr *base)
 
 	kfree(gr->data);
 
+	nvkm_memory_unref(&gr->unknown);
+	nvkm_memory_unref(&gr->attrib_cb);
+	nvkm_memory_unref(&gr->bundle_cb);
+	nvkm_memory_unref(&gr->pagepool);
+
 	nvkm_falcon_dtor(&gr->gpccs.falcon);
 	nvkm_falcon_dtor(&gr->fecs.falcon);
 
@@ -2047,81 +2144,27 @@ gf100_gr_dtor(struct nvkm_gr *base)
 	nvkm_blob_dtor(&gr->gpccs.inst);
 	nvkm_blob_dtor(&gr->gpccs.data);
 
+	vfree(gr->bundle64);
+	vfree(gr->bundle_veid);
 	vfree(gr->bundle);
 	vfree(gr->method);
 	vfree(gr->sw_ctx);
 	vfree(gr->sw_nonctx);
+	vfree(gr->sw_nonctx1);
+	vfree(gr->sw_nonctx2);
+	vfree(gr->sw_nonctx3);
+	vfree(gr->sw_nonctx4);
 
 	return gr;
 }
 
-static const struct nvkm_gr_func
-gf100_gr_ = {
-	.dtor = gf100_gr_dtor,
-	.oneinit = gf100_gr_oneinit,
-	.init = gf100_gr_init_,
-	.fini = gf100_gr_fini,
-	.intr = gf100_gr_intr,
-	.units = gf100_gr_units,
-	.chan_new = gf100_gr_chan_new,
-	.object_get = gf100_gr_object_get,
-	.chsw_load = gf100_gr_chsw_load,
-	.ctxsw.pause = gf100_gr_fecs_stop_ctxsw,
-	.ctxsw.resume = gf100_gr_fecs_start_ctxsw,
-	.ctxsw.inst = gf100_gr_ctxsw_inst,
-};
-
 static const struct nvkm_falcon_func
 gf100_gr_flcn = {
-	.fbif = 0x600,
 	.load_imem = nvkm_falcon_v1_load_imem,
 	.load_dmem = nvkm_falcon_v1_load_dmem,
-	.read_dmem = nvkm_falcon_v1_read_dmem,
-	.bind_context = nvkm_falcon_v1_bind_context,
-	.wait_for_halt = nvkm_falcon_v1_wait_for_halt,
-	.clear_interrupt = nvkm_falcon_v1_clear_interrupt,
-	.set_start_addr = nvkm_falcon_v1_set_start_addr,
 	.start = nvkm_falcon_v1_start,
-	.enable = nvkm_falcon_v1_enable,
-	.disable = nvkm_falcon_v1_disable,
 };
 
-int
-gf100_gr_new_(const struct gf100_gr_fwif *fwif, struct nvkm_device *device,
-	      enum nvkm_subdev_type type, int inst, struct nvkm_gr **pgr)
-{
-	struct gf100_gr *gr;
-	int ret;
-
-	if (!(gr = kzalloc(sizeof(*gr), GFP_KERNEL)))
-		return -ENOMEM;
-	*pgr = &gr->base;
-
-	ret = nvkm_gr_ctor(&gf100_gr_, device, type, inst, true, &gr->base);
-	if (ret)
-		return ret;
-
-	fwif = nvkm_firmware_load(&gr->base.engine.subdev, fwif, "Gr", gr);
-	if (IS_ERR(fwif))
-		return PTR_ERR(fwif);
-
-	gr->func = fwif->func;
-
-	ret = nvkm_falcon_ctor(&gf100_gr_flcn, &gr->base.engine.subdev,
-			       "fecs", 0x409000, &gr->fecs.falcon);
-	if (ret)
-		return ret;
-
-	mutex_init(&gr->fecs.mutex);
-
-	ret = nvkm_falcon_ctor(&gf100_gr_flcn, &gr->base.engine.subdev,
-			       "gpccs", 0x41a000, &gr->gpccs.falcon);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 void
 gf100_gr_init_num_tpc_per_gpc(struct gf100_gr *gr, bool pd, bool ds)
 {
@@ -2146,6 +2189,29 @@ gf100_gr_init_400054(struct gf100_gr *gr)
 }
 
 void
+gf100_gr_init_exception2(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x40011c, 0xffffffff);
+	nvkm_wr32(device, 0x400134, 0xffffffff);
+}
+
+void
+gf100_gr_init_rop_exceptions(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	int rop;
+
+	for (rop = 0; rop < gr->rop_nr; rop++) {
+		nvkm_wr32(device, ROP_UNIT(rop, 0x144), 0x40000000);
+		nvkm_wr32(device, ROP_UNIT(rop, 0x070), 0x40000000);
+		nvkm_wr32(device, ROP_UNIT(rop, 0x204), 0xffffffff);
+		nvkm_wr32(device, ROP_UNIT(rop, 0x208), 0xffffffff);
+	}
+}
+
+void
 gf100_gr_init_shader_exceptions(struct gf100_gr *gr, int gpc, int tpc)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
@@ -2252,21 +2318,47 @@ gf100_gr_init_vsc_stream_master(struct gf100_gr *gr)
 	nvkm_mask(device, TPC_UNIT(0, 0, 0x05c), 0x00000001, 0x00000001);
 }
 
+static int
+gf100_gr_reset(struct nvkm_gr *base)
+{
+	struct nvkm_subdev *subdev = &base->engine.subdev;
+	struct nvkm_device *device = subdev->device;
+	struct gf100_gr *gr = gf100_gr(base);
+
+	nvkm_mask(device, 0x400500, 0x00000001, 0x00000000);
+
+	WARN_ON(gf100_gr_fecs_halt_pipeline(gr));
+
+	subdev->func->fini(subdev, false);
+	nvkm_mc_disable(device, subdev->type, subdev->inst);
+	if (gr->func->gpccs.reset)
+		gr->func->gpccs.reset(gr);
+
+	nvkm_mc_enable(device, subdev->type, subdev->inst);
+	return subdev->func->init(subdev);
+}
+
 int
 gf100_gr_init(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	int gpc, tpc, rop;
+	int gpc, tpc;
 
-	if (gr->func->init_419bd8)
-		gr->func->init_419bd8(gr);
+	nvkm_mask(device, 0x400500, 0x00010001, 0x00000000);
 
 	gr->func->init_gpc_mmu(gr);
 
-	if (gr->sw_nonctx)
+	if (gr->sw_nonctx1) {
+		gf100_gr_mmio(gr, gr->sw_nonctx1);
+		gf100_gr_mmio(gr, gr->sw_nonctx2);
+		gf100_gr_mmio(gr, gr->sw_nonctx3);
+		gf100_gr_mmio(gr, gr->sw_nonctx4);
+	} else
+	if (gr->sw_nonctx) {
 		gf100_gr_mmio(gr, gr->sw_nonctx);
-	else
+	} else {
 		gf100_gr_mmio(gr, gr->func->mmio);
+	}
 
 	gf100_gr_wait_idle(gr);
 
@@ -2298,6 +2390,10 @@ gf100_gr_init(struct gf100_gr *gr)
 	nvkm_wr32(device, 0x400124, 0x00000002);
 
 	gr->func->init_fecs_exceptions(gr);
+
+	if (gr->func->init_40a790)
+		gr->func->init_40a790(gr);
+
 	if (gr->func->init_ds_hww_esr_2)
 		gr->func->init_ds_hww_esr_2(gr);
 
@@ -2346,19 +2442,14 @@ gf100_gr_init(struct gf100_gr *gr)
 		nvkm_wr32(device, GPC_UNIT(gpc, 0x2c94), 0xffffffff);
 	}
 
-	for (rop = 0; rop < gr->rop_nr; rop++) {
-		nvkm_wr32(device, ROP_UNIT(rop, 0x144), 0x40000000);
-		nvkm_wr32(device, ROP_UNIT(rop, 0x070), 0x40000000);
-		nvkm_wr32(device, ROP_UNIT(rop, 0x204), 0xffffffff);
-		nvkm_wr32(device, ROP_UNIT(rop, 0x208), 0xffffffff);
-	}
+	gr->func->init_rop_exceptions(gr);
 
 	nvkm_wr32(device, 0x400108, 0xffffffff);
 	nvkm_wr32(device, 0x400138, 0xffffffff);
 	nvkm_wr32(device, 0x400118, 0xffffffff);
 	nvkm_wr32(device, 0x400130, 0xffffffff);
-	nvkm_wr32(device, 0x40011c, 0xffffffff);
-	nvkm_wr32(device, 0x400134, 0xffffffff);
+	if (gr->func->init_exception2)
+		gr->func->init_exception2(gr);
 
 	if (gr->func->init_400054)
 		gr->func->init_400054(gr);
@@ -2371,6 +2462,18 @@ gf100_gr_init(struct gf100_gr *gr)
 	return gf100_gr_init_ctxctl(gr);
 }
 
+void
+gf100_gr_fecs_reset(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+
+	nvkm_wr32(device, 0x409614, 0x00000070);
+	nvkm_usec(device, 10, NVKM_DELAY);
+	nvkm_mask(device, 0x409614, 0x00000700, 0x00000700);
+	nvkm_usec(device, 10, NVKM_DELAY);
+	nvkm_rd32(device, 0x409614);
+}
+
 #include "fuc/hubgf100.fuc3.h"
 
 struct gf100_gr_ucode
@@ -2391,6 +2494,22 @@ gf100_gr_gpccs_ucode = {
 	.data.size = sizeof(gf100_grgpc_data),
 };
 
+static const struct nvkm_gr_func
+gf100_gr_ = {
+	.dtor = gf100_gr_dtor,
+	.oneinit = gf100_gr_oneinit,
+	.init = gf100_gr_init_,
+	.fini = gf100_gr_fini,
+	.reset = gf100_gr_reset,
+	.units = gf100_gr_units,
+	.chan_new = gf100_gr_chan_new,
+	.object_get = gf100_gr_object_get,
+	.chsw_load = gf100_gr_chsw_load,
+	.ctxsw.pause = gf100_gr_fecs_stop_ctxsw,
+	.ctxsw.resume = gf100_gr_fecs_start_ctxsw,
+	.ctxsw.inst = gf100_gr_ctxsw_inst,
+};
+
 static const struct gf100_gr_func
 gf100_gr = {
 	.oneinit_tiles = gf100_gr_oneinit_tiles,
@@ -2406,10 +2525,13 @@ gf100_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf100_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.grctx = &gf100_grctx,
@@ -2483,6 +2605,42 @@ gf100_gr_fwif[] = {
 };
 
 int
+gf100_gr_new_(const struct gf100_gr_fwif *fwif, struct nvkm_device *device,
+	      enum nvkm_subdev_type type, int inst, struct nvkm_gr **pgr)
+{
+	struct gf100_gr *gr;
+	int ret;
+
+	if (!(gr = kzalloc(sizeof(*gr), GFP_KERNEL)))
+		return -ENOMEM;
+	*pgr = &gr->base;
+
+	ret = nvkm_gr_ctor(&gf100_gr_, device, type, inst, true, &gr->base);
+	if (ret)
+		return ret;
+
+	fwif = nvkm_firmware_load(&gr->base.engine.subdev, fwif, "Gr", gr);
+	if (IS_ERR(fwif))
+		return PTR_ERR(fwif);
+
+	gr->func = fwif->func;
+
+	ret = nvkm_falcon_ctor(&gf100_gr_flcn, &gr->base.engine.subdev,
+			       "fecs", 0x409000, &gr->fecs.falcon);
+	if (ret)
+		return ret;
+
+	mutex_init(&gr->fecs.mutex);
+
+	ret = nvkm_falcon_ctor(&gf100_gr_flcn, &gr->base.engine.subdev,
+			       "gpccs", 0x41a000, &gr->gpccs.falcon);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int
 gf100_gr_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, struct nvkm_gr **pgr)
 {
 	return gf100_gr_new_(gf100_gr_fwif, device, type, inst, pgr);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
index c0038f906135..94ca7ac16acf 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
@@ -44,19 +44,6 @@ struct nvkm_acr_lsfw;
 #define PPC_UNIT(t, m, r) (0x503000 + (t) * 0x8000 + (m) * 0x200 + (r))
 #define TPC_UNIT(t, m, r) (0x504000 + (t) * 0x8000 + (m) * 0x800 + (r))
 
-struct gf100_gr_data {
-	u32 size;
-	u32 align;
-	bool priv;
-};
-
-struct gf100_gr_mmio {
-	u32 addr;
-	u32 data;
-	u32 shift;
-	int buffer;
-};
-
 struct gf100_gr_zbc_color {
 	u32 format;
 	u32 ds[4];
@@ -101,13 +88,19 @@ struct gf100_gr {
 	 * using hardcoded arrays. To be allocated with vzalloc().
 	 */
 	struct gf100_gr_pack *sw_nonctx;
+	struct gf100_gr_pack *sw_nonctx1;
+	struct gf100_gr_pack *sw_nonctx2;
+	struct gf100_gr_pack *sw_nonctx3;
+	struct gf100_gr_pack *sw_nonctx4;
 	struct gf100_gr_pack *sw_ctx;
 	struct gf100_gr_pack *bundle;
+	struct gf100_gr_pack *bundle_veid;
+	struct gf100_gr_pack *bundle64;
 	struct gf100_gr_pack *method;
 
-	struct gf100_gr_zbc_color zbc_color[NVKM_LTC_MAX_ZBC_CNT];
-	struct gf100_gr_zbc_depth zbc_depth[NVKM_LTC_MAX_ZBC_CNT];
-	struct gf100_gr_zbc_stencil zbc_stencil[NVKM_LTC_MAX_ZBC_CNT];
+	struct gf100_gr_zbc_color zbc_color[NVKM_LTC_MAX_ZBC_COLOR_CNT];
+	struct gf100_gr_zbc_depth zbc_depth[NVKM_LTC_MAX_ZBC_DEPTH_CNT];
+	struct gf100_gr_zbc_stencil zbc_stencil[NVKM_LTC_MAX_ZBC_DEPTH_CNT];
 
 	u8 rop_nr;
 	u8 gpc_nr;
@@ -120,6 +113,12 @@ struct gf100_gr {
 	u8 ppc_tpc_nr[GPC_MAX][4];
 	u8 ppc_tpc_min;
 	u8 ppc_tpc_max;
+	u8 ppc_total;
+
+	struct nvkm_memory *pagepool;
+	struct nvkm_memory *bundle_cb;
+	struct nvkm_memory *attrib_cb;
+	struct nvkm_memory *unknown;
 
 	u8 screen_tile_row_offset;
 	u8 tile[TPC_MAX];
@@ -130,8 +129,6 @@ struct gf100_gr {
 	} sm[TPC_MAX];
 	u8 sm_nr;
 
-	struct gf100_gr_data mmio_data[4];
-	struct gf100_gr_mmio mmio_list[4096/8];
 	u32  size;
 	u32 *data;
 	u32 size_zcull;
@@ -139,6 +136,7 @@ struct gf100_gr {
 };
 
 int gf100_gr_fecs_bind_pointer(struct gf100_gr *, u32 inst);
+int gf100_gr_fecs_wfi_golden_save(struct gf100_gr *, u32 inst);
 
 struct gf100_gr_func_zbc {
 	void (*clear_color)(struct gf100_gr *, int zbc);
@@ -149,8 +147,9 @@ struct gf100_gr_func_zbc {
 };
 
 struct gf100_gr_func {
+	struct nvkm_intr *(*oneinit_intr)(struct gf100_gr *, enum nvkm_intr_type *);
 	void (*oneinit_tiles)(struct gf100_gr *);
-	void (*oneinit_sm_id)(struct gf100_gr *);
+	int (*oneinit_sm_id)(struct gf100_gr *);
 	int (*init)(struct gf100_gr *);
 	void (*init_419bd8)(struct gf100_gr *);
 	void (*init_gpc_mmu)(struct gf100_gr *);
@@ -164,6 +163,7 @@ struct gf100_gr_func {
 	void (*init_swdx_pes_mask)(struct gf100_gr *);
 	void (*init_fs)(struct gf100_gr *);
 	void (*init_fecs_exceptions)(struct gf100_gr *);
+	void (*init_40a790)(struct gf100_gr *);
 	void (*init_ds_hww_esr_2)(struct gf100_gr *);
 	void (*init_40601c)(struct gf100_gr *);
 	void (*init_sked_hww_esr)(struct gf100_gr *);
@@ -174,6 +174,8 @@ struct gf100_gr_func {
 	void (*init_tex_hww_esr)(struct gf100_gr *, int gpc, int tpc);
 	void (*init_504430)(struct gf100_gr *, int gpc, int tpc);
 	void (*init_shader_exceptions)(struct gf100_gr *, int gpc, int tpc);
+	void (*init_rop_exceptions)(struct gf100_gr *);
+	void (*init_exception2)(struct gf100_gr *);
 	void (*init_400054)(struct gf100_gr *);
 	void (*init_4188a4)(struct gf100_gr *);
 	void (*trap_mp)(struct gf100_gr *, int gpc, int tpc);
@@ -181,9 +183,11 @@ struct gf100_gr_func {
 	const struct gf100_gr_pack *mmio;
 	struct {
 		struct gf100_gr_ucode *ucode;
+		void (*reset)(struct gf100_gr *);
 	} fecs;
 	struct {
 		struct gf100_gr_ucode *ucode;
+		void (*reset)(struct gf100_gr *);
 	} gpccs;
 	int (*rops)(struct gf100_gr *);
 	int gpc_nr;
@@ -197,7 +201,7 @@ struct gf100_gr_func {
 
 int gf100_gr_rops(struct gf100_gr *);
 void gf100_gr_oneinit_tiles(struct gf100_gr *);
-void gf100_gr_oneinit_sm_id(struct gf100_gr *);
+int gf100_gr_oneinit_sm_id(struct gf100_gr *);
 int gf100_gr_init(struct gf100_gr *);
 void gf100_gr_init_vsc_stream_master(struct gf100_gr *);
 void gf100_gr_init_zcull(struct gf100_gr *);
@@ -208,9 +212,12 @@ void gf100_gr_init_419cc0(struct gf100_gr *);
 void gf100_gr_init_419eb4(struct gf100_gr *);
 void gf100_gr_init_tex_hww_esr(struct gf100_gr *, int, int);
 void gf100_gr_init_shader_exceptions(struct gf100_gr *, int, int);
+void gf100_gr_init_rop_exceptions(struct gf100_gr *);
+void gf100_gr_init_exception2(struct gf100_gr *);
 void gf100_gr_init_400054(struct gf100_gr *);
 void gf100_gr_init_num_tpc_per_gpc(struct gf100_gr *, bool, bool);
 extern const struct gf100_gr_func_zbc gf100_gr_zbc;
+void gf100_gr_fecs_reset(struct gf100_gr *);
 
 void gf117_gr_init_zcull(struct gf100_gr *);
 
@@ -226,9 +233,13 @@ void gm107_gr_init_shader_exceptions(struct gf100_gr *, int, int);
 void gm107_gr_init_400054(struct gf100_gr *);
 
 int gk20a_gr_init(struct gf100_gr *);
+int gk20a_gr_av_to_init_(struct nvkm_blob *, u8 count, u32 pitch, struct gf100_gr_pack **);
+int gk20a_gr_av_to_init(struct nvkm_blob *, struct gf100_gr_pack **);
+int gk20a_gr_aiv_to_init(struct nvkm_blob *, struct gf100_gr_pack **);
+int gk20a_gr_av_to_method(struct nvkm_blob *, struct gf100_gr_pack **);
 
 void gm200_gr_oneinit_tiles(struct gf100_gr *);
-void gm200_gr_oneinit_sm_id(struct gf100_gr *);
+int gm200_gr_oneinit_sm_id(struct gf100_gr *);
 int gm200_gr_rops(struct gf100_gr *);
 void gm200_gr_init_num_active_ltcs(struct gf100_gr *);
 void gm200_gr_init_ds_hww_esr_2(struct gf100_gr *);
@@ -242,14 +253,24 @@ extern const struct gf100_gr_func_zbc gp100_gr_zbc;
 
 void gp102_gr_init_swdx_pes_mask(struct gf100_gr *);
 extern const struct gf100_gr_func_zbc gp102_gr_zbc;
+int gp102_gr_zbc_stencil_get(struct gf100_gr *, int, const u32, const u32);
+void gp102_gr_zbc_clear_stencil(struct gf100_gr *, int);
 
 extern const struct gf100_gr_func gp107_gr;
 
+int gv100_gr_oneinit_sm_id(struct gf100_gr *);
+u32 gv100_gr_nonpes_aware_tpc(struct gf100_gr *gr, u32 gpc, u32 tpc);
 void gv100_gr_init_419bd8(struct gf100_gr *);
 void gv100_gr_init_504430(struct gf100_gr *, int, int);
 void gv100_gr_init_shader_exceptions(struct gf100_gr *, int, int);
+void gv100_gr_init_4188a4(struct gf100_gr *);
 void gv100_gr_trap_mp(struct gf100_gr *, int, int);
 
+int tu102_gr_av_to_init_veid(struct nvkm_blob *, struct gf100_gr_pack **);
+void tu102_gr_init_zcull(struct gf100_gr *);
+void tu102_gr_init_fs(struct gf100_gr *);
+void tu102_gr_init_fecs_exceptions(struct gf100_gr *);
+
 #define gf100_gr_chan(p) container_of((p), struct gf100_gr_chan, object)
 #include <core/object.h>
 
@@ -258,14 +279,14 @@ struct gf100_gr_chan {
 	struct gf100_gr *gr;
 	struct nvkm_vmm *vmm;
 
+	struct nvkm_vma *pagepool;
+	struct nvkm_vma *bundle_cb;
+	struct nvkm_vma *attrib_cb;
+	struct nvkm_vma *unknown;
+
 	struct nvkm_memory *mmio;
 	struct nvkm_vma *mmio_vma;
 	int mmio_nr;
-
-	struct {
-		struct nvkm_memory *mem;
-		struct nvkm_vma *vma;
-	} data[4];
 };
 
 void gf100_gr_ctxctl_debug(struct gf100_gr *);
@@ -279,7 +300,7 @@ struct gf100_gr_init {
 	u32 addr;
 	u8  count;
 	u32 pitch;
-	u32 data;
+	u64 data;
 };
 
 struct gf100_gr_pack {
@@ -403,6 +424,9 @@ int gf100_gr_load(struct gf100_gr *, int, const struct gf100_gr_fwif *);
 int gf100_gr_nofw(struct gf100_gr *, int, const struct gf100_gr_fwif *);
 
 int gk20a_gr_load_sw(struct gf100_gr *, const char *path, int ver);
+int gk20a_gr_load_net(struct gf100_gr *, const char *, const char *, int,
+		      int (*)(struct nvkm_blob *, struct gf100_gr_pack **),
+		      struct gf100_gr_pack **);
 
 int gm200_gr_nofw(struct gf100_gr *, int, const struct gf100_gr_fwif *);
 int gm200_gr_load(struct gf100_gr *, int, const struct gf100_gr_fwif *);
@@ -415,6 +439,8 @@ void gm20b_gr_acr_bld_patch(struct nvkm_acr *, u32, s64);
 
 extern const struct nvkm_acr_lsf_func gp108_gr_gpccs_acr;
 extern const struct nvkm_acr_lsf_func gp108_gr_fecs_acr;
+void gp108_gr_acr_bld_write(struct nvkm_acr *, u32, struct nvkm_acr_lsfw *);
+void gp108_gr_acr_bld_patch(struct nvkm_acr *, u32, s64);
 
 int gf100_gr_new_(const struct gf100_gr_fwif *, struct nvkm_device *, enum nvkm_subdev_type, int,
 		  struct nvkm_gr **);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
index 3acd99c306f2..63bd29c22fe1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
@@ -127,10 +127,13 @@ gf104_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf104_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.grctx = &gf104_grctx,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
index ab3760e804b8..495a844f925f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
@@ -125,10 +125,13 @@ gf108_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf108_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.grctx = &gf108_grctx,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
index 616e2def1865..70fad235d161 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
@@ -99,10 +99,13 @@ gf110_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf110_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.grctx = &gf110_grctx,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
index 669e7536970e..f12728248048 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
@@ -125,7 +125,9 @@ gf117_gr_init_zcull(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const u32 magicgpc918 = DIV_ROUND_UP(0x00800000, gr->tpc_total);
-	const u8 tile_nr = ALIGN(gr->tpc_total, 32);
+	/*TODO: fill in litter vals for gf117-gm2xx */
+	const u8 tile_nr = !gr->func->gpc_nr ? ALIGN(gr->tpc_total, 32) :
+			   (gr->func->gpc_nr * gr->func->tpc_nr);
 	u8 bank[GPC_MAX] = {}, gpc, i, j;
 	u32 data;
 
@@ -163,10 +165,13 @@ gf117_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf117_gr_pack_mmio,
 	.fecs.ucode = &gf117_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf117_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 1,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
index 5b09bda8110c..75ceb514c06e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
@@ -190,10 +190,13 @@ gf119_gr = {
 	.init_419eb4 = gf100_gr_init_419eb4,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gf119_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.grctx = &gf119_grctx,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
index b680eaa0f350..e53ade24ad23 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
@@ -418,7 +418,7 @@ gk104_gr_init_ppc_exceptions(struct gf100_gr *gr)
 	int gpc, ppc;
 
 	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
-		for (ppc = 0; ppc < gr->ppc_nr[gpc]; ppc++) {
+		for (ppc = 0; ppc < gr->func->ppc_nr; ppc++) {
 			if (!(gr->ppc_mask[gpc] & (1 << ppc)))
 				continue;
 			nvkm_wr32(device, PPC_UNIT(gpc, ppc, 0x038), 0xc0000000);
@@ -470,10 +470,13 @@ gk104_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gk104_gr_pack_mmio,
 	.fecs.ucode = &gk104_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gk104_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 1,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
index 103e06a77e65..c7e1c5dbc6a9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
@@ -366,10 +366,13 @@ gk110_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gk110_gr_pack_mmio,
 	.fecs.ucode = &gk110_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gk110_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 2,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
index 034d0b11a17d..458abae571bf 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
@@ -118,10 +118,13 @@ gk110b_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gk110b_gr_pack_mmio,
 	.fecs.ucode = &gk110_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gk110_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 2,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
index 116d682f9f96..d3f6b65c21d2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
@@ -176,10 +176,13 @@ gk208_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_shader_exceptions = gf100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gf100_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gk208_gr_pack_mmio,
 	.fecs.ucode = &gk208_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gk208_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 1,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
index be0b2cefd8e8..035ea213f543 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
@@ -33,47 +33,40 @@ struct gk20a_fw_av
 	u32 data;
 };
 
-static int
-gk20a_gr_av_to_init(struct gf100_gr *gr, const char *path, const char *name,
-		    int ver, struct gf100_gr_pack **ppack)
+int
+gk20a_gr_av_to_init_(struct nvkm_blob *blob, u8 count, u32 pitch, struct gf100_gr_pack **ppack)
 {
-	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
-	struct nvkm_blob blob;
 	struct gf100_gr_init *init;
 	struct gf100_gr_pack *pack;
 	int nent;
-	int ret;
 	int i;
 
-	ret = nvkm_firmware_load_blob(subdev, path, name, ver, &blob);
-	if (ret)
-		return ret;
-
-	nent = (blob.size / sizeof(struct gk20a_fw_av));
+	nent = (blob->size / sizeof(struct gk20a_fw_av));
 	pack = vzalloc((sizeof(*pack) * 2) + (sizeof(*init) * (nent + 1)));
-	if (!pack) {
-		ret = -ENOMEM;
-		goto end;
-	}
+	if (!pack)
+		return -ENOMEM;
 
 	init = (void *)(pack + 2);
 	pack[0].init = init;
 
 	for (i = 0; i < nent; i++) {
 		struct gf100_gr_init *ent = &init[i];
-		struct gk20a_fw_av *av = &((struct gk20a_fw_av *)blob.data)[i];
+		struct gk20a_fw_av *av = &((struct gk20a_fw_av *)blob->data)[i];
 
 		ent->addr = av->addr;
 		ent->data = av->data;
-		ent->count = 1;
-		ent->pitch = 1;
+		ent->count = ((ent->addr & 0xffff) != 0xe100) ? count : 1;
+		ent->pitch = pitch;
 	}
 
 	*ppack = pack;
+	return 0;
+}
 
-end:
-	nvkm_blob_dtor(&blob);
-	return ret;
+int
+gk20a_gr_av_to_init(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
+{
+	return gk20a_gr_av_to_init_(blob, 1, 1, ppack);
 }
 
 struct gk20a_fw_aiv
@@ -83,35 +76,25 @@ struct gk20a_fw_aiv
 	u32 data;
 };
 
-static int
-gk20a_gr_aiv_to_init(struct gf100_gr *gr, const char *path, const char *name,
-		     int ver, struct gf100_gr_pack **ppack)
+int
+gk20a_gr_aiv_to_init(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
 {
-	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
-	struct nvkm_blob blob;
 	struct gf100_gr_init *init;
 	struct gf100_gr_pack *pack;
 	int nent;
-	int ret;
 	int i;
 
-	ret = nvkm_firmware_load_blob(subdev, path, name, ver, &blob);
-	if (ret)
-		return ret;
-
-	nent = (blob.size / sizeof(struct gk20a_fw_aiv));
+	nent = (blob->size / sizeof(struct gk20a_fw_aiv));
 	pack = vzalloc((sizeof(*pack) * 2) + (sizeof(*init) * (nent + 1)));
-	if (!pack) {
-		ret = -ENOMEM;
-		goto end;
-	}
+	if (!pack)
+		return -ENOMEM;
 
 	init = (void *)(pack + 2);
 	pack[0].init = init;
 
 	for (i = 0; i < nent; i++) {
 		struct gf100_gr_init *ent = &init[i];
-		struct gk20a_fw_aiv *av = &((struct gk20a_fw_aiv *)blob.data)[i];
+		struct gk20a_fw_aiv *av = &((struct gk20a_fw_aiv *)blob->data)[i];
 
 		ent->addr = av->addr;
 		ent->data = av->data;
@@ -120,44 +103,30 @@ gk20a_gr_aiv_to_init(struct gf100_gr *gr, const char *path, const char *name,
 	}
 
 	*ppack = pack;
-
-end:
-	nvkm_blob_dtor(&blob);
-	return ret;
+	return 0;
 }
 
-static int
-gk20a_gr_av_to_method(struct gf100_gr *gr, const char *path, const char *name,
-		      int ver, struct gf100_gr_pack **ppack)
+int
+gk20a_gr_av_to_method(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
 {
-	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
-	struct nvkm_blob blob;
 	struct gf100_gr_init *init;
 	struct gf100_gr_pack *pack;
 	/* We don't suppose we will initialize more than 16 classes here... */
 	static const unsigned int max_classes = 16;
 	u32 classidx = 0, prevclass = 0;
 	int nent;
-	int ret;
 	int i;
 
-	ret = nvkm_firmware_load_blob(subdev, path, name, ver, &blob);
-	if (ret)
-		return ret;
-
-	nent = (blob.size / sizeof(struct gk20a_fw_av));
-
+	nent = (blob->size / sizeof(struct gk20a_fw_av));
 	pack = vzalloc((sizeof(*pack) * (max_classes + 1)) +
 		       (sizeof(*init) * (nent + max_classes + 1)));
-	if (!pack) {
-		ret = -ENOMEM;
-		goto end;
-	}
+	if (!pack)
+		return -ENOMEM;
 
 	init = (void *)(pack + max_classes + 1);
 
 	for (i = 0; i < nent; i++, init++) {
-		struct gk20a_fw_av *av = &((struct gk20a_fw_av *)blob.data)[i];
+		struct gk20a_fw_av *av = &((struct gk20a_fw_av *)blob->data)[i];
 		u32 class = av->addr & 0xffff;
 		u32 addr = (av->addr & 0xffff0000) >> 14;
 
@@ -169,8 +138,7 @@ gk20a_gr_av_to_method(struct gf100_gr *gr, const char *path, const char *name,
 			prevclass = class;
 			if (++classidx >= max_classes) {
 				vfree(pack);
-				ret = -ENOSPC;
-				goto end;
+				return -ENOSPC;
 			}
 		}
 
@@ -181,10 +149,7 @@ gk20a_gr_av_to_method(struct gf100_gr *gr, const char *path, const char *name,
 	}
 
 	*ppack = pack;
-
-end:
-	nvkm_blob_dtor(&blob);
-	return ret;
+	return 0;
 }
 
 static int
@@ -294,6 +259,7 @@ gk20a_gr = {
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.trap_mp = gf100_gr_trap_mp,
 	.set_hww_esr_report_mask = gk20a_gr_set_hww_esr_report_mask,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 1,
 	.grctx = &gk20a_grctx,
@@ -308,12 +274,29 @@ gk20a_gr = {
 };
 
 int
+gk20a_gr_load_net(struct gf100_gr *gr, const char *path, const char *name, int ver,
+		  int (*load)(struct nvkm_blob *, struct gf100_gr_pack **),
+		  struct gf100_gr_pack **ppack)
+{
+	struct nvkm_blob blob;
+	int ret;
+
+	ret = nvkm_firmware_load_blob(&gr->base.engine.subdev, path, name, ver, &blob);
+	if (ret)
+		return ret;
+
+	ret = load(&blob, ppack);
+	nvkm_blob_dtor(&blob);
+	return 0;
+}
+
+int
 gk20a_gr_load_sw(struct gf100_gr *gr, const char *path, int ver)
 {
-	if (gk20a_gr_av_to_init(gr, path, "sw_nonctx", ver, &gr->sw_nonctx) ||
-	    gk20a_gr_aiv_to_init(gr, path, "sw_ctx", ver, &gr->sw_ctx) ||
-	    gk20a_gr_av_to_init(gr, path, "sw_bundle_init", ver, &gr->bundle) ||
-	    gk20a_gr_av_to_method(gr, path, "sw_method_init", ver, &gr->method))
+	if (gk20a_gr_load_net(gr, path, "sw_nonctx", ver, gk20a_gr_av_to_init, &gr->sw_nonctx) ||
+	    gk20a_gr_load_net(gr, path, "sw_ctx", ver, gk20a_gr_aiv_to_init, &gr->sw_ctx) ||
+	    gk20a_gr_load_net(gr, path, "sw_bundle_init", ver, gk20a_gr_av_to_init, &gr->bundle) ||
+	    gk20a_gr_load_net(gr, path, "sw_method_init", ver, gk20a_gr_av_to_method, &gr->method))
 		return -ENOENT;
 
 	return 0;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm107.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm107.c
index 310987174cb5..797b828a943b 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm107.c
@@ -411,10 +411,13 @@ gm107_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gm107_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gm107_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
 	.mmio = gm107_gr_pack_mmio,
 	.fecs.ucode = &gm107_gr_fecs_ucode,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.gpccs.ucode = &gm107_gr_gpccs_ucode,
 	.rops = gf100_gr_rops,
 	.ppc_nr = 2,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c
index 385cfd91b266..b5210b31c1b2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c
@@ -148,11 +148,11 @@ gm200_gr_tile_map_2_8[] = {
 	0, 1, 1, 0, 0, 1, 1, 0,
 };
 
-void
+int
 gm200_gr_oneinit_sm_id(struct gf100_gr *gr)
 {
 	/*XXX: There's a different algorithm here I've not yet figured out. */
-	gf100_gr_oneinit_sm_id(gr);
+	return gf100_gr_oneinit_sm_id(gr);
 }
 
 void
@@ -199,8 +199,11 @@ gm200_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gm107_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_400054 = gm107_gr_init_400054,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.tpc_nr = 4,
 	.ppc_nr = 2,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm20b.c
index ec1c46e47e00..458cd1a00d3f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm20b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm20b.c
@@ -123,6 +123,7 @@ gm20b_gr = {
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.trap_mp = gf100_gr_trap_mp,
 	.set_hww_esr_report_mask = gm20b_gr_set_hww_esr_report_mask,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.ppc_nr = 1,
 	.grctx = &gm20b_grctx,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp100.c
index 0550dd6f46f1..851e743d2cab 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp100.c
@@ -87,7 +87,7 @@ gp100_gr_init_419c9c(struct gf100_gr *gr)
 void
 gp100_gr_init_fecs_exceptions(struct gf100_gr *gr)
 {
-	nvkm_wr32(gr->base.engine.subdev.device, 0x409c24, 0x000f0002);
+	nvkm_wr32(gr->base.engine.subdev.device, 0x409c24, 0x000e0002);
 }
 
 void
@@ -119,7 +119,10 @@ gp100_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gp100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 6,
 	.tpc_nr = 5,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp102.c
index 5b001f374be0..0e223b7b5f0e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp102.c
@@ -26,7 +26,7 @@
 
 #include <nvif/class.h>
 
-static void
+void
 gp102_gr_zbc_clear_stencil(struct gf100_gr *gr, int zbc)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
@@ -40,14 +40,14 @@ gp102_gr_zbc_clear_stencil(struct gf100_gr *gr, int zbc)
 			  gr->zbc_stencil[zbc].format << ((znum % 4) * 7));
 }
 
-static int
+int
 gp102_gr_zbc_stencil_get(struct gf100_gr *gr, int format,
 			 const u32 ds, const u32 l2)
 {
 	struct nvkm_ltc *ltc = gr->base.engine.subdev.device->ltc;
 	int zbc = -ENOSPC, i;
 
-	for (i = ltc->zbc_min; i <= ltc->zbc_max; i++) {
+	for (i = ltc->zbc_depth_min; i <= ltc->zbc_depth_max; i++) {
 		if (gr->zbc_stencil[i].format) {
 			if (gr->zbc_stencil[i].format != format)
 				continue;
@@ -115,7 +115,10 @@ gp102_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gp100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 6,
 	.tpc_nr = 5,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp104.c
index 2655574ec63b..6802cb9b199f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp104.c
@@ -43,7 +43,10 @@ gp104_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gp100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 6,
 	.tpc_nr = 5,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp107.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp107.c
index adabc04d4f3a..cc2bb0d0a987 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp107.c
@@ -45,7 +45,10 @@ gp107_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gp100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 2,
 	.tpc_nr = 3,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp108.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp108.c
index 7310f0466bb7..311f703439e4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp108.c
@@ -25,7 +25,7 @@
 
 #include <nvfw/flcn.h>
 
-static void
+void
 gp108_gr_acr_bld_patch(struct nvkm_acr *acr, u32 bld, s64 adjust)
 {
 	struct flcn_bl_dmem_desc_v2 hdr;
@@ -36,7 +36,7 @@ gp108_gr_acr_bld_patch(struct nvkm_acr *acr, u32 bld, s64 adjust)
 	flcn_bl_dmem_desc_v2_dump(&acr->subdev, &hdr);
 }
 
-static void
+void
 gp108_gr_acr_bld_write(struct nvkm_acr *acr, u32 bld,
 		       struct nvkm_acr_lsfw *lsfw)
 {
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp10b.c
index e13683b6e7b1..5008881ca079 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp10b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gp10b.c
@@ -55,7 +55,10 @@ gp10b_gr = {
 	.init_tex_hww_esr = gf100_gr_init_tex_hww_esr,
 	.init_504430 = gm107_gr_init_504430,
 	.init_shader_exceptions = gp100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.trap_mp = gf100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 1,
 	.tpc_nr = 2,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c
index 4d043c1173ea..7f7404a76140 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c
@@ -52,10 +52,11 @@ gv100_gr_trap_mp(struct gf100_gr *gr, int gpc, int tpc)
 	gv100_gr_trap_sm(gr, gpc, tpc, 1);
 }
 
-static void
+void
 gv100_gr_init_4188a4(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
+
 	nvkm_mask(device, 0x4188a4, 0x03000000, 0x03000000);
 }
 
@@ -65,7 +66,6 @@ gv100_gr_init_shader_exceptions(struct gf100_gr *gr, int gpc, int tpc)
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	int sm;
 	for (sm = 0; sm < 0x100; sm += 0x80) {
-		nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x728 + sm), 0x0085eb64);
 		nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x610), 0x00000001);
 		nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x72c + sm), 0x00000004);
 	}
@@ -85,10 +85,202 @@ gv100_gr_init_419bd8(struct gf100_gr *gr)
 	nvkm_mask(device, 0x419bd8, 0x00000700, 0x00000000);
 }
 
+u32
+gv100_gr_nonpes_aware_tpc(struct gf100_gr *gr, u32 gpc, u32 tpc)
+{
+	u32 pes, temp, tpc_new = 0;
+
+	for (pes = 0; pes < gr->ppc_nr[gpc]; pes++) {
+		if (gr->ppc_tpc_mask[gpc][pes] & BIT(tpc))
+			break;
+
+		tpc_new += gr->ppc_tpc_nr[gpc][pes];
+	}
+
+	temp = (BIT(tpc) - 1) & gr->ppc_tpc_mask[gpc][pes];
+	temp = hweight32(temp);
+	return tpc_new + temp;
+}
+
+static int
+gv100_gr_scg_estimate_perf(struct gf100_gr *gr, unsigned long *gpc_tpc_mask,
+			   u32 disable_gpc, u32 disable_tpc, int *perf)
+{
+	const u32 scale_factor = 512UL;		/* Use fx23.9 */
+	const u32 pix_scale = 1024*1024UL;	/* Pix perf in [29:20] */
+	const u32 world_scale = 1024UL;		/* World performance in [19:10] */
+	const u32 tpc_scale = 1;		/* TPC balancing in [9:0] */
+	u32 scg_num_pes = 0;
+	u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
+	u32 average_tpcs = 0; /* Average of # of TPCs per GPC */
+	u32 deviation; /* absolute diff between TPC# and average_tpcs, averaged across GPCs */
+	u32 norm_tpc_deviation;	/* deviation/max_tpc_per_gpc */
+	u32 tpc_balance;
+	u32 scg_gpc_pix_perf;
+	u32 scg_world_perf;
+	u32 gpc;
+	u32 pes;
+	int diff;
+	bool tpc_removed_gpc = false;
+	bool tpc_removed_pes = false;
+	u32 max_tpc_gpc = 0;
+	u32 num_tpc_mask;
+	u32 *num_tpc_gpc;
+	int ret = -EINVAL;
+
+	if (!(num_tpc_gpc = kcalloc(gr->gpc_nr, sizeof(*num_tpc_gpc), GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
+	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
+		num_tpc_mask = gpc_tpc_mask[gpc];
+
+		if ((gpc == disable_gpc) && num_tpc_mask & BIT(disable_tpc)) {
+			/* Safety check if a TPC is removed twice */
+			if (WARN_ON(tpc_removed_gpc))
+				goto done;
+
+			/* Remove logical TPC from set */
+			num_tpc_mask &= ~BIT(disable_tpc);
+			tpc_removed_gpc = true;
+		}
+
+		/* track balancing of tpcs across gpcs */
+		num_tpc_gpc[gpc] = hweight32(num_tpc_mask);
+		average_tpcs += num_tpc_gpc[gpc];
+
+		/* save the maximum numer of gpcs */
+		max_tpc_gpc = num_tpc_gpc[gpc] > max_tpc_gpc ? num_tpc_gpc[gpc] : max_tpc_gpc;
+
+		/*
+		 * Calculate ratio between TPC count and post-FS and post-SCG
+		 *
+		 * ratio represents relative throughput of the GPC
+		 */
+		scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc] / gr->tpc_nr[gpc];
+		if (min_scg_gpc_pix_perf > scg_gpc_pix_perf)
+			min_scg_gpc_pix_perf = scg_gpc_pix_perf;
+
+		/* Calculate # of surviving PES */
+		for (pes = 0; pes < gr->ppc_nr[gpc]; pes++) {
+			/* Count the number of TPC on the set */
+			num_tpc_mask = gr->ppc_tpc_mask[gpc][pes] & gpc_tpc_mask[gpc];
+
+			if ((gpc == disable_gpc) && (num_tpc_mask & BIT(disable_tpc))) {
+				if (WARN_ON(tpc_removed_pes))
+					goto done;
+
+				num_tpc_mask &= ~BIT(disable_tpc);
+				tpc_removed_pes = true;
+			}
+
+			if (hweight32(num_tpc_mask))
+				scg_num_pes++;
+		}
+	}
+
+	if (WARN_ON(!tpc_removed_gpc || !tpc_removed_pes))
+		goto done;
+
+	if (max_tpc_gpc == 0) {
+		*perf = 0;
+		goto done_ok;
+	}
+
+	/* Now calculate perf */
+	scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_total;
+	deviation = 0;
+	average_tpcs = scale_factor * average_tpcs / gr->gpc_nr;
+	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
+		diff = average_tpcs - scale_factor * num_tpc_gpc[gpc];
+		if (diff < 0)
+			diff = -diff;
+
+		deviation += diff;
+	}
+
+	deviation /= gr->gpc_nr;
+
+	norm_tpc_deviation = deviation / max_tpc_gpc;
+
+	tpc_balance = scale_factor - norm_tpc_deviation;
+
+	if ((tpc_balance > scale_factor)          ||
+	    (scg_world_perf > scale_factor)       ||
+	    (min_scg_gpc_pix_perf > scale_factor) ||
+	    (norm_tpc_deviation > scale_factor)) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	*perf = (pix_scale * min_scg_gpc_pix_perf) +
+		(world_scale * scg_world_perf) +
+		(tpc_scale * tpc_balance);
+done_ok:
+	ret = 0;
+done:
+	kfree(num_tpc_gpc);
+	return ret;
+}
+
+int
+gv100_gr_oneinit_sm_id(struct gf100_gr *gr)
+{
+	unsigned long *gpc_tpc_mask;
+	u32 *tpc_table, *gpc_table;
+	u32 gpc, tpc, pes, gtpc;
+	int perf, maxperf, ret = 0;
+
+	gpc_tpc_mask = kcalloc(gr->gpc_nr, sizeof(*gpc_tpc_mask), GFP_KERNEL);
+	gpc_table = kcalloc(gr->tpc_total, sizeof(*gpc_table), GFP_KERNEL);
+	tpc_table = kcalloc(gr->tpc_total, sizeof(*tpc_table), GFP_KERNEL);
+	if (!gpc_table || !tpc_table || !gpc_tpc_mask) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	for (gpc = 0; gpc < gr->gpc_nr; gpc++) {
+		for (pes = 0; pes < gr->ppc_nr[gpc]; pes++)
+			gpc_tpc_mask[gpc] |= gr->ppc_tpc_mask[gpc][pes];
+	}
+
+	for (gtpc = 0; gtpc < gr->tpc_total; gtpc++) {
+		for (maxperf = -1, gpc = 0; gpc < gr->gpc_nr; gpc++) {
+			for_each_set_bit(tpc, &gpc_tpc_mask[gpc], gr->tpc_nr[gpc]) {
+				ret = gv100_gr_scg_estimate_perf(gr, gpc_tpc_mask, gpc, tpc, &perf);
+				if (ret)
+					goto done;
+
+				/* nvgpu does ">=" here, but this gets us RM's numbers. */
+				if (perf > maxperf) {
+					maxperf = perf;
+					gpc_table[gtpc] = gpc;
+					tpc_table[gtpc] = tpc;
+				}
+			}
+		}
+
+		gpc_tpc_mask[gpc_table[gtpc]] &= ~BIT(tpc_table[gtpc]);
+	}
+
+	/*TODO: build table for sm_per_tpc != 1, don't use yet, but might need later? */
+	for (gtpc = 0; gtpc < gr->tpc_total; gtpc++) {
+		gr->sm[gtpc].gpc = gpc_table[gtpc];
+		gr->sm[gtpc].tpc = tpc_table[gtpc];
+		gr->sm_nr++;
+	}
+
+done:
+	kfree(gpc_table);
+	kfree(tpc_table);
+	kfree(gpc_tpc_mask);
+	return ret;
+}
+
 static const struct gf100_gr_func
 gv100_gr = {
 	.oneinit_tiles = gm200_gr_oneinit_tiles,
-	.oneinit_sm_id = gm200_gr_oneinit_sm_id,
+	.oneinit_sm_id = gv100_gr_oneinit_sm_id,
 	.init = gf100_gr_init,
 	.init_419bd8 = gv100_gr_init_419bd8,
 	.init_gpc_mmu = gm200_gr_init_gpc_mmu,
@@ -103,11 +295,14 @@ gv100_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_504430 = gv100_gr_init_504430,
 	.init_shader_exceptions = gv100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
 	.init_4188a4 = gv100_gr_init_4188a4,
 	.trap_mp = gv100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 6,
-	.tpc_nr = 5,
+	.tpc_nr = 7,
 	.ppc_nr = 3,
 	.grctx = &gv100_grctx,
 	.zbc = &gp102_gr_zbc,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c
index 0bc1a238de43..81bd682c2102 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c
@@ -1192,7 +1192,7 @@ nv04_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv04_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	*ctx_reg(chan, NV04_PGRAPH_DEBUG_3) = 0xfad4ff31;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv10.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv10.c
index 942450b33bc6..7fe6e58f6bab 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv10.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv10.c
@@ -1011,7 +1011,7 @@ nv10_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv10_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	NV_WRITE_CTX(0x00400e88, 0x08000000);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
index 6bff10cee71b..75434f5de7ad 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
@@ -83,7 +83,7 @@ nv20_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv20_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
@@ -182,7 +182,7 @@ nv20_gr_intr(struct nvkm_gr *base)
 	struct nv20_gr *gr = nv20_gr(base);
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_fifo_chan *chan;
+	struct nvkm_chan *chan;
 	u32 stat = nvkm_rd32(device, NV03_PGRAPH_INTR);
 	u32 nsource = nvkm_rd32(device, NV03_PGRAPH_NSOURCE);
 	u32 nstatus = nvkm_rd32(device, NV03_PGRAPH_NSTATUS);
@@ -196,7 +196,7 @@ nv20_gr_intr(struct nvkm_gr *base)
 	char msg[128], src[128], sta[128];
 	unsigned long flags;
 
-	chan = nvkm_fifo_chan_chid(device->fifo, chid, &flags);
+	chan = nvkm_chan_get_chid(&gr->base.engine, chid, &flags);
 
 	nvkm_wr32(device, NV03_PGRAPH_INTR, stat);
 	nvkm_wr32(device, NV04_PGRAPH_FIFO, 0x00000001);
@@ -209,11 +209,11 @@ nv20_gr_intr(struct nvkm_gr *base)
 				   "nstatus %08x [%s] ch %d [%s] subc %d "
 				   "class %04x mthd %04x data %08x\n",
 			   show, msg, nsource, src, nstatus, sta, chid,
-			   chan ? chan->object.client->name : "unknown",
+			   chan ? chan->name : "unknown",
 			   subc, class, mthd, data);
 	}
 
-	nvkm_fifo_chan_put(device->fifo, flags, &chan);
+	nvkm_chan_put(&chan, flags);
 }
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv25.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv25.c
index f3a56f17d94a..94685e4d4f87 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv25.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv25.c
@@ -29,7 +29,7 @@ nv25_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv25_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv2a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv2a.c
index f268d2642d29..2d6273675291 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv2a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv2a.c
@@ -29,7 +29,7 @@ nv2a_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv2a_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c
index e5737cdf2fa1..647bd6fede04 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c
@@ -30,7 +30,7 @@ nv30_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv30_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c
index 1ab2da8ebf4e..2eae3fe4ef4e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c
@@ -29,7 +29,7 @@ nv34_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv34_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv35.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv35.c
index 591260f5676b..657d7cdba369 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv35.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv35.c
@@ -29,7 +29,7 @@ nv35_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&nv35_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
-	chan->chid = fifoch->chid;
+	chan->chid = fifoch->id;
 	*pobject = &chan->object;
 
 	ret = nvkm_memory_new(gr->base.engine.subdev.device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.c
index 67f3535ff97e..d2df097a6cf6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.c
@@ -275,8 +275,8 @@ nv40_gr_intr(struct nvkm_gr *base)
 				   "nstatus %08x [%s] ch %d [%08x %s] subc %d "
 				   "class %04x mthd %04x data %08x\n",
 			   show, msg, nsource, src, nstatus, sta,
-			   chan ? chan->fifo->chid : -1, inst << 4,
-			   chan ? chan->fifo->object.client->name : "unknown",
+			   chan ? chan->fifo->id : -1, inst << 4,
+			   chan ? chan->fifo->name : "unknown",
 			   subc, class, mthd, data);
 	}
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.c
index 563a10097e95..1ba18a8e380f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.c
@@ -622,7 +622,7 @@ nv50_gr_intr(struct nvkm_gr *base)
 	struct nv50_gr *gr = nv50_gr(base);
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_fifo_chan *chan;
+	struct nvkm_chan *chan;
 	u32 stat = nvkm_rd32(device, 0x400100);
 	u32 inst = nvkm_rd32(device, 0x40032c) & 0x0fffffff;
 	u32 addr = nvkm_rd32(device, 0x400704);
@@ -637,10 +637,10 @@ nv50_gr_intr(struct nvkm_gr *base)
 	char msg[128];
 	int chid = -1;
 
-	chan = nvkm_fifo_chan_inst(device->fifo, (u64)inst << 12, &flags);
+	chan = nvkm_chan_get_inst(&gr->base.engine, (u64)inst << 12, &flags);
 	if (chan)  {
-		name = chan->object.client->name;
-		chid = chan->chid;
+		name = chan->name;
+		chid = chan->id;
 	}
 
 	if (show & 0x00100000) {
@@ -672,7 +672,7 @@ nv50_gr_intr(struct nvkm_gr *base)
 	if (nvkm_rd32(device, 0x400824) & (1 << 31))
 		nvkm_wr32(device, 0x400824, nvkm_rd32(device, 0x400824) & ~(1 << 31));
 
-	nvkm_fifo_chan_put(device->fifo, flags, &chan);
+	nvkm_chan_put(&chan, flags);
 }
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/priv.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/priv.h
index 9b2c66e8be90..08d5c96e6458 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/priv.h
@@ -17,6 +17,7 @@ struct nvkm_gr_func {
 	int (*oneinit)(struct nvkm_gr *);
 	int (*init)(struct nvkm_gr *);
 	int (*fini)(struct nvkm_gr *, bool);
+	int (*reset)(struct nvkm_gr *);
 	void (*intr)(struct nvkm_gr *);
 	void (*tile)(struct nvkm_gr *, int region, struct nvkm_fb_tile *);
 	int (*tlb_flush)(struct nvkm_gr *);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c
index 1a8a21844e12..3b6c8100a242 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c
@@ -24,13 +24,13 @@
 
 #include <nvif/class.h>
 
-static void
+void
 tu102_gr_init_fecs_exceptions(struct gf100_gr *gr)
 {
-	nvkm_wr32(gr->base.engine.subdev.device, 0x409c24, 0x006f0002);
+	nvkm_wr32(gr->base.engine.subdev.device, 0x409c24, 0x006e0003);
 }
 
-static void
+void
 tu102_gr_init_fs(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
@@ -40,20 +40,21 @@ tu102_gr_init_fs(struct gf100_gr *gr)
 	gk104_grctx_generate_gpc_tpc_nr(gr);
 
 	for (sm = 0; sm < gr->sm_nr; sm++) {
-		nvkm_wr32(device, GPC_UNIT(gr->sm[sm].gpc, 0x0c10 +
-					   gr->sm[sm].tpc * 4), sm);
+		int tpc = gv100_gr_nonpes_aware_tpc(gr, gr->sm[sm].gpc, gr->sm[sm].tpc);
+
+		nvkm_wr32(device, GPC_UNIT(gr->sm[sm].gpc, 0x0c10 + tpc * 4), sm);
 	}
 
 	gm200_grctx_generate_dist_skip_table(gr);
 	gf100_gr_init_num_tpc_per_gpc(gr, true, true);
 }
 
-static void
+void
 tu102_gr_init_zcull(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	const u32 magicgpc918 = DIV_ROUND_UP(0x00800000, gr->tpc_total);
-	const u8 tile_nr = ALIGN(gr->tpc_total, 64);
+	const u8 tile_nr = gr->func->gpc_nr * gr->func->tpc_nr;
 	u8 bank[GPC_MAX] = {}, gpc, i, j;
 	u32 data;
 
@@ -93,7 +94,7 @@ tu102_gr_init_gpc_mmu(struct gf100_gr *gr)
 static const struct gf100_gr_func
 tu102_gr = {
 	.oneinit_tiles = gm200_gr_oneinit_tiles,
-	.oneinit_sm_id = gm200_gr_oneinit_sm_id,
+	.oneinit_sm_id = gv100_gr_oneinit_sm_id,
 	.init = gf100_gr_init,
 	.init_419bd8 = gv100_gr_init_419bd8,
 	.init_gpc_mmu = tu102_gr_init_gpc_mmu,
@@ -109,10 +110,14 @@ tu102_gr = {
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.init_504430 = gv100_gr_init_504430,
 	.init_shader_exceptions = gv100_gr_init_shader_exceptions,
+	.init_rop_exceptions = gf100_gr_init_rop_exceptions,
+	.init_exception2 = gf100_gr_init_exception2,
+	.init_4188a4 = gv100_gr_init_4188a4,
 	.trap_mp = gv100_gr_trap_mp,
+	.fecs.reset = gf100_gr_fecs_reset,
 	.rops = gm200_gr_rops,
 	.gpc_nr = 6,
-	.tpc_nr = 5,
+	.tpc_nr = 6,
 	.ppc_nr = 3,
 	.grctx = &tu102_grctx,
 	.zbc = &gp102_gr_zbc,
@@ -137,6 +142,7 @@ MODULE_FIRMWARE("nvidia/tu102/gr/sw_ctx.bin");
 MODULE_FIRMWARE("nvidia/tu102/gr/sw_nonctx.bin");
 MODULE_FIRMWARE("nvidia/tu102/gr/sw_bundle_init.bin");
 MODULE_FIRMWARE("nvidia/tu102/gr/sw_method_init.bin");
+MODULE_FIRMWARE("nvidia/tu102/gr/sw_veid_bundle_init.bin");
 
 MODULE_FIRMWARE("nvidia/tu104/gr/fecs_bl.bin");
 MODULE_FIRMWARE("nvidia/tu104/gr/fecs_inst.bin");
@@ -150,6 +156,7 @@ MODULE_FIRMWARE("nvidia/tu104/gr/sw_ctx.bin");
 MODULE_FIRMWARE("nvidia/tu104/gr/sw_nonctx.bin");
 MODULE_FIRMWARE("nvidia/tu104/gr/sw_bundle_init.bin");
 MODULE_FIRMWARE("nvidia/tu104/gr/sw_method_init.bin");
+MODULE_FIRMWARE("nvidia/tu104/gr/sw_veid_bundle_init.bin");
 
 MODULE_FIRMWARE("nvidia/tu106/gr/fecs_bl.bin");
 MODULE_FIRMWARE("nvidia/tu106/gr/fecs_inst.bin");
@@ -163,6 +170,7 @@ MODULE_FIRMWARE("nvidia/tu106/gr/sw_ctx.bin");
 MODULE_FIRMWARE("nvidia/tu106/gr/sw_nonctx.bin");
 MODULE_FIRMWARE("nvidia/tu106/gr/sw_bundle_init.bin");
 MODULE_FIRMWARE("nvidia/tu106/gr/sw_method_init.bin");
+MODULE_FIRMWARE("nvidia/tu106/gr/sw_veid_bundle_init.bin");
 
 MODULE_FIRMWARE("nvidia/tu117/gr/fecs_bl.bin");
 MODULE_FIRMWARE("nvidia/tu117/gr/fecs_inst.bin");
@@ -176,6 +184,7 @@ MODULE_FIRMWARE("nvidia/tu117/gr/sw_ctx.bin");
 MODULE_FIRMWARE("nvidia/tu117/gr/sw_nonctx.bin");
 MODULE_FIRMWARE("nvidia/tu117/gr/sw_bundle_init.bin");
 MODULE_FIRMWARE("nvidia/tu117/gr/sw_method_init.bin");
+MODULE_FIRMWARE("nvidia/tu117/gr/sw_veid_bundle_init.bin");
 
 MODULE_FIRMWARE("nvidia/tu116/gr/fecs_bl.bin");
 MODULE_FIRMWARE("nvidia/tu116/gr/fecs_inst.bin");
@@ -189,6 +198,26 @@ MODULE_FIRMWARE("nvidia/tu116/gr/sw_ctx.bin");
 MODULE_FIRMWARE("nvidia/tu116/gr/sw_nonctx.bin");
 MODULE_FIRMWARE("nvidia/tu116/gr/sw_bundle_init.bin");
 MODULE_FIRMWARE("nvidia/tu116/gr/sw_method_init.bin");
+MODULE_FIRMWARE("nvidia/tu116/gr/sw_veid_bundle_init.bin");
+
+int
+tu102_gr_av_to_init_veid(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
+{
+	return gk20a_gr_av_to_init_(blob, 64, 0x00100000, ppack);
+}
+
+int
+tu102_gr_load(struct gf100_gr *gr, int ver, const struct gf100_gr_fwif *fwif)
+{
+	int ret;
+
+	ret = gm200_gr_load(gr, ver, fwif);
+	if (ret)
+		return ret;
+
+	return gk20a_gr_load_net(gr, "gr/", "sw_veid_bundle_init", ver, tu102_gr_av_to_init_veid,
+				 &gr->bundle_veid);
+}
 
 static const struct gf100_gr_fwif
 tu102_gr_fwif[] = {