167 files changed, 5472 insertions, 650 deletions
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 1b5042f134a8..80c1be5d566c 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -124,6 +124,13 @@ OPTIONS
 --group::
 	Show event group information together
 
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--demangle-kernel::
+	Demangle kernel symbol names to human readable form (for C++ kernels).
+
 --percent-type::
 	Set annotation percent type from following choices:
 	  global-period, local-period, global-hits, local-hits
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index bb167e32a1d7..cd8ce6e8ec12 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -57,7 +57,7 @@ OPTIONS
 -u::
 --update=::
 	Update specified file of the cache. Note that this doesn't remove
-	older entires since those may be still needed for annotating old
+	older entries since those may be still needed for annotating old
 	(or remote) perf.data. Only if there is already a cache which has
 	exactly same build-id, that is replaced by new one. It can be used
 	to update kallsyms and kernel dso to vmlinux in order to support
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 153bde14bbe0..154a1ced72b2 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -393,6 +393,12 @@ annotate.*::
 
 		This option works with tui, stdio2 browsers.
 
+	annotate.demangle::
+		Demangle symbol names to human readable form. Default is 'true'.
+
+	annotate.demangle_kernel::
+		Demangle kernel symbol names to human readable form. Default is 'true'.
+
 hist.*::
 	hist.percentage::
 		This option control the way to calculate overhead of filtered entries -
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f546b5e9db05..f51f0000676e 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -112,6 +112,8 @@ OPTIONS
 	- ins_lat: Instruction latency in core cycles. This is the global instruction
 	  latency
 	- local_ins_lat: Local instruction latency version
+	- p_stage_cyc: On powerpc, this presents the number of cycles spent in a
+	  pipeline stage. And currently supported only on powerpc.
 
 	By default, comm, dso and symbol keys are used.
 	(i.e. --sort comm,dso,symbol)
@@ -224,6 +226,9 @@ OPTIONS
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+--disable-order::
+	Disable raw trace ordering.
+
 -g::
 --call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
         Display call chains using type, min percent threshold, print limit,
@@ -472,7 +477,7 @@ OPTIONS
 	but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
 	go straight to the histogram browser, just like 'perf report' with no events
-	explicitely specified does.
+	explicitly specified does.
 
 --itrace::
 	Options for decoding instruction tracing data. The options are:
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 08a1714494f8..6ec5960b08c3 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -93,6 +93,17 @@ report::
 
         1.102235068 seconds time elapsed
 
+--bpf-counters::
+	Use BPF programs to aggregate readings from perf_events.  This
+	allows multiple perf-stat sessions that are counting the same metric (cycles,
+	instructions, etc.) to share hardware counters.
+
+--bpf-attr-map::
+	With option "--bpf-counters", different perf-stat sessions share
+	information about shared BPF programs and maps via a pinned hashmap.
+	Use "--bpf-attr-map" to specify the path of this pinned hashmap.
+	The default path is /sys/fs/bpf/perf_attr_map.
+
 ifdef::HAVE_LIBPFM[]
 --pfm-events events::
 Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
@@ -142,7 +153,10 @@ Do not aggregate counts across all monitored CPUs.
 
 -n::
 --null::
-        null run - don't start any counters
+null run - Don't start any counters.
+
+This can be useful to measure just elapsed wall-clock time - or to assess the
+raw overhead of perf stat itself, without running any counters.
 
 -v::
 --verbose::
@@ -468,6 +482,15 @@ convenient for post processing.
 --summary::
 Print summary for interval mode (-I).
 
+--no-csv-summary::
+Don't print 'summary' at the first column for CVS summary output.
+This option must be used with -x and --summary.
+
+This option can be enabled in perf config by setting the variable
+'stat.no-csv-summary'.
+
+$ perf config stat.no-csv-summary=true
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index ee2024691d46..bba5ffb05463 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -317,7 +317,7 @@ Default is to monitor all CPUS.
 	but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
 	go straight to the histogram browser, just like 'perf top' with no events
-	explicitely specified does.
+	explicitly specified does.
 
 --stitch-lbr::
 	Show callgraph with stitched LBRs, which may have more complete
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
index c130a3c46a90..9c330cdfa973 100644
--- a/tools/perf/Documentation/perf.txt
+++ b/tools/perf/Documentation/perf.txt
@@ -76,3 +76,15 @@ SEE ALSO
 linkperf:perf-stat[1], linkperf:perf-top[1],
 linkperf:perf-record[1], linkperf:perf-report[1],
 linkperf:perf-list[1]
+
+linkperf:perf-annotate[1],linkperf:perf-archive[1],
+linkperf:perf-bench[1], linkperf:perf-buildid-cache[1],
+linkperf:perf-buildid-list[1], linkperf:perf-c2c[1],
+linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1],
+linkperf:perf-evlist[1], linkperf:perf-ftrace[1],
+linkperf:perf-help[1], linkperf:perf-inject[1],
+linkperf:perf-intel-pt[1], linkperf:perf-kallsyms[1],
+linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1],
+linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1],
+linkperf:perf-script[1], linkperf:perf-test[1],
+linkperf:perf-trace[1], linkperf:perf-version[1]
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index d8e59d31399a..3514fe956ed1 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -32,7 +32,7 @@ ifneq ($(NO_SYSCALL_TABLE),1)
       NO_SYSCALL_TABLE := 0
     endif
   else
-    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390))
+    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips))
       NO_SYSCALL_TABLE := 0
     endif
   endif
@@ -87,6 +87,13 @@ ifeq ($(ARCH),s390)
   CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
+ifeq ($(ARCH),mips)
+  NO_PERF_REGS := 0
+  CFLAGS += -I$(OUTPUT)arch/mips/include/generated
+  CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi
+  LIBUNWIND_LIBS = -lunwind -lunwind-mips
+endif
+
 ifeq ($(NO_PERF_REGS),0)
   $(call detected,CONFIG_PERF_REGS)
 endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f6e609673de2..090fb9d62665 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1007,6 +1007,7 @@ python-clean:
 SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
+SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 
 ifdef BUILD_BPF_SKEL
 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
@@ -1021,7 +1022,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT)
 		OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
 
 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
-	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf $(BPF_INCLUDE) \
+	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
 	  -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
 
 $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index c25c878fd06c..d942f118d32c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
 	char path[PATH_MAX];
 	int err = -EINVAL;
 	u32 val;
+	u64 contextid;
 
 	ptr = container_of(itr, struct cs_etm_recording, itr);
 	cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
 		goto out;
 	}
 
+	/* User has configured for PID tracing, respects it. */
+	contextid = evsel->core.attr.config &
+			(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
 	/*
-	 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-	 * is supported:
-	 *  0b00000 Context ID tracing is not supported.
-	 *  0b00100 Maximum of 32-bit Context ID size.
-	 *  All other values are reserved.
+	 * If user doesn't configure the contextid format, parse PMU format and
+	 * enable PID tracing according to the "contextid" format bits:
+	 *
+	 *   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+	 *   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
 	 */
-	val = BMVAL(val, 5, 9);
-	if (!val || val != 0x4) {
-		err = -EINVAL;
-		goto out;
+	if (!contextid)
+		contextid = perf_pmu__format_bits(&cs_etm_pmu->format,
+						  "contextid");
+
+	if (contextid & BIT(ETM_OPT_CTXTID)) {
+		/*
+		 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+		 * tracing is supported:
+		 *  0b00000 Context ID tracing is not supported.
+		 *  0b00100 Maximum of 32-bit Context ID size.
+		 *  All other values are reserved.
+		 */
+		val = BMVAL(val, 5, 9);
+		if (!val || val != 0x4) {
+			pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+			       CORESIGHT_ETM_PMU_NAME);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (contextid & BIT(ETM_OPT_CTXTID2)) {
+		/*
+		 * TRCIDR2.VMIDOPT[30:29] != 0 and
+		 * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+		 * We can't support CONTEXTIDR in VMID if the size of the
+		 * virtual context id is < 32bit.
+		 * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+		 */
+		if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+			pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+			       CORESIGHT_ETM_PMU_NAME);
+			err = -EINVAL;
+			goto out;
+		}
 	}
 
 	/* All good, let the kernel know */
-	evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+	evsel->core.attr.config |= contextid;
 	err = 0;
 
 out:
-
 	return err;
 }
 
@@ -173,17 +208,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
 		    !cpu_map__has(online_cpus, i))
 			continue;
 
-		if (option & ETM_SET_OPT_CTXTID) {
+		if (option & BIT(ETM_OPT_CTXTID)) {
 			err = cs_etm_set_context_id(itr, evsel, i);
 			if (err)
 				goto out;
 		}
-		if (option & ETM_SET_OPT_TS) {
+		if (option & BIT(ETM_OPT_TS)) {
 			err = cs_etm_set_timestamp(itr, evsel, i);
 			if (err)
 				goto out;
 		}
-		if (option & ~(ETM_SET_OPT_MASK))
+		if (option & ~(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS)))
 			/* Nothing else is currently supported */
 			goto out;
 	}
@@ -343,7 +378,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 			opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
 		}
 
-		/* Snapshost size can't be bigger than the auxtrace area */
+		/* Snapshot size can't be bigger than the auxtrace area */
 		if (opts->auxtrace_snapshot_size >
 				opts->auxtrace_mmap_pages * (size_t)page_size) {
 			pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
@@ -410,7 +445,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 		evsel__set_sample_bit(cs_etm_evsel, CPU);
 
 		err = cs_etm_set_option(itr, cs_etm_evsel,
-					ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS);
+					BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS));
 		if (err)
 			goto out;
 	}
@@ -489,7 +524,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
 		config |= BIT(ETM4_CFG_BIT_TS);
 	if (config_opts & BIT(ETM_OPT_RETSTK))
 		config |= BIT(ETM4_CFG_BIT_RETSTK);
-
+	if (config_opts & BIT(ETM_OPT_CTXTID2))
+		config |= BIT(ETM4_CFG_BIT_VMID) |
+			  BIT(ETM4_CFG_BIT_VMID_OPT);
 	return config;
 }
 
@@ -576,7 +613,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 				struct auxtrace_record *itr,
 				struct perf_record_auxtrace_info *info)
 {
-	u32 increment;
+	u32 increment, nr_trc_params;
 	u64 magic;
 	struct cs_etm_recording *ptr =
 			container_of(itr, struct cs_etm_recording, itr);
@@ -611,6 +648,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
 		/* How much space was used */
 		increment = CS_ETMV4_PRIV_MAX;
+		nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR;
 	} else {
 		magic = __perf_cs_etmv3_magic;
 		/* Get configuration register */
@@ -628,11 +666,13 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
 		/* How much space was used */
 		increment = CS_ETM_PRIV_MAX;
+		nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR;
 	}
 
 	/* Build generic header portion */
 	info->priv[*offset + CS_ETM_MAGIC] = magic;
 	info->priv[*offset + CS_ETM_CPU] = cpu;
+	info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params;
 	/* Where the next CPU entry should start from */
 	*offset += increment;
 }
@@ -678,7 +718,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 
 	/* First fill out the session header */
 	info->type = PERF_AUXTRACE_CS_ETM;
-	info->priv[CS_HEADER_VERSION_0] = 0;
+	info->priv[CS_HEADER_VERSION] = CS_HEADER_CURRENT_VERSION;
 	info->priv[CS_PMU_TYPE_CPUS] = type << 32;
 	info->priv[CS_PMU_TYPE_CPUS] |= nr_cpu;
 	info->priv[CS_ETM_SNAPSHOT] = ptr->snapshot_mode;
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index ead2f2275eee..9fcb4e68add9 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -2,6 +2,7 @@ perf-y += header.o
 perf-y += machine.o
 perf-y += perf_regs.o
 perf-y += tsc.o
+perf-y += pmu.o
 perf-y += kvm-stat.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index 40c5e0b5bda8..7e7714290a87 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -6,11 +6,11 @@
 #include "debug.h"
 #include "symbol.h"
 
-/* On arm64, kernel text segment start at high memory address,
+/* On arm64, kernel text segment starts at high memory address,
  * for example 0xffff 0000 8xxx xxxx. Modules start at a low memory
- * address, like 0xffff 0000 00ax xxxx. When only samll amount of
+ * address, like 0xffff 0000 00ax xxxx. When only small amount of
  * memory is used by modules, gap between end of module's text segment
- * and start of kernel text segment may be reach 2G.
+ * and start of kernel text segment may reach 2G.
  * Therefore do not fill this gap and do not assign it to the kernel dso map.
  */
 
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
index 2518cde18b34..476b037eea1c 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -108,7 +108,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 		/* [sp], [sp, NUM] or [sp,NUM] */
 		new_len = 7;	/* + ( % s p ) NULL */
 
-		/* If the arugment is [sp], need to fill offset '0' */
+		/* If the argument is [sp], need to fill offset '0' */
 		if (rm[2].rm_so == -1)
 			new_len += 1;
 		else
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
new file mode 100644
index 000000000000..d3259d61ca75
--- /dev/null
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../../util/cpumap.h"
+#include "../../util/pmu.h"
+
+struct pmu_events_map *pmu_events_map__find(void)
+{
+	struct perf_pmu *pmu = NULL;
+
+	while ((pmu = perf_pmu__scan(pmu))) {
+		if (!is_pmu_core(pmu->name))
+			continue;
+
+		/*
+		 * The cpumap should cover all CPUs. Otherwise, some CPUs may
+		 * not support some events or have different event IDs.
+		 */
+		if (pmu->cpus->nr != cpu__max_cpu())
+			return NULL;
+
+		return perf_pmu__find_map(pmu);
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile
new file mode 100644
index 000000000000..8bc09072e3d6
--- /dev/null
+++ b/tools/perf/arch/mips/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+endif
+
+# Syscall table generation for perf
+out    := $(OUTPUT)arch/mips/include/generated/asm
+header := $(out)/syscalls_n64.c
+sysprf := $(srctree)/tools/perf/arch/mips/entry/syscalls
+sysdef := $(sysprf)/syscall_n64.tbl
+systbl := $(sysprf)/mksyscalltbl
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sysdef) $(systbl)
+	$(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
+
+clean::
+	$(call QUIET_CLEAN, mips) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl
new file mode 100644
index 000000000000..fb1f49451af6
--- /dev/null
+++ b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl
@@ -0,0 +1,32 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate system call table for perf. Derived from
+# s390 script.
+#
+# Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+# Changed by: Tiezhu Yang <yangtiezhu@loongson.cn>
+
+SYSCALL_TBL=$1
+
+if ! test -r $SYSCALL_TBL; then
+	echo "Could not read input file" >&2
+	exit 1
+fi
+
+create_table()
+{
+	local max_nr nr abi sc discard
+
+	echo 'static const char *syscalltbl_mips_n64[] = {'
+	while read nr abi sc discard; do
+		printf '\t[%d] = "%s",\n' $nr $sc
+		max_nr=$nr
+	done
+	echo '};'
+	echo "#define SYSCALLTBL_MIPS_N64_MAX_ID $max_nr"
+}
+
+grep -E "^[[:digit:]]+[[:space:]]+(n64)" $SYSCALL_TBL	\
+	|sort -k1 -n					\
+	|create_table
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
new file mode 100644
index 000000000000..91649690b52f
--- /dev/null
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for mips
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The <abi> is always "n64" for this file.
+#
+0	n64	read				sys_read
+1	n64	write				sys_write
+2	n64	open				sys_open
+3	n64	close				sys_close
+4	n64	stat				sys_newstat
+5	n64	fstat				sys_newfstat
+6	n64	lstat				sys_newlstat
+7	n64	poll				sys_poll
+8	n64	lseek				sys_lseek
+9	n64	mmap				sys_mips_mmap
+10	n64	mprotect			sys_mprotect
+11	n64	munmap				sys_munmap
+12	n64	brk				sys_brk
+13	n64	rt_sigaction			sys_rt_sigaction
+14	n64	rt_sigprocmask			sys_rt_sigprocmask
+15	n64	ioctl				sys_ioctl
+16	n64	pread64				sys_pread64
+17	n64	pwrite64			sys_pwrite64
+18	n64	readv				sys_readv
+19	n64	writev				sys_writev
+20	n64	access				sys_access
+21	n64	pipe				sysm_pipe
+22	n64	_newselect			sys_select
+23	n64	sched_yield			sys_sched_yield
+24	n64	mremap				sys_mremap
+25	n64	msync				sys_msync
+26	n64	mincore				sys_mincore
+27	n64	madvise				sys_madvise
+28	n64	shmget				sys_shmget
+29	n64	shmat				sys_shmat
+30	n64	shmctl				sys_old_shmctl
+31	n64	dup				sys_dup
+32	n64	dup2				sys_dup2
+33	n64	pause				sys_pause
+34	n64	nanosleep			sys_nanosleep
+35	n64	getitimer			sys_getitimer
+36	n64	setitimer			sys_setitimer
+37	n64	alarm				sys_alarm
+38	n64	getpid				sys_getpid
+39	n64	sendfile			sys_sendfile64
+40	n64	socket				sys_socket
+41	n64	connect				sys_connect
+42	n64	accept				sys_accept
+43	n64	sendto				sys_sendto
+44	n64	recvfrom			sys_recvfrom
+45	n64	sendmsg				sys_sendmsg
+46	n64	recvmsg				sys_recvmsg
+47	n64	shutdown			sys_shutdown
+48	n64	bind				sys_bind
+49	n64	listen				sys_listen
+50	n64	getsockname			sys_getsockname
+51	n64	getpeername			sys_getpeername
+52	n64	socketpair			sys_socketpair
+53	n64	setsockopt			sys_setsockopt
+54	n64	getsockopt			sys_getsockopt
+55	n64	clone				__sys_clone
+56	n64	fork				__sys_fork
+57	n64	execve				sys_execve
+58	n64	exit				sys_exit
+59	n64	wait4				sys_wait4
+60	n64	kill				sys_kill
+61	n64	uname				sys_newuname
+62	n64	semget				sys_semget
+63	n64	semop				sys_semop
+64	n64	semctl				sys_old_semctl
+65	n64	shmdt				sys_shmdt
+66	n64	msgget				sys_msgget
+67	n64	msgsnd				sys_msgsnd
+68	n64	msgrcv				sys_msgrcv
+69	n64	msgctl				sys_old_msgctl
+70	n64	fcntl				sys_fcntl
+71	n64	flock				sys_flock
+72	n64	fsync				sys_fsync
+73	n64	fdatasync			sys_fdatasync
+74	n64	truncate			sys_truncate
+75	n64	ftruncate			sys_ftruncate
+76	n64	getdents			sys_getdents
+77	n64	getcwd				sys_getcwd
+78	n64	chdir				sys_chdir
+79	n64	fchdir				sys_fchdir
+80	n64	rename				sys_rename
+81	n64	mkdir				sys_mkdir
+82	n64	rmdir				sys_rmdir
+83	n64	creat				sys_creat
+84	n64	link				sys_link
+85	n64	unlink				sys_unlink
+86	n64	symlink				sys_symlink
+87	n64	readlink			sys_readlink
+88	n64	chmod				sys_chmod
+89	n64	fchmod				sys_fchmod
+90	n64	chown				sys_chown
+91	n64	fchown				sys_fchown
+92	n64	lchown				sys_lchown
+93	n64	umask				sys_umask
+94	n64	gettimeofday			sys_gettimeofday
+95	n64	getrlimit			sys_getrlimit
+96	n64	getrusage			sys_getrusage
+97	n64	sysinfo				sys_sysinfo
+98	n64	times				sys_times
+99	n64	ptrace				sys_ptrace
+100	n64	getuid				sys_getuid
+101	n64	syslog				sys_syslog
+102	n64	getgid				sys_getgid
+103	n64	setuid				sys_setuid
+104	n64	setgid				sys_setgid
+105	n64	geteuid				sys_geteuid
+106	n64	getegid				sys_getegid
+107	n64	setpgid				sys_setpgid
+108	n64	getppid				sys_getppid
+109	n64	getpgrp				sys_getpgrp
+110	n64	setsid				sys_setsid
+111	n64	setreuid			sys_setreuid
+112	n64	setregid			sys_setregid
+113	n64	getgroups			sys_getgroups
+114	n64	setgroups			sys_setgroups
+115	n64	setresuid			sys_setresuid
+116	n64	getresuid			sys_getresuid
+117	n64	setresgid			sys_setresgid
+118	n64	getresgid			sys_getresgid
+119	n64	getpgid				sys_getpgid
+120	n64	setfsuid			sys_setfsuid
+121	n64	setfsgid			sys_setfsgid
+122	n64	getsid				sys_getsid
+123	n64	capget				sys_capget
+124	n64	capset				sys_capset
+125	n64	rt_sigpending			sys_rt_sigpending
+126	n64	rt_sigtimedwait			sys_rt_sigtimedwait
+127	n64	rt_sigqueueinfo			sys_rt_sigqueueinfo
+128	n64	rt_sigsuspend			sys_rt_sigsuspend
+129	n64	sigaltstack			sys_sigaltstack
+130	n64	utime				sys_utime
+131	n64	mknod				sys_mknod
+132	n64	personality			sys_personality
+133	n64	ustat				sys_ustat
+134	n64	statfs				sys_statfs
+135	n64	fstatfs				sys_fstatfs
+136	n64	sysfs				sys_sysfs
+137	n64	getpriority			sys_getpriority
+138	n64	setpriority			sys_setpriority
+139	n64	sched_setparam			sys_sched_setparam
+140	n64	sched_getparam			sys_sched_getparam
+141	n64	sched_setscheduler		sys_sched_setscheduler
+142	n64	sched_getscheduler		sys_sched_getscheduler
+143	n64	sched_get_priority_max		sys_sched_get_priority_max
+144	n64	sched_get_priority_min		sys_sched_get_priority_min
+145	n64	sched_rr_get_interval		sys_sched_rr_get_interval
+146	n64	mlock				sys_mlock
+147	n64	munlock				sys_munlock
+148	n64	mlockall			sys_mlockall
+149	n64	munlockall			sys_munlockall
+150	n64	vhangup				sys_vhangup
+151	n64	pivot_root			sys_pivot_root
+152	n64	_sysctl				sys_ni_syscall
+153	n64	prctl				sys_prctl
+154	n64	adjtimex			sys_adjtimex
+155	n64	setrlimit			sys_setrlimit
+156	n64	chroot				sys_chroot
+157	n64	sync				sys_sync
+158	n64	acct				sys_acct
+159	n64	settimeofday			sys_settimeofday
+160	n64	mount				sys_mount
+161	n64	umount2				sys_umount
+162	n64	swapon				sys_swapon
+163	n64	swapoff				sys_swapoff
+164	n64	reboot				sys_reboot
+165	n64	sethostname			sys_sethostname
+166	n64	setdomainname			sys_setdomainname
+167	n64	create_module			sys_ni_syscall
+168	n64	init_module			sys_init_module
+169	n64	delete_module			sys_delete_module
+170	n64	get_kernel_syms			sys_ni_syscall
+171	n64	query_module			sys_ni_syscall
+172	n64	quotactl			sys_quotactl
+173	n64	nfsservctl			sys_ni_syscall
+174	n64	getpmsg				sys_ni_syscall
+175	n64	putpmsg				sys_ni_syscall
+176	n64	afs_syscall			sys_ni_syscall
+# 177 reserved for security
+177	n64	reserved177			sys_ni_syscall
+178	n64	gettid				sys_gettid
+179	n64	readahead			sys_readahead
+180	n64	setxattr			sys_setxattr
+181	n64	lsetxattr			sys_lsetxattr
+182	n64	fsetxattr			sys_fsetxattr
+183	n64	getxattr			sys_getxattr
+184	n64	lgetxattr			sys_lgetxattr
+185	n64	fgetxattr			sys_fgetxattr
+186	n64	listxattr			sys_listxattr
+187	n64	llistxattr			sys_llistxattr
+188	n64	flistxattr			sys_flistxattr
+189	n64	removexattr			sys_removexattr
+190	n64	lremovexattr			sys_lremovexattr
+191	n64	fremovexattr			sys_fremovexattr
+192	n64	tkill				sys_tkill
+193	n64	reserved193			sys_ni_syscall
+194	n64	futex				sys_futex
+195	n64	sched_setaffinity		sys_sched_setaffinity
+196	n64	sched_getaffinity		sys_sched_getaffinity
+197	n64	cacheflush			sys_cacheflush
+198	n64	cachectl			sys_cachectl
+199	n64	sysmips				__sys_sysmips
+200	n64	io_setup			sys_io_setup
+201	n64	io_destroy			sys_io_destroy
+202	n64	io_getevents			sys_io_getevents
+203	n64	io_submit			sys_io_submit
+204	n64	io_cancel			sys_io_cancel
+205	n64	exit_group			sys_exit_group
+206	n64	lookup_dcookie			sys_lookup_dcookie
+207	n64	epoll_create			sys_epoll_create
+208	n64	epoll_ctl			sys_epoll_ctl
+209	n64	epoll_wait			sys_epoll_wait
+210	n64	remap_file_pages		sys_remap_file_pages
+211	n64	rt_sigreturn			sys_rt_sigreturn
+212	n64	set_tid_address			sys_set_tid_address
+213	n64	restart_syscall			sys_restart_syscall
+214	n64	semtimedop			sys_semtimedop
+215	n64	fadvise64			sys_fadvise64_64
+216	n64	timer_create			sys_timer_create
+217	n64	timer_settime			sys_timer_settime
+218	n64	timer_gettime			sys_timer_gettime
+219	n64	timer_getoverrun		sys_timer_getoverrun
+220	n64	timer_delete			sys_timer_delete
+221	n64	clock_settime			sys_clock_settime
+222	n64	clock_gettime			sys_clock_gettime
+223	n64	clock_getres			sys_clock_getres
+224	n64	clock_nanosleep			sys_clock_nanosleep
+225	n64	tgkill				sys_tgkill
+226	n64	utimes				sys_utimes
+227	n64	mbind				sys_mbind
+228	n64	get_mempolicy			sys_get_mempolicy
+229	n64	set_mempolicy			sys_set_mempolicy
+230	n64	mq_open				sys_mq_open
+231	n64	mq_unlink			sys_mq_unlink
+232	n64	mq_timedsend			sys_mq_timedsend
+233	n64	mq_timedreceive			sys_mq_timedreceive
+234	n64	mq_notify			sys_mq_notify
+235	n64	mq_getsetattr			sys_mq_getsetattr
+236	n64	vserver				sys_ni_syscall
+237	n64	waitid				sys_waitid
+# 238 was sys_setaltroot
+239	n64	add_key				sys_add_key
+240	n64	request_key			sys_request_key
+241	n64	keyctl				sys_keyctl
+242	n64	set_thread_area			sys_set_thread_area
+243	n64	inotify_init			sys_inotify_init
+244	n64	inotify_add_watch		sys_inotify_add_watch
+245	n64	inotify_rm_watch		sys_inotify_rm_watch
+246	n64	migrate_pages			sys_migrate_pages
+247	n64	openat				sys_openat
+248	n64	mkdirat				sys_mkdirat
+249	n64	mknodat				sys_mknodat
+250	n64	fchownat			sys_fchownat
+251	n64	futimesat			sys_futimesat
+252	n64	newfstatat			sys_newfstatat
+253	n64	unlinkat			sys_unlinkat
+254	n64	renameat			sys_renameat
+255	n64	linkat				sys_linkat
+256	n64	symlinkat			sys_symlinkat
+257	n64	readlinkat			sys_readlinkat
+258	n64	fchmodat			sys_fchmodat
+259	n64	faccessat			sys_faccessat
+260	n64	pselect6			sys_pselect6
+261	n64	ppoll				sys_ppoll
+262	n64	unshare				sys_unshare
+263	n64	splice				sys_splice
+264	n64	sync_file_range			sys_sync_file_range
+265	n64	tee				sys_tee
+266	n64	vmsplice			sys_vmsplice
+267	n64	move_pages			sys_move_pages
+268	n64	set_robust_list			sys_set_robust_list
+269	n64	get_robust_list			sys_get_robust_list
+270	n64	kexec_load			sys_kexec_load
+271	n64	getcpu				sys_getcpu
+272	n64	epoll_pwait			sys_epoll_pwait
+273	n64	ioprio_set			sys_ioprio_set
+274	n64	ioprio_get			sys_ioprio_get
+275	n64	utimensat			sys_utimensat
+276	n64	signalfd			sys_signalfd
+277	n64	timerfd				sys_ni_syscall
+278	n64	eventfd				sys_eventfd
+279	n64	fallocate			sys_fallocate
+280	n64	timerfd_create			sys_timerfd_create
+281	n64	timerfd_gettime			sys_timerfd_gettime
+282	n64	timerfd_settime			sys_timerfd_settime
+283	n64	signalfd4			sys_signalfd4
+284	n64	eventfd2			sys_eventfd2
+285	n64	epoll_create1			sys_epoll_create1
+286	n64	dup3				sys_dup3
+287	n64	pipe2				sys_pipe2
+288	n64	inotify_init1			sys_inotify_init1
+289	n64	preadv				sys_preadv
+290	n64	pwritev				sys_pwritev
+291	n64	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
+292	n64	perf_event_open			sys_perf_event_open
+293	n64	accept4				sys_accept4
+294	n64	recvmmsg			sys_recvmmsg
+295	n64	fanotify_init			sys_fanotify_init
+296	n64	fanotify_mark			sys_fanotify_mark
+297	n64	prlimit64			sys_prlimit64
+298	n64	name_to_handle_at		sys_name_to_handle_at
+299	n64	open_by_handle_at		sys_open_by_handle_at
+300	n64	clock_adjtime			sys_clock_adjtime
+301	n64	syncfs				sys_syncfs
+302	n64	sendmmsg			sys_sendmmsg
+303	n64	setns				sys_setns
+304	n64	process_vm_readv		sys_process_vm_readv
+305	n64	process_vm_writev		sys_process_vm_writev
+306	n64	kcmp				sys_kcmp
+307	n64	finit_module			sys_finit_module
+308	n64	getdents64			sys_getdents64
+309	n64	sched_setattr			sys_sched_setattr
+310	n64	sched_getattr			sys_sched_getattr
+311	n64	renameat2			sys_renameat2
+312	n64	seccomp				sys_seccomp
+313	n64	getrandom			sys_getrandom
+314	n64	memfd_create			sys_memfd_create
+315	n64	bpf				sys_bpf
+316	n64	execveat			sys_execveat
+317	n64	userfaultfd			sys_userfaultfd
+318	n64	membarrier			sys_membarrier
+319	n64	mlock2				sys_mlock2
+320	n64	copy_file_range			sys_copy_file_range
+321	n64	preadv2				sys_preadv2
+322	n64	pwritev2			sys_pwritev2
+323	n64	pkey_mprotect			sys_pkey_mprotect
+324	n64	pkey_alloc			sys_pkey_alloc
+325	n64	pkey_free			sys_pkey_free
+326	n64	statx				sys_statx
+327	n64	rseq				sys_rseq
+328	n64	io_pgetevents			sys_io_pgetevents
+# 329 through 423 are reserved to sync up with other architectures
+424	n64	pidfd_send_signal		sys_pidfd_send_signal
+425	n64	io_uring_setup			sys_io_uring_setup
+426	n64	io_uring_enter			sys_io_uring_enter
+427	n64	io_uring_register		sys_io_uring_register
+428	n64	open_tree			sys_open_tree
+429	n64	move_mount			sys_move_mount
+430	n64	fsopen				sys_fsopen
+431	n64	fsconfig			sys_fsconfig
+432	n64	fsmount				sys_fsmount
+433	n64	fspick				sys_fspick
+434	n64	pidfd_open			sys_pidfd_open
+435	n64	clone3				__sys_clone3
+436	n64	close_range			sys_close_range
+437	n64	openat2				sys_openat2
+438	n64	pidfd_getfd			sys_pidfd_getfd
+439	n64	faccessat2			sys_faccessat2
+440	n64	process_madvise			sys_process_madvise
+441	n64	epoll_pwait2			sys_epoll_pwait2
diff --git a/tools/perf/arch/mips/include/dwarf-regs-table.h b/tools/perf/arch/mips/include/dwarf-regs-table.h
new file mode 100644
index 000000000000..5badbcd3c5ec
--- /dev/null
+++ b/tools/perf/arch/mips/include/dwarf-regs-table.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * dwarf-regs-table.h : Mapping of DWARF debug register numbers into
+ * register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+#undef REG_DWARFNUM_NAME
+#define REG_DWARFNUM_NAME(reg, idx)	[idx] = "$" #reg
+static const char * const mips_regstr_tbl[] = {
+	"$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+	"$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+	"$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "%29",
+	"$30", "$31",
+	REG_DWARFNUM_NAME(hi, 64),
+	REG_DWARFNUM_NAME(lo, 65),
+};
+#endif
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
new file mode 100644
index 000000000000..ee73b36a14d1
--- /dev/null
+++ b/tools/perf/arch/mips/include/perf_regs.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MAX PERF_REG_MIPS_MAX
+#define PERF_REG_IP PERF_REG_MIPS_PC
+#define PERF_REG_SP PERF_REG_MIPS_R29
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_MIPS_MAX) - 1)
+
+static inline const char *__perf_reg_name(int id)
+{
+	switch (id) {
+	case PERF_REG_MIPS_PC:
+		return "PC";
+	case PERF_REG_MIPS_R1:
+		return "$1";
+	case PERF_REG_MIPS_R2:
+		return "$2";
+	case PERF_REG_MIPS_R3:
+		return "$3";
+	case PERF_REG_MIPS_R4:
+		return "$4";
+	case PERF_REG_MIPS_R5:
+		return "$5";
+	case PERF_REG_MIPS_R6:
+		return "$6";
+	case PERF_REG_MIPS_R7:
+		return "$7";
+	case PERF_REG_MIPS_R8:
+		return "$8";
+	case PERF_REG_MIPS_R9:
+		return "$9";
+	case PERF_REG_MIPS_R10:
+		return "$10";
+	case PERF_REG_MIPS_R11:
+		return "$11";
+	case PERF_REG_MIPS_R12:
+		return "$12";
+	case PERF_REG_MIPS_R13:
+		return "$13";
+	case PERF_REG_MIPS_R14:
+		return "$14";
+	case PERF_REG_MIPS_R15:
+		return "$15";
+	case PERF_REG_MIPS_R16:
+		return "$16";
+	case PERF_REG_MIPS_R17:
+		return "$17";
+	case PERF_REG_MIPS_R18:
+		return "$18";
+	case PERF_REG_MIPS_R19:
+		return "$19";
+	case PERF_REG_MIPS_R20:
+		return "$20";
+	case PERF_REG_MIPS_R21:
+		return "$21";
+	case PERF_REG_MIPS_R22:
+		return "$22";
+	case PERF_REG_MIPS_R23:
+		return "$23";
+	case PERF_REG_MIPS_R24:
+		return "$24";
+	case PERF_REG_MIPS_R25:
+		return "$25";
+	case PERF_REG_MIPS_R28:
+		return "$28";
+	case PERF_REG_MIPS_R29:
+		return "$29";
+	case PERF_REG_MIPS_R30:
+		return "$30";
+	case PERF_REG_MIPS_R31:
+		return "$31";
+	default:
+		break;
+	}
+	return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build
new file mode 100644
index 000000000000..51c8900a9a10
--- /dev/null
+++ b/tools/perf/arch/mips/util/Build
@@ -0,0 +1,3 @@
+perf-y += perf_regs.o
+perf-$(CONFIG_DWARF) += dwarf-regs.o
+perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/mips/util/dwarf-regs.c b/tools/perf/arch/mips/util/dwarf-regs.c
new file mode 100644
index 000000000000..25c13a91c2a7
--- /dev/null
+++ b/tools/perf/arch/mips/util/dwarf-regs.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <dwarf-regs.h>
+
+static const char *mips_gpr_names[32] = {
+	"$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+	"$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+	"$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "$29",
+	"$30", "$31"
+};
+
+const char *get_arch_regstr(unsigned int n)
+{
+	if (n < 32)
+		return mips_gpr_names[n];
+	if (n == 64)
+		return "hi";
+	if (n == 65)
+		return "lo";
+	return NULL;
+}
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
new file mode 100644
index 000000000000..2864e2e3776d
--- /dev/null
+++ b/tools/perf/arch/mips/util/perf_regs.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
diff --git a/tools/perf/arch/mips/util/unwind-libunwind.c b/tools/perf/arch/mips/util/unwind-libunwind.c
new file mode 100644
index 000000000000..0d8c99c29da6
--- /dev/null
+++ b/tools/perf/arch/mips/util/unwind-libunwind.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+#include "util/debug.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+	switch (regnum) {
+	case UNW_MIPS_R1 ... UNW_MIPS_R25:
+		return regnum - UNW_MIPS_R1 + PERF_REG_MIPS_R1;
+	case UNW_MIPS_R28 ... UNW_MIPS_R31:
+		return regnum - UNW_MIPS_R28 + PERF_REG_MIPS_R28;
+	case UNW_MIPS_PC:
+		return PERF_REG_MIPS_PC;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+}
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index b7945e5a543b..8a79c4126e5b 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -4,6 +4,8 @@ perf-y += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
 perf-y += sym-handling.o
+perf-y += evsel.o
+perf-y += event.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
new file mode 100644
index 000000000000..3bf441257466
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
+#include "../../../util/machine.h"
+#include "../../../util/tool.h"
+#include "../../../util/map.h"
+#include "../../../util/debug.h"
+
+void arch_perf_parse_sample_weight(struct perf_sample *data,
+				   const __u64 *array, u64 type)
+{
+	union perf_sample_weight weight;
+
+	weight.full = *array;
+	if (type & PERF_SAMPLE_WEIGHT)
+		data->weight = weight.full;
+	else {
+		data->weight = weight.var1_dw;
+		data->ins_lat = weight.var2_w;
+		data->p_stage_cyc = weight.var3_w;
+	}
+}
+
+void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+					__u64 *array, u64 type)
+{
+	*array = data->weight;
+
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
+		*array &= 0xffffffff;
+		*array |= ((u64)data->ins_lat << 32);
+	}
+}
+
+const char *arch_perf_header_entry(const char *se_header)
+{
+	if (!strcmp(se_header, "Local INSTR Latency"))
+		return "Finish Cyc";
+	else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+		return "Dispatch Cyc";
+	return se_header;
+}
+
+int arch_support_sort_key(const char *sort_key)
+{
+	if (!strcmp(sort_key, "p_stage_cyc"))
+		return 1;
+	return 0;
+}
diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c
new file mode 100644
index 000000000000..2f733cdc8dbb
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/evsel.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "util/evsel.h"
+
+void arch_evsel__set_sample_weight(struct evsel *evsel)
+{
+	evsel__set_sample_bit(evsel, WEIGHT_STRUCT);
+}
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index eed9e5a42935..16510686c138 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -176,7 +176,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 }
 
 /*
- * Incase of powerpc architecture, pmu registers are programmable
+ * In case of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
  * valid samples with default 'cycles' event. It is better to use
  * 'trace_imc/trace_cycles' event for guest profiling, since it
diff --git a/tools/perf/arch/powerpc/util/utils_header.h b/tools/perf/arch/powerpc/util/utils_header.h
index 5788eb1f1fe3..2baeb1c1ae85 100644
--- a/tools/perf/arch/powerpc/util/utils_header.h
+++ b/tools/perf/arch/powerpc/util/utils_header.h
@@ -10,6 +10,6 @@
 
 #define SPRN_PVR        0x11F   /* Processor Version Register */
 #define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
-#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revision field */
 
 #endif /* __PERF_UTIL_HEADER_H */
diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c
index adcacf1b6609..dffcf9b52153 100644
--- a/tools/perf/arch/x86/tests/bp-modify.c
+++ b/tools/perf/arch/x86/tests/bp-modify.c
@@ -73,7 +73,7 @@ static int bp_modify1(void)
 	/*
 	 * The parent does following steps:
 	 *  - creates a new breakpoint (id 0) for bp_2 function
-	 *  - changes that breakponit to bp_1 function
+	 *  - changes that breakpoint to bp_1 function
 	 *  - waits for the breakpoint to hit and checks
 	 *    it has proper rip of bp_1 function
 	 *  - detaches the child
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index fca81b39b09f..207c56805c55 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -165,7 +165,7 @@ static int sdt_init_op_regex(void)
 /*
  * Max x86 register name length is 5(ex: %r15d). So, 6th char
  * should always contain NULL. This helps to find register name
- * length using strlen, insted of maintaing one more variable.
+ * length using strlen, instead of maintaining one more variable.
  */
 #define SDT_REG_NAME_SIZE  6
 
@@ -207,7 +207,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 	 * and displacement 0 (Both sign and displacement 0 are
 	 * optional so it may be empty). Use one more character
 	 * to hold last NULL so that strlen can be used to find
-	 * prefix length, instead of maintaing one more variable.
+	 * prefix length, instead of maintaining one more variable.
 	 */
 	char prefix[3] = {0};
 
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 0a0ff1247c83..79d13dbc0a47 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -17,7 +17,7 @@
  * While the second model, enabled via --multiq option, uses multiple
  * queueing (which refers to one epoll instance per worker). For example,
  * short lived tcp connections in a high throughput httpd server will
- * ditribute the accept()'ing  connections across CPUs. In this case each
+ * distribute the accept()'ing  connections across CPUs. In this case each
  * worker does a limited  amount of processing.
  *
  *             [queue A]  ---> [worker]
@@ -198,7 +198,7 @@ static void *workerfn(void *arg)
 
 	do {
 		/*
-		 * Block undefinitely waiting for the IN event.
+		 * Block indefinitely waiting for the IN event.
 		 * In order to stress the epoll_wait(2) syscall,
 		 * call it event per event, instead of a larger
 		 * batch (max)limit.
diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c
index 280227e3ffd7..55d373b75791 100644
--- a/tools/perf/bench/inject-buildid.c
+++ b/tools/perf/bench/inject-buildid.c
@@ -372,7 +372,7 @@ static int inject_build_id(struct bench_data *data, u64 *max_rss)
 			len += synthesize_flush(data);
 	}
 
-	/* tihs makes the child to finish */
+	/* this makes the child to finish */
 	close(data->input_pipe[1]);
 
 	wait4(data->pid, &status, 0, &rusage);
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 20b87e29c96f..f2640179ada9 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -42,7 +42,7 @@
 #endif
 
 /*
- * Regular printout to the terminal, supressed if -q is specified:
+ * Regular printout to the terminal, suppressed if -q is specified:
  */
 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index a23ba6bb99b6..021d974c978e 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -239,7 +239,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample,
 	}
 
 	/*
-	 * XXX filtered samples can still have branch entires pointing into our
+	 * XXX filtered samples can still have branch entries pointing into our
 	 * symbol and are missed.
 	 */
 	process_branch_stack(sample->branch_stack, al, sample);
@@ -538,6 +538,10 @@ int cmd_annotate(int argc, const char **argv)
 		    "Strip first N entries of source file path name in programs (with --prefix)"),
 	OPT_STRING(0, "objdump", &annotate.opts.objdump_path, "path",
 		   "objdump binary to use for disassembly and annotations"),
+	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
+		    "Enable symbol demangling"),
+	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
+		    "Enable kernel symbol demangling"),
 	OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
 		    "Show event group information together"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index 7c4a9d424a64..61929f63a047 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -6,7 +6,6 @@
 #include <linux/zalloc.h>
 #include <linux/string.h>
 #include <linux/limits.h>
-#include <linux/string.h>
 #include <string.h>
 #include <sys/file.h>
 #include <signal.h>
@@ -24,8 +23,6 @@
 #include <sys/signalfd.h>
 #include <sys/wait.h>
 #include <poll.h>
-#include <sys/stat.h>
-#include <time.h>
 #include "builtin.h"
 #include "perf.h"
 #include "debug.h"
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 878e04b1fab7..f52b3a799e76 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1796,7 +1796,7 @@ static int ui_init(void)
 	data__for_each_file(i, d) {
 
 		/*
-		 * Baseline or compute realted columns:
+		 * Baseline or compute related columns:
 		 *
 		 *   PERF_HPP_DIFF__BASELINE
 		 *   PERF_HPP_DIFF__DELTA
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index a2f1e53f37a7..01326e370009 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -49,7 +49,7 @@ struct lock_stat {
 
 	/*
 	 * FIXME: evsel__intval() returns u64,
-	 * so address of lockdep_map should be dealed as 64bit.
+	 * so address of lockdep_map should be treated as 64bit.
 	 * Is there more better solution?
 	 */
 	void			*addr;		/* address of lockdep_map, used as ID */
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2a845d6cac09..0d65c98794a8 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -84,6 +84,7 @@ struct report {
 	bool			nonany_branch_mode;
 	bool			group_set;
 	bool			stitch_lbr;
+	bool			disable_order;
 	int			max_stack;
 	struct perf_read_values	show_threads_values;
 	struct annotation_options annotation_opts;
@@ -1296,6 +1297,8 @@ int cmd_report(int argc, const char **argv)
 	OPTS_EVSWITCH(&report.evswitch),
 	OPT_BOOLEAN(0, "total-cycles", &report.total_cycles_mode,
 		    "Sort all blocks by 'Sampled Cycles%'"),
+	OPT_BOOLEAN(0, "disable-order", &report.disable_order,
+		    "Disable raw trace ordering"),
 	OPT_END()
 	};
 	struct perf_data data = {
@@ -1329,7 +1332,7 @@ int cmd_report(int argc, const char **argv)
 	if (report.mmaps_mode)
 		report.tasks_mode = true;
 
-	if (dump_trace)
+	if (dump_trace && report.disable_order)
 		report.tool.ordered_events = false;
 
 	if (quiet)
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 69c769b04a61..954ce2f594e9 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1712,7 +1712,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool,
 {
 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	/* run the fork event through the perf machineruy */
+	/* run the fork event through the perf machinery */
 	perf_event__process_fork(tool, event, sample, machine);
 
 	/* and then run additional processing needed for this command */
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5915f19cee55..1280cbfad4db 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -314,8 +314,7 @@ static inline struct evsel_script *evsel_script(struct evsel *evsel)
 	return (struct evsel_script *)evsel->priv;
 }
 
-static struct evsel_script *perf_evsel_script__new(struct evsel *evsel,
-							struct perf_data *data)
+static struct evsel_script *evsel_script__new(struct evsel *evsel, struct perf_data *data)
 {
 	struct evsel_script *es = zalloc(sizeof(*es));
 
@@ -335,7 +334,7 @@ out_free:
 	return NULL;
 }
 
-static void perf_evsel_script__delete(struct evsel_script *es)
+static void evsel_script__delete(struct evsel_script *es)
 {
 	zfree(&es->filename);
 	fclose(es->fp);
@@ -343,7 +342,7 @@ static void perf_evsel_script__delete(struct evsel_script *es)
 	free(es);
 }
 
-static int perf_evsel_script__fprintf(struct evsel_script *es, FILE *fp)
+static int evsel_script__fprintf(struct evsel_script *es, FILE *fp)
 {
 	struct stat st;
 
@@ -2219,8 +2218,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
 
 	if (!evsel->priv) {
 		if (scr->per_event_dump) {
-			evsel->priv = perf_evsel_script__new(evsel,
-						scr->session->data);
+			evsel->priv = evsel_script__new(evsel, scr->session->data);
 		} else {
 			es = zalloc(sizeof(*es));
 			if (!es)
@@ -2475,7 +2473,7 @@ static void perf_script__fclose_per_event_dump(struct perf_script *script)
 	evlist__for_each_entry(evlist, evsel) {
 		if (!evsel->priv)
 			break;
-		perf_evsel_script__delete(evsel->priv);
+		evsel_script__delete(evsel->priv);
 		evsel->priv = NULL;
 	}
 }
@@ -2488,14 +2486,14 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script)
 		/*
 		 * Already setup? I.e. we may be called twice in cases like
 		 * Intel PT, one for the intel_pt// and dummy events, then
-		 * for the evsels syntheized from the auxtrace info.
+		 * for the evsels synthesized from the auxtrace info.
 		 *
 		 * Ses perf_script__process_auxtrace_info.
 		 */
 		if (evsel->priv != NULL)
 			continue;
 
-		evsel->priv = perf_evsel_script__new(evsel, script->session->data);
+		evsel->priv = evsel_script__new(evsel, script->session->data);
 		if (evsel->priv == NULL)
 			goto out_err_fclose;
 	}
@@ -2530,8 +2528,8 @@ static void perf_script__exit_per_event_dump_stats(struct perf_script *script)
 	evlist__for_each_entry(script->session->evlist, evsel) {
 		struct evsel_script *es = evsel->priv;
 
-		perf_evsel_script__fprintf(es, stdout);
-		perf_evsel_script__delete(es);
+		evsel_script__fprintf(es, stdout);
+		evsel_script__delete(es);
 		evsel->priv = NULL;
 	}
 }
@@ -3085,7 +3083,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
  *
  * Fixme: All existing "xxx-record" are all in good formats "-e event ",
  * which is covered well now. And new parsing code should be added to
- * cover the future complexing formats like event groups etc.
+ * cover the future complex formats like event groups etc.
  */
 static int check_ev_match(char *dir_name, char *scriptname,
 			struct perf_session *session)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e2e4a8345ea..2a2c15cac80a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -792,6 +792,12 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	}
 
 	evlist__for_each_cpu (evsel_list, i, cpu) {
+		/*
+		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
+		 * no need to call it again here.
+		 */
+		if (target.use_bpf)
+			break;
 		affinity__set(&affinity, cpu);
 
 		evlist__for_each_entry(evsel_list, counter) {
@@ -925,15 +931,15 @@ try_again_reset:
 	/*
 	 * Enable counters and exec the command:
 	 */
-	t0 = rdclock();
-	clock_gettime(CLOCK_MONOTONIC, &ref_time);
-
 	if (forks) {
 		evlist__start_workload(evsel_list);
 		err = enable_counters();
 		if (err)
 			return -1;
 
+		t0 = rdclock();
+		clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
 		if (interval || timeout || evlist__ctlfd_initialized(evsel_list))
 			status = dispatch_events(forks, timeout, interval, &times);
 		if (child_pid != -1) {
@@ -954,6 +960,10 @@ try_again_reset:
 		err = enable_counters();
 		if (err)
 			return -1;
+
+		t0 = rdclock();
+		clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
 		status = dispatch_events(forks, timeout, interval, &times);
 	}
 
@@ -1083,6 +1093,11 @@ void perf_stat__set_big_num(int set)
 	stat_config.big_num = (set != 0);
 }
 
+void perf_stat__set_no_csv_summary(int set)
+{
+	stat_config.no_csv_summary = (set != 0);
+}
+
 static int stat__set_big_num(const struct option *opt __maybe_unused,
 			     const char *s __maybe_unused, int unset)
 {
@@ -1146,6 +1161,10 @@ static struct option stat_options[] = {
 #ifdef HAVE_BPF_SKEL
 	OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id",
 		   "stat events on existing bpf program id"),
+	OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf,
+		    "use bpf program to count events"),
+	OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path",
+		   "path to perf_event_attr map"),
 #endif
 	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
 		    "system-wide collection from all CPUs"),
@@ -1235,6 +1254,8 @@ static struct option stat_options[] = {
 		    "threads of same physical core"),
 	OPT_BOOLEAN(0, "summary", &stat_config.summary,
 		       "print summary for interval mode"),
+	OPT_BOOLEAN(0, "no-csv-summary", &stat_config.no_csv_summary,
+		       "don't print 'summary' for CSV summary output"),
 	OPT_BOOLEAN(0, "quiet", &stat_config.quiet,
 			"don't print output (useful with record)"),
 #ifdef HAVE_LIBPFM
@@ -1705,7 +1726,7 @@ static int add_default_attributes(void)
 	bzero(&errinfo, sizeof(errinfo));
 	if (transaction_run) {
 		/* Handle -T as -M transaction. Once platform specific metrics
-		 * support has been added to the json files, all archictures
+		 * support has been added to the json files, all architectures
 		 * will use this approach. To determine transaction support
 		 * on an architecture test for such a metric name.
 		 */
@@ -2459,7 +2480,7 @@ int cmd_stat(int argc, const char **argv)
 		/*
 		 * We synthesize the kernel mmap record just so that older tools
 		 * don't emit warnings about not being able to resolve symbols
-		 * due to /proc/sys/kernel/kptr_restrict settings and instear provide
+		 * due to /proc/sys/kernel/kptr_restrict settings and instead provide
 		 * a saner message about no samples being in the perf.data file.
 		 *
 		 * This also serves to suppress a warning about f_header.data.size == 0
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3673c04d16b6..173ace43f845 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1607,7 +1607,7 @@ int cmd_top(int argc, const char **argv)
 	if (status) {
 		/*
 		 * Some arches do not provide a get_cpuid(), so just use pr_debug, otherwise
-		 * warn the user explicitely.
+		 * warn the user explicitly.
 		 */
 		eprintf(status == ENOSYS ? 1 : 0, verbose,
 			"Couldn't read the cpuid for this machine: %s\n",
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index dded93a2bc89..39eada935d4e 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -146,6 +146,7 @@ check arch/x86/lib/insn.c             '-I "^#include [\"<]\(../include/\)*asm/in
 check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl
 check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl
 check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl
+check_2 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl
 
 for i in $BEAUTY_FILES; do
   beauty_check $i -B
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index b80437971d80..a262dcd020f4 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -262,7 +262,7 @@ int sys_enter(struct syscall_enter_args *args)
 	/*
 	 * Jump to syscall specific augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
-	 * unagmented tracepoint payload.
+	 * unaugmented tracepoint payload.
 	 */
 	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
@@ -282,7 +282,7 @@ int sys_exit(struct syscall_exit_args *args)
 	/*
 	 * Jump to syscall specific return augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
-	 * unagmented tracepoint payload.
+	 * unaugmented tracepoint payload.
 	 */
 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
 	/*
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index 88108598d6e9..526dcaf9f079 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -390,7 +390,7 @@ jvmti_write_code(void *agent, char const *sym,
 		rec.p.total_size += size;
 
 	/*
-	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * If JVM is multi-threaded, multiple concurrent calls to agent
 	 * may be possible, so protect file writes
 	 */
 	flockfile(fp);
@@ -457,7 +457,7 @@ jvmti_write_debug_info(void *agent, uint64_t code,
 	rec.p.total_size = size;
 
 	/*
-	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * If JVM is multi-threaded, multiple concurrent calls to agent
 	 * may be possible, so protect file writes
 	 */
 	flockfile(fp);
diff --git a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
index 75376c7cc072..913fb200ea52 100644
--- a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
@@ -210,12 +210,24 @@
         "BriefDescription": "Attributable Level 2 data TLB refill"
     },
     {
+        "PublicDescription": "Attributable Level 2 instruction TLB refill.",
+        "EventCode": "0x2E",
+        "EventName": "L2I_TLB_REFILL",
+        "BriefDescription": "Attributable Level 2 instruction TLB refill."
+    },
+    {
         "PublicDescription": "Attributable Level 2 data or unified TLB access",
         "EventCode": "0x2F",
         "EventName": "L2D_TLB",
         "BriefDescription": "Attributable Level 2 data or unified TLB access"
     },
     {
+        "PublicDescription": "Attributable Level 2 instruction TLB access.",
+        "EventCode": "0x30",
+        "EventName": "L2I_TLB",
+        "BriefDescription": "Attributable Level 2 instruction TLB access."
+    },
+    {
         "PublicDescription": "Access to another socket in a multi-socket system",
         "EventCode": "0x31",
         "EventName": "REMOTE_ACCESS",
@@ -244,5 +256,221 @@
         "EventCode": "0x37",
         "EventName": "LL_CACHE_MISS_RD",
         "BriefDescription": "Last level cache miss, read"
+    },
+    {
+        "PublicDescription": "SIMD Instruction architecturally executed.",
+        "EventCode": "0x8000",
+        "EventName": "SIMD_INST_RETIRED",
+        "BriefDescription": "SIMD Instruction architecturally executed."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, SVE.",
+        "EventCode": "0x8002",
+        "EventName": "SVE_INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, SVE."
+    },
+    {
+        "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
+        "EventCode": "0x8008",
+        "EventName": "UOP_SPEC",
+        "BriefDescription": "Microarchitectural operation, Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE Math accelerator Operations speculatively executed.",
+        "EventCode": "0x800E",
+        "EventName": "SVE_MATH_SPEC",
+        "BriefDescription": "SVE Math accelerator Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point Operations speculatively executed.",
+        "EventCode": "0x8010",
+        "EventName": "FP_SPEC",
+        "BriefDescription": "Floating-point Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point FMA Operations speculatively executed.",
+        "EventCode": "0x8028",
+        "EventName": "FP_FMA_SPEC",
+        "BriefDescription": "Floating-point FMA Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.",
+        "EventCode": "0x8034",
+        "EventName": "FP_RECPE_SPEC",
+        "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "floating-point convert Operations speculatively executed.",
+        "EventCode": "0x8038",
+        "EventName": "FP_CVT_SPEC",
+        "BriefDescription": "floating-point convert Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.",
+        "EventCode": "0x8043",
+        "EventName": "ASE_SVE_INT_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE predicated Operations speculatively executed.",
+        "EventCode": "0x8074",
+        "EventName": "SVE_PRED_SPEC",
+        "BriefDescription": "SVE predicated Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
+        "EventCode": "0x807C",
+        "EventName": "SVE_MOVPRFX_SPEC",
+        "BriefDescription": "SVE MOVPRFX Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.",
+        "EventCode": "0x807F",
+        "EventName": "SVE_MOVPRFX_U_SPEC",
+        "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.",
+        "EventCode": "0x8085",
+        "EventName": "ASE_SVE_LD_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.",
+        "EventCode": "0x8086",
+        "EventName": "ASE_SVE_ST_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Prefetch Operations speculatively executed.",
+        "EventCode": "0x8087",
+        "EventName": "PRF_SPEC",
+        "BriefDescription": "Prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register load Operations speculatively executed.",
+        "EventCode": "0x8089",
+        "EventName": "BASE_LD_REG_SPEC",
+        "BriefDescription": "General-purpose register load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register store Operations speculatively executed.",
+        "EventCode": "0x808A",
+        "EventName": "BASE_ST_REG_SPEC",
+        "BriefDescription": "General-purpose register store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated load register Operations speculatively executed.",
+        "EventCode": "0x8091",
+        "EventName": "SVE_LDR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated load register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated store register Operations speculatively executed.",
+        "EventCode": "0x8092",
+        "EventName": "SVE_STR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated store register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE load predicate register Operations speculatively executed.",
+        "EventCode": "0x8095",
+        "EventName": "SVE_LDR_PREG_SPEC",
+        "BriefDescription": "SVE load predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE store predicate register Operations speculatively executed.",
+        "EventCode": "0x8096",
+        "EventName": "SVE_STR_PREG_SPEC",
+        "BriefDescription": "SVE store predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.",
+        "EventCode": "0x809F",
+        "EventName": "SVE_PRF_CONTIG_SPEC",
+        "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A5",
+        "EventName": "ASE_SVE_LD_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A6",
+        "EventName": "ASE_SVE_ST_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-load Operations speculatively executed.",
+        "EventCode": "0x80AD",
+        "EventName": "SVE_LD_GATHER_SPEC",
+        "BriefDescription": "SVE gather-load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE scatter-store Operations speculatively executed.",
+        "EventCode": "0x80AE",
+        "EventName": "SVE_ST_SCATTER_SPEC",
+        "BriefDescription": "SVE scatter-store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-prefetch Operations speculatively executed.",
+        "EventCode": "0x80AF",
+        "EventName": "SVE_PRF_GATHER_SPEC",
+        "BriefDescription": "SVE gather-prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE First-fault load Operations speculatively executed.",
+        "EventCode": "0x80BC",
+        "EventName": "SVE_LDFF_SPEC",
+        "BriefDescription": "SVE First-fault load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C0",
+        "EventName": "FP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C1",
+        "EventName": "FP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C2",
+        "EventName": "FP_HP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C3",
+        "EventName": "FP_HP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C4",
+        "EventName": "FP_SP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C5",
+        "EventName": "FP_SP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C6",
+        "EventName": "FP_DP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C7",
+        "EventName": "FP_DP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
     }
 ]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
new file mode 100644
index 000000000000..b011af11bf94
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
@@ -0,0 +1,8 @@
+[
+  {
+    "ArchStdEvent": "BR_MIS_PRED"
+  },
+  {
+    "ArchStdEvent": "BR_PRED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
new file mode 100644
index 000000000000..084e88d7df73
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
@@ -0,0 +1,62 @@
+[
+  {
+    "PublicDescription": "This event counts read transactions from tofu controller to measured CMG.",
+    "EventCode": "0x314",
+    "EventName": "BUS_READ_TOTAL_TOFU",
+    "BriefDescription": "This event counts read transactions from tofu controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from PCI controller to measured CMG.",
+    "EventCode": "0x315",
+    "EventName": "BUS_READ_TOTAL_PCI",
+    "BriefDescription": "This event counts read transactions from PCI controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from measured CMG local memory to measured CMG.",
+    "EventCode": "0x316",
+    "EventName": "BUS_READ_TOTAL_MEM",
+    "BriefDescription": "This event counts read transactions from measured CMG local memory to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0.",
+    "EventCode": "0x318",
+    "EventName": "BUS_WRITE_TOTAL_CMG0",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1.",
+    "EventCode": "0x319",
+    "EventName": "BUS_WRITE_TOTAL_CMG1",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2.",
+    "EventCode": "0x31A",
+    "EventName": "BUS_WRITE_TOTAL_CMG2",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3.",
+    "EventCode": "0x31B",
+    "EventName": "BUS_WRITE_TOTAL_CMG3",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to tofu controller.",
+    "EventCode": "0x31C",
+    "EventName": "BUS_WRITE_TOTAL_TOFU",
+    "BriefDescription": "This event counts write transactions from measured CMG to tofu controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to PCI controller.",
+    "EventCode": "0x31D",
+    "EventName": "BUS_WRITE_TOTAL_PCI",
+    "BriefDescription": "This event counts write transactions from measured CMG to PCI controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to measured CMG local memory.",
+    "EventCode": "0x31E",
+    "EventName": "BUS_WRITE_TOTAL_MEM",
+    "BriefDescription": "This event counts write transactions from measured CMG to measured CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
new file mode 100644
index 000000000000..2e341a951a10
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
@@ -0,0 +1,128 @@
+[
+  {
+    "ArchStdEvent": "L1I_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB"
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x49",
+    "EventName": "L1D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x59",
+    "EventName": "L2D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x200",
+    "EventName": "L1D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x202",
+    "EventName": "L1D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1D cache miss requests per cycle.",
+    "EventCode": "0x208",
+    "EventName": "L1_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1D cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1I cache miss requests per cycle.",
+    "EventCode": "0x209",
+    "EventName": "L1I_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1I cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x300",
+    "EventName": "L2D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x302",
+    "EventName": "L2D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L2 cache miss requests per cycle.",
+    "EventCode": "0x308",
+    "EventName": "L2_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L2 cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts the number of times of L2 cache miss.",
+    "EventCode": "0x309",
+    "EventName": "L2_MISS_COUNT",
+    "BriefDescription": "This event counts the number of times of L2 cache miss."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x325",
+    "EventName": "L2D_SWAP_DM",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x326",
+    "EventName": "L2D_CACHE_MIBMCH_PRF",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x396",
+    "EventName": "L2D_CACHE_SWAP_LOCAL",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of L2 cache.",
+    "EventCode": "0x3E0",
+    "EventName": "EA_L2",
+    "BriefDescription": "This event counts energy consumption per cycle of L2 cache."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
new file mode 100644
index 000000000000..b16484628290
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
@@ -0,0 +1,5 @@
+[
+  {
+    "ArchStdEvent": "CPU_CYCLES"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
new file mode 100644
index 000000000000..348749c154c0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
@@ -0,0 +1,29 @@
+[
+  {
+    "ArchStdEvent": "EXC_TAKEN"
+  },
+  {
+    "ArchStdEvent": "EXC_UNDEF"
+  },
+  {
+    "ArchStdEvent": "EXC_SVC"
+  },
+  {
+    "ArchStdEvent": "EXC_PABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_DABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_IRQ"
+  },
+  {
+    "ArchStdEvent": "EXC_FIQ"
+  },
+  {
+    "ArchStdEvent": "EXC_SMC"
+  },
+  {
+    "ArchStdEvent": "EXC_HVC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
new file mode 100644
index 000000000000..6d258b1080cf
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
@@ -0,0 +1,131 @@
+[
+  {
+    "ArchStdEvent": "SW_INCR"
+  },
+  {
+    "ArchStdEvent": "INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "EXC_RETURN"
+  },
+  {
+    "ArchStdEvent": "CID_WRITE_RETIRED"
+  },
+  {
+    "ArchStdEvent": "INST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "STREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDST_SPEC"
+  },
+  {
+    "ArchStdEvent": "DP_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SPEC"
+  },
+  {
+    "ArchStdEvent": "VFP_SPEC"
+  },
+  {
+    "ArchStdEvent": "PC_WRITE_SPEC"
+  },
+  {
+    "ArchStdEvent": "CRYPTO_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_IMMED_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_RETURN_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_INDIRECT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ISB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DSB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DMB_SPEC"
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction.",
+    "EventCode": "0x9F",
+    "EventName": "DCZVA_SPEC",
+    "BriefDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed floating-point move operations.",
+    "EventCode": "0x105",
+    "EventName": "FP_MV_SPEC",
+    "BriefDescription": "This event counts architecturally executed floating-point move operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that using predicate register.",
+    "EventCode": "0x108",
+    "EventName": "PRD_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that using predicate register."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-element manipulation operations.",
+    "EventCode": "0x109",
+    "EventName": "IEL_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-element manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-register manipulation operations.",
+    "EventCode": "0x10A",
+    "EventName": "IREG_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-register manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers.",
+    "EventCode": "0x112",
+    "EventName": "FP_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers.",
+    "EventCode": "0x113",
+    "EventName": "FP_ST_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations.",
+    "EventCode": "0x11A",
+    "EventName": "BC_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction.",
+    "EventCode": "0x121",
+    "EventName": "EFFECTIVE_INST_SPEC",
+    "BriefDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode.",
+    "EventCode": "0x123",
+    "EventName": "PRE_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode.",
+    "EventCode": "0x124",
+    "EventName": "POST_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
new file mode 100644
index 000000000000..c1f6479e92b4
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
@@ -0,0 +1,8 @@
+[
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of CMG local memory.",
+    "EventCode": "0x3E8",
+    "EventName": "EA_MEMORY",
+    "BriefDescription": "This event counts energy consumption per cycle of CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
new file mode 100644
index 000000000000..10c823ac26cc
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
@@ -0,0 +1,188 @@
+[
+  {
+    "PublicDescription": "This event counts the occurrence count of the micro-operation split.",
+    "EventCode": "0x139",
+    "EventName": "UOP_SPLIT",
+    "BriefDescription": "This event counts the occurrence count of the micro-operation split."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access.",
+    "EventCode": "0x180",
+    "EventName": "LD_COMP_WAIT_L2_MISS",
+    "BriefDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access.",
+    "EventCode": "0x181",
+    "EventName": "LD_COMP_WAIT_L2_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access.",
+    "EventCode": "0x182",
+    "EventName": "LD_COMP_WAIT_L1_MISS",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access.",
+    "EventCode": "0x183",
+    "EventName": "LD_COMP_WAIT_L1_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x184",
+    "EventName": "LD_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x185",
+    "EventName": "LD_COMP_WAIT_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port.",
+    "EventCode": "0x186",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation.",
+    "EventCode": "0x187",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_EX",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction.",
+    "EventCode": "0x188",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_SWPF",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction.",
+    "EventCode": "0x189",
+    "EventName": "EU_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction.",
+    "EventCode": "0x18A",
+    "EventName": "FL_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction.",
+    "EventCode": "0x18B",
+    "EventName": "BR_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty.",
+    "EventCode": "0x18C",
+    "EventName": "ROB_EMPTY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full.",
+    "EventCode": "0x18D",
+    "EventName": "ROB_EMPTY_STQ_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction.",
+    "EventCode": "0x18E",
+    "EventName": "WFE_WFI_CYCLE",
+    "BriefDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only.",
+    "EventCode": "0x190",
+    "EventName": "_0INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that one instruction is committed.",
+    "EventCode": "0x191",
+    "EventName": "_1INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that one instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that two instructions are committed.",
+    "EventCode": "0x192",
+    "EventName": "_2INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that two instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that three instructions are committed.",
+    "EventCode": "0x193",
+    "EventName": "_3INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that three instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that four instructions are committed.",
+    "EventCode": "0x194",
+    "EventName": "_4INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that four instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only any micro-operations are committed.",
+    "EventCode": "0x198",
+    "EventName": "UOP_ONLY_COMMIT",
+    "BriefDescription": "This event counts every cycle that only any micro-operations are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only the MOVPRFX instruction is committed.",
+    "EventCode": "0x199",
+    "EventName": "SINGLE_MOVPRFX_COMMIT",
+    "BriefDescription": "This event counts every cycle that only the MOVPRFX instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of core.",
+    "EventCode": "0x1E0",
+    "EventName": "EA_CORE",
+    "BriefDescription": "This event counts energy consumption per cycle of core."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x230",
+    "EventName": "L1HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x231",
+    "EventName": "L1HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x232",
+    "EventName": "L1HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher.",
+    "EventCode": "0x233",
+    "EventName": "L2HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x234",
+    "EventName": "L2HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x235",
+    "EventName": "L2HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts prefetch requests to L2 cache generated by the other causes.",
+    "EventCode": "0x236",
+    "EventName": "L2HWPF_OTHER",
+    "BriefDescription": "This event counts prefetch requests to L2 cache generated by the other causes."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
new file mode 100644
index 000000000000..dd7c97a9972b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
@@ -0,0 +1,194 @@
+[
+  {
+    "ArchStdEvent": "STALL_FRONTEND"
+  },
+  {
+    "ArchStdEvent": "STALL_BACKEND"
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGA pipeline.",
+    "EventCode": "0x1A0",
+    "EventName": "EAGA_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGB pipeline.",
+    "EventCode": "0x1A1",
+    "EventName": "EAGB_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXA pipeline.",
+    "EventCode": "0x1A2",
+    "EventName": "EXA_VAL",
+    "BriefDescription": "This event counts valid cycles of EXA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXB pipeline.",
+    "EventCode": "0x1A3",
+    "EventName": "EXB_VAL",
+    "BriefDescription": "This event counts valid cycles of EXB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLA pipeline.",
+    "EventCode": "0x1A4",
+    "EventName": "FLA_VAL",
+    "BriefDescription": "This event counts valid cycles of FLA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLB pipeline.",
+    "EventCode": "0x1A5",
+    "EventName": "FLB_VAL",
+    "BriefDescription": "This event counts valid cycles of FLB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of PRX pipeline.",
+    "EventCode": "0x1A6",
+    "EventName": "PRX_VAL",
+    "BriefDescription": "This event counts valid cycles of PRX pipeline."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B4",
+    "EventName": "FLA_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B5",
+    "EventName": "FLB_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#0.",
+    "EventCode": "0x240",
+    "EventName": "L1_PIPE0_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#1.",
+    "EventCode": "0x241",
+    "EventName": "L1_PIPE1_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1.",
+    "EventCode": "0x250",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x251",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1.",
+    "EventCode": "0x252",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x253",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#0.",
+    "EventCode": "0x260",
+    "EventName": "L1_PIPE0_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#1.",
+    "EventCode": "0x261",
+    "EventName": "L1_PIPE1_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1I cache pipeline.",
+    "EventCode": "0x268",
+    "EventName": "L1I_PIPE_COMP",
+    "BriefDescription": "This event counts completed requests in L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1I cache pipeline.",
+    "EventCode": "0x269",
+    "EventName": "L1I_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock.",
+    "EventCode": "0x274",
+    "EventName": "L1_PIPE_ABORT_STLD_INTLK",
+    "BriefDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0.",
+    "EventCode": "0x2A0",
+    "EventName": "L1_PIPE0_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0.",
+    "EventCode": "0x2A1",
+    "EventName": "L1_PIPE1_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined.",
+    "EventCode": "0x2B0",
+    "EventName": "L1_PIPE_COMP_GATHER_2FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined.",
+    "EventCode": "0x2B1",
+    "EventName": "L1_PIPE_COMP_GATHER_1FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0.",
+    "EventCode": "0x2B2",
+    "EventName": "L1_PIPE_COMP_GATHER_0FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of flows of the scatter instructions.",
+    "EventCode": "0x2B3",
+    "EventName": "L1_PIPE_COMP_SCATTER_1FLOW",
+    "BriefDescription": "This event counts the number of flows of the scatter instructions."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B8",
+    "EventName": "L1_PIPE0_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B9",
+    "EventName": "L1_PIPE1_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L2 cache pipeline.",
+    "EventCode": "0x330",
+    "EventName": "L2_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L2 cache pipeline.",
+    "EventCode": "0x350",
+    "EventName": "L2_PIPE_COMP_ALL",
+    "BriefDescription": "This event counts completed requests in L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x370",
+    "EventName": "L2_PIPE_COMP_PF_L2MIB_MCH",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
new file mode 100644
index 000000000000..dc1b95e42c32
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
@@ -0,0 +1,110 @@
+[
+  {
+    "ArchStdEvent": "SIMD_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "SVE_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "UOP_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MATH_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FMA_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_RECPE_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_CVT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_INT_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRED_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_U_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "PRF_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_LD_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_ST_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_CONTIG_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LD_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_ST_SCATTER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDFF_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
new file mode 100644
index 000000000000..dda8e59149d2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
@@ -0,0 +1,233 @@
+[
+    {
+        "MetricExpr": "FETCH_BUBBLE / (4 * CPU_CYCLES)",
+        "PublicDescription": "Frontend bound L1 topdown metric",
+        "BriefDescription": "Frontend bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "frontend_bound"
+    },
+    {
+        "MetricExpr": "(INST_SPEC - INST_RETIRED) / (4 * CPU_CYCLES)",
+        "PublicDescription": "Bad Speculation L1 topdown metric",
+        "BriefDescription": "Bad Speculation L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "bad_speculation"
+    },
+    {
+        "MetricExpr": "INST_RETIRED / (CPU_CYCLES * 4)",
+        "PublicDescription": "Retiring L1 topdown metric",
+        "BriefDescription": "Retiring L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "retiring"
+    },
+    {
+        "MetricExpr": "1 - (frontend_bound + bad_speculation + retiring)",
+        "PublicDescription": "Backend Bound L1 topdown metric",
+        "BriefDescription": "Backend Bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "backend_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201d@ / CPU_CYCLES",
+        "PublicDescription": "Fetch latency bound L2 topdown metric",
+        "BriefDescription": "Fetch latency bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_latency_bound"
+    },
+    {
+        "MetricExpr": "frontend_bound - fetch_latency_bound",
+        "PublicDescription": "Fetch bandwidth bound L2 topdown metric",
+        "BriefDescription": "Fetch bandwidth bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_bandwidth_bound"
+    },
+    {
+        "MetricExpr": "(bad_speculation * BR_MIS_PRED) / (BR_MIS_PRED + armv8_pmuv3_0@event\\=0x2013@)",
+        "PublicDescription": "Branch mispredicts L2 topdown metric",
+        "BriefDescription": "Branch mispredicts L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "branch_mispredicts"
+    },
+    {
+        "MetricExpr": "bad_speculation - branch_mispredicts",
+        "PublicDescription": "Machine clears L2 topdown metric",
+        "BriefDescription": "Machine clears L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "machine_clears"
+    },
+    {
+        "MetricExpr": "(EXE_STALL_CYCLE - (MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@)) / CPU_CYCLES",
+        "PublicDescription": "Core bound L2 topdown metric",
+        "BriefDescription": "Core bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "core_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@) / CPU_CYCLES",
+        "PublicDescription": "Memory bound L2 topdown metric",
+        "BriefDescription": "Memory bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "memory_bound"
+    },
+    {
+        "MetricExpr": "(((L2I_TLB - L2I_TLB_REFILL) * 15) + (L2I_TLB_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by itlb miss L3 topdown metric",
+        "BriefDescription": "Idle by itlb miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_itlb_miss"
+    },
+    {
+        "MetricExpr": "(((L2I_CACHE - L2I_CACHE_REFILL) * 15) + (L2I_CACHE_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by icache miss L3 topdown metric",
+        "BriefDescription": "Idle by icache miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_icache_miss"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED * 5) / CPU_CYCLES",
+        "PublicDescription": "BP misp flush L3 topdown metric",
+        "BriefDescription": "BP misp flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "bp_misp_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2013@ * 5) / CPU_CYCLES",
+        "PublicDescription": "OOO flush L3 topdown metric",
+        "BriefDescription": "OOO flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ooo_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1001@ * 5) / CPU_CYCLES",
+        "PublicDescription": "Static predictor flush L3 topdown metric",
+        "BriefDescription": "Static predictor flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sp_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x1010@ / BR_MIS_PRED",
+        "PublicDescription": "Indirect branch L3 topdown metric",
+        "BriefDescription": "Indirect branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "indirect_branch"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1014@ + armv8_pmuv3_0@event\\=0x1018@) / BR_MIS_PRED",
+        "PublicDescription": "Push branch L3 topdown metric",
+        "BriefDescription": "Push branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "push_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x100c@ / BR_MIS_PRED",
+        "PublicDescription": "Pop branch L3 topdown metric",
+        "BriefDescription": "Pop branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pop_branch"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1014@ - armv8_pmuv3_0@event\\=0x1018@ - armv8_pmuv3_0@event\\=0x100c@) / BR_MIS_PRED",
+        "PublicDescription": "Other branch L3 topdown metric",
+        "BriefDescription": "Other branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2012@ / armv8_pmuv3_0@event\\=0x2013@",
+        "PublicDescription": "Nuke flush L3 topdown metric",
+        "BriefDescription": "Nuke flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "nuke_flush"
+    },
+    {
+        "MetricExpr": "1 - nuke_flush",
+        "PublicDescription": "Other flush L3 topdown metric",
+        "BriefDescription": "Other flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2010@ / CPU_CYCLES",
+        "PublicDescription": "Sync stall L3 topdown metric",
+        "BriefDescription": "Sync stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sync_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2004@ / CPU_CYCLES",
+        "PublicDescription": "Rob stall L3 topdown metric",
+        "BriefDescription": "Rob stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "rob_stall"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2006@ + armv8_pmuv3_0@event\\=0x2007@ + armv8_pmuv3_0@event\\=0x2008@) / CPU_CYCLES",
+        "PublicDescription": "Ptag stall L3 topdown metric",
+        "BriefDescription": "Ptag stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ptag_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201e@ / CPU_CYCLES",
+        "PublicDescription": "SaveOpQ stall L3 topdown metric",
+        "BriefDescription": "SaveOpQ stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "saveopq_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2005@ / CPU_CYCLES",
+        "PublicDescription": "PC buffer stall L3 topdown metric",
+        "BriefDescription": "PC buffer stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pc_buffer_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7002@ / CPU_CYCLES",
+        "PublicDescription": "Divider L3 topdown metric",
+        "BriefDescription": "Divider L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "divider"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7003@ / CPU_CYCLES",
+        "PublicDescription": "FSU stall L3 topdown metric",
+        "BriefDescription": "FSU stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "fsu_stall"
+    },
+    {
+        "MetricExpr": "core_bound - divider - fsu_stall",
+        "PublicDescription": "EXE ports util L3 topdown metric",
+        "BriefDescription": "EXE ports util L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "exe_ports_util"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD - MEM_STALL_L1MISS) / CPU_CYCLES",
+        "PublicDescription": "L1 bound L3 topdown metric",
+        "BriefDescription": "L1 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l1_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_L1MISS - MEM_STALL_L2MISS) / CPU_CYCLES",
+        "PublicDescription": "L2 bound L3 topdown metric",
+        "BriefDescription": "L2 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l2_bound"
+    },
+    {
+        "MetricExpr": "MEM_STALL_L2MISS / CPU_CYCLES",
+        "PublicDescription": "Mem bound L3 topdown metric",
+        "BriefDescription": "Mem bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "mem_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7005@ / CPU_CYCLES",
+        "PublicDescription": "Store bound L3 topdown metric",
+        "BriefDescription": "Store bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "store_bound"
+    },
+]
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 0d609149b82a..c43591d831b8 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -20,5 +20,6 @@
 0x00000000410fd0c0,v1,arm/cortex-a76-n1,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
+0x00000000460f0010,v1,fujitsu/a64fx,core
 0x00000000480fd010,v1,hisilicon/hip08,core
 0x00000000500f0000,v1,ampere/emag,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
index fc4aa6c2ddc9..4e25525b7da6 100644
--- a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
@@ -885,37 +885,37 @@
         "MetricName": "flush_rate_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_11_14_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_11to14_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_15_17_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_15to17_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_18_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_18plus_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_1_2_ENTRIES /  ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_1to2_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_3_6_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_3to6_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_7_10_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_7to10_slots_percent"
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
index f8784c608479..db86ba36224d 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
@@ -1210,156 +1210,24 @@
         "MetricName": "inst_from_rmem_percent"
     },
     {
-        "BriefDescription": "%L2 Modified CO Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_CASTOUT_MOD/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_co_m_rd_util"
-    },
-    {
-        "BriefDescription": "L2 dcache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_DC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_dc_inv_rate_percent"
-    },
-    {
         "BriefDescription": "Demand load misses as a % of L2 LD dispatches (per thread)",
         "MetricExpr": "PM_L1_DCACHE_RELOAD_VALID / (PM_L2_LD / 2) * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_dem_ld_disp_percent"
     },
     {
-        "BriefDescription": "L2 Icache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_IC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ic_inv_rate_percent"
-    },
-    {
-        "BriefDescription": "L2 Inst misses as a % of total L2 Inst dispatches (per thread)",
-        "MetricExpr": "PM_L2_INST_MISS / PM_L2_INST * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_inst_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load hits",
-        "MetricExpr": "(PM_L2_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load misses",
-        "MetricExpr": "(PM_L2_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "L2 Load misses as a % of total L2 Load dispatches (per thread)",
-        "MetricExpr": "PM_L2_LD_MISS / PM_L2_LD * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "% L2 load disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_rd_util"
-    },
-    {
-        "BriefDescription": "L2 load misses that require a cache write (4 pclks per disp attempt) % of pclks",
-        "MetricExpr": "((( PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ldmiss_wr_util"
-    },
-    {
-        "BriefDescription": "L2 local pump prediction success",
-        "MetricExpr": "PM_L2_LOC_GUESS_CORRECT / (PM_L2_LOC_GUESS_CORRECT + PM_L2_LOC_GUESS_WRONG) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_local_pred_correct_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in M,Me,Mu state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_MOD / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_mod_co_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR )/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR + PM_L2_RCLD_DISP_FAIL_OTHER)/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "PM_L2_RCST_DISP_FAIL_ADDR / PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCST_DISP_FAIL_ADDR + PM_L2_RCST_DISP_FAIL_OTHER)/ PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "L2 Cache Read Utilization (per core)",
-        "MetricExpr": "(((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100) + (((PM_L2_RCST_DISP/2)*4)/PM_RUN_CYC * 100) + (((PM_L2_CASTOUT_MOD/2)*4)/PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rd_util_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in T,Te,Si,S state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_SHR / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_shr_co_percent"
-    },
-    {
         "BriefDescription": "L2 Store misses as a % of total L2 Store dispatches (per thread)",
         "MetricExpr": "PM_L2_ST_MISS / PM_L2_ST * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_miss_ratio_percent"
     },
     {
-        "BriefDescription": "% L2 store disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCST_DISP/2)*4) / PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_st_rd_util"
-    },
-    {
         "BriefDescription": "L2 stores that require a cache write (4 pclks per disp attempt) % of pclks",
         "MetricExpr": "((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_wr_util"
     },
     {
-        "BriefDescription": "L2 Cache Write Utilization (per core)",
-        "MetricExpr": "((((PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4) / PM_RUN_CYC * 100) + (((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_wr_util_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load hits",
-        "MetricExpr": "(PM_L3_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load misses",
-        "MetricExpr": "(PM_L3_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "Average number of Write-in machines used. 1 of 8 WI machines is sampled every L3 cycle",
-        "MetricExpr": "(PM_L3_WI_USAGE / PM_RUN_CYC) * 8",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_wi_usage"
-    },
-    {
         "BriefDescription": "Average icache miss latency",
         "MetricExpr": "PM_IC_DEMAND_CYC / PM_IC_DEMAND_REQ",
         "MetricGroup": "latency",
@@ -1823,7 +1691,7 @@
         "MetricName": "custom_secs"
     },
     {
-        "BriefDescription": "Percentage Cycles atleast one instruction dispatched",
+        "BriefDescription": "Percentage Cycles at least one instruction dispatched",
         "MetricExpr": "PM_1PLUS_PPC_DISP / PM_CYC * 100",
         "MetricName": "cycles_atleast_one_inst_dispatched_percent"
     },
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
index 4ea7ec4f496e..0d46cb82bd52 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
@@ -38,31 +38,31 @@
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
@@ -97,25 +97,25 @@
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
@@ -150,31 +150,31 @@
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
@@ -192,13 +192,13 @@
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
@@ -228,37 +228,37 @@
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
@@ -270,12 +270,12 @@
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/core.json b/tools/perf/pmu-events/arch/x86/amdzen1/core.json
index 653b11b23399..4dceeabc4a9f 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/core.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/core.json
@@ -68,21 +68,21 @@
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
@@ -103,19 +103,19 @@
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
index a35542bd3b36..3995b528ebd6 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
@@ -39,35 +39,35 @@
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to all fpu pipes.",
     "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to all pipes.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_sched_empty",
@@ -79,28 +79,28 @@
     "EventCode": "0x02",
     "BriefDescription": "All Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "fp_retx87_fp_ops.div_sqr_r_ops",
     "EventCode": "0x02",
     "BriefDescription": "Divide and square root Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Divide and square root Ops.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retx87_fp_ops.mul_ops",
     "EventCode": "0x02",
     "BriefDescription": "Multiply Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Multiply Ops.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retx87_fp_ops.add_sub_ops",
     "EventCode": "0x02",
     "BriefDescription": "Add/subtract Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Add/subtract Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
@@ -142,83 +142,83 @@
     "EventCode": "0x03",
     "BriefDescription": "Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision divide/square root FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision divide/square root FLOPS.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision multiply FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision multiply FLOPS.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision add/subtract FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision add/subtract FLOPS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Scalar Ops optimized.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops eliminated.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. x87 bottom-executing uOps retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. SSE bottom-executing uOps retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
index b33a3c308019..385022fb026e 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
@@ -3,25 +3,25 @@
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed. Load-op-Stores.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of stores dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of loads dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_stlf",
@@ -37,13 +37,13 @@
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
@@ -85,61 +85,61 @@
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 1G size.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 2M size.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 32K size.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 4K size.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
@@ -150,31 +150,31 @@
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.store_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.load_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched. Prefetch, Prefetch_T0_T1_T2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/other.json b/tools/perf/pmu-events/arch/x86/amdzen1/other.json
index ff780098d36e..7626986ce1fb 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/other.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/other.json
@@ -3,13 +3,13 @@
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.retire_token_stall",
@@ -33,24 +33,24 @@
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3_0 Tokens unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3 Tokens unavailable.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
index 2cfe2d2f3bfd..bf5083c1c260 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
@@ -79,10 +79,10 @@
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
index ef4166a66288..84fb43fa59ad 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
@@ -24,25 +24,25 @@
     "EventName": "bp_l1_tlb_fetch_hit",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
-    "UMask": "0xFF"
+    "UMask": "0xff"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if1g",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if2m",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if4k",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
index f61b982f83ca..c858fb9477e3 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
@@ -27,25 +27,25 @@
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
@@ -80,31 +80,31 @@
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
@@ -122,13 +122,13 @@
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
@@ -158,37 +158,37 @@
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
@@ -200,12 +200,12 @@
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
@@ -255,19 +255,19 @@
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_snp_re_sync",
@@ -278,43 +278,43 @@
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l3_request_g1.caching_l3_cache_accesses",
@@ -353,7 +353,7 @@
   },
   {
     "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs",
-    "EventCode": "0x9A",
+    "EventCode": "0x9a",
     "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.",
     "UMask": "0x3f",
     "Unit": "L3PMC"
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/core.json b/tools/perf/pmu-events/arch/x86/amdzen2/core.json
index 4b75183da94a..bed14829f0bc 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/core.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/core.json
@@ -68,21 +68,21 @@
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
@@ -108,19 +108,19 @@
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
index 622a0c420e46..91ed96f2580b 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
@@ -4,35 +4,35 @@
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps.",
     "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
@@ -45,96 +45,96 @@
     "EventCode": "0x03",
     "BriefDescription": "Multiply-add FLOPS. Multiply-add counts as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
     "PublicDescription": "",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Divide/square root FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Multiply FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Add/subtract FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_disp_faults.ymm_spill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_disp_faults.ymm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_disp_faults.xmm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_disp_faults.x87_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
index 715046b339cb..89822b9ddb79 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
@@ -4,31 +4,31 @@
     "EventCode": "0x24",
     "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
     "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.spec_lock_hi_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_locks.spec_lock_lo_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_locks.non_spec_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type. Comparable to legacy bus lock.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_ret_cl_flush",
@@ -44,33 +44,33 @@
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Dispatch of a single op that performs a load from and store to the same memory address. Number of single ops that do load/store to an address.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of stores dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of loads dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_smi_rx",
-    "EventCode": "0x2B",
+    "EventCode": "0x2b",
     "BriefDescription": "Number of SMIs received."
   },
   {
     "EventName": "ls_int_taken",
-    "EventCode": "0x2C",
+    "EventCode": "0x2c",
     "BriefDescription": "Number of interrupts taken."
   },
   {
     "EventName": "ls_rdtsc",
-    "EventCode": "0x2D",
+    "EventCode": "0x2d",
     "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
   },
   {
@@ -93,19 +93,19 @@
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Loads.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_rmt_dram",
@@ -123,19 +123,19 @@
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_dram",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. DRAM or IO from this thread's die.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_cache",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Hit in cache; local CCX (not Local L2), or Remote CCX and the address's Home Node is on this thread's die.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_l2",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.all",
@@ -171,61 +171,61 @@
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload hit a coalesced page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
@@ -242,31 +242,31 @@
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). See docAPM3 PREFETCHW.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). Prefetch_T0_T1_T2. PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_rmt_dram",
@@ -284,49 +284,49 @@
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_dram",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. DRAM or IO from this thread's die.  From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_cache",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_l2",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node remote).",
     "UMask": "0x40"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node remote).",
     "UMask": "0x10"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_l2",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/other.json b/tools/perf/pmu-events/arch/x86/amdzen2/other.json
index e94994d4a60e..1bdf106ca785 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/other.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/other.json
@@ -14,13 +14,13 @@
     "EventName": "de_dis_uops_from_decoder.opcache_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from OpCache.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_uops_from_decoder.decoder_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from Decoder.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.fp_misc_rsrc_stall",
@@ -50,25 +50,25 @@
     "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.store_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Store queue resource stall. Applies to all ops with store semantics.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.load_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Load queue resource stall. Applies to all ops with load semantics.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Physical Register File resource stall. Applies to all ops that have an integer destination register.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.sc_agu_dispatch_stall",
@@ -92,24 +92,24 @@
     "EventName": "de_dis_dispatch_token_stalls0.alu_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALU tokens total unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ3_0_TokenStall.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
index 2ef91e25e661..a71694a043ba 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
@@ -79,10 +79,10 @@
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/branch.json b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json
new file mode 100644
index 000000000000..018a7fe94fb9
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json
@@ -0,0 +1,53 @@
+[
+  {
+    "EventName": "bp_l1_btb_correct",
+    "EventCode": "0x8a",
+    "BriefDescription": "L1 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_l2_btb_correct",
+    "EventCode": "0x8b",
+    "BriefDescription": "L2 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_dyn_ind_pred",
+    "EventCode": "0x8e",
+    "BriefDescription": "Dynamic Indirect Predictions.",
+    "PublicDescription": "The number of times a branch used the indirect predictor to make a prediction."
+  },
+  {
+    "EventName": "bp_de_redirect",
+    "EventCode": "0x91",
+    "BriefDescription": "Decode Redirects",
+    "PublicDescription": "The number of times the instruction decoder overrides the predicted target."
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if1g",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (1G page size).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if2m",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (2M page size).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if4k",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instrcution TLB hit (4K or 16K page size).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_tlb_rel",
+    "EventCode": "0x99",
+    "BriefDescription": "The number of ITLB reload requests."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/cache.json b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json
new file mode 100644
index 000000000000..fa1d7499a2e3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json
@@ -0,0 +1,402 @@
+[
+  {
+    "EventName": "l2_request_g1.rd_blk_l",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache reads (including hardware and software prefetch).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache stores.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g1.ls_rd_blk_c_s",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache shared reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g1.cacheable_ic_read",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Instruction cache reads.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g1.change_to_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g1.prefetch_l2_cmd",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g1.l2_hw_pf",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g1.group2",
+    "EventCode": "0x60",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_request_g1.all_no_prefetch",
+    "EventCode": "0x60",
+    "UMask": "0xf9"
+  },
+  {
+    "EventName": "l2_request_g2.group1",
+    "EventCode": "0x61",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g1 (PMCx060).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized non-cacheable.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g2.smc_inval",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_originator",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_responses",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_latency.l2_cycles_waiting_on_fills",
+    "EventCode": "0x62",
+    "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_write",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB write requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) write requests.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_close",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB close requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) close requests.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_wcb_req.zero_byte_store",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_wcb_req.cl_zero",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache shared read hit in L2",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit in L2. Modifiable.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit non-modifiable line in L2.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache store or state change hit in L2.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_c",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types). Use l2_cache_misses_from_dc_misses instead.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit non-modifiable line in L2.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2. Use l2_cache_misses_from_ic_miss instead.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request hit in L2 and Data cache request hit in L2 (all types).",
+    "UMask": "0xf6"
+  },
+  {
+    "EventName": "l2_fill_pending.l2_fill_busy",
+    "EventCode": "0x6d",
+    "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_pf_hit_l2",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetcher hits in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 cache and hit the L3.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetcher misses in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3 caches.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ic_fw32",
+    "EventCode": "0x80",
+    "BriefDescription": "The number of 32B fetch windows transferred from IC pipe to DE instruction decoder (includes non-cacheable and cacheable fill responses)."
+  },
+  {
+    "EventName": "ic_fw32_miss",
+    "EventCode": "0x81",
+    "BriefDescription": "The number of 32B fetch windows tried to read the L1 IC and missed in the full tag."
+  },
+  {
+    "EventName": "ic_cache_fill_l2",
+    "EventCode": "0x82",
+    "BriefDescription": "Instruction Cache Refills from L2. The number of 64 byte instruction cache line was fulfilled from the L2 cache."
+  },
+  {
+    "EventName": "ic_cache_fill_sys",
+    "EventCode": "0x83",
+    "BriefDescription": "Instruction Cache Refills from System. The number of 64 byte instruction cache line fulfilled from system memory or another cache."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_hit",
+    "EventCode": "0x84",
+    "BriefDescription": "L1 ITLB Miss, L2 ITLB Hit. The number of instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for >4K Coalesced page.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 1G page.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 2M page.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk to 4K page.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_snp_re_sync",
+    "EventCode": "0x86",
+    "BriefDescription": "The number of pipeline restarts caused by invalidating probes that hit on the instruction stream currently being executed. This would happen if the active instruction stream was being modified by another processor in an MP system - typically a highly unlikely event."
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_any",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_dq_empty",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_back_pressure",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_cache_inval.l2_invalidating_probe",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_cache_inval.fill_invalidated",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses",
+    "EventCode": "0x18e",
+    "BriefDescription": "All Instruction Cache Accesses. Counts various IC tag related hit and miss events.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_miss",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Miss. Counts various IC tag related hit and miss events.",
+    "UMask": "0x18"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_hit",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Hit. Counts various IC tag related hit and miss events.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "op_cache_hit_miss.all_op_cache_accesses",
+    "EventCode": "0x28f",
+    "BriefDescription": "All Op Cache accesses. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_miss",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Miss. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_hit",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Hit. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l3_request_g1.caching_l3_cache_accesses",
+    "EventCode": "0x01",
+    "BriefDescription": "Caching: L3 cache accesses",
+    "UMask": "0x80",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_l3_req_typs",
+    "EventCode": "0x04",
+    "BriefDescription": "All L3 Request Types. All L3 cache Requests",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.other_l3_miss_typs",
+    "EventCode": "0x06",
+    "BriefDescription": "Other L3 Miss Request Types",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.request_miss",
+    "EventCode": "0x06",
+    "BriefDescription": "L3 cache misses",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_sys_fill_latency",
+    "EventCode": "0x90",
+    "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_ccx_sdp_req1",
+    "EventCode": "0x9a",
+    "BriefDescription": "L3 Misses by Request Type. Ignores SliceID, EnAllSlices, CoreID, EnAllCores and ThreadMask. Requires unit mask 0xFF to engage event for counting.",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/core.json b/tools/perf/pmu-events/arch/x86/amdzen3/core.json
new file mode 100644
index 000000000000..4e27a2be359e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/core.json
@@ -0,0 +1,137 @@
+[
+  {
+    "EventName": "ex_ret_instr",
+    "EventCode": "0xc0",
+    "BriefDescription": "Retired Instructions."
+  },
+  {
+    "EventName": "ex_ret_ops",
+    "EventCode": "0xc1",
+    "BriefDescription": "Retired Ops. Use macro_ops_retired instead.",
+    "PublicDescription": "The number of macro-ops retired."
+  },
+  {
+    "EventName": "ex_ret_brn",
+    "EventCode": "0xc2",
+    "BriefDescription": "Retired Branch Instructions.",
+    "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_misp",
+    "EventCode": "0xc3",
+    "BriefDescription": "Retired Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired branch instructions, that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn",
+    "EventCode": "0xc4",
+    "BriefDescription": "Retired Taken Branch Instructions.",
+    "PublicDescription": "The number of taken branches that were retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn_misp",
+    "EventCode": "0xc5",
+    "BriefDescription": "Retired Taken Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired taken branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_far",
+    "EventCode": "0xc6",
+    "BriefDescription": "Retired Far Control Transfers.",
+    "PublicDescription": "The number of far control transfers retired including far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts. Far control transfers are not subject to branch prediction."
+  },
+  {
+    "EventName": "ex_ret_brn_resync",
+    "EventCode": "0xc7",
+    "BriefDescription": "Retired Branch Resyncs.",
+    "PublicDescription": "The number of resync branches. These reflect pipeline restarts due to certain microcode assists and events such as writes to the active instruction stream, among other things. Each occurrence reflects a restart penalty similar to a branch mispredict. This is relatively rare."
+  },
+  {
+    "EventName": "ex_ret_near_ret",
+    "EventCode": "0xc8",
+    "BriefDescription": "Retired Near Returns.",
+    "PublicDescription": "The number of near return instructions (RET or RET Iw) retired."
+  },
+  {
+    "EventName": "ex_ret_near_ret_mispred",
+    "EventCode": "0xc9",
+    "BriefDescription": "Retired Near Returns Mispredicted.",
+    "PublicDescription": "The number of near returns retired that were not correctly predicted by the return address predictor. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind_misp",
+    "EventCode": "0xca",
+    "BriefDescription": "Retired Indirect Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of indirect branches retired that were not correctly predicted. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction. Note that only EX mispredicts are counted."
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.sse_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "MMX instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.x87_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "x87 instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_ind_brch_instr",
+    "EventCode": "0xcc",
+    "BriefDescription": "Retired Indirect Branch Instructions. The number of indirect branches retired."
+  },
+  {
+    "EventName": "ex_ret_cond",
+    "EventCode": "0xd1",
+    "BriefDescription": "Retired Conditional Branch Instructions."
+  },
+  {
+    "EventName": "ex_div_busy",
+    "EventCode": "0xd3",
+    "BriefDescription": "Div Cycles Busy count."
+  },
+  {
+    "EventName": "ex_div_count",
+    "EventCode": "0xd4",
+    "BriefDescription": "Div Op Count."
+  },
+  {
+    "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch",
+    "EventCode": "0x1c7",
+    "BriefDescription": "Retired Mispredicted Branch Instructions due to Direction Mismatch",
+    "PublicDescription": "The number of retired conditional branch instructions that were not correctly predicted because of a branch direction mismatch."
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_fused_instr",
+    "EventCode": "0x1d0",
+    "BriefDescription": "Counts retired Fused Instructions."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json
new file mode 100644
index 000000000000..40271df40015
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json
@@ -0,0 +1,98 @@
+[
+  {
+    "EventName": "remote_outbound_data_controller_0",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 0",
+    "EventCode": "0x7c7",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_1",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 1",
+    "EventCode": "0x807",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_2",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 2",
+    "EventCode": "0x847",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_3",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 3",
+    "EventCode": "0x887",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_0",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x07",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_1",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x47",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_2",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x87",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_3",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0xc7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_4",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x107",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_5",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x147",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_6",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x187",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_7",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x1c7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json
new file mode 100644
index 000000000000..98cfcb9c78ec
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json
@@ -0,0 +1,139 @@
+[
+  {
+    "EventName": "fpu_pipe_assignment.total",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps.",
+    "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total3",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 3.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total2",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 2.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total1",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 1.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total0",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps  on pipe 0.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.all",
+    "EventCode": "0x03",
+    "BriefDescription": "All FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mac_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply-Accumulate FLOPs. Each MAC operation is counted as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.div_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Divide/square root FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mult_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Add/subtract FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.optimized",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.opt_potential",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX control word mispredict traps. The number of serializing Ops retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_spill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_disp_faults.xmm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_disp_faults.x87_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/memory.json b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json
new file mode 100644
index 000000000000..a2833955dcd2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json
@@ -0,0 +1,428 @@
+[
+  {
+    "EventName": "ls_bad_status2.stli_other",
+    "EventCode": "0x24",
+    "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
+    "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_hi_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_lo_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_locks.non_spec_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.bus_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Comparable to legacy bus lock.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_ret_cl_flush",
+    "EventCode": "0x26",
+    "BriefDescription": "The number of retired CLFLUSH instructions. This is a non-speculative event."
+  },
+  {
+    "EventName": "ls_ret_cpuid",
+    "EventCode": "0x27",
+    "BriefDescription": "The number of CPUID instructions retired."
+  },
+  {
+    "EventName": "ls_dispatch.ld_st_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Load-op-Store Dispatch. Dispatch of a single op that performs a load from and store to the same memory address. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dispatch.store_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory store. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dispatch.ld_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory load. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_smi_rx",
+    "EventCode": "0x2b",
+    "BriefDescription": "Counts the number of SMIs received."
+  },
+  {
+    "EventName": "ls_int_taken",
+    "EventCode": "0x2c",
+    "BriefDescription": "Counts the number of interrupts taken."
+  },
+  {
+    "EventName": "ls_rdtsc",
+    "EventCode": "0x2d",
+    "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
+  },
+  {
+    "EventName": "ls_stlf",
+    "EventCode": "0x35",
+    "BriefDescription": "Number of STLF hits."
+  },
+  {
+    "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full",
+    "EventCode": "0x37",
+    "BriefDescription": "A non-cacheable store and the non-cacheable commit buffer is full.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dc_accesses",
+    "EventCode": "0x40",
+    "BriefDescription": "Number of accesses to the dcache for load/store references.",
+    "PublicDescription": "The number of accesses to the data cache for load and store references. This may include certain microcode scratchpad accesses, although these are generally rare. Each increment represents an eight-byte access, although the instruction may only be accessing a portion of that. This event is a speculative event."
+  },
+  {
+    "EventName": "ls_mab_alloc.all_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "All Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x7f"
+  },
+  {
+    "EventName": "ls_mab_alloc.hardware_prefetcher_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Hardware Prefetcher Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_mab_alloc.load_store_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Load Store Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x3f"
+  },
+  {
+    "EventName": "ls_mab_alloc.dc_prefetcher",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_mab_alloc.stores",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_mab_alloc.loads",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.int_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.lcl_l2",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.int_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.lcl_l2",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.all",
+    "EventCode": "0x45",
+    "BriefDescription": "All L1 DTLB Misses or Reloads. Use l1_dtlb_misses instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that also missed in the L2 TLB.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that also missed in the L2 TLB.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload coalesced page that also missed in the L2 TLB.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that missed the L2 TLB.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a coalesced page that hit in the L2 TLB.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_tablewalker.iside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on I-side.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 1.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 0.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_tablewalker.dside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on D-side.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_misal_loads.ma4k",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 4KB misaligned (i.e., page crossing) loads.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_misal_loads.ma64",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 64B misaligned (i.e., cacheline crossing) loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_pref_instr_disp",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative).",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_nta",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_w",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchW instruction. See docAPM3 PREFETCHW.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.mab_mch_cnt",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.int_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.int_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_alloc_mab_count",
+    "EventCode": "0x5f",
+    "BriefDescription": "Count of Allocated Mabs",
+    "PublicDescription": "This event counts the in-flight L1 data cache misses (allocated Miss Address Buffers) divided by 4 and rounded down each cycle unless used with the MergeEvent functionality. If the MergeEvent is used, it counts the exact number of outstanding L1 data cache misses. See 2.1.17.3 [Large Increment per Cycle Events]."
+  },
+  {
+    "EventName": "ls_not_halted_cyc",
+    "EventCode": "0x76",
+    "BriefDescription": "Cycles not in Halt."
+  },
+  {
+    "EventName": "ls_tlb_flush.all_tlb_flushes",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLB Flushes. Requires unit mask 0xFF to engage event for counting. Use all_tlbs_flushed instead",
+    "UMask": "0xff"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/other.json b/tools/perf/pmu-events/arch/x86/amdzen3/other.json
new file mode 100644
index 000000000000..7da5d0791ea3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/other.json
@@ -0,0 +1,103 @@
+[
+  {
+    "EventName": "de_dis_uop_queue_empty_di0",
+    "EventCode": "0xa9",
+    "BriefDescription": "Cycles where the Micro-Op Queue is empty."
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any Integer dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any FP dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_flush_recovery_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP Flush recovery stall.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_sch_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP scheduler resource stall. Applies to ops that use the FP scheduler.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Floating point register file resource stall. Applies to all FP ops that have a destination register.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.taken_brnch_buffer_rsrc",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Taken branch buffer resource stall.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.store_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Store Queue resource stall. Applies to all ops with store semantics.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.load_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Load Queue resource stall. Applies to all ops with load semantics.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Integer Physical Register File resource stall. Integer Physical Register File, applies to all ops that have an integer destination register.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.retire_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Insufficient Retire Queue tokens available.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.agsq_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. AGSQ Tokens unavailable.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch3_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 3 available.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch2_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 2 available.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch1_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 1 available.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch0_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 0 available.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json
new file mode 100644
index 000000000000..988cf68ae825
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json
@@ -0,0 +1,214 @@
+[
+  {
+    "MetricName": "branch_misprediction_ratio",
+    "BriefDescription": "Execution-Time Branch Misprediction Ratio (Non-Speculative)",
+    "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)",
+    "MetricGroup": "branch_prediction",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "all_data_cache_accesses",
+    "EventCode": "0x29",
+    "BriefDescription": "All L1 Data Cache Accesses",
+    "UMask": "0x07"
+  },
+  {
+    "MetricName": "all_l2_cache_accesses",
+    "BriefDescription": "All L2 Cache Accesses",
+    "MetricExpr": "l2_request_g1.all_no_prefetch + l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_ic_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Instruction Cache Misses (including prefetch)",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_dc_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Data Cache Misses (including prefetch)",
+    "UMask": "0xe8"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Accesses from L2 HWPF",
+    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_misses",
+    "BriefDescription": "All L2 Cache Misses",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_misses_from_ic_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Instruction Cache Misses",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_misses_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Data Cache Misses",
+    "UMask": "0x08"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Misses from L2 Cache HWPF",
+    "MetricExpr": "l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_hits",
+    "BriefDescription": "All L2 Cache Hits",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_hits_from_ic_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Instruction Cache Misses",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "l2_cache_hits_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Data Cache Misses",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 Cache Hits from L2 Cache HWPF",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l3_cache_accesses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Cache Accesses",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_misses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Misses (includes cacheline state change requests)",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "MetricName": "l3_read_miss_latency",
+    "BriefDescription": "Average L3 Read Miss Latency (in core clocks)",
+    "MetricExpr": "(xi_sys_fill_latency * 16) / xi_ccx_sdp_req1",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1core clocks"
+  },
+  {
+    "MetricName": "op_cache_fetch_miss_ratio",
+    "BriefDescription": "Op Cache (64B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "ic_fetch_miss_ratio",
+    "BriefDescription": "Instruction Cache (32B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_memory",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Memory",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_remote_node",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Remote Node",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_within_same_ccx",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From within same CCX",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_external_ccx_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From External CCX Cache",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "l1_data_cache_fills_all",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: All",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "l1_itlb_misses",
+    "BriefDescription": "L1 ITLB Misses",
+    "MetricExpr": "bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss",
+    "MetricGroup": "tlb"
+  },
+  {
+    "EventName": "l2_itlb_misses",
+    "EventCode": "0x85",
+    "BriefDescription": "L2 ITLB Misses & Instruction page walks",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l1_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Misses",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L2 DTLB Misses & Data page walks",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "all_tlbs_flushed",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLBs Flushed",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "macro_ops_dispatched",
+    "BriefDescription": "Macro-ops Dispatched",
+    "MetricExpr": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch + de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "MetricGroup": "decoder"
+  },
+  {
+    "EventName": "sse_avx_stalls",
+    "EventCode": "0x0e",
+    "BriefDescription": "Mixed SSE/AVX Stalls",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "macro_ops_retired",
+    "EventCode": "0xc1",
+    "BriefDescription": "Macro-ops Retired"
+  },
+  {
+    "MetricName": "all_remote_links_outbound",
+    "BriefDescription": "Approximate: Outbound data bytes for all Remote Links for a node (die)",
+    "MetricExpr": "remote_outbound_data_controller_0 + remote_outbound_data_controller_1 + remote_outbound_data_controller_2 + remote_outbound_data_controller_3",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "3e-5MiB"
+  },
+  {
+    "MetricName": "nps1_die_to_dram",
+    "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)",
+    "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "6.1e-5MiB"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 2f2a209e87e1..9f97b32533a0 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -38,4 +38,4 @@ GenuineIntel-6-7E,v1,icelake,core
 GenuineIntel-6-86,v1,tremontx,core
 AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core
 AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core
-AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen2,core
+AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen3,core
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index e1f3f5c8c550..33aa3c885eaf 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -1149,7 +1149,7 @@ static int process_one_file(const char *fpath, const struct stat *sb,
 	 * and directory tree could result in build failure due to table
 	 * names not being found.
 	 *
-	 * Atleast for now, be strict with processing JSON file names.
+	 * At least for now, be strict with processing JSON file names.
 	 * i.e. if JSON file name cannot be mapped to C-style table name,
 	 * fail.
 	 */
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
index ea0c8b90a783..a0cfc7fe5908 100644
--- a/tools/perf/scripts/python/netdev-times.py
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -356,7 +356,7 @@ def handle_irq_softirq_exit(event_info):
 		return
 	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
 			'irq_list':irq_list, 'event_list':event_list}
-	# merge information realted to a NET_RX softirq
+	# merge information related to a NET_RX softirq
 	receive_hunk_list.append(rec_data)
 
 def handle_napi_poll(event_info):
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index cc9fbcedb364..ef37353636d8 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -225,11 +225,11 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
 	 *
 	 * The test case check following error conditions:
 	 * - we get stuck in signal handler because of debug
-	 *   exception being triggered receursively due to
+	 *   exception being triggered recursively due to
 	 *   the wrong RF EFLAG management
 	 *
 	 * - we never trigger the sig_handler breakpoint due
-	 *   to the rong RF EFLAG management
+	 *   to the wrong RF EFLAG management
 	 *
 	 */
 
@@ -242,7 +242,7 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
 	ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
 
 	/*
-	 * Kick off the test by trigering 'fd1'
+	 * Kick off the test by triggering 'fd1'
 	 * breakpoint.
 	 */
 	test_function();
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 2fdc7b2f996e..9866cddebf23 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -658,7 +658,7 @@ static int do_test_code_reading(bool try_kcore)
 				/*
 				 * Both cpus and threads are now owned by evlist
 				 * and will be freed by following perf_evlist__set_maps
-				 * call. Getting refference to keep them alive.
+				 * call. Getting reference to keep them alive.
 				 */
 				perf_cpu_map__get(cpus);
 				perf_thread_map__get(threads);
diff --git a/tools/perf/tests/demangle-ocaml-test.c b/tools/perf/tests/demangle-ocaml-test.c
index a273ed5163d7..0043be812355 100644
--- a/tools/perf/tests/demangle-ocaml-test.c
+++ b/tools/perf/tests/demangle-ocaml-test.c
@@ -19,14 +19,14 @@ int test__demangle_ocaml(struct test *test __maybe_unused, int subtest __maybe_u
 		{ "main",
 		  NULL },
 		{ "camlStdlib__array__map_154",
-		  "Stdlib.array.map" },
+		  "Stdlib.array.map_154" },
 		{ "camlStdlib__anon_fn$5bstdlib$2eml$3a334$2c0$2d$2d54$5d_1453",
-		  "Stdlib.anon_fn[stdlib.ml:334,0--54]" },
+		  "Stdlib.anon_fn[stdlib.ml:334,0--54]_1453" },
 		{ "camlStdlib__bytes__$2b$2b_2205",
-		  "Stdlib.bytes.++" },
+		  "Stdlib.bytes.++_2205" },
 	};
 
-	for (i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
 		buf = ocaml_demangle_sym(test_cases[i].mangled);
 		if ((buf == NULL && test_cases[i].demangled != NULL)
 				|| (buf != NULL && test_cases[i].demangled == NULL)
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 3f2e1a581247..890cb1f5bf53 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -47,7 +47,7 @@ static struct sample fake_samples[] = {
 };
 
 /*
- * Will be casted to struct ip_callchain which has all 64 bit entries
+ * Will be cast to struct ip_callchain which has all 64 bit entries
  * of nr and ips[].
  */
 static u64 fake_callchains[][10] = {
@@ -297,7 +297,7 @@ out:
 	return err;
 }
 
-/* callcain + NO children */
+/* callchain + NO children */
 static int test2(struct evsel *evsel, struct machine *machine)
 {
 	int err;
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index a7f6661e6112..026c54743311 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -20,7 +20,7 @@
 
 #if defined(__s390x__)
 /* Return true if kvm module is available and loaded. Test this
- * and retun success when trace point kvm_s390_create_vm
+ * and return success when trace point kvm_s390_create_vm
  * exists. Otherwise this test always fails.
  */
 static bool kvm_s390_create_vm_valid(void)
diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 6dc1db1626ad..4968c4106254 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -98,7 +98,7 @@ static u64 find_value(const char *name, struct value *values)
 		if (!strcmp(name, v->event))
 			return v->val;
 		v++;
-	};
+	}
 	return 0;
 }
 
@@ -186,7 +186,7 @@ static int __compute_metric(const char *name, struct value *vals,
 		*ratio2 = compute_single(&metric_events, evlist, &st, name2);
 
 out:
-	/* ... clenup. */
+	/* ... cleanup. */
 	metricgroup__rblist_exit(&metric_events);
 	runtime_stat__exit(&st);
 	evlist__free_stats(evlist);
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index 0ca6a5a53523..b8aff8fb50d8 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -12,6 +12,7 @@
 #include "util/evlist.h"
 #include "util/expr.h"
 #include "util/parse-events.h"
+#include "metricgroup.h"
 
 struct perf_pmu_test_event {
 	/* used for matching against events from generated pmu-events.c */
@@ -471,9 +472,74 @@ static void expr_failure(const char *msg,
 	pr_debug("On expression %s\n", pe->metric_expr);
 }
 
+struct metric {
+	struct list_head list;
+	struct metric_ref metric_ref;
+};
+
+static int resolve_metric_simple(struct expr_parse_ctx *pctx,
+				 struct list_head *compound_list,
+				 struct pmu_events_map *map,
+				 const char *metric_name)
+{
+	struct hashmap_entry *cur, *cur_tmp;
+	struct metric *metric, *tmp;
+	size_t bkt;
+	bool all;
+	int rc;
+
+	do {
+		all = true;
+		hashmap__for_each_entry_safe((&pctx->ids), cur, cur_tmp, bkt) {
+			struct metric_ref *ref;
+			struct pmu_event *pe;
+
+			pe = metricgroup__find_metric(cur->key, map);
+			if (!pe)
+				continue;
+
+			if (!strcmp(metric_name, (char *)cur->key)) {
+				pr_warning("Recursion detected for metric %s\n", metric_name);
+				rc = -1;
+				goto out_err;
+			}
+
+			all = false;
+
+			/* The metric key itself needs to go out.. */
+			expr__del_id(pctx, cur->key);
+
+			metric = malloc(sizeof(*metric));
+			if (!metric) {
+				rc = -ENOMEM;
+				goto out_err;
+			}
+
+			ref = &metric->metric_ref;
+			ref->metric_name = pe->metric_name;
+			ref->metric_expr = pe->metric_expr;
+			list_add_tail(&metric->list, compound_list);
+
+			rc = expr__find_other(pe->metric_expr, NULL, pctx, 0);
+			if (rc)
+				goto out_err;
+			break; /* The hashmap has been modified, so restart */
+		}
+	} while (!all);
+
+	return 0;
+
+out_err:
+	list_for_each_entry_safe(metric, tmp, compound_list, list)
+		free(metric);
+
+	return rc;
+
+}
+
 static int test_parsing(void)
 {
-	struct pmu_events_map *cpus_map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *cpus_map = pmu_events_map__find();
 	struct pmu_events_map *map;
 	struct pmu_event *pe;
 	int i, j, k;
@@ -488,7 +554,9 @@ static int test_parsing(void)
 			break;
 		j = 0;
 		for (;;) {
+			struct metric *metric, *tmp;
 			struct hashmap_entry *cur;
+			LIST_HEAD(compound_list);
 			size_t bkt;
 
 			pe = &map->table[j++];
@@ -504,6 +572,13 @@ static int test_parsing(void)
 				continue;
 			}
 
+			if (resolve_metric_simple(&ctx, &compound_list, map,
+						  pe->metric_name)) {
+				expr_failure("Could not resolve metrics", map, pe);
+				ret++;
+				goto exit; /* Don't tolerate errors due to severity */
+			}
+
 			/*
 			 * Add all ids with a made up value. The value may
 			 * trigger divide by zero when subtracted and so try to
@@ -519,6 +594,11 @@ static int test_parsing(void)
 					ret++;
 			}
 
+			list_for_each_entry_safe(metric, tmp, &compound_list, list) {
+				expr__add_ref(&ctx, &metric->metric_ref);
+				free(metric);
+			}
+
 			if (expr__parse(&result, &ctx, pe->metric_expr, 0)) {
 				expr_failure("Parse failed", map, pe);
 				ret++;
@@ -527,6 +607,7 @@ static int test_parsing(void)
 		}
 	}
 	/* TODO: fail when not ok */
+exit:
 	return ret == 0 ? TEST_OK : TEST_SKIP;
 }
 
diff --git a/tools/perf/tests/shell/buildid.sh b/tools/perf/tests/shell/buildid.sh
index 416af614bbe0..f05670d1e39e 100755
--- a/tools/perf/tests/shell/buildid.sh
+++ b/tools/perf/tests/shell/buildid.sh
@@ -14,18 +14,56 @@ if ! [ -x "$(command -v cc)" ]; then
 	exit 2
 fi
 
+# check what we need to test windows binaries
+add_pe=1
+run_pe=1
+if ! perf version --build-options | grep -q 'libbfd: .* on '; then
+	echo "WARNING: perf not built with libbfd. PE binaries will not be tested."
+	add_pe=0
+	run_pe=0
+fi
+if ! which wine > /dev/null; then
+	echo "WARNING: wine not found. PE binaries will not be run."
+	run_pe=0
+fi
+
+# set up wine
+if [ ${run_pe} -eq 1 ]; then
+	wineprefix=$(mktemp -d /tmp/perf.wineprefix.XXX)
+	export WINEPREFIX=${wineprefix}
+	# clear display variables to prevent wine from popping up dialogs
+	unset DISPLAY
+	unset WAYLAND_DISPLAY
+fi
+
 ex_md5=$(mktemp /tmp/perf.ex.MD5.XXX)
 ex_sha1=$(mktemp /tmp/perf.ex.SHA1.XXX)
+ex_pe=$(dirname $0)/../pe-file.exe
 
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=sha1 -o ${ex_sha1} -x c -
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=md5 -o ${ex_md5} -x c -
 
-echo "test binaries: ${ex_sha1} ${ex_md5}"
+echo "test binaries: ${ex_sha1} ${ex_md5} ${ex_pe}"
 
 check()
 {
-	id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
-
+	case $1 in
+	*.exe)
+		# We don't have a tool that can pull a nicely formatted build-id out of
+		# a PE file, but we can extract the whole section with objcopy and
+		# format it ourselves. The .buildid section is a Debug Directory
+		# containing a CodeView entry:
+		#     https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#debug-directory-image-only
+		#     https://github.com/dotnet/runtime/blob/da94c022576a5c3bbc0e896f006565905eb137f9/docs/design/specs/PE-COFF.md
+		# The build-id starts at byte 33 and must be rearranged into a GUID.
+		id=`objcopy -O binary --only-section=.buildid $1 /dev/stdout | \
+			cut -c 33-48 | hexdump -ve '/1 "%02x"' | \
+			sed 's@^\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(.*\)0a$@\4\3\2\1\6\5\8\7\9@'`
+		;;
+	*)
+		id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
+		;;
+	esac
 	echo "build id: ${id}"
 
 	link=${build_id_dir}/.build-id/${id:0:2}/${id:2}
@@ -50,7 +88,7 @@ check()
 		exit 1
 	fi
 
-	${perf} buildid-cache -l | grep $id
+	${perf} buildid-cache -l | grep ${id}
 	if [ $? -ne 0 ]; then
 		echo "failed: ${id} is not reported by \"perf buildid-cache -l\""
 		exit 1
@@ -79,16 +117,20 @@ test_record()
 {
 	data=$(mktemp /tmp/perf.data.XXX)
 	build_id_dir=$(mktemp -d /tmp/perf.debug.XXX)
+	log=$(mktemp /tmp/perf.log.XXX)
 	perf="perf --buildid-dir ${build_id_dir}"
 
-	${perf} record --buildid-all -o ${data} ${1}
+	echo "running: perf record $@"
+	${perf} record --buildid-all -o ${data} $@ &> ${log}
 	if [ $? -ne 0 ]; then
-		echo "failed: record ${1}"
+		echo "failed: record $@"
+		echo "see log: ${log}"
 		exit 1
 	fi
 
-	check ${1}
+	check ${@: -1}
 
+	rm -f ${log}
 	rm -rf ${build_id_dir}
 	rm -rf ${data}
 }
@@ -96,12 +138,21 @@ test_record()
 # add binaries manual via perf buildid-cache -a
 test_add ${ex_sha1}
 test_add ${ex_md5}
+if [ ${add_pe} -eq 1 ]; then
+	test_add ${ex_pe}
+fi
 
 # add binaries via perf record post processing
 test_record ${ex_sha1}
 test_record ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+	test_record wine ${ex_pe}
+fi
 
 # cleanup
 rm ${ex_sha1} ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+	rm -r ${wineprefix}
+fi
 
 exit ${err}
diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh
index 58984380b211..45fc24af5b07 100755
--- a/tools/perf/tests/shell/daemon.sh
+++ b/tools/perf/tests/shell/daemon.sh
@@ -98,6 +98,23 @@ check_line_other()
 	fi
 }
 
+daemon_exit()
+{
+	local config=$1
+
+	local line=`perf daemon --config ${config} -x: | head -1`
+	local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
+
+	# Reset trap handler.
+	trap - SIGINT SIGTERM
+
+	# stop daemon
+	perf daemon stop --config ${config}
+
+	# ... and wait for the pid to go away
+	tail --pid=${pid} -f /dev/null
+}
+
 daemon_start()
 {
 	local config=$1
@@ -105,29 +122,24 @@ daemon_start()
 
 	perf daemon start --config ${config}
 
+	# Clean up daemon if interrupted.
+	trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM
+
 	# wait for the session to ping
 	local state="FAIL"
+	local retries=0
 	while [ "${state}" != "OK" ]; do
 		state=`perf daemon ping --config ${config} --session ${session} | awk '{ print $1 }'`
 		sleep 0.05
+		retries=$((${retries} +1))
+		if [ ${retries} -ge 600 ]; then
+			echo "FAILED: Timeout waiting for daemon to ping"
+			daemon_exit ${config}
+			exit 1
+		fi
 	done
 }
 
-daemon_exit()
-{
-	local base=$1
-	local config=$2
-
-	local line=`perf daemon --config ${config} -x: | head -1`
-	local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
-
-	# stop daemon
-	perf daemon stop --config ${config}
-
-	# ... and wait for the pid to go away
-	tail --pid=${pid} -f /dev/null
-}
-
 test_list()
 {
 	echo "test daemon list"
@@ -171,7 +183,7 @@ EOF
 			 ${base}/session-time/ack "0"
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -288,7 +300,7 @@ EOF
 	done
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -333,7 +345,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	# check that sessions are gone
 	if [ -d "/proc/${pid_size}" ]; then
@@ -374,7 +386,7 @@ EOF
 	perf daemon signal --config ${config}
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	# count is 2 perf.data for signals and 1 for perf record finished
 	count=`ls ${base}/session-test/ | grep perf.data | wc -l`
@@ -420,7 +432,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -457,7 +469,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
diff --git a/tools/perf/tests/shell/stat+csv_summary.sh b/tools/perf/tests/shell/stat+csv_summary.sh
new file mode 100755
index 000000000000..5571ff75eb42
--- /dev/null
+++ b/tools/perf/tests/shell/stat+csv_summary.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat csv summary test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+#
+#     1.001364330 9224197  cycles 8012885033 100.00
+#         summary 9224197  cycles 8012885033 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary 2>&1 | \
+grep -e summary | \
+while read summary num event run pct
+do
+	if [ $summary != "summary" ]; then
+		exit 1
+	fi
+done
+
+#
+#     1.001360298 9148534  cycles 8012853854 100.00
+#9148534  cycles 8012853854 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary --no-csv-summary 2>&1 | \
+grep -e summary | \
+while read num event run pct
+do
+	exit 1
+done
+
+exit 0
diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
new file mode 100755
index 000000000000..22eb31e48ca7
--- /dev/null
+++ b/tools/perf/tests/shell/stat_bpf_counters.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat --bpf-counters test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# check whether $2 is within +/- 10% of $1
+compare_number()
+{
+       first_num=$1
+       second_num=$2
+
+       # upper bound is first_num * 110%
+       upper=$(( $first_num + $first_num / 10 ))
+       # lower bound is first_num * 90%
+       lower=$(( $first_num - $first_num / 10 ))
+
+       if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
+               echo "The difference between $first_num and $second_num are greater than 10%."
+               exit 1
+       fi
+}
+
+# skip if --bpf-counters is not supported
+perf stat --bpf-counters true > /dev/null 2>&1 || exit 2
+
+base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+
+compare_number $base_cycles $bpf_cycles
+exit 0
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 74748ed75b2c..050489807a47 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -80,7 +80,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	 *   CPU 1 is on core_id 1 and physical_package_id 3
 	 *
 	 *   Core_id and physical_package_id are platform and architecture
-	 *   dependend and might have higher numbers than the CPU id.
+	 *   dependent and might have higher numbers than the CPU id.
 	 *   This actually depends on the configuration.
 	 *
 	 *  In this case process_cpu_topology() prints error message:
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index 385894b4a8bb..b8fc5c53ba6f 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -85,7 +85,7 @@ struct mmsghdr {
 
 /*
  *	POSIX 1003.1g - ancillary data object information
- *	Ancillary data consits of a sequence of pairs of
+ *	Ancillary data consists of a sequence of pairs of
  *	(cmsghdr, cmsg_data[])
  */
 
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 35b82caf8090..ad0a70f0edaf 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -381,6 +381,25 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser)
 	return true;
 }
 
+#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
+
+static void annotate_browser__show_full_location(struct ui_browser *browser)
+{
+	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
+	struct disasm_line *cursor = disasm_line(ab->selection);
+	struct annotation_line *al = &cursor->al;
+
+	if (al->offset != -1)
+		ui_helpline__puts("Only available for source code lines.");
+	else if (al->fileloc == NULL)
+		ui_helpline__puts("No source file location.");
+	else {
+		char help_line[SYM_TITLE_MAX_SIZE];
+		sprintf (help_line, "Source file location: %s", al->fileloc);
+		ui_helpline__puts(help_line);
+	}
+}
+
 static void ui_browser__init_asm_mode(struct ui_browser *browser)
 {
 	struct annotation *notes = browser__annotation(browser);
@@ -388,8 +407,6 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser)
 	browser->nr_entries = notes->nr_asm_entries;
 }
 
-#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
-
 static int sym_title(struct symbol *sym, struct map *map, char *title,
 		     size_t sz, int percent_type)
 {
@@ -398,7 +415,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title,
 }
 
 /*
- * This can be called from external jumps, i.e. jumps from one functon
+ * This can be called from external jumps, i.e. jumps from one function
  * to another, like from the kernel's entry_SYSCALL_64 function to the
  * swapgs_restore_regs_and_return_to_usermode() function.
  *
@@ -747,6 +764,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		"c             Show min/max cycle\n"
 		"/             Search string\n"
 		"k             Toggle line numbers\n"
+		"l             Show full source file location\n"
 		"P             Print to [symbol_name].annotation file.\n"
 		"r             Run available scripts\n"
 		"p             Toggle percent type [local/global]\n"
@@ -760,6 +778,9 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		case 'k':
 			notes->options->show_linenr = !notes->options->show_linenr;
 			continue;
+		case 'l':
+			annotate_browser__show_full_location (&browser->b);
+			continue;
 		case 'H':
 			nd = browser->curr_hot;
 			break;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 3b9818ee9546..bcfd0a45953b 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -117,7 +117,7 @@ static void hist_browser__update_rows(struct hist_browser *hb)
 	browser->rows -= browser->extra_title_lines;
 	/*
 	 * Verify if we were at the last line and that line isn't
-	 * visibe because we now show the header line(s).
+	 * visible because we now show the header line(s).
 	 */
 	index_row = browser->index - browser->top_idx;
 	if (index_row >= browser->rows)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e60841b86d27..18eee25b4976 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1161,6 +1161,7 @@ struct annotate_args {
 	s64			  offset;
 	char			  *line;
 	int			  line_nr;
+	char			  *fileloc;
 };
 
 static void annotation_line__init(struct annotation_line *al,
@@ -1170,6 +1171,7 @@ static void annotation_line__init(struct annotation_line *al,
 	al->offset = args->offset;
 	al->line = strdup(args->line);
 	al->line_nr = args->line_nr;
+	al->fileloc = args->fileloc;
 	al->data_nr = nr;
 }
 
@@ -1482,7 +1484,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
  */
 static int symbol__parse_objdump_line(struct symbol *sym,
 				      struct annotate_args *args,
-				      char *parsed_line, int *line_nr)
+				      char *parsed_line, int *line_nr, char **fileloc)
 {
 	struct map *map = args->ms.map;
 	struct annotation *notes = symbol__annotation(sym);
@@ -1494,6 +1496,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	/* /filename:linenr ? Save line number and ignore. */
 	if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) {
 		*line_nr = atoi(parsed_line + match[1].rm_so);
+		*fileloc = strdup(parsed_line);
 		return 0;
 	}
 
@@ -1513,6 +1516,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	args->offset  = offset;
 	args->line    = parsed_line;
 	args->line_nr = *line_nr;
+	args->fileloc = *fileloc;
 	args->ms.sym  = sym;
 
 	dl = disasm_line__new(args);
@@ -1807,6 +1811,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 			args->offset = -1;
 			args->line = strdup(srcline);
 			args->line_nr = 0;
+			args->fileloc = NULL;
 			args->ms.sym  = sym;
 			dl = disasm_line__new(args);
 			if (dl) {
@@ -1818,6 +1823,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 		args->offset = pc;
 		args->line = buf + prev_buf_size;
 		args->line_nr = 0;
+		args->fileloc = NULL;
 		args->ms.sym  = sym;
 		dl = disasm_line__new(args);
 		if (dl)
@@ -1852,6 +1858,7 @@ symbol__disassemble_bpf_image(struct symbol *sym,
 	args->offset = -1;
 	args->line = strdup("to be implemented");
 	args->line_nr = 0;
+	args->fileloc = NULL;
 	dl = disasm_line__new(args);
 	if (dl)
 		annotation_line__add(&dl->al, &notes->src->source);
@@ -1933,6 +1940,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	bool delete_extract = false;
 	bool decomp = false;
 	int lineno = 0;
+	char *fileloc = NULL;
 	int nline;
 	char *line;
 	size_t line_len;
@@ -2060,7 +2068,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 		 * See disasm_line__new() and struct disasm_line::line_nr.
 		 */
 		if (symbol__parse_objdump_line(sym, args, expanded_line,
-					       &lineno) < 0)
+					       &lineno, &fileloc) < 0)
 			break;
 		nline++;
 	}
@@ -3144,6 +3152,10 @@ static int annotation__config(const char *var, const char *value, void *data)
 		opt->use_offset = perf_config_bool("use_offset", value);
 	} else if (!strcmp(var, "annotate.disassembler_style")) {
 		opt->disassembler_style = value;
+	} else if (!strcmp(var, "annotate.demangle")) {
+		symbol_conf.demangle = perf_config_bool("demangle", value);
+	} else if (!strcmp(var, "annotate.demangle_kernel")) {
+		symbol_conf.demangle_kernel = perf_config_bool("demangle_kernel", value);
 	} else {
 		pr_debug("%s variable unknown, ignoring...", var);
 	}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 096cdaf21b01..3757416bcf46 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -84,6 +84,7 @@ struct annotation_options {
 	     print_lines,
 	     full_path,
 	     show_linenr,
+	     show_fileloc,
 	     show_nr_jumps,
 	     show_minmax_cycle,
 	     show_asm_raw,
@@ -136,6 +137,7 @@ struct annotation_line {
 	s64			 offset;
 	char			*line;
 	int			 line_nr;
+	char			*fileloc;
 	int			 jump_sources;
 	float			 ipc;
 	u64			 cycles;
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 9087f1bffd3d..fbb3c4057c30 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -671,7 +671,7 @@ int bpf__probe(struct bpf_object *obj)
 		 * After probing, let's consider prologue, which
 		 * adds program fetcher to BPF programs.
 		 *
-		 * hook_load_preprocessorr() hooks pre-processor
+		 * hook_load_preprocessor() hooks pre-processor
 		 * to bpf_program, let it generate prologue
 		 * dynamically during loading.
 		 */
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 04f89120b323..81d1df3c4ec0 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <unistd.h>
+#include <sys/file.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <linux/err.h>
@@ -12,14 +13,45 @@
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
+#include <api/fs/fs.h>
 
 #include "bpf_counter.h"
 #include "counts.h"
 #include "debug.h"
 #include "evsel.h"
+#include "evlist.h"
 #include "target.h"
+#include "cpumap.h"
+#include "thread_map.h"
 
 #include "bpf_skel/bpf_prog_profiler.skel.h"
+#include "bpf_skel/bperf_u.h"
+#include "bpf_skel/bperf_leader.skel.h"
+#include "bpf_skel/bperf_follower.skel.h"
+
+/*
+ * bperf uses a hashmap, the attr_map, to track all the leader programs.
+ * The hashmap is pinned in bpffs. flock() on this file is used to ensure
+ * no concurrent access to the attr_map.  The key of attr_map is struct
+ * perf_event_attr, and the value is struct perf_event_attr_map_entry.
+ *
+ * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the
+ * leader prog, and the diff_map. Each perf-stat session holds a reference
+ * to the bpf_link to make sure the leader prog is attached to sched_switch
+ * tracepoint.
+ *
+ * Since the hashmap only contains IDs of the bpf_link and diff_map, it
+ * does not hold any references to the leader program. Once all perf-stat
+ * sessions of these events exit, the leader prog, its maps, and the
+ * perf_events will be freed.
+ */
+struct perf_event_attr_map_entry {
+	__u32 link_id;
+	__u32 diff_map_id;
+};
+
+#define DEFAULT_ATTR_MAP_PATH "fs/bpf/perf_attr_map"
+#define ATTR_MAP_SIZE 16
 
 static inline void *u64_to_ptr(__u64 ptr)
 {
@@ -274,17 +306,494 @@ struct bpf_counter_ops bpf_program_profiler_ops = {
 	.install_pe = bpf_program_profiler__install_pe,
 };
 
+static __u32 bpf_link_get_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.id;
+}
+
+static __u32 bpf_link_get_prog_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.prog_id;
+}
+
+static __u32 bpf_map_get_id(int fd)
+{
+	struct bpf_map_info map_info = {0};
+	__u32 map_info_len = sizeof(map_info);
+
+	bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len);
+	return map_info.id;
+}
+
+static int bperf_lock_attr_map(struct target *target)
+{
+	char path[PATH_MAX];
+	int map_fd, err;
+
+	if (target->attr_map) {
+		scnprintf(path, PATH_MAX, "%s", target->attr_map);
+	} else {
+		scnprintf(path, PATH_MAX, "%s/%s", sysfs__mountpoint(),
+			  DEFAULT_ATTR_MAP_PATH);
+	}
+
+	if (access(path, F_OK)) {
+		map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+					sizeof(struct perf_event_attr),
+					sizeof(struct perf_event_attr_map_entry),
+					ATTR_MAP_SIZE, 0);
+		if (map_fd < 0)
+			return -1;
+
+		err = bpf_obj_pin(map_fd, path);
+		if (err) {
+			/* someone pinned the map in parallel? */
+			close(map_fd);
+			map_fd = bpf_obj_get(path);
+			if (map_fd < 0)
+				return -1;
+		}
+	} else {
+		map_fd = bpf_obj_get(path);
+		if (map_fd < 0)
+			return -1;
+	}
+
+	err = flock(map_fd, LOCK_EX);
+	if (err) {
+		close(map_fd);
+		return -1;
+	}
+	return map_fd;
+}
+
+/* trigger the leader program on a cpu */
+static int bperf_trigger_reading(int prog_fd, int cpu)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+			    .ctx_in = NULL,
+			    .ctx_size_in = 0,
+			    .flags = BPF_F_TEST_RUN_ON_CPU,
+			    .cpu = cpu,
+			    .retval = 0,
+		);
+
+	return bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static int bperf_check_target(struct evsel *evsel,
+			      struct target *target,
+			      enum bperf_filter_type *filter_type,
+			      __u32 *filter_entry_cnt)
+{
+	if (evsel->leader->core.nr_members > 1) {
+		pr_err("bpf managed perf events do not yet support groups.\n");
+		return -1;
+	}
+
+	/* determine filter type based on target */
+	if (target->system_wide) {
+		*filter_type = BPERF_FILTER_GLOBAL;
+		*filter_entry_cnt = 1;
+	} else if (target->cpu_list) {
+		*filter_type = BPERF_FILTER_CPU;
+		*filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
+	} else if (target->tid) {
+		*filter_type = BPERF_FILTER_PID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else if (target->pid || evsel->evlist->workload.pid != -1) {
+		*filter_type = BPERF_FILTER_TGID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else {
+		pr_err("bpf managed perf events do not yet support these targets.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static	struct perf_cpu_map *all_cpu_map;
+
+static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
+				       struct perf_event_attr_map_entry *entry)
+{
+	struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
+	int link_fd, diff_map_fd, err;
+	struct bpf_link *link = NULL;
+
+	if (!skel) {
+		pr_err("Failed to open leader skeleton\n");
+		return -1;
+	}
+
+	bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
+	err = bperf_leader_bpf__load(skel);
+	if (err) {
+		pr_err("Failed to load leader skeleton\n");
+		goto out;
+	}
+
+	err = -1;
+	link = bpf_program__attach(skel->progs.on_switch);
+	if (!link) {
+		pr_err("Failed to attach leader program\n");
+		goto out;
+	}
+
+	link_fd = bpf_link__fd(link);
+	diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
+	entry->link_id = bpf_link_get_id(link_fd);
+	entry->diff_map_id = bpf_map_get_id(diff_map_fd);
+	err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
+	assert(err == 0);
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
+	assert(evsel->bperf_leader_link_fd >= 0);
+
+	/*
+	 * save leader_skel for install_pe, which is called within
+	 * following evsel__open_per_cpu call
+	 */
+	evsel->leader_skel = skel;
+	evsel__open_per_cpu(evsel, all_cpu_map, -1);
+
+out:
+	bperf_leader_bpf__destroy(skel);
+	bpf_link__destroy(link);
+	return err;
+}
+
+static int bperf__load(struct evsel *evsel, struct target *target)
+{
+	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
+	int attr_map_fd, diff_map_fd = -1, err;
+	enum bperf_filter_type filter_type;
+	__u32 filter_entry_cnt, i;
+
+	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
+		return -1;
+
+	if (!all_cpu_map) {
+		all_cpu_map = perf_cpu_map__new(NULL);
+		if (!all_cpu_map)
+			return -1;
+	}
+
+	evsel->bperf_leader_prog_fd = -1;
+	evsel->bperf_leader_link_fd = -1;
+
+	/*
+	 * Step 1: hold a fd on the leader program and the bpf_link, if
+	 * the program is not already gone, reload the program.
+	 * Use flock() to ensure exclusive access to the perf_event_attr
+	 * map.
+	 */
+	attr_map_fd = bperf_lock_attr_map(target);
+	if (attr_map_fd < 0) {
+		pr_err("Failed to lock perf_event_attr map\n");
+		return -1;
+	}
+
+	err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
+	if (err) {
+		err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
+		if (err)
+			goto out;
+	}
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
+	if (evsel->bperf_leader_link_fd < 0 &&
+	    bperf_reload_leader_program(evsel, attr_map_fd, &entry))
+		goto out;
+
+	/*
+	 * The bpf_link holds reference to the leader program, and the
+	 * leader program holds reference to the maps. Therefore, if
+	 * link_id is valid, diff_map_id should also be valid.
+	 */
+	evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
+		bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
+	assert(evsel->bperf_leader_prog_fd >= 0);
+
+	diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
+	assert(diff_map_fd >= 0);
+
+	/*
+	 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
+	 * whether the kernel support it
+	 */
+	err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
+	if (err) {
+		pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
+		       "Therefore, --use-bpf might show inaccurate readings\n");
+		goto out;
+	}
+
+	/* Step 2: load the follower skeleton */
+	evsel->follower_skel = bperf_follower_bpf__open();
+	if (!evsel->follower_skel) {
+		pr_err("Failed to open follower skeleton\n");
+		goto out;
+	}
+
+	/* attach fexit program to the leader program */
+	bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
+				       evsel->bperf_leader_prog_fd, "on_switch");
+
+	/* connect to leader diff_reading map */
+	bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
+
+	/* set up reading map */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
+				 filter_entry_cnt);
+	/* set up follower filter based on target */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
+				 filter_entry_cnt);
+	err = bperf_follower_bpf__load(evsel->follower_skel);
+	if (err) {
+		pr_err("Failed to load follower skeleton\n");
+		bperf_follower_bpf__destroy(evsel->follower_skel);
+		evsel->follower_skel = NULL;
+		goto out;
+	}
+
+	for (i = 0; i < filter_entry_cnt; i++) {
+		int filter_map_fd;
+		__u32 key;
+
+		if (filter_type == BPERF_FILTER_PID ||
+		    filter_type == BPERF_FILTER_TGID)
+			key = evsel->core.threads->map[i].pid;
+		else if (filter_type == BPERF_FILTER_CPU)
+			key = evsel->core.cpus->map[i];
+		else
+			break;
+
+		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
+		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+	}
+
+	evsel->follower_skel->bss->type = filter_type;
+
+	err = bperf_follower_bpf__attach(evsel->follower_skel);
+
+out:
+	if (err && evsel->bperf_leader_link_fd >= 0)
+		close(evsel->bperf_leader_link_fd);
+	if (err && evsel->bperf_leader_prog_fd >= 0)
+		close(evsel->bperf_leader_prog_fd);
+	if (diff_map_fd >= 0)
+		close(diff_map_fd);
+
+	flock(attr_map_fd, LOCK_UN);
+	close(attr_map_fd);
+
+	return err;
+}
+
+static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
+{
+	struct bperf_leader_bpf *skel = evsel->leader_skel;
+
+	return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
+				   &cpu, &fd, BPF_ANY);
+}
+
+/*
+ * trigger the leader prog on each cpu, so the accum_reading map could get
+ * the latest readings.
+ */
+static int bperf_sync_counters(struct evsel *evsel)
+{
+	int num_cpu, i, cpu;
+
+	num_cpu = all_cpu_map->nr;
+	for (i = 0; i < num_cpu; i++) {
+		cpu = all_cpu_map->map[i];
+		bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
+	}
+	return 0;
+}
+
+static int bperf__enable(struct evsel *evsel)
+{
+	evsel->follower_skel->bss->enabled = 1;
+	return 0;
+}
+
+static int bperf__read(struct evsel *evsel)
+{
+	struct bperf_follower_bpf *skel = evsel->follower_skel;
+	__u32 num_cpu_bpf = cpu__max_cpu();
+	struct bpf_perf_event_value values[num_cpu_bpf];
+	int reading_map_fd, err = 0;
+	__u32 i, j, num_cpu;
+
+	bperf_sync_counters(evsel);
+	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
+
+	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+		__u32 cpu;
+
+		err = bpf_map_lookup_elem(reading_map_fd, &i, values);
+		if (err)
+			goto out;
+		switch (evsel->follower_skel->bss->type) {
+		case BPERF_FILTER_GLOBAL:
+			assert(i == 0);
+
+			num_cpu = all_cpu_map->nr;
+			for (j = 0; j < num_cpu; j++) {
+				cpu = all_cpu_map->map[j];
+				perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
+				perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
+				perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
+			}
+			break;
+		case BPERF_FILTER_CPU:
+			cpu = evsel->core.cpus->map[i];
+			perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
+			perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
+			perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
+			break;
+		case BPERF_FILTER_PID:
+		case BPERF_FILTER_TGID:
+			perf_counts(evsel->counts, 0, i)->val = 0;
+			perf_counts(evsel->counts, 0, i)->ena = 0;
+			perf_counts(evsel->counts, 0, i)->run = 0;
+
+			for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
+				perf_counts(evsel->counts, 0, i)->val += values[cpu].counter;
+				perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled;
+				perf_counts(evsel->counts, 0, i)->run += values[cpu].running;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+out:
+	return err;
+}
+
+static int bperf__destroy(struct evsel *evsel)
+{
+	bperf_follower_bpf__destroy(evsel->follower_skel);
+	close(evsel->bperf_leader_prog_fd);
+	close(evsel->bperf_leader_link_fd);
+	return 0;
+}
+
+/*
+ * bperf: share hardware PMCs with BPF
+ *
+ * perf uses performance monitoring counters (PMC) to monitor system
+ * performance. The PMCs are limited hardware resources. For example,
+ * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
+ *
+ * Modern data center systems use these PMCs in many different ways:
+ * system level monitoring, (maybe nested) container level monitoring, per
+ * process monitoring, profiling (in sample mode), etc. In some cases,
+ * there are more active perf_events than available hardware PMCs. To allow
+ * all perf_events to have a chance to run, it is necessary to do expensive
+ * time multiplexing of events.
+ *
+ * On the other hand, many monitoring tools count the common metrics
+ * (cycles, instructions). It is a waste to have multiple tools create
+ * multiple perf_events of "cycles" and occupy multiple PMCs.
+ *
+ * bperf tries to reduce such wastes by allowing multiple perf_events of
+ * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
+ * of having each perf-stat session to read its own perf_events, bperf uses
+ * BPF programs to read the perf_events and aggregate readings to BPF maps.
+ * Then, the perf-stat session(s) reads the values from these BPF maps.
+ *
+ *                                ||
+ *       shared progs and maps <- || -> per session progs and maps
+ *                                ||
+ *   ---------------              ||
+ *   | perf_events |              ||
+ *   ---------------       fexit  ||      -----------------
+ *          |             --------||----> | follower prog |
+ *       --------------- /        || ---  -----------------
+ * cs -> | leader prog |/         ||/        |         |
+ *   --> ---------------         /||  --------------  ------------------
+ *  /       |         |         / ||  | filter map |  | accum_readings |
+ * /  ------------  ------------  ||  --------------  ------------------
+ * |  | prev map |  | diff map |  ||                        |
+ * |  ------------  ------------  ||                        |
+ *  \                             ||                        |
+ * = \ ==================================================== | ============
+ *    \                                                    /   user space
+ *     \                                                  /
+ *      \                                                /
+ *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
+ *        \                                            /
+ *         \                                          /
+ *          \------  perf-stat ----------------------/
+ *
+ * The figure above shows the architecture of bperf. Note that the figure
+ * is divided into 3 regions: shared progs and maps (top left), per session
+ * progs and maps (top right), and user space (bottom).
+ *
+ * The leader prog is triggered on each context switch (cs). The leader
+ * prog reads perf_events and stores the difference (current_reading -
+ * previous_reading) to the diff map. For the same metric, e.g. "cycles",
+ * multiple perf-stat sessions share the same leader prog.
+ *
+ * Each perf-stat session creates a follower prog as fexit program to the
+ * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
+ * follower progs to the same leader prog. The follower prog checks current
+ * task and processor ID to decide whether to add the value from the diff
+ * map to its accumulated reading map (accum_readings).
+ *
+ * Finally, perf-stat user space reads the value from accum_reading map.
+ *
+ * Besides context switch, it is also necessary to trigger the leader prog
+ * before perf-stat reads the value. Otherwise, the accum_reading map may
+ * not have the latest reading from the perf_events. This is achieved by
+ * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
+ *
+ * Comment before the definition of struct perf_event_attr_map_entry
+ * describes how different sessions of perf-stat share information about
+ * the leader prog.
+ */
+
+struct bpf_counter_ops bperf_ops = {
+	.load       = bperf__load,
+	.enable     = bperf__enable,
+	.read       = bperf__read,
+	.install_pe = bperf__install_pe,
+	.destroy    = bperf__destroy,
+};
+
+static inline bool bpf_counter_skip(struct evsel *evsel)
+{
+	return list_empty(&evsel->bpf_counter_list) &&
+		evsel->follower_skel == NULL;
+}
+
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
 }
 
 int bpf_counter__load(struct evsel *evsel, struct target *target)
 {
-	if (target__has_bpf(target))
+	if (target->bpf_str)
 		evsel->bpf_counter_ops = &bpf_program_profiler_ops;
+	else if (target->use_bpf)
+		evsel->bpf_counter_ops = &bperf_ops;
 
 	if (evsel->bpf_counter_ops)
 		return evsel->bpf_counter_ops->load(evsel, target);
@@ -293,21 +802,21 @@ int bpf_counter__load(struct evsel *evsel, struct target *target)
 
 int bpf_counter__enable(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->enable(evsel);
 }
 
 int bpf_counter__read(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return -EAGAIN;
 	return evsel->bpf_counter_ops->read(evsel);
 }
 
 void bpf_counter__destroy(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return;
 	evsel->bpf_counter_ops->destroy(evsel);
 	evsel->bpf_counter_ops = NULL;
diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h
index 2eca210e5dc1..cb9c532e0a07 100644
--- a/tools/perf/util/bpf_counter.h
+++ b/tools/perf/util/bpf_counter.h
@@ -38,7 +38,7 @@ int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
 
 #else /* HAVE_BPF_SKEL */
 
-#include<linux/err.h>
+#include <linux/err.h>
 
 static inline int bpf_counter__load(struct evsel *evsel __maybe_unused,
 				    struct target *target __maybe_unused)
diff --git a/tools/perf/util/bpf_skel/bperf.h b/tools/perf/util/bpf_skel/bperf.h
new file mode 100644
index 000000000000..186a5551ddb9
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_H
+#define __BPERF_STAT_H
+
+typedef struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} reading_map;
+
+#endif /* __BPERF_STAT_H */
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
new file mode 100644
index 000000000000..b8fa3cb2da23
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+#include "bperf_u.h"
+
+reading_map diff_readings SEC(".maps");
+reading_map accum_readings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} filter SEC(".maps");
+
+enum bperf_filter_type type = 0;
+int enabled = 0;
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+	struct bpf_perf_event_value *diff_val, *accum_val;
+	__u32 filter_key, zero = 0;
+	__u32 *accum_key;
+
+	if (!enabled)
+		return 0;
+
+	switch (type) {
+	case BPERF_FILTER_GLOBAL:
+		accum_key = &zero;
+		goto do_add;
+	case BPERF_FILTER_CPU:
+		filter_key = bpf_get_smp_processor_id();
+		break;
+	case BPERF_FILTER_PID:
+		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
+		break;
+	case BPERF_FILTER_TGID:
+		filter_key = bpf_get_current_pid_tgid() >> 32;
+		break;
+	default:
+		return 0;
+	}
+
+	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
+	if (!accum_key)
+		return 0;
+
+do_add:
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+	if (!accum_val)
+		return 0;
+
+	accum_val->counter += diff_val->counter;
+	accum_val->enabled += diff_val->enabled;
+	accum_val->running += diff_val->running;
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
new file mode 100644
index 000000000000..4f70d1459e86
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(int));
+	__uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} events SEC(".maps");
+
+reading_map prev_readings SEC(".maps");
+reading_map diff_readings SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(on_switch)
+{
+	struct bpf_perf_event_value val, *prev_val, *diff_val;
+	__u32 key = bpf_get_smp_processor_id();
+	__u32 zero = 0;
+	long err;
+
+	prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
+	if (!prev_val)
+		return 0;
+
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+	if (err)
+		return 0;
+
+	diff_val->counter = val.counter - prev_val->counter;
+	diff_val->enabled = val.enabled - prev_val->enabled;
+	diff_val->running = val.running - prev_val->running;
+	*prev_val = val;
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
new file mode 100644
index 000000000000..1ce0c2c905c1
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_U_H
+#define __BPERF_STAT_U_H
+
+enum bperf_filter_type {
+	BPERF_FILTER_GLOBAL = 1,
+	BPERF_FILTER_CPU,
+	BPERF_FILTER_PID,
+	BPERF_FILTER_TGID,
+};
+
+#endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
index c7cec92d0236..ab12b4c4ece2 100644
--- a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
+++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
@@ -52,7 +52,7 @@ int BPF_PROG(fentry_XXX)
 static inline void
 fexit_update_maps(struct bpf_perf_event_value *after)
 {
-	struct bpf_perf_event_value *before, diff, *accum;
+	struct bpf_perf_event_value *before, diff;
 	__u32 zero = 0;
 
 	before = bpf_map_lookup_elem(&fentry_readings, &zero);
@@ -78,7 +78,6 @@ int BPF_PROG(fexit_XXX)
 {
 	struct bpf_perf_event_value reading;
 	__u32 cpu = bpf_get_smp_processor_id();
-	__u32 one = 1, zero = 0;
 	int err;
 
 	/* read all events before updating the maps, to reduce error */
diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h
index 6b3229106f16..5875cfc8106e 100644
--- a/tools/perf/util/call-path.h
+++ b/tools/perf/util/call-path.h
@@ -23,7 +23,7 @@
  * @children: tree of call paths of functions called
  *
  * In combination with the call_return structure, the call_path structure
- * defines a context-sensitve call-graph.
+ * defines a context-sensitive call-graph.
  */
 struct call_path {
 	struct call_path *parent;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 1b60985690bb..8e2777133bd9 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -877,7 +877,7 @@ append_chain_children(struct callchain_node *root,
 	if (!node)
 		return -1;
 
-	/* lookup in childrens */
+	/* lookup in children */
 	while (*p) {
 		enum match_result ret;
 
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 6984c77068a3..6bcb5ef221f8 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -457,6 +457,9 @@ static int perf_stat_config(const char *var, const char *value)
 	if (!strcmp(var, "stat.big-num"))
 		perf_stat__set_big_num(perf_config_bool(var, value));
 
+	if (!strcmp(var, "stat.no-csv-summary"))
+		perf_stat__set_no_csv_summary(perf_config_bool(var, value));
+
 	/* Add other config variables here. */
 	return 0;
 }
@@ -699,7 +702,7 @@ static int collect_config(const char *var, const char *value,
 	/* perf_config_set can contain both user and system config items.
 	 * So we should know where each value is from.
 	 * The classification would be needed when a particular config file
-	 * is overwrited by setting feature i.e. set_config().
+	 * is overwritten by setting feature i.e. set_config().
 	 */
 	if (strcmp(config_file_name, perf_etc_perfconfig()) == 0) {
 		section->from_system_config = true;
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..059bcec3f651 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/zalloc.h>
@@ -316,7 +317,7 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 	 * This is the first timestamp we've seen since the beginning of traces
 	 * or a discontinuity.  Since timestamps packets are generated *after*
 	 * range packets have been generated, we need to estimate the time at
-	 * which instructions started by substracting the number of instructions
+	 * which instructions started by subtracting the number of instructions
 	 * executed to the timestamp.
 	 */
 	packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
@@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
 			const ocsd_generic_trace_elem *elem,
 			const uint8_t trace_chan_id)
 {
-	pid_t tid;
+	pid_t tid = -1;
+	static u64 pid_fmt;
+	int ret;
 
-	/* Ignore PE_CONTEXT packets that don't have a valid contextID */
-	if (!elem->context.ctxt_id_valid)
+	/*
+	 * As all the ETMs run at the same exception level, the system should
+	 * have the same PID format crossing CPUs.  So cache the PID format
+	 * and reuse it for sequential decoding.
+	 */
+	if (!pid_fmt) {
+		ret = cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt);
+		if (ret)
+			return OCSD_RESP_FATAL_SYS_ERR;
+	}
+
+	/*
+	 * Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+	 * If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+	 * as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+	 */
+	switch (pid_fmt) {
+	case BIT(ETM_OPT_CTXTID):
+		if (elem->context.ctxt_id_valid)
+			tid = elem->context.context_id;
+		break;
+	case BIT(ETM_OPT_CTXTID2):
+		if (elem->context.vmid_valid)
+			tid = elem->context.vmid;
+		break;
+	default:
+		break;
+	}
+
+	if (tid == -1)
 		return OCSD_RESP_CONT;
 
-	tid =  elem->context.context_id;
 	if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
 		return OCSD_RESP_FATAL_SYS_ERR;
 
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..7e63e7dedc33 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -156,11 +157,52 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
 	return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2
+ * are enabled at the same time when the session runs on an EL2 kernel.
+ * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be
+ * recorded in the trace data, the tool will selectively use
+ * CONTEXTIDR_EL2 as PID.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+	struct int_node *inode;
+	u64 *metadata, val;
+
+	inode = intlist__find(traceid_list, trace_chan_id);
+	if (!inode)
+		return -EINVAL;
+
+	metadata = inode->priv;
+
+	if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+		val = metadata[CS_ETM_ETMCR];
+		/* CONTEXTIDR is traced */
+		if (val & BIT(ETM_OPT_CTXTID))
+			*pid_fmt = BIT(ETM_OPT_CTXTID);
+	} else {
+		val = metadata[CS_ETMV4_TRCCONFIGR];
+		/* CONTEXTIDR_EL2 is traced */
+		if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+			*pid_fmt = BIT(ETM_OPT_CTXTID2);
+		/* CONTEXTIDR_EL1 is traced */
+		else if (val & BIT(ETM4_CFG_BIT_CTXTID))
+			*pid_fmt = BIT(ETM_OPT_CTXTID);
+	}
+
+	return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
 					      u8 trace_chan_id)
 {
 	/*
-	 * Wnen a timestamp packet is encountered the backend code
+	 * When a timestamp packet is encountered the backend code
 	 * is stopped so that the front end has time to process packets
 	 * that were accumulated in the traceID queue.  Since there can
 	 * be more than one channel per cs_etm_queue, we need to specify
@@ -1655,7 +1697,7 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
 		 * | 1 1 0 1 1 1 1 1 |  imm8  |
 		 * +-----------------+--------+
 		 *
-		 * According to the specifiction, it only defines SVC for T32
+		 * According to the specification, it only defines SVC for T32
 		 * with 16 bits instruction and has no definition for 32bits;
 		 * so below only read 2 bytes as instruction size for T32.
 		 */
@@ -1887,7 +1929,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 
 		/*
 		 * If the previous packet is an exception return packet
-		 * and the return address just follows SVC instuction,
+		 * and the return address just follows SVC instruction,
 		 * it needs to calibrate the previous packet sample flags
 		 * as PERF_IP_FLAG_SYSCALLRET.
 		 */
@@ -1961,7 +2003,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 		 * contain exception type related info so we cannot decide
 		 * the exception type purely based on exception return packet.
 		 * If we record the exception number from exception packet and
-		 * reuse it for excpetion return packet, this is not reliable
+		 * reuse it for exception return packet, this is not reliable
 		 * due the trace can be discontinuity or the interrupt can
 		 * be nested, thus the recorded exception number cannot be
 		 * used for exception return packet for these two cases.
@@ -2435,7 +2477,7 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm)
 }
 
 static const char * const cs_etm_global_header_fmts[] = {
-	[CS_HEADER_VERSION_0]	= "	Header version		       %llx\n",
+	[CS_HEADER_VERSION]	= "	Header version		       %llx\n",
 	[CS_PMU_TYPE_CPUS]	= "	PMU type/num cpus	       %llx\n",
 	[CS_ETM_SNAPSHOT]	= "	Snapshot		       %llx\n",
 };
@@ -2443,6 +2485,7 @@ static const char * const cs_etm_global_header_fmts[] = {
 static const char * const cs_etm_priv_fmts[] = {
 	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
 	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
 	[CS_ETM_ETMCR]		= "	ETMCR			       %llx\n",
 	[CS_ETM_ETMTRACEIDR]	= "	ETMTRACEIDR		       %llx\n",
 	[CS_ETM_ETMCCER]	= "	ETMCCER			       %llx\n",
@@ -2452,6 +2495,7 @@ static const char * const cs_etm_priv_fmts[] = {
 static const char * const cs_etmv4_priv_fmts[] = {
 	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
 	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
 	[CS_ETMV4_TRCCONFIGR]	= "	TRCCONFIGR		       %llx\n",
 	[CS_ETMV4_TRCTRACEIDR]	= "	TRCTRACEIDR		       %llx\n",
 	[CS_ETMV4_TRCIDR0]	= "	TRCIDR0			       %llx\n",
@@ -2461,26 +2505,167 @@ static const char * const cs_etmv4_priv_fmts[] = {
 	[CS_ETMV4_TRCAUTHSTATUS] = "	TRCAUTHSTATUS		       %llx\n",
 };
 
-static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+static const char * const param_unk_fmt =
+	"	Unknown parameter [%d]	       %llx\n";
+static const char * const magic_unk_fmt =
+	"	Magic number Unknown	       %llx\n";
+
+static int cs_etm__print_cpu_metadata_v0(__u64 *val, int *offset)
 {
-	int i, j, cpu = 0;
+	int i = *offset, j, nr_params = 0, fmt_offset;
+	__u64 magic;
 
-	for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
-		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+	/* check magic value */
+	magic = val[i + CS_ETM_MAGIC];
+	if ((magic != __perf_cs_etmv3_magic) &&
+	    (magic != __perf_cs_etmv4_magic)) {
+		/* failure - note bad magic value */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+
+	/* print common header block */
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]);
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]);
+
+	if (magic == __perf_cs_etmv3_magic) {
+		nr_params = CS_ETM_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETM_ETMCR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
+	} else if (magic == __perf_cs_etmv4_magic) {
+		nr_params = CS_ETMV4_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETMV4_TRCCONFIGR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
+	}
+	*offset = i;
+	return 0;
+}
+
+static int cs_etm__print_cpu_metadata_v1(__u64 *val, int *offset)
+{
+	int i = *offset, j, total_params = 0;
+	__u64 magic;
+
+	magic = val[i + CS_ETM_MAGIC];
+	/* total params to print is NR_PARAMS + common block size for v1 */
+	total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1;
 
-	for (i = CS_HEADER_VERSION_0_MAX; cpu < num; cpu++) {
-		if (val[i] == __perf_cs_etmv3_magic)
-			for (j = 0; j < CS_ETM_PRIV_MAX; j++, i++)
+	if (magic == __perf_cs_etmv3_magic) {
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETM_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
 				fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
-		else if (val[i] == __perf_cs_etmv4_magic)
-			for (j = 0; j < CS_ETMV4_PRIV_MAX; j++, i++)
+		}
+	} else if (magic == __perf_cs_etmv4_magic) {
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETMV4_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
 				fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
-		else
-			/* failure.. return */
+		}
+	} else {
+		/* failure - note bad magic value and error out */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+	*offset = i;
+	return 0;
+}
+
+static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+{
+	int i, cpu = 0, version, err;
+
+	/* bail out early on bad header version */
+	version = val[0];
+	if (version > CS_HEADER_CURRENT_VERSION) {
+		/* failure.. return */
+		fprintf(stdout, "	Unknown Header Version = %x, ", version);
+		fprintf(stdout, "Version supported <= %x\n", CS_HEADER_CURRENT_VERSION);
+		return;
+	}
+
+	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
+		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+
+	for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
+		if (version == 0)
+			err = cs_etm__print_cpu_metadata_v0(val, &i);
+		else if (version == 1)
+			err = cs_etm__print_cpu_metadata_v1(val, &i);
+		if (err)
 			return;
 	}
 }
 
+/*
+ * Read a single cpu parameter block from the auxtrace_info priv block.
+ *
+ * For version 1 there is a per cpu nr_params entry. If we are handling
+ * version 1 file, then there may be less, the same, or more params
+ * indicated by this value than the compile time number we understand.
+ *
+ * For a version 0 info block, there are a fixed number, and we need to
+ * fill out the nr_param value in the metadata we create.
+ */
+static u64 *cs_etm__create_meta_blk(u64 *buff_in, int *buff_in_offset,
+				    int out_blk_size, int nr_params_v0)
+{
+	u64 *metadata = NULL;
+	int hdr_version;
+	int nr_in_params, nr_out_params, nr_cmn_params;
+	int i, k;
+
+	metadata = zalloc(sizeof(*metadata) * out_blk_size);
+	if (!metadata)
+		return NULL;
+
+	/* read block current index & version */
+	i = *buff_in_offset;
+	hdr_version = buff_in[CS_HEADER_VERSION];
+
+	if (!hdr_version) {
+	/* read version 0 info block into a version 1 metadata block  */
+		nr_in_params = nr_params_v0;
+		metadata[CS_ETM_MAGIC] = buff_in[i + CS_ETM_MAGIC];
+		metadata[CS_ETM_CPU] = buff_in[i + CS_ETM_CPU];
+		metadata[CS_ETM_NR_TRC_PARAMS] = nr_in_params;
+		/* remaining block params at offset +1 from source */
+		for (k = CS_ETM_COMMON_BLK_MAX_V1 - 1; k < nr_in_params; k++)
+			metadata[k + 1] = buff_in[i + k];
+		/* version 0 has 2 common params */
+		nr_cmn_params = 2;
+	} else {
+	/* read version 1 info block - input and output nr_params may differ */
+		/* version 1 has 3 common params */
+		nr_cmn_params = 3;
+		nr_in_params = buff_in[i + CS_ETM_NR_TRC_PARAMS];
+
+		/* if input has more params than output - skip excess */
+		nr_out_params = nr_in_params + nr_cmn_params;
+		if (nr_out_params > out_blk_size)
+			nr_out_params = out_blk_size;
+
+		for (k = CS_ETM_MAGIC; k < nr_out_params; k++)
+			metadata[k] = buff_in[i + k];
+
+		/* record the actual nr params we copied */
+		metadata[CS_ETM_NR_TRC_PARAMS] = nr_out_params - nr_cmn_params;
+	}
+
+	/* adjust in offset by number of in params used */
+	i += nr_in_params + nr_cmn_params;
+	*buff_in_offset = i;
+	return metadata;
+}
+
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session)
 {
@@ -2492,11 +2677,12 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	int info_header_size;
 	int total_size = auxtrace_info->header.size;
 	int priv_size = 0;
-	int num_cpu;
-	int err = 0, idx = -1;
-	int i, j, k;
+	int num_cpu, trcidr_idx;
+	int err = 0;
+	int i, j;
 	u64 *ptr, *hdr = NULL;
 	u64 **metadata = NULL;
+	u64 hdr_version;
 
 	/*
 	 * sizeof(auxtrace_info_event::type) +
@@ -2512,16 +2698,21 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	/* First the global part */
 	ptr = (u64 *) auxtrace_info->priv;
 
-	/* Look for version '0' of the header */
-	if (ptr[0] != 0)
+	/* Look for version of the header */
+	hdr_version = ptr[0];
+	if (hdr_version > CS_HEADER_CURRENT_VERSION) {
+		/* print routine will print an error on bad version */
+		if (dump_trace)
+			cs_etm__print_auxtrace_info(auxtrace_info->priv, 0);
 		return -EINVAL;
+	}
 
-	hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_0_MAX);
+	hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_MAX);
 	if (!hdr)
 		return -ENOMEM;
 
 	/* Extract header information - see cs-etm.h for format */
-	for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
+	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
 		hdr[i] = ptr[i];
 	num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff;
 	pmu_type = (unsigned int) ((hdr[CS_PMU_TYPE_CPUS] >> 32) &
@@ -2552,35 +2743,31 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	 */
 	for (j = 0; j < num_cpu; j++) {
 		if (ptr[i] == __perf_cs_etmv3_magic) {
-			metadata[j] = zalloc(sizeof(*metadata[j]) *
-					     CS_ETM_PRIV_MAX);
-			if (!metadata[j]) {
-				err = -ENOMEM;
-				goto err_free_metadata;
-			}
-			for (k = 0; k < CS_ETM_PRIV_MAX; k++)
-				metadata[j][k] = ptr[i + k];
+			metadata[j] =
+				cs_etm__create_meta_blk(ptr, &i,
+							CS_ETM_PRIV_MAX,
+							CS_ETM_NR_TRC_PARAMS_V0);
 
 			/* The traceID is our handle */
-			idx = metadata[j][CS_ETM_ETMTRACEIDR];
-			i += CS_ETM_PRIV_MAX;
+			trcidr_idx = CS_ETM_ETMTRACEIDR;
+
 		} else if (ptr[i] == __perf_cs_etmv4_magic) {
-			metadata[j] = zalloc(sizeof(*metadata[j]) *
-					     CS_ETMV4_PRIV_MAX);
-			if (!metadata[j]) {
-				err = -ENOMEM;
-				goto err_free_metadata;
-			}
-			for (k = 0; k < CS_ETMV4_PRIV_MAX; k++)
-				metadata[j][k] = ptr[i + k];
+			metadata[j] =
+				cs_etm__create_meta_blk(ptr, &i,
+							CS_ETMV4_PRIV_MAX,
+							CS_ETMV4_NR_TRC_PARAMS_V0);
 
 			/* The traceID is our handle */
-			idx = metadata[j][CS_ETMV4_TRCTRACEIDR];
-			i += CS_ETMV4_PRIV_MAX;
+			trcidr_idx = CS_ETMV4_TRCTRACEIDR;
+		}
+
+		if (!metadata[j]) {
+			err = -ENOMEM;
+			goto err_free_metadata;
 		}
 
 		/* Get an RB node for this CPU */
-		inode = intlist__findnew(traceid_list, idx);
+		inode = intlist__findnew(traceid_list, metadata[j][trcidr_idx]);
 
 		/* Something went wrong, no need to continue */
 		if (!inode) {
@@ -2601,7 +2788,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	}
 
 	/*
-	 * Each of CS_HEADER_VERSION_0_MAX, CS_ETM_PRIV_MAX and
+	 * Each of CS_HEADER_VERSION_MAX, CS_ETM_PRIV_MAX and
 	 * CS_ETMV4_PRIV_MAX mark how many double words are in the
 	 * global metadata, and each cpu's metadata respectively.
 	 * The following tests if the correct number of double words was
@@ -2703,6 +2890,12 @@ err_free_traceid_list:
 	intlist__delete(traceid_list);
 err_free_hdr:
 	zfree(&hdr);
-
+	/*
+	 * At this point, as a minimum we have valid header. Dump the rest of
+	 * the info section - the print routines will error out on structural
+	 * issues.
+	 */
+	if (dump_trace)
+		cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu);
 	return err;
 }
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4ad925d6d799..36428918411e 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -12,28 +12,43 @@
 
 struct perf_session;
 
-/* Versionning header in case things need tro change in the future.  That way
+/*
+ * Versioning header in case things need to change in the future.  That way
  * decoding of old snapshot is still possible.
  */
 enum {
 	/* Starting with 0x0 */
-	CS_HEADER_VERSION_0,
+	CS_HEADER_VERSION,
 	/* PMU->type (32 bit), total # of CPUs (32 bit) */
 	CS_PMU_TYPE_CPUS,
 	CS_ETM_SNAPSHOT,
-	CS_HEADER_VERSION_0_MAX,
+	CS_HEADER_VERSION_MAX,
 };
 
+/*
+ * Update the version for new format.
+ *
+ * New version 1 format adds a param count to the per cpu metadata.
+ * This allows easy adding of new metadata parameters.
+ * Requires that new params always added after current ones.
+ * Also allows client reader to handle file versions that are different by
+ * checking the number of params in the file vs the number expected.
+ */
+#define CS_HEADER_CURRENT_VERSION 1
+
 /* Beginning of header common to both ETMv3 and V4 */
 enum {
 	CS_ETM_MAGIC,
 	CS_ETM_CPU,
+	/* Number of trace config params in following ETM specific block */
+	CS_ETM_NR_TRC_PARAMS,
+	CS_ETM_COMMON_BLK_MAX_V1,
 };
 
 /* ETMv3/PTM metadata */
 enum {
 	/* Dynamic, configurable parameters */
-	CS_ETM_ETMCR = CS_ETM_CPU + 1,
+	CS_ETM_ETMCR = CS_ETM_COMMON_BLK_MAX_V1,
 	CS_ETM_ETMTRACEIDR,
 	/* RO, taken from sysFS */
 	CS_ETM_ETMCCER,
@@ -41,10 +56,13 @@ enum {
 	CS_ETM_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETM_NR_TRC_PARAMS_V0 (CS_ETM_ETMIDR - CS_ETM_ETMCR + 1)
+
 /* ETMv4 metadata */
 enum {
 	/* Dynamic, configurable parameters */
-	CS_ETMV4_TRCCONFIGR = CS_ETM_CPU + 1,
+	CS_ETMV4_TRCCONFIGR = CS_ETM_COMMON_BLK_MAX_V1,
 	CS_ETMV4_TRCTRACEIDR,
 	/* RO, taken from sysFS */
 	CS_ETMV4_TRCIDR0,
@@ -55,9 +73,12 @@ enum {
 	CS_ETMV4_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETMV4_NR_TRC_PARAMS_V0 (CS_ETMV4_TRCAUTHSTATUS - CS_ETMV4_TRCCONFIGR + 1)
+
 /*
  * ETMv3 exception encoding number:
- * See Embedded Trace Macrocell spcification (ARM IHI 0014Q)
+ * See Embedded Trace Macrocell specification (ARM IHI 0014Q)
  * table 7-12 Encoding of Exception[3:0] for non-ARMv7-M processors.
  */
 enum {
@@ -162,7 +183,7 @@ struct cs_etm_packet_queue {
 
 #define BMVAL(val, lsb, msb)	((val & GENMASK(msb, lsb)) >> lsb)
 
-#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_0_MAX * sizeof(u64))
+#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_MAX * sizeof(u64))
 
 #define __perf_cs_etmv3_magic 0x3030303030303030ULL
 #define __perf_cs_etmv4_magic 0x4040404040404040ULL
@@ -173,6 +194,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
 			 pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 8b67bd97d122..a9c375e63e73 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -949,7 +949,7 @@ static char *change_name(char *name, char *orig_name, int dup)
 	/*
 	 * Add '_' prefix to potential keywork.  According to
 	 * Mathieu Desnoyers (https://lore.kernel.org/lkml/1074266107.40857.1422045946295.JavaMail.zimbra@efficios.com),
-	 * futher CTF spec updating may require us to use '$'.
+	 * further CTF spec updating may require us to use '$'.
 	 */
 	if (dup < 0)
 		len = strlen(name) + sizeof("_");
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
index 39c05200ed65..ddf33d58bcd3 100644
--- a/tools/perf/util/demangle-java.c
+++ b/tools/perf/util/demangle-java.c
@@ -147,7 +147,7 @@ error:
  * Demangle Java function signature (openJDK, not GCJ)
  * input:
  * 	str: string to parse. String is not modified
- *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ *    flags: combination of JAVA_DEMANGLE_* flags to modify demangling
  * return:
  *	if input can be demangled, then a newly allocated string is returned.
  *	if input cannot be demangled, then NULL is returned
@@ -164,7 +164,7 @@ java_demangle_sym(const char *str, int flags)
 	if (!str)
 		return NULL;
 
-	/* find start of retunr type */
+	/* find start of return type */
 	p = strrchr(str, ')');
 	if (!p)
 		return NULL;
diff --git a/tools/perf/util/demangle-ocaml.c b/tools/perf/util/demangle-ocaml.c
index 3df14e67c622..9d707bb60b4b 100644
--- a/tools/perf/util/demangle-ocaml.c
+++ b/tools/perf/util/demangle-ocaml.c
@@ -64,17 +64,5 @@ ocaml_demangle_sym(const char *sym)
 	}
 	result[j] = '\0';
 
-	/* scan backwards to remove an "_" followed by decimal digits */
-	if (j != 0 && isdigit(result[j - 1])) {
-		while (--j) {
-			if (!isdigit(result[j])) {
-				break;
-			}
-		}
-		if (result[j] == '_') {
-			result[j] = '\0';
-		}
-	}
-
 	return result;
 }
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index cd2fe64a3c5d..52e7101c5609 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -216,7 +216,7 @@ struct dso {
 
 /* dso__for_each_symbol - iterate over the symbols of given type
  *
- * @dso: the 'struct dso *' in which symbols itereated
+ * @dso: the 'struct dso *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  */
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 7b2d471a6419..b2f4920e19a6 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -91,7 +91,7 @@ static Dwarf_Line *cu_getsrc_die(Dwarf_Die *cu_die, Dwarf_Addr addr)
 			return NULL;
 	} while (laddr == addr);
 	l++;
-	/* Going foward to find the statement line */
+	/* Going forward to find the statement line */
 	do {
 		line = dwarf_onesrcline(lines, l++);
 		if (!line || dwarf_lineaddr(line, &laddr) != 0 ||
@@ -177,7 +177,7 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
  * die_get_linkage_name - Get the linkage name of the object
  * @dw_die: A DIE of the object
  *
- * Get the linkage name attiribute of given @dw_die.
+ * Get the linkage name attribute of given @dw_die.
  * For C++ binary, the linkage name will be the mangled symbol.
  */
 const char *die_get_linkage_name(Dwarf_Die *dw_die)
@@ -739,7 +739,7 @@ static int __die_walk_instances_cb(Dwarf_Die *inst, void *data)
  * @data: user data
  *
  * Walk on the instances of give @in_die. @in_die must be an inlined function
- * declartion. This returns the return value of @callback if it returns
+ * declaration. This returns the return value of @callback if it returns
  * non-zero value, or -ENOENT if there is no instance.
  */
 int die_walk_instances(Dwarf_Die *or_die, int (*callback)(Dwarf_Die *, void *),
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 506006e0cf66..cb99646843a9 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -22,7 +22,7 @@ const char *cu_get_comp_dir(Dwarf_Die *cu_die);
 int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr,
 		     const char **fname, int *lineno);
 
-/* Walk on funcitons at given address */
+/* Walk on functions at given address */
 int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
 			 int (*callback)(Dwarf_Die *, void *), void *data);
 
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 1b49ecee5aff..3fa4486742cd 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -24,6 +24,7 @@
 #include "../arch/s390/include/dwarf-regs-table.h"
 #include "../arch/sparc/include/dwarf-regs-table.h"
 #include "../arch/xtensa/include/dwarf-regs-table.h"
+#include "../arch/mips/include/dwarf-regs-table.h"
 
 #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
 
@@ -53,6 +54,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine)
 		return __get_dwarf_regstr(sparc_regstr_tbl, n);
 	case EM_XTENSA:
 		return __get_dwarf_regstr(xtensa_regstr_tbl, n);
+	case EM_MIPS:
+		return __get_dwarf_regstr(mips_regstr_tbl, n);
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index f603edbbbc6f..8a62fb39e365 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -147,6 +147,7 @@ struct perf_sample {
 	u8  cpumode;
 	u16 misc;
 	u16 ins_lat;
+	u16 p_stage_cyc;
 	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
 	char insn[MAX_INSN];
 	void *raw_data;
@@ -427,5 +428,7 @@ char *get_page_size_name(u64 size, char *str);
 
 void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
+const char *arch_perf_header_entry(const char *se_header);
+int arch_support_sort_key(const char *sort_key);
 
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h
index 859cb34fcff2..631a4af2ed86 100644
--- a/tools/perf/util/events_stats.h
+++ b/tools/perf/util/events_stats.h
@@ -21,7 +21,7 @@
  * all struct perf_record_lost_samples.lost fields reported.
  *
  * The total_period is needed because by default auto-freq is used, so
- * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
+ * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
  * the total number of low level events, it is necessary to to sum all struct
  * perf_record_sample.period and stash the result in total_period.
  */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 882cd1f721d9..f1c79ecf8107 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -36,6 +36,7 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/prctl.h>
 
 #include <linux/bitops.h>
 #include <linux/hash.h>
@@ -1209,7 +1210,7 @@ bool evlist__valid_read_format(struct evlist *evlist)
 		}
 	}
 
-	/* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */
+	/* PERF_SAMPLE_READ implies PERF_FORMAT_ID. */
 	if ((sample_type & PERF_SAMPLE_READ) &&
 	    !(read_format & PERF_FORMAT_ID)) {
 		return false;
@@ -1406,6 +1407,13 @@ int evlist__prepare_workload(struct evlist *evlist, struct target *target, const
 		fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
 
 		/*
+		 * Change the name of this process not to confuse --exclude-perf users
+		 * that sees 'perf' in the window up to the execvp() and thinks that
+		 * perf samples are not being excluded.
+		 */
+		prctl(PR_SET_NAME, "perf-exec");
+
+		/*
 		 * Tell the parent we're ready to go
 		 */
 		close(child_ready_pipe[1]);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 7ecbc8e2fbfa..2d2614eeaa20 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -621,7 +621,7 @@ const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_AL
 #define COP(x)		(1 << x)
 
 /*
- * cache operartion stat
+ * cache operation stat
  * L1I : Read and prefetch only
  * ITLB and BPU : Read-only
  */
@@ -2275,7 +2275,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		/*
 		 * Undo swap of u64, then swap on individual u32s,
 		 * get the size of the raw area and undo all of the
-		 * swap. The pevent interface handles endianity by
+		 * swap. The pevent interface handles endianness by
 		 * itself.
 		 */
 		if (swapped) {
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 6026487353dd..eccc4fd5b3eb 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -20,6 +20,8 @@ union perf_event;
 struct bpf_counter_ops;
 struct target;
 struct hashmap;
+struct bperf_leader_bpf;
+struct bperf_follower_bpf;
 
 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data);
 
@@ -130,8 +132,24 @@ struct evsel {
 	 * See also evsel__has_callchain().
 	 */
 	__u64			synth_sample_type;
-	struct list_head	bpf_counter_list;
+
+	/*
+	 * bpf_counter_ops serves two use cases:
+	 *   1. perf-stat -b          counting events used byBPF programs
+	 *   2. perf-stat --use-bpf   use BPF programs to aggregate counts
+	 */
 	struct bpf_counter_ops	*bpf_counter_ops;
+
+	/* for perf-stat -b */
+	struct list_head	bpf_counter_list;
+
+	/* for perf-stat --use-bpf */
+	int			bperf_leader_prog_fd;
+	int			bperf_leader_link_fd;
+	union {
+		struct bperf_leader_bpf *leader_skel;
+		struct bperf_follower_bpf *follower_skel;
+	};
 };
 
 struct perf_missing_features {
@@ -157,7 +175,6 @@ struct perf_missing_features {
 extern struct perf_missing_features perf_missing_features;
 
 struct perf_cpu_map;
-struct target;
 struct thread_map;
 struct record_opts;
 
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index dcf8d19b83c8..85df3e4771e4 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -3,7 +3,7 @@
 #define PARSE_CTX_H 1
 
 // There are fixes that need to land upstream before we can use libbpf's headers,
-// for now use our copy uncoditionally, since the data structures at this point
+// for now use our copy unconditionally, since the data structures at this point
 // are exactly the same, no problem.
 //#ifdef HAVE_LIBBPF_SUPPORT
 //#include <bpf/hashmap.h>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 20effdff76ce..aa1e42518d37 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -127,7 +127,7 @@ static int __do_write_buf(struct feat_fd *ff,  const void *buf, size_t size)
 	return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int do_write(struct feat_fd *ff, const void *buf, size_t size)
 {
 	if (!ff->buf)
@@ -135,7 +135,7 @@ int do_write(struct feat_fd *ff, const void *buf, size_t size)
 	return __do_write_buf(ff, buf, size);
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
 {
 	u64 *p = (u64 *) set;
@@ -154,7 +154,7 @@ static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
 	return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int write_padded(struct feat_fd *ff, const void *bf,
 		 size_t count, size_t count_aligned)
 {
@@ -170,7 +170,7 @@ int write_padded(struct feat_fd *ff, const void *bf,
 #define string_size(str)						\
 	(PERF_ALIGN((strlen(str) + 1), NAME_ALIGN) + sizeof(u32))
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_string(struct feat_fd *ff, const char *str)
 {
 	u32 len, olen;
@@ -266,7 +266,7 @@ static char *do_read_string(struct feat_fd *ff)
 	return NULL;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize)
 {
 	unsigned long *set;
@@ -2874,7 +2874,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
 	int err = -1;
 
 	if (ff->ph->needs_swap) {
-		pr_warning("interpreting bpf_prog_info from systems with endianity is not yet supported\n");
+		pr_warning("interpreting bpf_prog_info from systems with endianness is not yet supported\n");
 		return 0;
 	}
 
@@ -2942,7 +2942,7 @@ static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused)
 	int err = -1;
 
 	if (ff->ph->needs_swap) {
-		pr_warning("interpreting btf from systems with endianity is not yet supported\n");
+		pr_warning("interpreting btf from systems with endianness is not yet supported\n");
 		return 0;
 	}
 
@@ -3481,11 +3481,11 @@ static const size_t attr_pipe_abi_sizes[] = {
 };
 
 /*
- * In the legacy pipe format, there is an implicit assumption that endiannesss
+ * In the legacy pipe format, there is an implicit assumption that endianness
  * between host recording the samples, and host parsing the samples is the
  * same. This is not always the case given that the pipe output may always be
  * redirected into a file and analyzed on a different machine with possibly a
- * different endianness and perf_event ABI revsions in the perf tool itself.
+ * different endianness and perf_event ABI revisions in the perf tool itself.
  */
 static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph)
 {
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c82f5fc26af8..9299ee535518 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
 	hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
 	hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+	hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
 	if (symbol_conf.nanosecs)
 		hists__new_col_len(hists, HISTC_TIME, 16);
 	else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-				u64 weight, u64 ins_lat)
+				u64 weight, u64 ins_lat, u64 p_stage_cyc)
 {
 
 	he_stat->period		+= period;
 	he_stat->weight		+= weight;
 	he_stat->nr_events	+= 1;
 	he_stat->ins_lat	+= ins_lat;
+	he_stat->p_stage_cyc	+= p_stage_cyc;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
 	dest->nr_events		+= src->nr_events;
 	dest->weight		+= src->weight;
 	dest->ins_lat		+= src->ins_lat;
+	dest->p_stage_cyc		+= src->p_stage_cyc;
 }
 
 static void he_stat__decay(struct he_stat *he_stat)
@@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 	u64 period = entry->stat.period;
 	u64 weight = entry->stat.weight;
 	u64 ins_lat = entry->stat.ins_lat;
+	u64 p_stage_cyc = entry->stat.p_stage_cyc;
 	bool leftmost = true;
 
 	p = &hists->entries_in->rb_root.rb_node;
@@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 
 		if (!cmp) {
 			if (sample_self) {
-				he_stat__add_period(&he->stat, period, weight, ins_lat);
+				he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc);
 				hist_entry__add_callchain_period(he, period);
 			}
 			if (symbol_conf.cumulate_callchain)
-				he_stat__add_period(he->stat_acc, period, weight, ins_lat);
+				he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc);
 
 			/*
 			 * This mem info was allocated from sample__resolve_mem
@@ -731,6 +735,7 @@ __hists__add_entry(struct hists *hists,
 			.period	= sample->period,
 			.weight = sample->weight,
 			.ins_lat = sample->ins_lat,
+			.p_stage_cyc = sample->p_stage_cyc,
 		},
 		.parent = sym_parent,
 		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3c537232294b..e2faa745c8d6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,6 +75,7 @@ enum hist_column {
 	HISTC_MEM_BLOCKED,
 	HISTC_LOCAL_INS_LAT,
 	HISTC_GLOBAL_INS_LAT,
+	HISTC_P_STAGE_CYC,
 	HISTC_NR_COLS, /* Last entry */
 };
 
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index f6e28ac231b7..8658d42ce57a 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3569,7 +3569,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 	/*
 	 * Since this thread will not be kept in any rbtree not in a
 	 * list, initialize its list node so that at thread__put() the
-	 * current thread lifetime assuption is kept and we don't segfault
+	 * current thread lifetime assumption is kept and we don't segfault
 	 * at list_del_init().
 	 */
 	INIT_LIST_HEAD(&pt->unknown_thread->node);
diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c
index a217ecf0359d..6a6712635aa4 100644
--- a/tools/perf/util/levenshtein.c
+++ b/tools/perf/util/levenshtein.c
@@ -30,7 +30,7 @@
  *
  * It does so by calculating the costs of the path ending in characters
  * i (in string1) and j (in string2), respectively, given that the last
- * operation is a substition, a swap, a deletion, or an insertion.
+ * operation is a substitution, a swap, a deletion, or an insertion.
  *
  * This implementation allows the costs to be weighted:
  *
diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c
index 6b4e5a0892f8..c397be0c2e32 100644
--- a/tools/perf/util/libunwind/arm64.c
+++ b/tools/perf/util/libunwind/arm64.c
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/arm64/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c
index 21c216c40a3b..b2b92d030aef 100644
--- a/tools/perf/util/libunwind/x86_32.c
+++ b/tools/perf/util/libunwind/x86_32.c
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/x86/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index dbdffb6673fe..3ceaf7ef3301 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -471,7 +471,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 
 	/*
 	 * This is an optional work. Even it fail we can continue our
-	 * work. Needn't to check error return.
+	 * work. Needn't check error return.
 	 */
 	llvm__get_kbuild_opts(&kbuild_dir, &kbuild_include_opts);
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b5c2d8be4144..3ff4936a15a4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -905,7 +905,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start
 
 	maps__insert(&machine->kmaps, map);
 
-	/* Put the map here because maps__insert alread got it */
+	/* Put the map here because maps__insert already got it */
 	map__put(map);
 out:
 	/* put the dso here, corresponding to  machine__findnew_module_dso */
@@ -1952,7 +1952,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 	 * maps because that is what the kernel just did.
 	 *
 	 * But when synthesizing, this should not be done.  If we do, we end up
-	 * with overlapping maps as we process the sythesized MMAP2 events that
+	 * with overlapping maps as we process the synthesized MMAP2 events that
 	 * get delivered shortly thereafter.
 	 *
 	 * Use the FORK event misc flags in an internal way to signal this
@@ -2038,8 +2038,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 static bool symbol__match_regex(struct symbol *sym, regex_t *regex)
 {
 	if (!regexec(regex, sym->name, 0, NULL, 0))
-		return 1;
-	return 0;
+		return true;
+	return false;
 }
 
 static void ip__resolve_ams(struct thread *thread,
@@ -2518,7 +2518,7 @@ static bool has_stitched_lbr(struct thread *thread,
 
 	/*
 	 * Check if there are identical LBRs between two samples.
-	 * Identicall LBRs must have same from, to and flags values. Also,
+	 * Identical LBRs must have same from, to and flags values. Also,
 	 * they have to be saved in the same LBR registers (same physical
 	 * index).
 	 *
@@ -2588,7 +2588,7 @@ err:
 }
 
 /*
- * Recolve LBR callstack chain sample
+ * Resolve LBR callstack chain sample
  * Return:
  * 1 on success get LBR callchain information
  * 0 no available LBR callchain information, should try fp
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 9f32825c98d8..d32f5b28c1fb 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -75,7 +75,7 @@ struct thread;
 
 /* map__for_each_symbol - iterate over the symbols in the given map
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  * Note: caller must ensure map->dso is not NULL (map is loaded).
@@ -86,7 +86,7 @@ struct thread;
 /* map__for_each_symbol_with_name - iterate over the symbols in the given map
  *                                  that have the given name
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @sym_name: the symbol name
  * @pos: the 'struct symbol *' to use as a loop cursor
  */
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 755cef7e0625..cacdebd65b8a 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -44,7 +44,6 @@ bool is_mem_loads_aux_event(struct evsel *leader);
 
 void perf_mem_events__list(void);
 
-struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
@@ -81,7 +80,7 @@ struct c2c_stats {
 	u32	rmt_dram;            /* count of loads miss to remote DRAM */
 	u32	blk_data;            /* count of loads blocked by data */
 	u32	blk_addr;            /* count of loads blocked by address conflict */
-	u32	nomap;               /* count of load/stores with no phys adrs */
+	u32	nomap;               /* count of load/stores with no phys addrs */
 	u32	noparse;             /* count of unparsable data sources */
 };
 
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 26c990e32378..8336dd8e8098 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -181,7 +181,7 @@ static bool evsel_same_pmu(struct evsel *ev1, struct evsel *ev2)
  * @pctx: the parse context for the metric expression.
  * @metric_no_merge: don't attempt to share events for the metric with other
  * metrics.
- * @has_constraint: is there a contraint on the group of events? In which case
+ * @has_constraint: is there a constraint on the group of events? In which case
  * the events won't be grouped.
  * @metric_events: out argument, null terminated array of evsel's associated
  * with the metric.
@@ -618,7 +618,7 @@ static int metricgroup__print_sys_event_iter(struct pmu_event *pe, void *data)
 void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 			bool raw, bool details)
 {
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *map = pmu_events_map__find();
 	struct pmu_event *pe;
 	int i;
 	struct rblist groups;
@@ -900,7 +900,8 @@ static int __add_metric(struct list_head *metric_list,
 		    (match_metric(__pe->metric_group, __metric) ||	\
 		     match_metric(__pe->metric_name, __metric)))
 
-static struct pmu_event *find_metric(const char *metric, struct pmu_events_map *map)
+struct pmu_event *metricgroup__find_metric(const char *metric,
+					   struct pmu_events_map *map)
 {
 	struct pmu_event *pe;
 	int i;
@@ -985,7 +986,7 @@ static int __resolve_metric(struct metric *m,
 			struct expr_id *parent;
 			struct pmu_event *pe;
 
-			pe = find_metric(cur->key, map);
+			pe = metricgroup__find_metric(cur->key, map);
 			if (!pe)
 				continue;
 
@@ -1253,8 +1254,7 @@ int metricgroup__parse_groups(const struct option *opt,
 			      struct rblist *metric_events)
 {
 	struct evlist *perf_evlist = *(struct evlist **)opt->value;
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
-
+	struct pmu_events_map *map = pmu_events_map__find();
 
 	return parse_groups(perf_evlist, str, metric_no_group,
 			    metric_no_merge, NULL, metric_events, map);
@@ -1273,7 +1273,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist,
 
 bool metricgroup__has_metric(const char *metric)
 {
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *map = pmu_events_map__find();
 	struct pmu_event *pe;
 	int i;
 
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index ed1b9392e624..cc4a92492a61 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -9,7 +9,6 @@
 
 struct evlist;
 struct evsel;
-struct evlist;
 struct option;
 struct rblist;
 struct pmu_events_map;
@@ -44,7 +43,8 @@ int metricgroup__parse_groups(const struct option *opt,
 			      bool metric_no_group,
 			      bool metric_no_merge,
 			      struct rblist *metric_events);
-
+struct pmu_event *metricgroup__find_metric(const char *metric,
+					   struct pmu_events_map *map);
 int metricgroup__parse_groups_test(struct evlist *evlist,
 				   struct pmu_events_map *map,
 				   const char *str,
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c0c0fab22cb8..8123d218ad17 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -846,9 +846,9 @@ split_bpf_config_terms(struct list_head *evt_head_config,
 	struct parse_events_term *term, *temp;
 
 	/*
-	 * Currectly, all possible user config term
+	 * Currently, all possible user config term
 	 * belong to bpf object. parse_events__is_hardcoded_term()
-	 * happends to be a good flag.
+	 * happens to be a good flag.
 	 *
 	 * See parse_events_config_bpf() and
 	 * config_term_tracepoint().
@@ -898,7 +898,7 @@ int parse_events_load_bpf(struct parse_events_state *parse_state,
 
 	/*
 	 * Caller doesn't know anything about obj_head_config,
-	 * so combine them together again before returnning.
+	 * so combine them together again before returning.
 	 */
 	if (head_config)
 		list_splice_tail(&obj_head_config, head_config);
@@ -1185,10 +1185,10 @@ do {									   \
 	}
 
 	/*
-	 * Check term availbility after basic checking so
+	 * Check term availability after basic checking so
 	 * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
 	 *
-	 * If check availbility at the entry of this function,
+	 * If check availability at the entry of this function,
 	 * user will see "'<sysfs term>' is not usable in 'perf stat'"
 	 * if an invalid config term is provided for legacy events
 	 * (for example, instructions/badterm/...), which is confusing.
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 46fd0f998484..286d5e415bdc 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -717,6 +717,11 @@ struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu)
 	return map;
 }
 
+struct pmu_events_map *__weak pmu_events_map__find(void)
+{
+	return perf_pmu__find_map(NULL);
+}
+
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name)
 {
 	char *tmp = NULL, *tok, *str;
@@ -1069,7 +1074,7 @@ int perf_pmu__format_type(struct list_head *formats, const char *name)
 
 /*
  * Sets value based on the format definition (format parameter)
- * and unformated value (value parameter).
+ * and unformatted value (value parameter).
  */
 static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
 			     bool zero)
@@ -1408,7 +1413,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 	}
 
 	/*
-	 * if no unit or scale foundin aliases, then
+	 * if no unit or scale found in aliases, then
 	 * set defaults as for evsel
 	 * unit cannot left to NULL
 	 */
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 160b0f561771..1f1749ba830f 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -114,6 +114,7 @@ void pmu_add_cpu_aliases_map(struct list_head *head, struct perf_pmu *pmu,
 			     struct pmu_events_map *map);
 
 struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu);
+struct pmu_events_map *pmu_events_map__find(void);
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name);
 void perf_pmu_free_alias(struct perf_pmu_alias *alias);
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index a9cff3a50ddf..a78c8d59a555 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3228,7 +3228,7 @@ errout:
 	return err;
 }
 
-/* Concatinate two arrays */
+/* Concatenate two arrays */
 static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b)
 {
 	void *ret;
@@ -3258,7 +3258,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs,
 	if (*ntevs + ntevs2 > probe_conf.max_probes)
 		ret = -E2BIG;
 	else {
-		/* Concatinate the array of probe_trace_event */
+		/* Concatenate the array of probe_trace_event */
 		new_tevs = memcat(*tevs, (*ntevs) * sizeof(**tevs),
 				  *tevs2, ntevs2 * sizeof(**tevs2));
 		if (!new_tevs)
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1b118c9c86a6..866f2d514d72 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -164,7 +164,7 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs)
 /*
  * Convert a location into trace_arg.
  * If tvar == NULL, this just checks variable can be converted.
- * If fentry == true and vr_die is a parameter, do huristic search
+ * If fentry == true and vr_die is a parameter, do heuristic search
  * for the location fuzzed by function entry mcount.
  */
 static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
@@ -498,7 +498,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			       " nor array.\n", varname);
 			return -EINVAL;
 		}
-		/* While prcessing unnamed field, we don't care about this */
+		/* While processing unnamed field, we don't care about this */
 		if (field->ref && dwarf_diename(vr_die)) {
 			pr_err("Semantic error: %s must be referred by '.'\n",
 			       field->name);
@@ -1832,7 +1832,7 @@ static int line_range_walk_cb(const char *fname, int lineno,
 	    (lf->lno_s > lineno || lf->lno_e < lineno))
 		return 0;
 
-	/* Make sure this line can be reversable */
+	/* Make sure this line can be reversible */
 	if (cu_find_lineinfo(&lf->cu_die, addr, &__fname, &__lineno) > 0
 	    && (lineno != __lineno || strcmp(fname, __fname)))
 		return 0;
diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index 078a71773565..8130b56aa04b 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -45,7 +45,7 @@
  * the data portion is mmap()'ed.
  *
  * To sort the queues in chronological order, all queue access is controlled
- * by the auxtrace_heap. This is basicly a stack, each stack element has two
+ * by the auxtrace_heap. This is basically a stack, each stack element has two
  * entries, the queue number and a time stamp. However the stack is sorted by
  * the time stamps. The highest time stamp is at the bottom the lowest
  * (nearest) time stamp is at the top. That sort order is maintained at all
@@ -65,11 +65,11 @@
  * stamp of the last processed entry of the auxtrace_buffer replaces the
  * current auxtrace_heap top.
  *
- * 3. Auxtrace_queues might run of out data and are feeded by the
+ * 3. Auxtrace_queues might run of out data and are fed by the
  * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  *
  * Event Generation
- * Each sampling-data entry in the auxilary trace data generates a perf sample.
+ * Each sampling-data entry in the auxiliary trace data generates a perf sample.
  * This sample is filled
  * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  * etc. This sample is processed with perf_session__deliver_synth_event() to
@@ -575,7 +575,7 @@ static unsigned long long get_trailer_time(const unsigned char *buf)
  * pointer to the queue, the second parameter is the time stamp. This
  * is the time stamp:
  * - of the event that triggered this processing.
- * - or the time stamp when the last proccesing of this queue stopped.
+ * - or the time stamp when the last processing of this queue stopped.
  *   In this case it stopped at a 4KB page boundary and record the
  *   position on where to continue processing on the next invocation
  *   (see buffer->use_data and buffer->use_size).
@@ -640,7 +640,7 @@ static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
 			goto out;
 		}
 
-		pos += dsdes;	/* Skip diagnositic entry */
+		pos += dsdes;	/* Skip diagnostic entry */
 
 		/* Check for trailer entry */
 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c
index cfcf8d534d76..08ec3c3ae0ee 100644
--- a/tools/perf/util/s390-sample-raw.c
+++ b/tools/perf/util/s390-sample-raw.c
@@ -160,11 +160,9 @@ static void s390_cpumcfdg_dump(struct perf_sample *sample)
 	const char *color = PERF_COLOR_BLUE;
 	struct cf_ctrset_entry *cep, ce;
 	struct pmu_events_map *map;
-	struct perf_pmu pmu;
 	u64 *p;
 
-	memset(&pmu, 0, sizeof(pmu));
-	map = perf_pmu__find_map(&pmu);
+	map = pmu_events_map__find();
 	while (offset < len) {
 		cep = (struct cf_ctrset_entry *)(buf + offset);
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c83c2c6564e0..4e4aa4c97ac5 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1531,7 +1531,7 @@ static void set_table_handlers(struct tables *tables)
 		 * Attempt to use the call path root from the call return
 		 * processor, if the call return processor is in use. Otherwise,
 		 * we allocate a new call path root. This prevents exporting
-		 * duplicate call path ids when both are in use simultaniously.
+		 * duplicate call path ids when both are in use simultaneously.
 		 */
 		if (tables->dbe.crp)
 			tables->dbe.cpr = tables->dbe.crp->cpr;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 859832a82496..eba3769be3f1 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1069,7 +1069,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 		 * in "to" register.
 		 * For example, there is a call stack
 		 * "A"->"B"->"C"->"D".
-		 * The LBR registers will recorde like
+		 * The LBR registers will be recorded like
 		 * "C"->"D", "B"->"C", "A"->"B".
 		 * So only the first "to" register and all "from"
 		 * registers are needed to construct the whole stack.
@@ -1302,8 +1302,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 
 	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
 		printf("... weight: %" PRIu64 "", sample->weight);
-			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
 				printf(",0x%"PRIx16"", sample->ins_lat);
+				printf(",0x%"PRIx16"", sample->p_stage_cyc);
+			}
 		printf("\n");
 	}
 
@@ -1584,7 +1586,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->event_update(tool, event, &session->evlist);
 	case PERF_RECORD_HEADER_EVENT_TYPE:
 		/*
-		 * Depreceated, but we need to handle it for sake
+		 * Deprecated, but we need to handle it for sake
 		 * of old data files create in pipe mode.
 		 */
 		return 0;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 552b590485bf..88ce47f2547e 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -25,6 +25,7 @@
 #include <traceevent/event-parse.h>
 #include "mem-events.h"
 #include "annotate.h"
+#include "event.h"
 #include "time-utils.h"
 #include "cgroup.h"
 #include "machine.h"
@@ -36,7 +37,7 @@ const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
 const char	*default_sort_order = "comm,dso,symbol";
 const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
+const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
 const char	default_tracepoint_sort_order[] = "trace";
@@ -45,6 +46,8 @@ const char	*field_order;
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
+const char	*dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
+const char	*arch_specific_sort_keys[] = {"p_stage_cyc"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1408,6 +1411,25 @@ struct sort_entry sort_global_ins_lat = {
 	.se_width_idx	= HISTC_GLOBAL_INS_LAT,
 };
 
+static int64_t
+sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return left->stat.p_stage_cyc - right->stat.p_stage_cyc;
+}
+
+static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc);
+}
+
+struct sort_entry sort_p_stage_cyc = {
+	.se_header      = "Pipeline Stage Cycle",
+	.se_cmp         = sort__global_p_stage_cyc_cmp,
+	.se_snprintf	= hist_entry__p_stage_cyc_snprintf,
+	.se_width_idx	= HISTC_P_STAGE_CYC,
+};
+
 struct sort_entry sort_mem_daddr_sym = {
 	.se_header	= "Data Symbol",
 	.se_cmp		= sort__daddr_cmp,
@@ -1816,6 +1838,21 @@ struct sort_dimension {
 	int			taken;
 };
 
+int __weak arch_support_sort_key(const char *sort_key __maybe_unused)
+{
+	return 0;
+}
+
+const char * __weak arch_perf_header_entry(const char *se_header)
+{
+	return se_header;
+}
+
+static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
+{
+	sd->entry->se_header = arch_perf_header_entry(sd->entry->se_header);
+}
+
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
 static struct sort_dimension common_sort_dimensions[] = {
@@ -1841,6 +1878,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
 	DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
 	DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
+	DIM(SORT_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
 };
 
 #undef DIM
@@ -2739,7 +2777,20 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			struct evlist *evlist,
 			int level)
 {
-	unsigned int i;
+	unsigned int i, j;
+
+	/*
+	 * Check to see if there are any arch specific
+	 * sort dimensions not applicable for the current
+	 * architecture. If so, Skip that sort key since
+	 * we don't want to display it in the output fields.
+	 */
+	for (j = 0; j < ARRAY_SIZE(arch_specific_sort_keys); j++) {
+		if (!strcmp(arch_specific_sort_keys[j], tok) &&
+				!arch_support_sort_key(tok)) {
+			return 0;
+		}
+	}
 
 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
 		struct sort_dimension *sd = &common_sort_dimensions[i];
@@ -2747,6 +2798,11 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
+		for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
+			if (!strcmp(dynamic_headers[j], sd->name))
+				sort_dimension_add_dynamic_header(sd);
+		}
+
 		if (sd->entry == &sort_parent) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 63f67a3f3630..87a092645aa7 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -51,6 +51,7 @@ struct he_stat {
 	u64			period_guest_us;
 	u64			weight;
 	u64			ins_lat;
+	u64			p_stage_cyc;
 	u32			nr_events;
 };
 
@@ -234,6 +235,7 @@ enum sort_type {
 	SORT_CODE_PAGE_SIZE,
 	SORT_LOCAL_INS_LAT,
 	SORT_GLOBAL_INS_LAT,
+	SORT_PIPELINE_STAGE_CYC,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 7f09cdaf5b60..d3137bc17065 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -439,6 +439,12 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 		if (counter->cgrp)
 			os.nfields++;
 	}
+
+	if (!config->no_csv_summary && config->csv_output &&
+	    config->summary && !config->interval) {
+		fprintf(config->output, "%16s%s", "summary", config->csv_sep);
+	}
+
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
 		if (config->metric_only) {
 			pm(config, &os, NULL, "", "", 0);
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 6ccf21a72f06..3f800e71126f 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -9,6 +9,7 @@
 #include "expr.h"
 #include "metricgroup.h"
 #include "cgroup.h"
+#include "units.h"
 #include <linux/zalloc.h>
 
 /*
@@ -1270,18 +1271,15 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL,
 				evsel->name, evsel->metric_name, NULL, 1, cpu, out, st);
 	} else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) {
-		char unit = 'M';
-		char unit_buf[10];
+		char unit = ' ';
+		char unit_buf[10] = "/sec";
 
 		total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd);
-
 		if (total)
-			ratio = 1000.0 * avg / total;
-		if (ratio < 0.001) {
-			ratio *= 1000;
-			unit = 'K';
-		}
-		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+			ratio = convert_unit_double(1000000000.0 * avg / total, &unit);
+
+		if (unit != ' ')
+			snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 		print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
 		print_smi_cost(config, cpu, out, st, &rsd);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index c400f8dde017..2db46b9bebd0 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -76,8 +76,7 @@ double rel_stddev_stats(double stddev, double avg)
 	return pct;
 }
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-			   enum perf_stat_evsel_id id)
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id)
 {
 	struct perf_stat_evsel *ps = evsel->stats;
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index d85c292148bb..48e6a06233fa 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -128,6 +128,7 @@ struct perf_stat_config {
 	bool			 all_user;
 	bool			 percore_show_thread;
 	bool			 summary;
+	bool			 no_csv_summary;
 	bool			 metric_no_group;
 	bool			 metric_no_merge;
 	bool			 stop_read_counter;
@@ -160,6 +161,7 @@ struct perf_stat_config {
 };
 
 void perf_stat__set_big_num(int set);
+void perf_stat__set_no_csv_summary(int set);
 
 void update_stats(struct stats *stats, u64 val);
 double avg_stats(struct stats *stats);
@@ -187,11 +189,10 @@ struct perf_aggr_thread_value {
 	u64 ena;
 };
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-			   enum perf_stat_evsel_id id);
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id);
 
 #define perf_stat_evsel__is(evsel, id) \
-	__perf_evsel_stat__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
+	__perf_stat_evsel__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
 
 extern struct runtime_stat rt_stat;
 extern struct stats walltime_nsecs_stats;
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index ea94d8628980..be94d7046fa0 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -12,7 +12,7 @@
  *    build complex strings/buffers whose final size isn't easily known.
  *
  *    It is NOT legal to copy the ->buf pointer away.
- *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    `strbuf_detach' is the operation that detaches a buffer from its shell
  *    while keeping the shell valid wrt its invariants.
  *
  * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
index e0c25a40f796..c05aca9ca582 100644
--- a/tools/perf/util/strfilter.h
+++ b/tools/perf/util/strfilter.h
@@ -8,8 +8,8 @@
 
 /* A node of string filter */
 struct strfilter_node {
-	struct strfilter_node *l;	/* Tree left branche (for &,|) */
-	struct strfilter_node *r;	/* Tree right branche (for !,&,|) */
+	struct strfilter_node *l;	/* Tree left branch (for &,|) */
+	struct strfilter_node *r;	/* Tree right branch (for !,&,|) */
 	const char *p;		/* Operator or rule */
 };
 
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 6dff843fd883..4c56aa837434 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1058,7 +1058,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 		curr_dso->symtab_type = dso->symtab_type;
 		maps__insert(kmaps, curr_map);
 		/*
-		 * Add it before we drop the referece to curr_map, i.e. while
+		 * Add it before we drop the reference to curr_map, i.e. while
 		 * we still are sure to have a reference to this DSO via
 		 * *curr_map->dso.
 		 */
diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c
index 35c936ce33ef..2664fb65e47a 100644
--- a/tools/perf/util/symbol_fprintf.c
+++ b/tools/perf/util/symbol_fprintf.c
@@ -68,7 +68,7 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso,
 
 	for (nd = rb_first_cached(&dso->symbol_names); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
-		fprintf(fp, "%s\n", pos->sym.name);
+		ret += fprintf(fp, "%s\n", pos->sym.name);
 	}
 
 	return ret;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index dff178103ce5..35aa0c0f7cd9 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1211,7 +1211,7 @@ static size_t mask_size(struct perf_cpu_map *map, int *max)
 	*max = 0;
 
 	for (i = 0; i < map->nr; i++) {
-		/* bit possition of the cpu is + 1 */
+		/* bit position of the cpu is + 1 */
 		int bit = map->map[i] + 1;
 
 		if (bit > *max)
@@ -1237,7 +1237,7 @@ void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int
 	 *   mask  = size of 'struct perf_record_record_cpu_map' +
 	 *           maximum cpu bit converted to size of longs
 	 *
-	 * and finaly + the size of 'struct perf_record_cpu_map_data'.
+	 * and finally + the size of 'struct perf_record_cpu_map_data'.
 	 */
 	size_cpus = cpus_size(map);
 	size_mask = mask_size(map, max);
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index 03bd99d3be16..a2e906858891 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -34,6 +34,10 @@ static const char **syscalltbl_native = syscalltbl_powerpc_32;
 #include <asm/syscalls.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_ARM64_MAX_ID;
 static const char **syscalltbl_native = syscalltbl_arm64;
+#elif defined(__mips__)
+#include <asm/syscalls_n64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_mips_n64;
 #endif
 
 struct syscall {
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index f132c6c2eef8..1bce3eb28ef2 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -16,6 +16,8 @@ struct target {
 	bool	     uses_mmap;
 	bool	     default_per_cpu;
 	bool	     per_thread;
+	bool	     use_bpf;
+	const char   *attr_map;
 };
 
 enum target_errno {
@@ -66,7 +68,7 @@ static inline bool target__has_cpu(struct target *target)
 
 static inline bool target__has_bpf(struct target *target)
 {
-	return target->bpf_str;
+	return target->bpf_str || target->use_bpf;
 }
 
 static inline bool target__none(struct target *target)
diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h
index 3bc47a42af8e..b3cd09beb62f 100644
--- a/tools/perf/util/thread-stack.h
+++ b/tools/perf/util/thread-stack.h
@@ -16,7 +16,6 @@ struct comm;
 struct ip_callchain;
 struct symbol;
 struct dso;
-struct comm;
 struct perf_sample;
 struct addr_location;
 struct call_path;
diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c
index a46762aec4c9..32c39cfe209b 100644
--- a/tools/perf/util/units.c
+++ b/tools/perf/util/units.c
@@ -33,28 +33,35 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
 	return (unsigned long) -1;
 }
 
-unsigned long convert_unit(unsigned long value, char *unit)
+double convert_unit_double(double value, char *unit)
 {
 	*unit = ' ';
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'K';
 	}
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'M';
 	}
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'G';
 	}
 
 	return value;
 }
 
+unsigned long convert_unit(unsigned long value, char *unit)
+{
+	double v = convert_unit_double((double)value, unit);
+
+	return (unsigned long)v;
+}
+
 int unit_number__scnprintf(char *buf, size_t size, u64 n)
 {
 	char unit[4] = "BKMG";
diff --git a/tools/perf/util/units.h b/tools/perf/util/units.h
index 99263b6a23f7..ea43e74e3240 100644
--- a/tools/perf/util/units.h
+++ b/tools/perf/util/units.h
@@ -12,6 +12,7 @@ struct parse_tag {
 
 unsigned long parse_tag_value(const char *str, struct parse_tag *tags);
 
+double convert_unit_double(double value, char *unit);
 unsigned long convert_unit(unsigned long value, char *unit);
 int unit_number__scnprintf(char *buf, size_t size, u64 n);
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 9aededc0bc06..71a353349181 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -82,7 +82,7 @@ UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
 #define DW_EH_PE_funcrel	0x40	/* start-of-procedure-relative */
 #define DW_EH_PE_aligned	0x50	/* aligned pointer */
 
-/* Flags intentionaly not handled, since they're not needed:
+/* Flags intentionally not handled, since they're not needed:
  * #define DW_EH_PE_indirect      0x80
  * #define DW_EH_PE_uleb128       0x01
  * #define DW_EH_PE_udata2        0x02