diff options
12 files changed, 7011 insertions, 216 deletions
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json index 73b6865a769d..6789285555f0 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -47,7 +47,7 @@ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", "MetricGroup": "TopdownL1", "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided." + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", @@ -130,44 +130,26 @@ "MetricName": "FLOPc_SMT" }, { - "BriefDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width)", + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", - "PublicDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). Values > 1 are possible due to Fused-Multiply Add (FMA) counting." + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). SMT version; use when SMT is enabled and measuring per logical CPU.", + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", "MetricGroup": "Cor;Flops;HPC_SMT", "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). Values > 1 are possible due to Fused-Multiply Add (FMA) counting. SMT version; use when SMT is enabled and measuring per logical CPU." + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "Branch_Misprediction_Cost" - }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "IpMispredict" - }, - { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", "MetricGroup": "SMT", @@ -204,7 +186,7 @@ "MetricName": "IpTB" }, { - "BriefDescription": "Branch instructions per taken branch.", + "BriefDescription": "Branch instructions per taken branch. ", "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;PGO", "MetricName": "BpTkBranch" @@ -257,41 +239,52 @@ "MetricName": "Instructions" }, { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, + { + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "Execute" + }, + { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "Load_Miss_Real_Latency", - "PublicDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles). Latency may be overestimated for multi-load instructions - e.g. repeat strings." + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "IpMispredict" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", - "MetricName": "MLP" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost" }, { - "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L1D_Cache_Fill_BW" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts_SMT", + "MetricName": "Branch_Misprediction_Cost_SMT" }, { - "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L2_Cache_Fill_BW" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "Load_Miss_Real_Latency" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L3_Cache_Fill_BW" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -306,13 +299,13 @@ "MetricName": "L2MPKI" }, { - "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)", + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "Mem;CacheMisses;Offcore", "MetricName": "L2MPKI_All" }, { - "BriefDescription": "L2 cache misses per kilo instruction for all demand loads (including speculative)", + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "Mem;CacheMisses", "MetricName": "L2MPKI_Load" @@ -349,6 +342,48 @@ "MetricName": "Page_Walks_Utilization_SMT" }, { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L1D_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L2_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L3_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L1D_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L2_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L3_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "L3_Cache_Access_BW_1T" + }, + { "BriefDescription": "Average CPU Utilization", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "HPC;Summary", @@ -364,7 +399,8 @@ "BriefDescription": "Giga Floating Point Operations Per Second", "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", "MetricGroup": "Cor;Flops;HPC", - "MetricName": "GFLOPs" + "MetricName": "GFLOPs", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/cache.json b/tools/perf/pmu-events/arch/x86/broadwellde/cache.json index 0f4de912d099..4b77181b2c53 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/cache.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/cache.json @@ -806,4 +806,4 @@ "SampleAfterValue": "100003", "UMask": "0x10" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json b/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json index fdf5dc40b835..46cf18490140 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json @@ -190,4 +190,4 @@ "SampleAfterValue": "2000003", "UMask": "0x3" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/frontend.json b/tools/perf/pmu-events/arch/x86/broadwellde/frontend.json index f0bcb945ff76..37ce8034b2ed 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/frontend.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/frontend.json @@ -292,4 +292,4 @@ "SampleAfterValue": "2000003", "UMask": "0x1" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/memory.json b/tools/perf/pmu-events/arch/x86/broadwellde/memory.json index 604059e7eb58..a3a5cc6dab42 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/memory.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/memory.json @@ -429,4 +429,4 @@ "SampleAfterValue": "2000003", "UMask": "0x40" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/other.json b/tools/perf/pmu-events/arch/x86/broadwellde/other.json index 4b360fe96698..917d145d5227 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/other.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/other.json @@ -41,4 +41,4 @@ "SampleAfterValue": "2000003", "UMask": "0x1" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json b/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json index 7580b8af0d13..85654037b768 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json @@ -1378,4 +1378,4 @@ "SampleAfterValue": "2000003", "UMask": "0x1" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json index 58ed6d33d1f4..caadbca1b15b 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json @@ -1,316 +1,3826 @@ [ { - "BriefDescription": "Uncore cache clock ticks", + "BriefDescription": "Bounce Control", + "Counter": "0,1,2,3", + "EventCode": "0xA", + "EventName": "UNC_C_BOUNCE_CONTROL", + "PerPkg": "1", + "Unit": "CBO" + }, + { + "BriefDescription": "Uncore Clocks", "Counter": "0,1,2,3", "EventName": "UNC_C_CLOCKTICKS", "PerPkg": "1", "Unit": "CBO" }, { - "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)", + "BriefDescription": "Counter 0 Occupancy", + "Counter": "0,1,2,3", + "EventCode": "0x1F", + "EventName": "UNC_C_COUNTER0_OCCUPANCY", + "PerPkg": "1", + "PublicDescription": "Since occupancy counts can only be captured in the Cbo's 0 counter, this event allows a user to capture occupancy related information by filtering the Cb0 occupancy count captured in Counter 0. The filtering available is found in the control register - threshold, invert and edge detect. E.g. setting threshold to 1 can effectively monitor how many cycles the monitored queue has an entry.", + "Unit": "CBO" + }, + { + "BriefDescription": "FaST wire asserted", + "Counter": "0,1", + "EventCode": "0x9", + "EventName": "UNC_C_FAST_ASSERTED", + "PerPkg": "1", + "PublicDescription": "Counts the number of cycles either the local distress or incoming distress signals are asserted. Incoming distress includes both up and dn.", + "Unit": "CBO" + }, + { + "BriefDescription": "Cache Lookups; Any Request", "Counter": "0,1,2,3", "EventCode": "0x34", "EventName": "UNC_C_LLC_LOOKUP.ANY", - "Filter": "filter_state=0x1", + "Filter": "CBoFilter0[23:17]", "PerPkg": "1", - "ScaleUnit": "64Bytes", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Filters for any transaction originating from the IPQ or IRQ. This does not include lookups originating from the ISMQ.", "UMask": "0x11", "Unit": "CBO" }, { - "BriefDescription": "M line evictions from LLC (writebacks to memory)", + "BriefDescription": "Cache Lookups; Data Read Request", + "Counter": "0,1,2,3", + "EventCode": "0x34", + "EventName": "UNC_C_LLC_LOOKUP.DATA_READ", + "Filter": "CBoFilter0[23:17]", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Read transactions", + "UMask": "0x3", + "Unit": "CBO" + }, + { + "BriefDescription": "Cache Lookups; Lookups that Match NID", + "Counter": "0,1,2,3", + "EventCode": "0x34", + "EventName": "UNC_C_LLC_LOOKUP.NID", + "Filter": "CBoFilter0[23:17]", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Qualify one of the other subevents by the Target NID. The NID is programmed in Cn_MSR_PMON_BOX_FILTER.nid. In conjunction with STATE = I, it is possible to monitor misses to specific NIDs in the system.", + "UMask": "0x41", + "Unit": "CBO" + }, + { + "BriefDescription": "Cache Lookups; Any Read Request", + "Counter": "0,1,2,3", + "EventCode": "0x34", + "EventName": "UNC_C_LLC_LOOKUP.READ", + "Filter": "CBoFilter0[22:18]", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Read transactions", + "UMask": "0x21", + "Unit": "CBO" + }, + { + "BriefDescription": "Cache Lookups; External Snoop Request", + "Counter": "0,1,2,3", + "EventCode": "0x34", + "EventName": "UNC_C_LLC_LOOKUP.REMOTE_SNOOP", + "Filter": "CBoFilter0[23:17]", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Filters for only snoop requests coming from the remote socket(s) through the IPQ.", + "UMask": "0x9", + "Unit": "CBO" + }, + { + "BriefDescription": "Cache Lookups; Write Requests", + "Counter": "0,1,2,3", + "EventCode": "0x34", + "EventName": "UNC_C_LLC_LOOKUP.WRITE", + "Filter": "CBoFilter0[23:17]", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:18] bits correspond to [FMESI] state.; Writeback transactions from L2 to the LLC This includes all write transactions -- both Cachable and UC.", + "UMask": "0x5", + "Unit": "CBO" + }, + { + "BriefDescription": "Lines Victimized; Lines in E state", + "Counter": "0,1,2,3", + "EventCode": "0x37", + "EventName": "UNC_C_LLC_VICTIMS.E_STATE", + "PerPkg": "1", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "UMask": "0x2", + "Unit": "CBO" + }, + { + "BriefDescription": "Lines Victimized", + "Counter": "0,1,2,3", + "EventCode": "0x37", + "EventName": "UNC_C_LLC_VICTIMS.F_STATE", + "PerPkg": "1", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "UMask": "0x8", + "Unit": "CBO" + }, + { + "BriefDescription": "Lines Victimized; Lines in S State", + "Counter": "0,1,2,3", + "EventCode": "0x37", + "EventName": "UNC_C_LLC_VICTIMS.I_STATE", + "PerPkg": "1", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "UMask": "0x4", + "Unit": "CBO" + }, + { + "BriefDescription": "Lines Victimized", + "Counter": "0,1,2,3", + "EventCode": "0x37", + "EventName": "UNC_C_LLC_VICTIMS.MISS", + "PerPkg": "1", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "UMask": "0x10", + "Unit": "CBO" + }, + { + "BriefDescription": "Lines Victimized; Lines in M state", "Counter": "0,1,2,3", "EventCode": "0x37", "EventName": "UNC_C_LLC_VICTIMS.M_STATE", "PerPkg": "1", - "ScaleUnit": "64Bytes", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", "UMask": "0x1", "Unit": "CBO" }, { - "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches. Derived from unc_c_tor_inserts.miss_opcode", + "BriefDescription": "Lines Victimized; Victimized Lines that Match NID", "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_MISSES.DATA_READ", - "Filter": "filter_opc=0x182", + "EventCode": "0x37", + "EventName": "UNC_C_LLC_VICTIMS.NID", + "Filter": "CBoFilter1[17:10]", "PerPkg": "1", - "ScaleUnit": "64Bytes", + "PublicDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.; Qualify one of the other subevents by the Target NID. The NID is programmed in Cn_MSR_PMON_BOX_FILTER.nid. In conjunction with STATE = I, it is possible to monitor misses to specific NIDs in the system.", + "UMask": "0x40", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc; DRd hitting non-M with raw CV=0", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.CVZERO_PREFETCH_MISS", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.", + "UMask": "0x20", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc; Clean Victim with raw CV=0", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.CVZERO_PREFETCH_VICTIM", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.", + "UMask": "0x10", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc; RFO HitS", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.RFO_HIT_S", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.; Number of times that an RFO hit in S state. This is useful for determining if it might be good for a workload to use RspIWB instead of RspSWB.", + "UMask": "0x8", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc; Silent Snoop Eviction", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.RSPI_WAS_FSE", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.; Counts the number of times when a Snoop hit in FSE states and triggered a silent eviction. This is useful because this information is lost in the PRE encodings.", + "UMask": "0x1", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.STARTED", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.", + "UMask": "0x4", + "Unit": "CBO" + }, + { + "BriefDescription": "Cbo Misc; Write Combining Aliasing", + "Counter": "0,1,2,3", + "EventCode": "0x39", + "EventName": "UNC_C_MISC.WC_ALIASING", + "PerPkg": "1", + "PublicDescription": "Miscellaneous events in the Cbo.; Counts the number of times that a USWC write (WCIL(F)) transaction hit in the LLC in M state, triggering a WBMtoI followed by the USWC write. This occurs when there is WC aliasing.", + "UMask": "0x2", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; LRU Age 0", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.AGE0", + "PerPkg": "1", + "PublicDescription": "How often age was set to 0", + "UMask": "0x1", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; LRU Age 1", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.AGE1", + "PerPkg": "1", + "PublicDescription": "How often age was set to 1", + "UMask": "0x2", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; LRU Age 2", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.AGE2", + "PerPkg": "1", + "PublicDescription": "How often age was set to 2", + "UMask": "0x4", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; LRU Age 3", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.AGE3", + "PerPkg": "1", + "PublicDescription": "How often age was set to 3", + "UMask": "0x8", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; LRU Bits Decremented", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.LRU_DECREMENT", + "PerPkg": "1", + "PublicDescription": "How often all LRU bits were decremented by 1", + "UMask": "0x10", + "Unit": "CBO" + }, + { + "BriefDescription": "LRU Queue; Non-0 Aged Victim", + "Counter": "0,1,2,3", + "EventCode": "0x3C", + "EventName": "UNC_C_QLRU.VICTIM_NON_ZERO", + "PerPkg": "1", + "PublicDescription": "How often we picked a victim that had a non-zero age", + "UMask": "0x20", + "Unit": "CBO" + }, + { + "BriefDescription": "AD Ring In Use; All", + "Counter": "0,1,2,3", + "EventCode": "0x1B", + "EventName": "UNC_C_RING_AD_USED.ALL", + "PerPkg": "1", + "PublicDescription": "Counts the number of cycles that the AD ring is being used at this ring stop. This includes when packets are passing by and when packets are being sunk, but does not include when packets are being sent from the ring stop. We really have two rings in BDX -- a clockwise ring and a counter-clockwise ring. On the left side of the ring, the UP direction is on the clockwise ring and DN is on the counter-clockwise ring. On the right side of the ring, this is reversed. The first half of the CBos are on the left side of the ring, and the 2nd half are on the right side of the ring. In other words (for example), in a 4c part, Cbo 0 UP AD is NOT the same ring as CBo 2 UP AD because they are on opposite sides of the ring.", + "UMask": "0xF", + "Unit": "CBO" + }, + { + "BriefDescription": "AD Ring In Use; Down", + "Counter": "0,1,2,3", + "EventCode": "0x1B", + "EventName": "UNC_C_RING_AD_USED.CCW", + "PerPkg": "1", + "PublicDescription": "Counts the number of cycles that the AD ring is being used at this ring stop. This includes when packets are passing by and when packets are being sunk, but does not include when packets are being sent from the ring stop. We really have two rings in BDX -- a clockwise ring and a counter-clockwise ring. On the left side of the ring, the UP direction is on the clockwise ring and DN is on the counter-clockwise ring. On the right side of the ring, this is reversed. The first half of the CBos are on the left side of the ring, and the 2nd half are on the right side of the ring. In other words (for example), in a 4c part, Cbo 0 UP AD is NOT the same ring as CBo 2 UP AD because they are on opposite sides of the ring.", + "UMask": "0xC", + "Unit": "CBO" + }, + { + "BriefDescription": "AD Ring In Use; Up", + "Counter": "0,1,2,3", + "EventCode": "0x1B", + "EventName": "UNC_C_RING_AD_USED.CW", + "PerPkg": "1", + "PublicDescription": "Counts the number of cycles that the AD ring is being used at this ring stop. This includes when packets are passing by and when packets are being sunk, but does not include when packets are being sent from the ring stop. We really have two rings in BDX -- a clockwise ring and a counter-clockwise ring. On the left side of the ring, the UP direction is on the clockwise ring and DN is on the counter-clockwise ring. On the right side of the ring, this is reversed. The first half of the CBos are on the left side of the ring, and the 2nd half are on the right side of the ring. In other words (for example), in a 4c part, Cbo 0 UP AD is NOT the same ring as CBo 2 UP AD because they are on opposite sides of the ring.", "UMask": "0x3", "Unit": "CBO" }, { - "BriefDescription": "LLC misses - Uncacheable reads (from cpu) . Derived from unc_c_tor_inserts.miss_opcode", + "BriefDescription": "AD Ring In Use; Down and Even", "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_MISSES.UNCACHEABLE", - "Filter": "filter_opc=0x187", + "EventCode": "0x1B", + "EventName": "U |
