From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/units.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From f3b536878a3cf47e5193a96176a3ca2aaf0d848f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:14:44 -0700 Subject: powercap: correct kernel-doc function parameter names Use the correct function parameter names in kernel-doc comments to avoid these warnings: Warning: include/linux/powercap.h:254 function parameter 'name' not described in 'powercap_register_control_type' Warning: include/linux/powercap.h:298 function parameter 'nr_constraints' not described in 'powercap_register_zone' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260312051444.685136-1-rdunlap@infradead.org Signed-off-by: Rafael J. Wysocki --- include/linux/powercap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/powercap.h b/include/linux/powercap.h index 3d557bbcd2c7..603419db924c 100644 --- a/include/linux/powercap.h +++ b/include/linux/powercap.h @@ -238,7 +238,7 @@ static inline void *powercap_get_zone_data(struct powercap_zone *power_zone) * Advantage of this parameter is that client can embed * this data in its data structures and allocate in a * single call, preventing multiple allocations. -* @control_type_name: The Name of this control_type, which will be shown +* @name: The Name of this control_type, which will be shown * in the sysfs Interface. * @ops: Callbacks for control type. This parameter is optional. * @@ -277,7 +277,7 @@ int powercap_unregister_control_type(struct powercap_control_type *instance); * @name: A name for this zone. * @parent: A pointer to the parent power zone instance if any or NULL * @ops: Pointer to zone operation callback structure. -* @no_constraints: Number of constraints for this zone +* @nr_constraints: Number of constraints for this zone * @const_ops: Pointer to constraint callback structure * * Register a power zone under a given control type. A power zone must register -- cgit v1.2.3 From 8765715b4e8a1dd24ab5d507c42fc0bcd3d83f5c Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Fri, 13 Mar 2026 11:53:27 -0700 Subject: powercap: intel_rapl: Remove unused AVERAGE_POWER primitive The AVERAGE_POWER primitive and RAPL_PRIMITIVE_DERIVED flag are not used anywhere in the code. Remove them to simplify the primitive handling logic. No functional changes. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260313185333.2370733-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 6d694099a3ad..9e6bd654be1f 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -77,7 +77,6 @@ enum rapl_primitives { PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW2, /* below are not raw primitive data */ - AVERAGE_POWER, NR_RAPL_PRIMITIVES, }; -- cgit v1.2.3 From 6e39ba4e5a82aa5469b2ac517b74a71accb0540f Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 26 Mar 2026 21:44:01 +0100 Subject: cpufreq: Add boost_freq_req QoS request The Power Management Quality of Service (PM QoS) allows to aggregate constraints from multiple entities. It is currently used to manage the min/max frequency of a given policy. Frequency constraints can come for instance from: - Thermal framework: acpi_thermal_cpufreq_init() - Firmware: _PPC objects: acpi_processor_ppc_init() - User: by setting policyX/scaling_[min|max]_freq The minimum of the max frequency constraints is used to compute the resulting maximum allowed frequency. When enabling boost frequencies, the same frequency request object (policy->max_freq_req) as to handle requests from users is used. As a result, when setting: - scaling_max_freq - boost The last sysfs file used overwrites the request from the other sysfs file. To avoid this, create a per-policy boost_freq_req to save the boost constraints instead of overwriting the last scaling_max_freq constraint. policy_set_boost() calls the cpufreq set_boost callback. Update the newly added boost_freq_req request from there: - whenever boost is toggled - to cover all possible paths In the existing .set_boost() callbacks: - Don't update policy->max as this is done through the qos notifier cpufreq_notifier_max() which calls cpufreq_set_policy(). - Remove freq_qos_update_request() calls as the qos request is now done in policy_set_boost() and updates the new boost_freq_req $ ## Init state scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ echo 700000 > scaling_max_freq scaling_max_freq:700000 cpuinfo_max_freq:1000000 $ echo 1 > ../boost scaling_max_freq:1200000 cpuinfo_max_freq:1200000 $ echo 800000 > scaling_max_freq scaling_max_freq:800000 cpuinfo_max_freq:1200000 $ ## Final step: $ ## Without the patches: $ echo 0 > ../boost scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ ## With the patches: $ echo 0 > ../boost scaling_max_freq:800000 cpuinfo_max_freq:1000000 Note: cpufreq_frequency_table_cpuinfo() updates policy->min and max from: A. cpufreq_boost_set_sw() \-cpufreq_frequency_table_cpuinfo() B. cpufreq_policy_online() \-cpufreq_table_validate_and_sort() \-cpufreq_frequency_table_cpuinfo() Keep these updates as some drivers expect policy->min and max to be set through B. Reviewed-by: Lifeng Zheng Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260326204404.1401849-3-pierre.gondois@arm.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 8ca2bcb3d7ae..b6f6c7d06912 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -81,6 +81,7 @@ struct cpufreq_policy { struct freq_constraints constraints; struct freq_qos_request *min_freq_req; struct freq_qos_request *max_freq_req; + struct freq_qos_request *boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 9266b4da051a410d9e6c5c0b0ef0c877855aa1b8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 31 Mar 2026 10:33:46 +0530 Subject: cpufreq: Allocate QoS freq_req objects with policy A recent change exposed a bug in the error path: if freq_qos_add_request(boost_freq_req) fails, min_freq_req may remain a valid pointer even though it was never successfully added. During policy teardown, this leads to an unconditional call to freq_qos_remove_request(), triggering a WARN. The current design allocates all three freq_req objects together, making the lifetime rules unclear and error handling fragile. Simplify this by allocating the QoS freq_req objects at policy allocation time. The policy itself is dynamically allocated, and two of the three requests are always needed anyway. This ensures consistent lifetime management and eliminates the inconsistent state in failure paths. Reported-by: Zhongqiu Han Fixes: 6e39ba4e5a82 ("cpufreq: Add boost_freq_req QoS request") Signed-off-by: Viresh Kumar Reviewed-by: Lifeng Zheng Tested-by: Pierre Gondois Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/a293f29d841b86c51f34699c6e717e01858d8ada.1774933424.git.viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b6f6c7d06912..9b10eb486ece 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -79,9 +79,9 @@ struct cpufreq_policy { * called, but you're in IRQ context */ struct freq_constraints constraints; - struct freq_qos_request *min_freq_req; - struct freq_qos_request *max_freq_req; - struct freq_qos_request *boost_freq_req; + struct freq_qos_request min_freq_req; + struct freq_qos_request max_freq_req; + struct freq_qos_request boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 04bcbed4cd33495d05ba98857a748e416ab603b7 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:46 -0700 Subject: powercap: intel_rapl: Move primitive info to header for interface drivers RAPL primitive information varies across different RAPL interfaces (MSR, TPMI, MMIO). Keeping them in the common code adds no benefit, but requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the primitive info infrastructure to the shared header to allow interface drivers to configure RAPL primitives. Specific changes: 1. Move struct rapl_primitive_info, enum unit_type, and PRIMITIVE_INFO_INIT macro to intel_rapl.h. 2. Change the @rpi field in struct rapl_if_priv from void * to struct rapl_primitive_info * to improve type safety and eliminate unnecessary casts. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL primitive settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-4-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 9e6bd654be1f..01f290de3586 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -137,6 +137,34 @@ struct rapl_defaults { bool spr_psys_bits; }; +#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ + .name = #p, \ + .mask = m, \ + .shift = s, \ + .id = i, \ + .unit = u, \ + .flag = f \ + } + +enum unit_type { + ARBITRARY_UNIT, /* no translation */ + POWER_UNIT, + ENERGY_UNIT, + TIME_UNIT, +}; + +/* per domain data. used to describe individual knobs such that access function + * can be consolidated into one instead of many inline functions. + */ +struct rapl_primitive_info { + const char *name; + u64 mask; + int shift; + enum rapl_domain_reg_id id; + enum unit_type unit; + u32 flag; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -152,7 +180,7 @@ struct rapl_defaults { * @write_raw: Callback for writing RAPL interface specific * registers. * @defaults: pointer to default settings - * @rpi: internal pointer to interface primitive info + * @rpi: pointer to interface primitive info */ struct rapl_if_priv { enum rapl_if_type type; @@ -164,7 +192,7 @@ struct rapl_if_priv { int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); const struct rapl_defaults *defaults; - void *rpi; + struct rapl_primitive_info *rpi; }; #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From c3bb8d4f5d802ec1a16f018e82030bccb7a053a4 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:50 -0700 Subject: powercap: intel_rapl: Consolidate PL4 and PMU support flags into rapl_defaults Currently, PL4 and MSR-based RAPL PMU support are detected using separate CPU ID tables (pl4_support_ids and pmu_support_ids) in the MSR driver probe path. This creates a maintenance burden since adding a new CPU requires updates in two places: the rapl_ids table and one or both of these capability tables. Consolidate PL4 and PMU capability information directly into struct rapl_defaults by adding msr_pl4_support and msr_pmu_support flags. This allows per-CPU capability to be expressed in a single place alongside other per-CPU defaults, eliminating the duplicate CPU ID tables entirely. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Acked-by: Srinivas Pandruvada Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 01f290de3586..328004f605c3 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -135,6 +135,8 @@ struct rapl_defaults { unsigned int dram_domain_energy_unit; unsigned int psys_domain_energy_unit; bool spr_psys_bits; + bool msr_pl4_support; + bool msr_pmu_support; }; #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ -- cgit v1.2.3 From c03791085adcd61fa9b766ab303c7d0941d7378d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:49 +0000 Subject: cpufreq: Pass the policy to cpufreq_driver->adjust_perf() cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent writer(s), however amd-pstate depends on fetching the cpudata via the policy's driver data which necessitates grabbing the reference. Since schedutil governor can call "cpufreq_driver->update_perf()" during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, fetching the policy object using the cpufreq_cpu_get() helper in the scheduler fast-path leads to "BUG: scheduling while atomic" on PREEMPT_RT [1]. Pass the cached cpufreq policy object in sg_policy to the update_perf() instead of just the CPU. The CPU can be inferred using "policy->cpu". The lifetime of cpufreq_policy object outlasts that of the governor and the cpufreq driver (allocated when the CPU is onlined and only reclaimed when the CPU is offlined / the CPU device is removed) which makes it safe to be referenced throughout the governor's lifetime. Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Bert Karwatzki Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Acked-by: Gary Guo # Rust Reviewed-by: Gautham R. Shenoy Reviewed-by: Zhongqiu Han Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- include/linux/cpufreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..4317c5a312bd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -372,7 +372,7 @@ struct cpufreq_driver { * conditions) scale invariance can be disabled, which causes the * schedutil governor to fall back to the latter. */ - void (*adjust_perf)(unsigned int cpu, + void (*adjust_perf)(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); @@ -617,7 +617,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); -- cgit v1.2.3