Re: [PATCH] Separate performance counter reservation from nmi watchdog

From: Björn Steinbrink
Date: Wed Jun 13 2007 - 12:46:44 EST


On 2007.06.13 03:41:36 +0200, Björn Steinbrink wrote:
> On 2007.06.12 21:07:30 +0200, Björn Steinbrink wrote:
> > On 2007.06.12 08:02:46 -0700, Stephane Eranian wrote:
> > > * the fill_in_addresses() callback for X86 invokes the NMI watchdog
> > > reserve_*_nmi() register allocation routines. This is done regardless
> > > of whether the NMI watchdog is active. When the NMI watchdog is not
> > > active, the allocator will satisfy the allocation for the first MSR
> > > of each type (counter or control), but then it will reject any
> > > request for the others. You end up working with a single
> > > counter/control register.
> >
> > Hm, ouch. I'll try to move the reservation parts into a separate system.
>
> Ok, here's the first try. The performance counter reservation system has
> been moved into its own file and separated from the nmi watchdog. Also,
> the probing rules were relaxed a bit, as they were more restrictive in
> the watchdog code than in the oprofile code.
>
> The new code never allows to reserve a performance counter or event
> selection register when the probing failed, instead of allowing one
> random register to be reserved.
>
> While moving the code around, I noticed that the PerfMon nmi watchdog
> actually reserved the wrong MSRs, as the generic reservation function
> always took the base register. As the respective attributes are no
> longer used as base regs, we can now store the correct value there and
> keep using the generic function.
>
> Being unfamiliar with the kernel init process, I simply put the probing
> call right before the nmi watchdog initialization, but that's probably
> wrong (and dependent on local APIC on i386), so I'd be glad if someone
> could point out a better location.

JFYI, this should be 2.6.22 material, as the dependency on the nmi
watchdog being probed was added by the cleanup in commit
09198e68501a7e34737cd9264d266f42429abcdc. So it's a regression over
2.6.21.

> Thanks,
> Björn
>
>
> From: Björn Steinbrink <B.Steinbrink@xxxxxx>
>
> Separate the performance counter reservation system from the nmi
> watchdog to allow usage of all performance counters even if the nmi
> watchdog is not used.
>
> Fixed the reservation of the wrong performance counter for the PerfMon
> based nmi watchdog along the way.
>
> Signed-off-by: Björn Steinbrink <B.Steinbrink@xxxxxx>
> ---
> diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
> index 67824f3..88b74e3 100644
> --- a/arch/i386/kernel/apic.c
> +++ b/arch/i386/kernel/apic.c
> @@ -1009,6 +1009,7 @@ void __devinit setup_local_APIC(void)
> value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
> apic_write_around(APIC_LVTT, value);
>
> + probe_performance_counters();
> setup_apic_nmi_watchdog(NULL);
> apic_pm_activate();
> }
> diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
> index 74f27a4..882364d 100644
> --- a/arch/i386/kernel/cpu/Makefile
> +++ b/arch/i386/kernel/cpu/Makefile
> @@ -18,4 +18,4 @@ obj-$(CONFIG_X86_MCE) += mcheck/
> obj-$(CONFIG_MTRR) += mtrr/
> obj-$(CONFIG_CPU_FREQ) += cpufreq/
>
> -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
> +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr.o perfctr-watchdog.o
> diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
> index 2b04c8f..6187097 100644
> --- a/arch/i386/kernel/cpu/perfctr-watchdog.c
> +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
> @@ -8,9 +8,7 @@
> Original code for K7/P6 written by Keith Owens */
>
> #include <linux/percpu.h>
> -#include <linux/module.h>
> #include <linux/kernel.h>
> -#include <linux/bitops.h>
> #include <linux/smp.h>
> #include <linux/nmi.h>
> #include <asm/apic.h>
> @@ -36,105 +34,8 @@ struct wd_ops {
>
> static struct wd_ops *wd_ops;
>
> -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
> - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
> - */
> -#define NMI_MAX_COUNTER_BITS 66
> -
> -/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
> - * evtsel_nmi_owner tracks the ownership of the event selection
> - * - different performance counters/ event selection may be reserved for
> - * different subsystems this reservation system just tries to coordinate
> - * things a little
> - */
> -static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
> -static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
> -
> static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
>
> -/* converts an msr to an appropriate reservation bit */
> -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
> -{
> - return wd_ops ? msr - wd_ops->perfctr : 0;
> -}
> -
> -/* converts an msr to an appropriate reservation bit */
> -/* returns the bit offset of the event selection register */
> -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
> -{
> - return wd_ops ? msr - wd_ops->evntsel : 0;
> -}
> -
> -/* checks for a bit availability (hack for oprofile) */
> -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
> -{
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - return (!test_bit(counter, perfctr_nmi_owner));
> -}
> -
> -/* checks the an msr for availability */
> -int avail_to_resrv_perfctr_nmi(unsigned int msr)
> -{
> - unsigned int counter;
> -
> - counter = nmi_perfctr_msr_to_bit(msr);
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - return (!test_bit(counter, perfctr_nmi_owner));
> -}
> -
> -int reserve_perfctr_nmi(unsigned int msr)
> -{
> - unsigned int counter;
> -
> - counter = nmi_perfctr_msr_to_bit(msr);
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - if (!test_and_set_bit(counter, perfctr_nmi_owner))
> - return 1;
> - return 0;
> -}
> -
> -void release_perfctr_nmi(unsigned int msr)
> -{
> - unsigned int counter;
> -
> - counter = nmi_perfctr_msr_to_bit(msr);
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - clear_bit(counter, perfctr_nmi_owner);
> -}
> -
> -int reserve_evntsel_nmi(unsigned int msr)
> -{
> - unsigned int counter;
> -
> - counter = nmi_evntsel_msr_to_bit(msr);
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - if (!test_and_set_bit(counter, evntsel_nmi_owner))
> - return 1;
> - return 0;
> -}
> -
> -void release_evntsel_nmi(unsigned int msr)
> -{
> - unsigned int counter;
> -
> - counter = nmi_evntsel_msr_to_bit(msr);
> - BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> -
> - clear_bit(counter, evntsel_nmi_owner);
> -}
> -
> -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
> -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
> -EXPORT_SYMBOL(reserve_perfctr_nmi);
> -EXPORT_SYMBOL(release_perfctr_nmi);
> -EXPORT_SYMBOL(reserve_evntsel_nmi);
> -EXPORT_SYMBOL(release_evntsel_nmi);
> -
> void disable_lapic_nmi_watchdog(void)
> {
> BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
> @@ -568,8 +469,8 @@ static struct wd_ops intel_arch_wd_ops = {
> .setup = setup_intel_arch_watchdog,
> .rearm = p6_rearm,
> .stop = single_msr_stop_watchdog,
> - .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
> - .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
> + .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
> + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
> };
>
> static void probe_nmi_watchdog(void)
> diff --git a/arch/i386/kernel/cpu/perfctr.c b/arch/i386/kernel/cpu/perfctr.c
> new file mode 100644
> index 0000000..63e68a3
> --- /dev/null
> +++ b/arch/i386/kernel/cpu/perfctr.c
> @@ -0,0 +1,175 @@
> +#include <linux/bitops.h>
> +#include <linux/module.h>
> +#include <asm/msr-index.h>
> +#include <asm/intel_arch_perfmon.h>
> +
> +struct perfctr_base_regs {
> + unsigned int perfctr;
> + unsigned int evntsel;
> +};
> +
> +static struct perfctr_base_regs *perfctr_base_regs;
> +
> +static struct perfctr_base_regs k7_base_regs = {
> + .perfctr = MSR_K7_PERFCTR0,
> + .evntsel = MSR_K7_EVNTSEL0
> +};
> +
> +static struct perfctr_base_regs p4_base_regs = {
> + .perfctr = MSR_P4_BPU_PERFCTR0,
> + .evntsel = MSR_P4_BSU_ESCR0
> +};
> +
> +static struct perfctr_base_regs p6_base_regs = {
> + .perfctr = MSR_P6_PERFCTR0,
> + .evntsel = MSR_P6_EVNTSEL0
> +};
> +
> +static struct perfctr_base_regs arch_perfmon_base_regs = {
> + .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
> + .evntsel = MSR_ARCH_PERFMON_EVENTSEL0
> +};
> +
> +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
> + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
> + */
> +#define NMI_MAX_COUNTER_BITS 66
> +
> +/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
> + * evtsel_nmi_owner tracks the ownership of the event selection
> + * - different performance counters/ event selection may be reserved for
> + * different subsystems this reservation system just tries to coordinate
> + * things a little
> + */
> +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
> +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
> +
> +void __devinit probe_performance_counters(void)
> +{
> + switch (boot_cpu_data.x86_vendor) {
> + case X86_VENDOR_AMD:
> + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
> + boot_cpu_data.x86 != 16)
> + return;
> +
> + perfctr_base_regs = &k7_base_regs;
> + break;
> +
> + case X86_VENDOR_INTEL:
> + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
> + perfctr_base_regs = &arch_perfmon_base_regs;
> + break;
> + }
> + switch (boot_cpu_data.x86) {
> + case 6:
> + perfctr_base_regs = &p6_base_regs;
> + break;
> +
> + case 15:
> + perfctr_base_regs = &p4_base_regs;
> + break;
> + }
> + break;
> + }
> +}
> +
> +/* converts an msr to an appropriate reservation bit */
> +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
> +{
> + return msr - perfctr_base_regs->perfctr;
> +}
> +
> +/* converts an msr to an appropriate reservation bit */
> +/* returns the bit offset of the event selection register */
> +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
> +{
> + return msr - perfctr_base_regs->evntsel;
> +}
> +
> +/* checks for a bit availability (hack for oprofile) */
> +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
> +{
> + if (!perfctr_base_regs)
> + return 0;
> +
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + return (!test_bit(counter, perfctr_nmi_owner));
> +}
> +
> +/* checks the an msr for availability */
> +int avail_to_resrv_perfctr_nmi(unsigned int msr)
> +{
> + unsigned int counter;
> +
> + if (!perfctr_base_regs)
> + return 0;
> +
> + counter = nmi_perfctr_msr_to_bit(msr);
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + return (!test_bit(counter, perfctr_nmi_owner));
> +}
> +
> +int reserve_perfctr_nmi(unsigned int msr)
> +{
> + unsigned int counter;
> +
> + if (!perfctr_base_regs)
> + return 0;
> +
> + counter = nmi_perfctr_msr_to_bit(msr);
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + if (!test_and_set_bit(counter, perfctr_nmi_owner))
> + return 1;
> + return 0;
> +}
> +
> +void release_perfctr_nmi(unsigned int msr)
> +{
> + unsigned int counter;
> +
> + if (!perfctr_base_regs)
> + return 0;
> +
> + counter = nmi_perfctr_msr_to_bit(msr);
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + clear_bit(counter, perfctr_nmi_owner);
> +}
> +
> +int reserve_evntsel_nmi(unsigned int msr)
> +{
> + unsigned int counter;
> +
> + if (!perfctr_base_regs)
> + return 0;
> +
> + counter = nmi_evntsel_msr_to_bit(msr);
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + if (!test_and_set_bit(counter, evntsel_nmi_owner))
> + return 1;
> + return 0;
> +}
> +
> +void release_evntsel_nmi(unsigned int msr)
> +{
> + unsigned int counter;
> +
> + if (!perfctr_base_regs)
> + return 0;
> +
> + counter = nmi_evntsel_msr_to_bit(msr);
> + BUG_ON(counter > NMI_MAX_COUNTER_BITS);
> +
> + clear_bit(counter, evntsel_nmi_owner);
> +}
> +
> +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
> +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
> +EXPORT_SYMBOL(reserve_perfctr_nmi);
> +EXPORT_SYMBOL(release_perfctr_nmi);
> +EXPORT_SYMBOL(reserve_evntsel_nmi);
> +EXPORT_SYMBOL(release_evntsel_nmi);
> diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
> index de1de8a..cc7587d 100644
> --- a/arch/x86_64/kernel/Makefile
> +++ b/arch/x86_64/kernel/Makefile
> @@ -9,7 +9,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
> x8664_ksyms.o i387.o syscall.o vsyscall.o \
> setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
> pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \
> - perfctr-watchdog.o
> + perfctr.o perfctr-watchdog.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
> @@ -60,4 +60,5 @@ i8237-y += ../../i386/kernel/i8237.o
> msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
> alternative-y += ../../i386/kernel/alternative.o
> pcspeaker-y += ../../i386/kernel/pcspeaker.o
> +perfctr-y += ../../i386/kernel/cpu/perfctr.o
> perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o
> diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
> index 1b0e07b..892ebf8 100644
> --- a/arch/x86_64/kernel/apic.c
> +++ b/arch/x86_64/kernel/apic.c
> @@ -453,6 +453,7 @@ void __cpuinit setup_local_APIC (void)
> oldvalue, value);
> }
>
> + probe_performance_counters();
> nmi_watchdog_default();
> setup_apic_nmi_watchdog(NULL);
> apic_pm_activate();
> diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
> index fb1e133..736af6f 100644
> --- a/include/asm-i386/nmi.h
> +++ b/include/asm-i386/nmi.h
> @@ -18,6 +18,7 @@
> int do_nmi_callback(struct pt_regs *regs, int cpu);
>
> extern int nmi_watchdog_enabled;
> +extern void probe_performance_counters(void);
> extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
> extern int avail_to_resrv_perfctr_nmi(unsigned int);
> extern int reserve_perfctr_nmi(unsigned int);
> diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
> index d0a7f53..d45fc62 100644
> --- a/include/asm-x86_64/nmi.h
> +++ b/include/asm-x86_64/nmi.h
> @@ -46,6 +46,7 @@ extern int unknown_nmi_panic;
> extern int nmi_watchdog_enabled;
>
> extern int check_nmi_watchdog(void);
> +extern void probe_performance_counters(void);
> extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
> extern int avail_to_resrv_perfctr_nmi(unsigned int);
> extern int reserve_perfctr_nmi(unsigned int);
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/