[RFT/PATCH v2 5/6] x86-64: Move vread_tsc into a new file with sensible options

From: Andy Lutomirski
Date: Wed Apr 06 2011 - 22:05:15 EST


vread_tsc is short and hot, and it's userspace code so the usual
reasons to keep frame pointers around, enable -pg, and turn off
sibling calls don't apply.

(OK, turning off sibling calls has no effect. But it might
someday...)

As an added benefit, tsc.c is profilable now.

Signed-off-by: Andy Lutomirski <luto@xxxxxxx>
---
arch/x86/include/asm/tsc.h | 4 +++
arch/x86/kernel/Makefile | 8 +++--
arch/x86/kernel/tsc.c | 53 --------------------------------------
arch/x86/kernel/vread_tsc_64.c | 55 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 64 insertions(+), 56 deletions(-)
create mode 100644 arch/x86/kernel/vread_tsc_64.c

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132f..8f2b3c6 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);

+#ifdef CONFIG_X86_64
+extern cycles_t vread_tsc(void);
+#endif
+
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2..7626fb8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)

ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
-CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,13 +23,16 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
+CFLAGS_vread_tsc_64.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
GCOV_PROFILE_tsc.o := n
GCOV_PROFILE_paravirt.o := n

+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls
+
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 69ff619..5346381 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,59 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
ret : clocksource_tsc.cycle_last;
}

-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
- u64 zero, last;
-
- /*
- * rdtsc is unordered, and we want it to be ordered like
- * a load with respect to other CPUs (and we don't want
- * it to execute absurdly early wrt code on this CPU).
- * rdtsc_barrier() is a barrier that provides this ordering
- * with respect to *earlier* loads. (Which barrier to use
- * depends on the CPU.)
- */
- rdtsc_barrier();
-
- asm volatile ("rdtsc\n\t"
- "shl $0x20,%%rdx\n\t"
- "or %%rdx,%%rax\n\t"
- "shl $0x20,%%rdx"
- : "=a" (ret), "=d" (zero) : : "cc");
-
- /*
- * zero == 0, but as far as the processor is concerned, zero
- * depends on the output of rdtsc. So we can use it as a
- * load barrier by loading something that depends on it.
- * x86-64 keeps all loads in order wrt each other, so this
- * ensures that rdtsc is ordered wrt all later loads.
- */
-
- /*
- * This doesn't multiply 'zero' by anything, which generates
- * very slightly nicer code than multiplying it by 8.
- */
- last = *( (cycle_t *)
- ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-#endif
-
static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 0000000..856330e
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,55 @@
+/* This code runs in userspace. */
+
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+ cycle_t ret;
+ u64 zero, last;
+
+ /*
+ * rdtsc is unordered, and we want it to be ordered like
+ * a load with respect to other CPUs (and we don't want
+ * it to execute absurdly early wrt code on this CPU).
+ * rdtsc_barrier() is a barrier that provides this ordering
+ * with respect to *earlier* loads. (Which barrier to use
+ * depends on the CPU.)
+ */
+ rdtsc_barrier();
+
+ asm volatile ("rdtsc\n\t"
+ "shl $0x20,%%rdx\n\t"
+ "or %%rdx,%%rax\n\t"
+ "shl $0x20,%%rdx"
+ : "=a" (ret), "=d" (zero) : : "cc");
+
+ /*
+ * zero == 0, but as far as the processor is concerned, zero
+ * depends on the output of rdtsc. So we can use it as a
+ * load barrier by loading something that depends on it.
+ * x86-64 keeps all loads in order wrt each other, so this
+ * ensures that rdtsc is ordered wrt all later loads.
+ */
+
+ /*
+ * This doesn't multiply 'zero' by anything, which generates
+ * very slightly nicer code than multiplying it by 8.
+ */
+ last = *( (cycle_t *)
+ ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
--
1.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/