vDSO for ppc64 : Preliminary release #3

From: Benjamin Herrenschmidt
Date: Thu Sep 09 2004 - 03:52:09 EST


Here's a new drop of the vDSO, patch for ppc64 against the current bk.

Lots of changes since last version. It actually provides a working .so
to userland that glibc can use provided it's patched properly (see
note below). The only functions provided at this point are the 32 and
64 bits versions of the fully-userland gettimeofday in addition to
the signal trampolines. I'm now working on adding more.

The vDSO is no longer above the stack. It's now mapped at 0x100000,
still using a real VMA though, read only by default but ptrace and
mprotect can be used. (gdb breakpoints do work).

Many thanks to Alan Modra for his help at various stages of the
process.

What is missing is:

- more functions, especially the cpu-specific ones. on the todo list
for those are: uname, get_syscall_bitmap() (arjan suggestion), cache
sync (dcache/icache flush), memcpy, memset, atomics & locks. Note that
we may do an early release with not all the functions in.
- cfi infos for the signal trampoline (beeing done)
- definition of a phdr allowing userland apps to request the
vdso to either not exist or be mapped at a different address
- possible issues with Makefiles to be ironed out

About glibc:

Without glibc changes, the vdso patch will give you the signal
trampolines out of the stack at least. Currently, there are several
changes required to glibc, I don't have a clean patch to provide yet
and it's still not clear what Ulrich Drepper will accept or not.

- First, we need to enable using a vdso when available on ppc &
ppc64. This is rather simple, I expect Ulrich to commit a patch for
that soon. The only issue is that we don't use the vdso for syscalls,
so some stuff related to that doesn't exist on ppc and compilation
fails in a couple of places without some changes for which Ulrich will
find the right solution I hope.

- There seem to be a small bug in the ld.so code that deals with the
vdso regarding the way it calculates the map & text size, I'll have a
patch for that.

- The current glibc code for dealing with vdso's is not completely
appropriate for ppc64 in particular since we do need relocations to be
performed on the OPD section (thanks mprotect + COW, it actually works)
if the library is ever mapped at a differnet address than it's native
0x100000 (via the new phdr for example).
The current glibc code forces l_relocated to 1 for all vdso's (which is
fine for archs without need to relocate function descriptors).
It is not clear what to do here. I would like the code to set l_relocated
to 1 only if l_addr is 0 (native address) but that _might_ trigger some
overhead of checking relocations on archs that don't care. We may want to
add an arch macro hook in there, maybe a DL_POST_PROCESS_VDSO or so ...

- We need to "hook" some glibc functions to call the vDSO for things
that are provided by. I'll let glibc experts decide how to do that (for
example have it call the vdso gettimeofday instead of the syscall, etc...)


In the meantime, here's the latest drop, comments welcome:

diff -urN linux-2.5/arch/ppc64/kernel/Makefile linux-vdso/arch/ppc64/kernel/Makefile
--- linux-2.5/arch/ppc64/kernel/Makefile 2004-08-26 15:46:30.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/Makefile 2004-09-07 18:25:37.000000000 +1000
@@ -11,7 +11,7 @@
udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \
ptrace32.o signal32.o rtc.o init_task.o \
lmb.o cputable.o cpu_setup_power4.o idle_power4.o \
- iommu.o sysfs.o vio.o
+ iommu.o sysfs.o vio.o vdso.o

obj-$(CONFIG_PPC_OF) += of_device.o

diff -urN linux-2.5/arch/ppc64/kernel/asm-offsets.c linux-vdso/arch/ppc64/kernel/asm-offsets.c
--- linux-2.5/arch/ppc64/kernel/asm-offsets.c 2004-09-09 17:33:52.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/asm-offsets.c 2004-09-09 17:56:36.000000000 +1000
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/time.h>
#include <linux/hardirq.h>
#include <asm/io.h>
#include <asm/page.h>
@@ -36,6 +37,8 @@
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/cputable.h>
+#include <asm/systemcfg.h>
+#include <asm/compat.h>

#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -170,5 +173,20 @@
DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features));
DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup));

+ /* systemcfg offsets for use by vdso */
+ DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct systemcfg, tb_orig_stamp));
+ DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct systemcfg, tb_ticks_per_sec));
+ DEFINE(CFG_TB_TO_XS, offsetof(struct systemcfg, tb_to_xs));
+ DEFINE(CFG_STAMP_XSEC, offsetof(struct systemcfg, stamp_xsec));
+ DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct systemcfg, tb_update_count));
+ DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct systemcfg, tz_minuteswest));
+ DEFINE(CFG_TZ_DSTTIME, offsetof(struct systemcfg, tz_dsttime));
+ DEFINE(TVAL64_TV_SEC, offsetof(struct timeval, tv_sec));
+ DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec));
+ DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec));
+ DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec));
+ DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest));
+ DEFINE(TZONE_TZ_DSTTIME, offsetof(struct timezone, tz_dsttime));
+
return 0;
}
diff -urN linux-2.5/arch/ppc64/kernel/signal.c linux-vdso/arch/ppc64/kernel/signal.c
--- linux-2.5/arch/ppc64/kernel/signal.c 2004-08-26 15:46:30.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/signal.c 2004-09-07 18:25:41.000000000 +1000
@@ -34,6 +34,7 @@
#include <asm/ppcdebug.h>
#include <asm/unistd.h>
#include <asm/cacheflush.h>
+#include <asm/vdso.h>

#define DEBUG_SIG 0

@@ -412,10 +413,14 @@
goto badframe;

/* Set up to return from userspace. */
- err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
- if (err)
- goto badframe;
-
+ if (vdso64_rt_sigtramp && current->thread.vdso_base) {
+ regs->link = current->thread.vdso_base + vdso64_rt_sigtramp;
+ } else {
+ err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
+ if (err)
+ goto badframe;
+ regs->link = (unsigned long) &frame->tramp[0];
+ }
funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler;

/* Allocate a dummy caller frame for the signal handler. */
@@ -424,7 +429,6 @@

/* Set up "regs" so we "return" to the signal handler. */
err |= get_user(regs->nip, &funct_desc_ptr->entry);
- regs->link = (unsigned long) &frame->tramp[0];
regs->gpr[1] = newsp;
err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
regs->gpr[3] = signr;
diff -urN linux-2.5/arch/ppc64/kernel/signal32.c linux-vdso/arch/ppc64/kernel/signal32.c
--- linux-2.5/arch/ppc64/kernel/signal32.c 2004-08-26 15:46:30.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/signal32.c 2004-09-09 17:05:06.000000000 +1000
@@ -30,6 +30,7 @@
#include <asm/ppcdebug.h>
#include <asm/unistd.h>
#include <asm/cacheflush.h>
+#include <asm/vdso.h>

#define DEBUG_SIG 0

@@ -677,18 +678,24 @@

/* Save user registers on the stack */
frame = &rt_sf->uc.uc_mcontext;
- if (save_user_regs(regs, frame, __NR_rt_sigreturn))
- goto badframe;
-
if (put_user(regs->gpr[1], (unsigned long __user *)newsp))
goto badframe;
+
+ if (vdso32_rt_sigtramp && current->thread.vdso_base) {
+ if (save_user_regs(regs, frame, 0))
+ goto badframe;
+ regs->link = current->thread.vdso_base + vdso32_rt_sigtramp;
+ } else {
+ if (save_user_regs(regs, frame, __NR_rt_sigreturn))
+ goto badframe;
+ regs->link = (unsigned long) frame->tramp;
+ }
regs->gpr[1] = (unsigned long) newsp;
regs->gpr[3] = sig;
regs->gpr[4] = (unsigned long) &rt_sf->info;
regs->gpr[5] = (unsigned long) &rt_sf->uc;
regs->gpr[6] = (unsigned long) rt_sf;
regs->nip = (unsigned long) ka->sa.sa_handler;
- regs->link = (unsigned long) frame->tramp;
regs->trap = 0;
regs->result = 0;

@@ -842,8 +849,15 @@
|| __put_user(sig, &sc->signal))
goto badframe;

- if (save_user_regs(regs, &frame->mctx, __NR_sigreturn))
- goto badframe;
+ if (vdso32_sigtramp && current->thread.vdso_base) {
+ if (save_user_regs(regs, &frame->mctx, 0))
+ goto badframe;
+ regs->link = current->thread.vdso_base + vdso32_sigtramp;
+ } else {
+ if (save_user_regs(regs, &frame->mctx, __NR_sigreturn))
+ goto badframe;
+ regs->link = (unsigned long) frame->mctx.tramp;
+ }

if (put_user(regs->gpr[1], (unsigned long __user *)newsp))
goto badframe;
@@ -851,7 +865,6 @@
regs->gpr[3] = sig;
regs->gpr[4] = (unsigned long) sc;
regs->nip = (unsigned long) ka->sa.sa_handler;
- regs->link = (unsigned long) frame->mctx.tramp;
regs->trap = 0;
regs->result = 0;

diff -urN linux-2.5/arch/ppc64/kernel/smp.c linux-vdso/arch/ppc64/kernel/smp.c
--- linux-2.5/arch/ppc64/kernel/smp.c 2004-09-09 17:33:52.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/smp.c 2004-09-09 17:56:36.000000000 +1000
@@ -784,7 +784,7 @@
* For now we leave it which means the time can be some
* number of msecs off until someone does a settimeofday()
*/
- do_gtod.tb_orig_stamp = tb_last_stamp;
+ do_gtod.varp->tb_orig_stamp = tb_last_stamp;
systemcfg->tb_orig_stamp = tb_last_stamp;
#endif

diff -urN linux-2.5/arch/ppc64/kernel/time.c linux-vdso/arch/ppc64/kernel/time.c
--- linux-2.5/arch/ppc64/kernel/time.c 2004-09-06 16:43:57.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/time.c 2004-09-09 16:43:21.000000000 +1000
@@ -158,6 +158,51 @@
}
}

+/*
+ * When the timebase - tb_orig_stamp gets too big, we do a manipulation
+ * between tb_orig_stamp and stamp_xsec. The goal here is to keep the
+ * difference tb - tb_orig_stamp small enough to always fit inside a
+ * 32 bits number. This is a requirement of our fast 32 bits userland
+ * implementation in the vdso. If we "miss" a call to this function
+ * (interrupt latency, CPU locked in a spinlock, ...) and we end up
+ * with a too big difference, then the vdso will fallback to calling
+ * the syscall
+ */
+static __inline__ void timer_recalc_offset( unsigned long cur_tb )
+{
+ struct gettimeofday_vars * temp_varp;
+ unsigned temp_idx;
+ unsigned long offset, new_stamp_xsec, new_tb_orig_stamp;
+
+ if (((cur_tb - do_gtod.varp->tb_orig_stamp) & 0x80000000u) == 0)
+ return;
+
+ if (do_gtod.var_idx == 0) {
+ temp_varp = &do_gtod.vars[1];
+ temp_idx = 1;
+ } else {
+ temp_varp = &do_gtod.vars[0];
+ temp_idx = 0;
+ }
+
+ new_tb_orig_stamp = cur_tb;
+ offset = new_tb_orig_stamp - do_gtod.varp->tb_orig_stamp;
+ new_stamp_xsec = do_gtod.varp->stamp_xsec + mulhdu(offset, do_gtod.varp->tb_to_xs);
+
+ temp_varp->tb_orig_stamp = new_tb_orig_stamp;
+ temp_varp->stamp_xsec = new_stamp_xsec;
+ mb();
+ do_gtod.varp = temp_varp;
+ do_gtod.var_idx = temp_idx;
+
+ ++(systemcfg->tb_update_count);
+ wmb();
+ systemcfg->tb_orig_stamp = new_tb_orig_stamp;
+ systemcfg->stamp_xsec = new_stamp_xsec;
+ wmb();
+ ++(systemcfg->tb_update_count);
+}
+
#ifdef CONFIG_SMP
unsigned long profile_pc(struct pt_regs *regs)
{
@@ -278,6 +323,7 @@
write_seqlock(&xtime_lock);
tb_last_stamp = lpaca->next_jiffy_update_tb;
do_timer(regs);
+ timer_recalc_offset( cur_tb );
timer_sync_xtime( cur_tb );
timer_check_rtc();
write_sequnlock(&xtime_lock);
@@ -331,8 +377,8 @@
* if done in units of 1/2^20 rather than microseconds.
* The conversion to microseconds at the end is done
* without a divide (and in fact, without a multiply) */
- tb_ticks = get_tb() - do_gtod.tb_orig_stamp;
temp_varp = do_gtod.varp;
+ tb_ticks = get_tb() - temp_varp->tb_orig_stamp;
temp_tb_to_xs = temp_varp->tb_to_xs;
temp_stamp_xsec = temp_varp->stamp_xsec;
tb_xsec = mulhdu( tb_ticks, temp_tb_to_xs );
@@ -394,7 +440,9 @@
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;

- delta_xsec = mulhdu( (tb_last_stamp-do_gtod.tb_orig_stamp), do_gtod.varp->tb_to_xs );
+ delta_xsec = mulhdu( (tb_last_stamp-do_gtod.varp->tb_orig_stamp),
+ do_gtod.varp->tb_to_xs );
+
new_xsec = (new_nsec * XSEC_PER_SEC) / NSEC_PER_SEC;
new_xsec += new_sec * XSEC_PER_SEC;
if ( new_xsec > delta_xsec ) {
@@ -407,7 +455,7 @@
* before 1970 ... eg. we booted ten days ago, and we are setting
* the time to Jan 5, 1970 */
do_gtod.varp->stamp_xsec = new_xsec;
- do_gtod.tb_orig_stamp = tb_last_stamp;
+ do_gtod.varp->tb_orig_stamp = tb_last_stamp;
systemcfg->stamp_xsec = new_xsec;
systemcfg->tb_orig_stamp = tb_last_stamp;
}
@@ -510,9 +558,9 @@
xtime.tv_sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
tm.tm_hour, tm.tm_min, tm.tm_sec);
tb_last_stamp = get_tb();
- do_gtod.tb_orig_stamp = tb_last_stamp;
do_gtod.varp = &do_gtod.vars[0];
do_gtod.var_idx = 0;
+ do_gtod.varp->tb_orig_stamp = tb_last_stamp;
do_gtod.varp->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC;
do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
do_gtod.varp->tb_to_xs = tb_to_xs;
@@ -630,12 +678,12 @@
stamp_xsec which is the time (in 1/2^20 second units) corresponding to tb_orig_stamp. This
new value of stamp_xsec compensates for the change in frequency (implied by the new tb_to_xs)
which guarantees that the current time remains the same */
- tb_ticks = get_tb() - do_gtod.tb_orig_stamp;
+ write_seqlock_irqsave( &xtime_lock, flags );
+ tb_ticks = get_tb() - do_gtod.varp->tb_orig_stamp;
div128_by_32( 1024*1024, 0, new_tb_ticks_per_sec, &divres );
new_tb_to_xs = divres.result_low;
new_xsec = mulhdu( tb_ticks, new_tb_to_xs );

- write_seqlock_irqsave( &xtime_lock, flags );
old_xsec = mulhdu( tb_ticks, do_gtod.varp->tb_to_xs );
new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec;

diff -urN linux-2.5/arch/ppc64/kernel/vdso.c linux-vdso/arch/ppc64/kernel/vdso.c
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso.c 2004-09-07 19:11:29.000000000 +1000
@@ -0,0 +1,468 @@
+/*
+ * linux/arch/ppc64/kernel/vdso.c
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
+ * <benh@xxxxxxxxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/vdso.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+
+/*
+ * The vDSOs themselves are here
+ */
+extern char vdso64_start, vdso64_end;
+extern char vdso32_start, vdso32_end;
+
+static void *vdso64_kbase = &vdso64_start;
+static void *vdso32_kbase = &vdso32_start;
+
+unsigned int vdso64_pages;
+unsigned int vdso32_pages;
+
+/* Signal trampolines user addresses */
+
+unsigned long vdso64_rt_sigtramp;
+unsigned long vdso32_sigtramp;
+unsigned long vdso32_rt_sigtramp;
+
+/*
+ * Some infos carried around for each of them during parsing at
+ * boot time.
+ */
+struct lib32_elfinfo
+{
+ Elf32_Ehdr *hdr; /* ptr to ELF */
+ Elf32_Sym *dynsym; /* ptr to .dynsym section */
+ unsigned long dynsymsize; /* size of .dynsym section */
+ char *dynstr; /* ptr to .dynstr section */
+ unsigned long text; /* offset of .text section in .so */
+};
+
+struct lib64_elfinfo
+{
+ Elf64_Ehdr *hdr;
+ Elf64_Sym *dynsym;
+ unsigned long dynsymsize;
+ char *dynstr;
+ unsigned long text;
+};
+
+
+#ifdef __DEBUG
+static void dump_one_vdso_page(struct page *pg, struct page *upg)
+{
+ printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT),
+ page_count(pg),
+ pg->flags);
+ if (upg/* && pg != upg*/) {
+ printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg) << PAGE_SHIFT),
+ page_count(upg),
+ upg->flags);
+ }
+ printk("\n");
+}
+
+static void dump_vdso_pages(struct vm_area_struct * vma)
+{
+ int i;
+
+ if (!vma || test_thread_flag(TIF_32BIT)) {
+ printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase);
+ for (i=0; i<vdso32_pages; i++) {
+ struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
+ struct page *upg = (vma && vma->vm_mm) ?
+ follow_page(vma->vm_mm, vma->vm_start + i*PAGE_SIZE, 0)
+ : NULL;
+ dump_one_vdso_page(pg, upg);
+ }
+ }
+ if (!vma || !test_thread_flag(TIF_32BIT)) {
+ printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase);
+ for (i=0; i<vdso64_pages; i++) {
+ struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
+ struct page *upg = (vma && vma->vm_mm) ?
+ follow_page(vma->vm_mm, vma->vm_start + i*PAGE_SIZE, 0)
+ : NULL;
+ dump_one_vdso_page(pg, upg);
+ }
+ }
+}
+#endif /* DEBUG */
+
+/*
+ * Keep a dummy vma_close for now, it will prevent VMA merging.
+ */
+static void vdso_vma_close(struct vm_area_struct * vma)
+{
+}
+
+/*
+ * Our nopage() function, maps in the actual vDSO kernel pages, they will
+ * be mapped read-only by do_no_page(), and eventually COW'ed, either
+ * right away for an initial write access, or by do_wp_page().
+ */
+static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
+ unsigned long address, int *type)
+{
+ unsigned long offset = address - vma->vm_start;
+ struct page *pg;
+ void *vbase = test_thread_flag(TIF_32BIT) ? vdso32_kbase : vdso64_kbase;
+
+ DBG("vdso_vma_nopage(current: %s, address: %016lx, off: %lx)\n",
+ current->comm, address, offset);
+
+ if (address < vma->vm_start || address > vma->vm_end)
+ return NOPAGE_SIGBUS;
+
+ /*
+ * Last page is systemcfg, special handling here, no get_page() a
+ * this is a reserved page
+ */
+ if ((vma->vm_end - address) <= PAGE_SIZE)
+ return virt_to_page(SYSTEMCFG_VIRT_ADDR);
+
+ pg = virt_to_page(vbase + offset);
+ get_page(pg);
+ DBG(" ->page count: %d\n", page_count(pg));
+
+ return pg;
+}
+
+static struct vm_operations_struct vdso_vmops = {
+ .close = vdso_vma_close,
+ .nopage = vdso_vma_nopage,
+};
+
+/*
+ * This is called from binfmt_elf, we create the special vma for the
+ * vDSO and insert it into the mm struct tree
+ */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long vdso_pages = test_thread_flag(TIF_32BIT) ?
+ vdso32_pages : vdso64_pages;
+
+ /* vDSO has a problem and was disabled, just don't "enable" it for the
+ * process
+ */
+ if (vdso_pages == 0) {
+ current->thread.vdso_base = 0;
+ return 0;
+ }
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (vma == NULL)
+ return -ENOMEM;
+ if (security_vm_enough_memory(vdso_pages)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return -ENOMEM;
+ }
+ memset(vma, 0, sizeof(*vma));
+
+ /*
+ * pick a base address for the vDSO in process space. We have a default
+ * base of 1Mb on which we had a random offset up to 1Mb.
+ * XXX: Add possibility for a program header to specify that location
+ */
+ current->thread.vdso_base = 0x00100000
+ + 0xaa000;/* + ((unsigned long)vma & 0x000ff000); */
+
+ vma->vm_mm = mm;
+ vma->vm_start = current->thread.vdso_base;
+
+ /*
+ * the VMA size is one page more than the vDSO since systemcfg
+ * is mapped in the last one
+ */
+ vma->vm_end = vma->vm_start + ((vdso_pages + 1) << PAGE_SHIFT);
+
+ /*
+ * our vma flags don't have VM_WRITE so by default, the process isn't allowed
+ * to write those pages.
+ * gdb can break that with ptrace interface, and thus trigger COW on those
+ * pages but it's then your responsibility to never do that on the "data" page
+ * of the vDSO or you'll stop getting kernel updates and your nice userland
+ * gettimeofday will be totally dead. It's fine to use that for setting
+ * breakpoints in the vDSO code pages though
+ */
+ vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ vma->vm_flags |= mm->def_flags;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
+ vma->vm_ops = &vdso_vmops;
+
+ down_write(&mm->mmap_sem);
+ insert_vm_struct(mm, vma);
+ mm->total_vm += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ up_write(&mm->mmap_sem);
+
+ return 0;
+}
+
+static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
+ unsigned long *size)
+{
+ Elf32_Shdr *sechdrs;
+ unsigned int i;
+ char *secnames;
+
+ /* Grab section headers and strings so we can tell who is who */
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+ secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ /* Find the section they want */
+ for (i = 1; i < ehdr->e_shnum; i++) {
+ if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+ if (size)
+ *size = sechdrs[i].sh_size;
+ return (void *)ehdr + sechdrs[i].sh_offset;
+ }
+ }
+ *size = 0;
+ return NULL;
+}
+
+static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname,
+ unsigned long *size)
+{
+ Elf64_Shdr *sechdrs;
+ unsigned int i;
+ char *secnames;
+
+ /* Grab section headers and strings so we can tell who is who */
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+ secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ /* Find the section they want */
+ for (i = 1; i < ehdr->e_shnum; i++) {
+ if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+ if (size)
+ *size = sechdrs[i].sh_size;
+ return (void *)ehdr + sechdrs[i].sh_offset;
+ }
+ }
+ if (size)
+ *size = 0;
+ return NULL;
+}
+
+static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib, const char *symname)
+{
+ unsigned int i;
+
+ for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) {
+ if (lib->dynsym[i].st_name == 0)
+ continue;
+ if (strcmp(symname, lib->dynstr + lib->dynsym[i].st_name) == 0)
+ return &lib->dynsym[i];
+ }
+ return NULL;
+}
+
+static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib, const char *symname)
+{
+ unsigned int i;
+
+ for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) {
+ if (lib->dynsym[i].st_name == 0)
+ continue;
+ if (strcmp(symname, lib->dynstr + lib->dynsym[i].st_name) == 0)
+ return &lib->dynsym[i];
+ }
+ return NULL;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib, const char *symname)
+{
+ Elf32_Sym *sym = find_symbol32(lib, symname);
+
+ if (sym == NULL) {
+ printk(KERN_WARNING "vDSO32: function %s not found !\n", symname);
+ return 0;
+ }
+ return sym->st_value - VDSO32_BASE;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function64(struct lib64_elfinfo *lib, const char *symname)
+{
+ Elf64_Sym *sym = find_symbol64(lib, symname);
+
+ if (sym == NULL) {
+ printk(KERN_WARNING "vDSO64: function %s not found !\n", symname);
+ return 0;
+ }
+ return sym->st_value - VDSO64_BASE;
+}
+
+
+static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
+ struct lib64_elfinfo *v64)
+{
+ void *sect;
+
+ /*
+ * Locate symbol tables & text section
+ */
+
+ v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize);
+ v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL);
+ if (v32->dynsym == NULL || v32->dynstr == NULL) {
+ printk(KERN_ERR "vDSO32: a required symbol section was not found\n");
+ return -1;
+ }
+ sect = find_section32(v32->hdr, ".text", NULL);
+ if (sect == NULL) {
+ printk(KERN_ERR "vDSO32: the .text section was not found\n");
+ return -1;
+ }
+ v32->text = sect - vdso32_kbase;
+
+ v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize);
+ v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL);
+ if (v64->dynsym == NULL || v64->dynstr == NULL) {
+ printk(KERN_ERR "vDSO64: a required symbol section was not found\n");
+ return -1;
+ }
+ sect = find_section64(v64->hdr, ".text", NULL);
+ if (sect == NULL) {
+ printk(KERN_ERR "vDSO64: the .text section was not found\n");
+ return -1;
+ }
+ v64->text = sect - vdso64_kbase;
+
+ return 0;
+}
+
+static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
+ struct lib64_elfinfo *v64)
+{
+ /*
+ * Find signal trampolines
+ */
+
+ vdso64_rt_sigtramp = find_function64(v64, "__v_sigtramp_rt64");
+ vdso32_sigtramp = find_function32(v32, "__v_sigtramp32");
+ vdso32_rt_sigtramp = find_function32(v32, "__v_sigtramp_rt32");
+}
+
+static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
+ struct lib64_elfinfo *v64)
+{
+ Elf32_Sym *sym32;
+ Elf64_Sym *sym64;
+
+ sym32 = find_symbol32(v32, "__v_datapage_offset");
+ if (sym32 == NULL) {
+ printk(KERN_ERR "vDSO32: Can't find symbol __v_datapage_offset !\n");
+ return -1;
+ }
+ *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_BASE))) =
+ (vdso32_pages << PAGE_SHIFT) - (sym32->st_value - VDSO32_BASE);
+
+ sym64 = find_symbol64(v64, "__v_datapage_offset");
+ if (sym64 == NULL) {
+ printk(KERN_ERR "vDSO64: Can't find symbol __v_datapage_offset !\n");
+ return -1;
+ }
+ *((int *)(vdso64_kbase + sym64->st_value - VDSO64_BASE)) =
+ (vdso64_pages << PAGE_SHIFT) - (sym64->st_value - VDSO64_BASE);
+
+ return 0;
+}
+
+static __init int vdso_setup(void)
+{
+ struct lib32_elfinfo v32;
+ struct lib64_elfinfo v64;
+
+ v32.hdr = vdso32_kbase;
+ v64.hdr = vdso64_kbase;
+
+ if (vdso_do_find_sections(&v32, &v64))
+ return -1;
+
+ if (vdso_fixup_datapage(&v32, &v64))
+ return -1;
+
+ vdso_setup_trampolines(&v32, &v64);
+
+ return 0;
+}
+
+void __init vdso_init(void)
+{
+ int i;
+
+ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
+ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
+
+ DBG("vdso64_kbase: %p, 0x%x pages, vdso32_kbase: %p, 0x%x pages\n",
+ vdso64_kbase, vdso64_pages, vdso32_kbase, vdso32_pages);
+
+ /*
+ * Initialize the vDSO images in memory, that is do necessary
+ * fixups of vDSO symbols, locate trampolines, etc...
+ */
+ if (vdso_setup()) {
+ printk(KERN_ERR "vDSO setup failure, not enabled !\n");
+ /* XXX should free pages here ? */
+ vdso64_pages = vdso32_pages = 0;
+ return;
+ }
+
+ /* Make sure pages are in the correct state */
+ for (i = 0; i < vdso64_pages; i++) {
+ struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
+ ClearPageReserved(pg);
+ get_page(pg);
+ }
+ for (i = 0; i < vdso32_pages; i++) {
+ struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
+ ClearPageReserved(pg);
+ get_page(pg);
+ }
+}
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/Makefile linux-vdso/arch/ppc64/kernel/vdso32/Makefile
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/Makefile 2004-09-07 18:25:44.000000000 +1000
@@ -0,0 +1,51 @@
+# Choose compiler
+#
+# XXX FIXME: We probably want to enforce using a biarch compiler by default
+# and thus use (CC) with -m64, while letting the user pass a
+# CROSS32_COMPILE prefix if wanted. Same goes for the zImage
+# wrappers
+#
+
+CROSS32_COMPILE ?=
+
+CROSS32CC := $(CROSS32_COMPILE)gcc
+CROSS32AS := $(CROSS32_COMPILE)as
+
+# List of files in the vdso, has to be asm only for now
+
+src-vdso32 = sigtramp.S gettimeofday.S datapage.S
+
+# Build rules
+
+obj-vdso32 := $(addsuffix .o, $(basename $(src-vdso32)))
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+src-vdso32 := $(addprefix $(src)/, $(src-vdso32))
+
+VDSO32_CFLAGS := -shared -s -fno-common -Iinclude -fno-builtin -nostdlib
+VDSO32_CFLAGS += -Wl,-soname=linux-vdso32.so.1
+VDSO32_AFLAGS := -D__ASSEMBLY__ -D__KERNEL__ -D__VDSO32__ -s -nostdinc -Iinclude
+
+obj-y += vdso32_wrapper.o
+extra-y += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso32.so: $(src)/vdso32.lds $(obj-vdso32)
+ $(call if_changed,vdso32ld)
+
+# assembly rules for the .S files
+# This is probably wrong with split src & obj trees
+$(obj-vdso32): %.o: %.S
+ $(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32ld = VDSO32L $@
+ cmd_vdso32ld = $(CROSS32CC) -Wp,-MD,$(depfile) $(VDSO32_CFLAGS) \
+ -Wl,-T $^ -o $@
+quiet_cmd_vdso32as = VDSO32A $@
+ cmd_vdso32as = $(CROSS32CC) -Wp,-MD,$(depfile) $(VDSO32_AFLAGS) -c -o $@ $^
+
+targets += vdso32.so
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/datapage.S linux-vdso/arch/ppc64/kernel/vdso32/datapage.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/datapage.S 2004-09-09 16:43:13.000000000 +1000
@@ -0,0 +1,28 @@
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+/* HIDE THIS ONE FROM USERSPACE */
+
+ .text
+V_FUNCTION_BEGIN(__v_get_datapage)
+ /* We don't want that exposed or overridable as we want other objects
+ * to be able to bl directly to here
+ */
+ .protected __v_get_datapage
+
+ mflr r0
+ .cfi_register lr,r0
+
+ bcl 20,31,1f
+ .global __v_datapage_offset;
+__v_datapage_offset:
+ .long 0
+1:
+ mflr r4
+ lwz r3,0(r4)
+ add r3,r3,r4
+ mtlr r0
+ blr
+V_FUNCTION_END(__v_get_datapage)
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/gettimeofday.S linux-vdso/arch/ppc64/kernel/vdso32/gettimeofday.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/gettimeofday.S 2004-09-09 16:43:13.000000000 +1000
@@ -0,0 +1,138 @@
+/*
+ * Userland implementation of gettimeofday() for 32 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@xxxxxxxxxxxxxxxxxxx),
+ * IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+#include <asm/unistd.h>
+
+ .text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int _v_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(_v_gettimeofday)
+ mflr r12
+ .cfi_register lr,r12
+
+ mr r10,r3 /* r10 saves tv */
+ mr r11,r4 /* r11 saves tz */
+ bl __v_get_datapage /* get data page */
+ mr r9, r3 /* datapage ptr in r9 */
+ bl __v_do_get_xsec /* get xsec from tb & kernel */
+ bne- 2f /* out of line -> do syscall */
+
+ /* seconds are xsec >> 20 */
+ rlwinm r5,r4,12,20,31
+ rlwimi r5,r3,12,0,19
+ stw r5,TVAL32_TV_SEC(r10)
+
+ /* get remaining xsec and convert to usec. we scale
+ * up remaining xsec by 12 bits and get the top 32 bits
+ * of the multiplication
+ */
+ rlwinm r5,r4,12,0,19
+ lis r6,1000000@h
+ ori r6,r6,1000000@l
+ mulhwu r5,r5,r6
+ stw r5,TVAL32_TV_USEC(r10)
+
+ cmpli cr0,r11,0 /* check if tz is NULL */
+ beq 1f
+ lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */
+ lwz r5,CFG_TZ_DSTTIME(r9)
+ stw r4,TZONE_TZ_MINWEST(r11)
+ stw r5,TZONE_TZ_DSTTIME(r11)
+
+1: mtlr r12
+ blr
+
+2: mr r3,r10
+ mr r4,r11
+ li r0,__NR_gettimeofday
+ sc
+ b 1b
+V_FUNCTION_END(_v_gettimeofday)
+
+/*
+ * This is the core of gettimeofday(), it returns the xsec
+ * value in r3 & r4 and expects the datapage ptr (non clobbered)
+ * in r9. clobbers r0,r4,r5,r6,r7,r8
+*/
+__v_do_get_xsec:
+ .cfi_startproc
+ /* Check for update count & load values. We use the low
+ * order 32 bits of the update count
+ */
+1: lwz r8,(CFG_TB_UPDATE_COUNT+4)(r9)
+ andi. r0,r8,1 /* pending update ? loop */
+ bne- 1b
+ xor r0,r8,r8 /* create dependency */
+ add r9,r9,r0
+
+ /* Load orig stamp (offset to TB) */
+ lwz r5,CFG_TB_ORIG_STAMP(r9)
+ lwz r6,(CFG_TB_ORIG_STAMP+4)(r9)
+
+ /* Get a stable TB value */
+2: mftbu r3
+ mftbl r4
+ mftbu r0
+ cmpl cr0,r3,r0
+ bne- 2b
+
+ /* Substract tb orig stamp. If the high part is non-zero, we jump to the
+ * slow path which call the syscall. If it's ok, then we have our 32 bits
+ * tb_ticks value in r7
+ */
+ subfc r7,r6,r4
+ subfe. r0,r5,r3
+ bne- 3f
+
+ /* Load scale factor & do multiplication */
+ lwz r5,CFG_TB_TO_XS(r9) /* load values */
+ lwz r6,(CFG_TB_TO_XS+4)(r9)
+ mulhwu r4,r7,r5
+ mulhwu r6,r7,r6
+ mullw r6,r7,r5
+ addc r6,r6,r0
+
+ /* At this point, we have the scaled xsec value in r4 + XER:CA
+ * we load & add the stamp since epoch
+ */
+ lwz r5,CFG_STAMP_XSEC(r9)
+ lwz r6,(CFG_STAMP_XSEC+4)(r9)
+ adde r4,r4,r6
+ addze r3,r5
+
+ /* We now have our result in r3,r4. We create a fake dependency
+ * on that result and re-check the counter
+ */
+ xor r0,r4,r4
+ add r9,r9,r0
+ lwz r0,(CFG_TB_UPDATE_COUNT+4)(r9)
+ cmpl cr0,r8,r0 /* check if updated */
+ bne- 1b
+
+ /* Warning ! The caller expects CR:EQ to be set to indicate a
+ * successful calculation (so it won't fallback to the syscall
+ * method). We have overriden that CR bit in the counter check,
+ * but fortunately, the loop exit condition _is_ CR:EQ set, so
+ * we can exit safely here. If you change this code, be careful
+ * of that side effect.
+ */
+3: blr
+ .cfi_endproc
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/sigtramp.S linux-vdso/arch/ppc64/kernel/vdso32/sigtramp.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/sigtramp.S 2004-09-07 18:25:44.000000000 +1000
@@ -0,0 +1,18 @@
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+ .text
+
+V_FUNCTION_BEGIN(__v_sigtramp32)
+ li r0,__NR_sigreturn
+ sc
+V_FUNCTION_END(__v_sigtramp32)
+
+V_FUNCTION_BEGIN(__v_sigtramp_rt32)
+ li r0,__NR_rt_sigreturn
+ sc
+V_FUNCTION_END(__v_sigtramp_rt32)
+
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/vdso32.lds.S linux-vdso/arch/ppc64/kernel/vdso32/vdso32.lds.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/vdso32.lds.S 2004-09-09 13:13:18.000000000 +1000
@@ -0,0 +1,91 @@
+
+/*
+ * This is the infamous ld script for the 32 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+/* Default link addresses for the vDSOs */
+OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
+OUTPUT_ARCH(powerpc:common)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = VDSO32_BASE + SIZEOF_HEADERS;
+ .hash : { *(.hash) } :text
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ . = ALIGN (16);
+ .text :
+ {
+ *(.text .stub .text.* .gnu.linkonce.t.*)
+ }
+ PROVIDE (__etext = .);
+ PROVIDE (_etext = .);
+ PROVIDE (etext = .);
+
+ /* Other stuff is appended to the text segment: */
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata1 : { *(.rodata1) }
+ .eh_frame_hdr : { *(.eh_frame_hdr) }
+ .eh_frame : { KEEP (*(.eh_frame)) }
+ .gcc_except_table : { *(.gcc_except_table) }
+ .fixup : { *(.fixup) }
+
+ .got ALIGN(4) : { *(.got.plt) *(.got) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ _end = .;
+ __end = .;
+ PROVIDE (end = .);
+
+
+ /* Stabs debugging sections are here too
+ */
+ .stab 0 : { *(.stab) }
+ .stabstr 0 : { *(.stabstr) }
+ .stab.excl 0 : { *(.stab.excl) }
+ .stab.exclstr 0 : { *(.stab.exclstr) }
+ .stab.index 0 : { *(.stab.index) }
+ .stab.indexstr 0 : { *(.stab.indexstr) }
+ .comment 0 : { *(.comment) }
+ .debug 0 : { *(.debug) }
+ .line 0 : { *(.line) }
+
+ .debug_srcinfo 0 : { *(.debug_srcinfo) }
+ .debug_sfnames 0 : { *(.debug_sfnames) }
+
+ .debug_aranges 0 : { *(.debug_aranges) }
+ .debug_pubnames 0 : { *(.debug_pubnames) }
+
+ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+ .debug_abbrev 0 : { *(.debug_abbrev) }
+ .debug_line 0 : { *(.debug_line) }
+ .debug_frame 0 : { *(.debug_frame) }
+ .debug_str 0 : { *(.debug_str) }
+ .debug_loc 0 : { *(.debug_loc) }
+ .debug_macinfo 0 : { *(.debug_macinfo) }
+
+ .debug_weaknames 0 : { *(.debug_weaknames) }
+ .debug_funcnames 0 : { *(.debug_funcnames) }
+ .debug_typenames 0 : { *(.debug_typenames) }
+ .debug_varnames 0 : { *(.debug_varnames) }
+
+ /DISCARD/ : { *(.note.GNU-stack) }
+ /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.* .sdata*) }
+ /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) }
+}
+
+
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
diff -urN linux-2.5/arch/ppc64/kernel/vdso32/vdso32_wrapper.S linux-vdso/arch/ppc64/kernel/vdso32/vdso32_wrapper.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso32/vdso32_wrapper.S 2004-09-07 18:25:44.000000000 +1000
@@ -0,0 +1,12 @@
+#include <linux/init.h>
+
+ .section ".data"
+
+ .globl vdso32_start, vdso32_end
+ .balign 4096
+vdso32_start:
+ .incbin "arch/ppc64/kernel/vdso32/vdso32.so"
+ .balign 4096
+vdso32_end:
+
+ .previous
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/Makefile linux-vdso/arch/ppc64/kernel/vdso64/Makefile
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/Makefile 2004-09-07 18:25:46.000000000 +1000
@@ -0,0 +1,38 @@
+# List of files in the vdso, has to be asm only for now
+
+src-vdso64 = sigtramp.S gettimeofday.S datapage.S
+
+# Build rules
+
+obj-vdso64 := $(addsuffix .o, $(basename $(src-vdso64)))
+obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+src-vdso64 := $(addprefix $(src)/, $(src-vdso64))
+
+VDSO64_CFLAGS := -shared -s -fno-common -Iinclude -fno-builtin -nostdlib
+VDSO64_CFLAGS += -Wl,-soname=linux-vdso64.so.1
+VDSO64_AFLAGS := -D__ASSEMBLY__ -D__KERNEL__ -D__VDSO64__ -s -nostdinc -Iinclude
+
+obj-y += vdso64_wrapper.o
+extra-y += vdso64.lds
+CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso64.so: $(src)/vdso64.lds $(obj-vdso64)
+ $(call if_changed,vdso64ld)
+
+# assembly rules for the .S files
+# This is probably wrong with split src & obj trees
+$(obj-vdso64): %.o: %.S
+ $(call if_changed_dep,vdso64as)
+
+# actual build commands
+quiet_cmd_vdso64ld = VDSO64L $@
+ cmd_vdso64ld = $(CC) -Wp,-MD,$(depfile) $(VDSO64_CFLAGS) \
+ -Wl,-T $^ -o $@
+quiet_cmd_vdso64as = VDSO64A $@
+ cmd_vdso64as = $(CC) -Wp,-MD,$(depfile) $(VDSO64_AFLAGS) -c -o $@ $^
+
+targets += vdso64.so
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/datapage.S linux-vdso/arch/ppc64/kernel/vdso64/datapage.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/datapage.S 2004-09-09 16:43:13.000000000 +1000
@@ -0,0 +1,28 @@
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+/* HIDE THIS ONE FROM USERSPACE */
+
+ .text
+V_FUNCTION_BEGIN(__v_get_datapage)
+ /* We don't want that exposed or overridable as we want other objects
+ * to be able to bl directly to here
+ */
+ .protected __v_get_datapage
+
+ mflr r0
+ .cfi_register lr,r0
+
+ bcl 20,31,1f
+ .global __v_datapage_offset;
+__v_datapage_offset:
+ .long 0
+1:
+ mflr r4
+ lwz r3,0(r4)
+ add r3,r3,r4
+ mtlr r0
+ blr
+V_FUNCTION_END(__v_get_datapage)
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/gettimeofday.S linux-vdso/arch/ppc64/kernel/vdso64/gettimeofday.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/gettimeofday.S 2004-09-09 16:43:13.000000000 +1000
@@ -0,0 +1,88 @@
+/*
+ * Userland implementation of gettimeofday() for 64 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@xxxxxxxxxxxxxxxxxxx),
+ * IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+
+ .text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int _v_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(_v_gettimeofday)
+ mflr r12
+ .cfi_register lr,r12
+
+ mr r11,r3 /* r11 holds tv */
+ mr r10,r4 /* r10 holds tz */
+ bl .__v_get_datapage /* get data page */
+ bl .__v_do_get_xsec /* get xsec from tb & kernel */
+ lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */
+ ori r7,r7,16960
+ rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */
+ rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */
+ std r5,TVAL64_TV_SEC(r11) /* store sec in tv */
+ subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */
+ mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / XSEC_PER_SEC */
+ rldicl r0,r0,44,20
+ cmpldi cr0,r10,0 /* check if tz is NULL */
+ std r0,TVAL64_TV_USEC(r11) /* store usec in tv */
+ beq 1f
+ lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */
+ lwz r5,CFG_TZ_DSTTIME(r3)
+ stw r4,TZONE_TZ_MINWEST(r10)
+ stw r5,TZONE_TZ_DSTTIME(r10)
+1: mtlr r12
+ li r3,0 /* always success */
+ blr
+V_FUNCTION_END(_v_gettimeofday)
+
+
+/*
+ * This is the core of gettimeofday(), it returns the xsec
+ * value in r4 and expects the datapage ptr (non clobbered)
+ * in r3. clobbers r0,r4,r5,r6,r7,r8
+*/
+.__v_do_get_xsec:
+ .cfi_startproc
+ /* check for update count & load values */
+1: ld r7,CFG_TB_UPDATE_COUNT(r3)
+ andi. r0,r4,1 /* pending update ? loop */
+ bne- 1b
+ xor r0,r4,r4 /* create dependency */
+ add r3,r3,r0
+
+ /* Get TB & offset it */
+ mftb r8
+ ld r9,CFG_TB_ORIG_STAMP(r3)
+ subf r8,r9,r8
+
+ /* Scale result */
+ ld r5,CFG_TB_TO_XS(r3)
+ mulhdu r8,r8,r5
+
+ /* Add stamp since epoch */
+ ld r6,CFG_STAMP_XSEC(r3)
+ add r4,r6,r8
+
+ xor r0,r4,r4
+ add r3,r3,r0
+ ld r0,CFG_TB_UPDATE_COUNT(r3)
+ cmpld cr0,r0,r7 /* check if updated */
+ bne- 1b
+ blr
+ .cfi_endproc
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/sigtramp.S linux-vdso/arch/ppc64/kernel/vdso64/sigtramp.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/sigtramp.S 2004-09-07 18:25:46.000000000 +1000
@@ -0,0 +1,14 @@
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+ .text
+
+V_FUNCTION_BEGIN(__v_sigtramp_rt64)
+ addi r1, r1, __SIGNAL_FRAMESIZE
+ li r0,__NR_rt_sigreturn
+ sc
+V_FUNCTION_END(__v_sigtramp_rt64)
+
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/vdso64.lds.S linux-vdso/arch/ppc64/kernel/vdso64/vdso64.lds.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/vdso64.lds.S 2004-09-09 13:13:18.000000000 +1000
@@ -0,0 +1,93 @@
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc")
+OUTPUT_ARCH(powerpc:common64)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = VDSO64_BASE + SIZEOF_HEADERS;
+ .hash : { *(.hash) } :text
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ . = ALIGN (16);
+ .text :
+ {
+ *(.text .stub .text.* .gnu.linkonce.t.*)
+ *(.sfpr .glink)
+ }
+ PROVIDE (__etext = .);
+ PROVIDE (_etext = .);
+ PROVIDE (etext = .);
+
+ /* Other stuff is appended to the text segment: */
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata1 : { *(.rodata1) }
+ .eh_frame_hdr : { *(.eh_frame_hdr) }
+ .eh_frame : { KEEP (*(.eh_frame)) }
+ .gcc_except_table : { *(.gcc_except_table) }
+
+ .opd ALIGN(8) : { KEEP (*(.opd)) }
+ .got ALIGN(8) : { *(.got .toc) }
+ .rela.dyn ALIGN(8) : { *(.rela.dyn) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ _end = .;
+ PROVIDE (end = .);
+
+ /* Stabs debugging sections are here too
+ */
+ .stab 0 : { *(.stab) }
+ .stabstr 0 : { *(.stabstr) }
+ .stab.excl 0 : { *(.stab.excl) }
+ .stab.exclstr 0 : { *(.stab.exclstr) }
+ .stab.index 0 : { *(.stab.index) }
+ .stab.indexstr 0 : { *(.stab.indexstr) }
+ .comment 0 : { *(.comment) }
+ /* DWARF debug sectio/ns.
+ Symbols in the DWARF debugging sections are relative to the beginning
+ of the section so we begin them at 0. */
+ /* DWARF 1 */
+ .debug 0 : { *(.debug) }
+ .line 0 : { *(.line) }
+ /* GNU DWARF 1 extensions */
+ .debug_srcinfo 0 : { *(.debug_srcinfo) }
+ .debug_sfnames 0 : { *(.debug_sfnames) }
+ /* DWARF 1.1 and DWARF 2 */
+ .debug_aranges 0 : { *(.debug_aranges) }
+ .debug_pubnames 0 : { *(.debug_pubnames) }
+ /* DWARF 2 */
+ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+ .debug_abbrev 0 : { *(.debug_abbrev) }
+ .debug_line 0 : { *(.debug_line) }
+ .debug_frame 0 : { *(.debug_frame) }
+ .debug_str 0 : { *(.debug_str) }
+ .debug_loc 0 : { *(.debug_loc) }
+ .debug_macinfo 0 : { *(.debug_macinfo) }
+ /* SGI/MIPS DWARF 2 extensions */
+ .debug_weaknames 0 : { *(.debug_weaknames) }
+ .debug_funcnames 0 : { *(.debug_funcnames) }
+ .debug_typenames 0 : { *(.debug_typenames) }
+ .debug_varnames 0 : { *(.debug_varnames) }
+
+ /DISCARD/ : { *(.note.GNU-stack) }
+ /DISCARD/ : { *(.branch_lt) }
+ /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.*) }
+ /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) }
+}
+
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
diff -urN linux-2.5/arch/ppc64/kernel/vdso64/vdso64_wrapper.S linux-vdso/arch/ppc64/kernel/vdso64/vdso64_wrapper.S
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/arch/ppc64/kernel/vdso64/vdso64_wrapper.S 2004-09-07 18:25:46.000000000 +1000
@@ -0,0 +1,12 @@
+#include <linux/init.h>
+
+ .section ".data"
+
+ .globl vdso64_start, vdso64_end
+ .balign 4096
+vdso64_start:
+ .incbin "arch/ppc64/kernel/vdso64/vdso64.so"
+ .balign 4096
+vdso64_end:
+
+ .previous
diff -urN linux-2.5/arch/ppc64/mm/init.c linux-vdso/arch/ppc64/mm/init.c
--- linux-2.5/arch/ppc64/mm/init.c 2004-09-09 17:33:52.000000000 +1000
+++ linux-vdso/arch/ppc64/mm/init.c 2004-09-09 17:56:36.000000000 +1000
@@ -61,6 +61,7 @@
#include <asm/system.h>
#include <asm/iommu.h>
#include <asm/abs_addr.h>
+#include <asm/vdso.h>


struct mmu_context_queue_t mmu_context_queue;
@@ -706,6 +707,8 @@
#ifdef CONFIG_PPC_ISERIES
iommu_vio_init();
#endif
+ /* Initialize the vDSO */
+ vdso_init();
}

/*
diff -urN linux-2.5/fs/binfmt_elf.c linux-vdso/fs/binfmt_elf.c
--- linux-2.5/fs/binfmt_elf.c 2004-09-01 11:02:35.000000000 +1000
+++ linux-vdso/fs/binfmt_elf.c 2004-09-03 14:55:56.000000000 +1000
@@ -716,6 +716,14 @@
goto out_free_dentry;
}

+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ retval = arch_setup_additional_pages(bprm, executable_stack);
+ if (retval < 0) {
+ send_sig(SIGKILL, current, 0);
+ goto out_free_dentry;
+ }
+#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
+
current->mm->start_stack = bprm->p;

/* Now we do a little grungy work by mmaping the ELF image into
diff -urN linux-2.5/include/asm-ppc64/a.out.h linux-vdso/include/asm-ppc64/a.out.h
--- linux-2.5/include/asm-ppc64/a.out.h 2004-08-10 10:22:36.000000000 +1000
+++ linux-vdso/include/asm-ppc64/a.out.h 2004-09-07 18:24:53.000000000 +1000
@@ -30,14 +30,11 @@

#ifdef __KERNEL__

-#define STACK_TOP_USER64 (TASK_SIZE_USER64)
+#define STACK_TOP_USER64 TASK_SIZE_USER64
+#define STACK_TOP_USER32 TASK_SIZE_USER32

-/* Give 32-bit user space a full 4G address space to live in. */
-#define STACK_TOP_USER32 (TASK_SIZE_USER32)
-
-#define STACK_TOP ((test_thread_flag(TIF_32BIT) || \
- (ppcdebugset(PPCDBG_BINFMT_32ADDR))) ? \
- STACK_TOP_USER32 : STACK_TOP_USER64)
+#define STACK_TOP (test_thread_flag(TIF_32BIT) ? \
+ STACK_TOP_USER32 : STACK_TOP_USER64)

#endif /* __KERNEL__ */

diff -urN linux-2.5/include/asm-ppc64/elf.h linux-vdso/include/asm-ppc64/elf.h
--- linux-2.5/include/asm-ppc64/elf.h 2004-08-10 10:22:37.000000000 +1000
+++ linux-vdso/include/asm-ppc64/elf.h 2004-09-07 18:25:28.000000000 +1000
@@ -238,10 +238,20 @@
/* A special ignored type value for PPC, for glibc compatibility. */
#define AT_IGNOREPPC 22

+/* The vDSO location. We have to use the same value as x86 for glibc's
+ * sake :-)
+ */
+#define AT_SYSINFO_EHDR 33
+
extern int dcache_bsize;
extern int icache_bsize;
extern int ucache_bsize;

+/* We do have an arch_setup_additional_pages for vDSO matters */
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack);
+
/*
* The requirements here are:
* - keep the final alignment of sp (sp & 0xf)
@@ -260,6 +270,8 @@
NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize); \
NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize); \
NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize); \
+ /* vDSO base */ \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, current->thread.vdso_base); \
} while (0)

/* PowerPC64 relocations defined by the ABIs */
diff -urN linux-2.5/include/asm-ppc64/page.h linux-vdso/include/asm-ppc64/page.h
--- linux-2.5/include/asm-ppc64/page.h 2004-09-06 16:43:57.000000000 +1000
+++ linux-vdso/include/asm-ppc64/page.h 2004-09-06 17:01:15.000000000 +1000
@@ -183,6 +183,11 @@

extern int page_is_ram(unsigned long pfn);

+/* We do define AT_SYSINFO_EHDR but don't use the gate mecanism */
+#define CONFIG_ARCH_GATE_AREA 1
+#define get_gate_vma(tsk) (NULL)
+#define in_gate_area(task, addr) (0)
+
#endif /* __ASSEMBLY__ */

#ifdef MODULE
diff -urN linux-2.5/include/asm-ppc64/processor.h linux-vdso/include/asm-ppc64/processor.h
--- linux-2.5/include/asm-ppc64/processor.h 2004-09-06 16:43:57.000000000 +1000
+++ linux-vdso/include/asm-ppc64/processor.h 2004-09-06 17:01:15.000000000 +1000
@@ -518,8 +518,8 @@
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
-#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(STACK_TOP_USER32 / 4))
-#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(STACK_TOP_USER64 / 4))
+#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
+#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4))

#define TASK_UNMAPPED_BASE ((test_thread_flag(TIF_32BIT)||(ppcdebugset(PPCDBG_BINFMT_32ADDR))) ? \
TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
@@ -536,7 +536,8 @@
double fpr[32]; /* Complete floating point set */
unsigned long fpscr; /* Floating point status (plus pad) */
unsigned long fpexc_mode; /* Floating-point exception mode */
- unsigned long pad[3]; /* was saved_msr, saved_softe */
+ unsigned long pad[2]; /* was saved_msr, saved_softe */
+ unsigned long vdso_base; /* base of the vDSO library */
#ifdef CONFIG_ALTIVEC
/* Complete AltiVec register set */
vector128 vr[32] __attribute((aligned(16)));
diff -urN linux-2.5/include/asm-ppc64/time.h linux-vdso/include/asm-ppc64/time.h
--- linux-2.5/include/asm-ppc64/time.h 2004-08-10 10:22:38.000000000 +1000
+++ linux-vdso/include/asm-ppc64/time.h 2004-09-09 16:43:16.000000000 +1000
@@ -43,10 +43,10 @@
struct gettimeofday_vars {
unsigned long tb_to_xs;
unsigned long stamp_xsec;
+ unsigned long tb_orig_stamp;
};

struct gettimeofday_struct {
- unsigned long tb_orig_stamp;
unsigned long tb_ticks_per_sec;
struct gettimeofday_vars vars[2];
struct gettimeofday_vars * volatile varp;
diff -urN linux-2.5/include/asm-ppc64/vdso.h linux-vdso/include/asm-ppc64/vdso.h
--- /dev/null 2004-09-01 15:26:22.000000000 +1000
+++ linux-vdso/include/asm-ppc64/vdso.h 2004-09-09 16:43:13.000000000 +1000
@@ -0,0 +1,60 @@
+#ifndef __PPC64_VDSO_H__
+#define __PPC64_VDSO_H__
+
+#ifdef __KERNEL__
+
+/* Default link addresses for the vDSOs */
+#define VDSO32_BASE 0x100000
+#define VDSO64_BASE 0x100000
+
+#ifndef __ASSEMBLY__
+
+extern unsigned int vdso64_pages;
+extern unsigned int vdso32_pages;
+
+/* Offsets relative to thread->vdso_base */
+extern unsigned long vdso64_rt_sigtramp;
+extern unsigned long vdso32_sigtramp;
+extern unsigned long vdso32_rt_sigtramp;
+
+extern void vdso_init(void);
+
+#else /* __ASSEMBLY__ */
+
+#ifdef __VDSO64__
+#define V_FUNCTION_BEGIN(name) \
+ .globl name; \
+ .section ".opd","a"; \
+ .align 3; \
+ name: \
+ .quad .name,.TOC.@tocbase,0; \
+ .previous; \
+ .globl .name; \
+ .type .name,@function; \
+ .name: \
+ .cfi_startproc
+
+#define V_FUNCTION_END(name) \
+ .cfi_endproc \
+ .size .name,.-.name;
+#endif /* __VDSO64__ */
+
+#ifdef __VDSO32__
+
+#define V_FUNCTION_BEGIN(name) \
+ .globl name; \
+ .type name,@function; \
+ name: \
+ .cfi_startproc
+
+#define V_FUNCTION_END(name) \
+ .cfi_endproc \
+ .size name,.-name;
+
+#endif /* __VDSO32__ */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* __PPC64_VDSO_H__ */


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/