Re: [PATCH] x86: Run checksumming in parallel accross multiple alu's

From: Eric Dumazet
Date: Fri Oct 18 2013 - 17:16:02 EST


On Fri, 2013-10-18 at 16:11 -0400, Neil Horman wrote:

> #define BUFSIZ_ORDER 4
> #define BUFSIZ ((2 << BUFSIZ_ORDER) * (1024*1024*2))
> static int __init csum_init_module(void)
> {
> int i;
> __wsum sum = 0;
> struct timespec start, end;
> u64 time;
> struct page *page;
> u32 offset = 0;
>
> page = alloc_pages((GFP_TRANSHUGE & ~__GFP_MOVABLE), BUFSIZ_ORDER);

Not sure what you are doing here, but its not correct.

You have a lot of variations in your results, I suspect a NUMA affinity
problem.

You can try the following code, and use taskset to make sure you run
this on a cpu on node 0

#define BUFSIZ 2*1024*1024
#define NBPAGES 16

static int __init csum_init_module(void)
{
int i;
__wsum sum = 0;
u64 start, end;
void *base, *addrs[NBPAGES];
u32 rnd, offset;

memset(addrs, 0, sizeof(addrs));
for (i = 0; i < NBPAGES; i++) {
addrs[i] = kmalloc_node(BUFSIZ, GFP_KERNEL, 0);
if (!addrs[i])
goto out;
}

local_bh_disable();
pr_err("STARTING ITERATIONS on cpu %d\n", smp_processor_id());
start = ktime_to_ns(ktime_get());

for (i = 0; i < 100000; i++) {
rnd = prandom_u32();
base = addrs[rnd % NBPAGES];
rnd /= NBPAGES;
offset = rnd % (BUFSIZ - 1500);
offset &= ~1U;
sum = csum_partial_opt(base + offset, 1500, sum);
}
end = ktime_to_ns(ktime_get());
local_bh_enable();

pr_err("COMPLETED 100000 iterations of csum %x in %llu nanosec\n", sum, end - start);

out:
for (i = 0; i < NBPAGES; i++)
kfree(addrs[i]);

return 0;
}

static void __exit csum_cleanup_module(void)
{
return;
}



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/