Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly

From: Christophe Leroy
Date: Thu May 24 2018 - 05:35:52 EST




On 05/24/2018 06:20 AM, Christophe LEROY wrote:


Le 23/05/2018 Ã 20:34, Segher Boessenkool a ÃcritÂ:
On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
The generic csum_ipv6_magic() generates a pretty bad result

<snip>

Please try with a more recent compiler, what you used is pretty ancient.
It's not like recent compilers do great on this either, but it's not
*that* bad anymore ;-)


Here is what I get with GCC 8.1
It doesn't look much better, does it ?


net/ipv6/ip6_checksum.o: file format elf32-powerpc


Disassembly of section .text:

00000000 <csum_ipv6_magic>:
0: 94 21 ff f0 stwu r1,-16(r1)
4: 80 04 00 00 lwz r0,0(r4)
8: 81 64 00 04 lwz r11,4(r4)
c: 81 04 00 08 lwz r8,8(r4)
10: 93 e1 00 0c stw r31,12(r1)
14: 81 43 00 00 lwz r10,0(r3)
18: 83 e3 00 04 lwz r31,4(r3)
1c: 81 23 00 08 lwz r9,8(r3)
20: 81 83 00 0c lwz r12,12(r3)
24: 7c ea 3a 14 add r7,r10,r7
28: 7d 4a 38 10 subfc r10,r10,r7
2c: 7c ff 3a 14 add r7,r31,r7
30: 81 44 00 0c lwz r10,12(r4)
34: 7c 63 19 10 subfe r3,r3,r3
38: 7c 63 38 50 subf r3,r3,r7
3c: 7f ff 18 10 subfc r31,r31,r3
40: 7c e9 1a 14 add r7,r9,r3
44: 83 e1 00 0c lwz r31,12(r1)
48: 7c 63 19 10 subfe r3,r3,r3
4c: 38 21 00 10 addi r1,r1,16
50: 7c 63 38 50 subf r3,r3,r7
54: 7d 29 18 10 subfc r9,r9,r3
58: 7d 2c 1a 14 add r9,r12,r3
5c: 7c 63 19 10 subfe r3,r3,r3
60: 7c 63 48 50 subf r3,r3,r9
64: 7d 8c 18 10 subfc r12,r12,r3
68: 7d 20 1a 14 add r9,r0,r3
6c: 7c 63 19 10 subfe r3,r3,r3
70: 7c 63 48 50 subf r3,r3,r9
74: 7c 00 18 10 subfc r0,r0,r3
78: 7d 2b 1a 14 add r9,r11,r3
7c: 7c 63 19 10 subfe r3,r3,r3
80: 7c 63 48 50 subf r3,r3,r9
84: 7d 6b 18 10 subfc r11,r11,r3
88: 7d 28 1a 14 add r9,r8,r3
8c: 7c 63 19 10 subfe r3,r3,r3
90: 7c 63 48 50 subf r3,r3,r9
94: 7d 08 18 10 subfc r8,r8,r3
98: 7d 2a 1a 14 add r9,r10,r3
9c: 7c 63 19 10 subfe r3,r3,r3
a0: 7c 63 48 50 subf r3,r3,r9
a4: 7d 4a 18 10 subfc r10,r10,r3
a8: 7d 23 2a 14 add r9,r3,r5
ac: 7c 63 19 10 subfe r3,r3,r3
b0: 7c 63 48 50 subf r3,r3,r9
b4: 7c a5 18 10 subfc r5,r5,r3
b8: 7c 63 32 14 add r3,r3,r6
bc: 7d 29 49 10 subfe r9,r9,r9
c0: 7d 29 18 50 subf r9,r9,r3
c4: 7c c6 48 10 subfc r6,r6,r9
c8: 7c 63 19 10 subfe r3,r3,r3
cc: 7c 63 48 50 subf r3,r3,r9
d0: 54 69 80 3e rotlwi r9,r3,16
d4: 7c 63 4a 14 add r3,r3,r9
d8: 7c 63 18 f8 not r3,r3
dc: 54 63 84 3e rlwinm r3,r3,16,16,31
e0: 4e 80 00 20 blr

net/ipv6/ip6_checksum.o: file format elf64-powerpc


Disassembly of section .text:

0000000000000000 <.csum_ipv6_magic>:
0: fb e1 ff f8 std r31,-8(r1)
4: 81 43 00 00 lwz r10,0(r3)
8: 81 83 00 04 lwz r12,4(r3)
c: 81 23 00 08 lwz r9,8(r3)
10: 80 03 00 0c lwz r0,12(r3)
14: 7c e7 52 14 add r7,r7,r10
18: 80 64 00 08 lwz r3,8(r4)
1c: 81 04 00 00 lwz r8,0(r4)
20: 78 ff 00 20 clrldi r31,r7,32
24: 7c ec 3a 14 add r7,r12,r7
28: 81 64 00 04 lwz r11,4(r4)
2c: 7f ea f8 50 subf r31,r10,r31
30: 81 44 00 0c lwz r10,12(r4)
34: 7b ff 0f e0 rldicl r31,r31,1,63
38: 7c ff 3a 14 add r7,r31,r7
3c: eb e1 ff f8 ld r31,-8(r1)
40: 78 e4 00 20 clrldi r4,r7,32
44: 7c e9 3a 14 add r7,r9,r7
48: 7d 8c 20 50 subf r12,r12,r4
4c: 79 8c 0f e0 rldicl r12,r12,1,63
50: 7d 8c 3a 14 add r12,r12,r7
54: 79 87 00 20 clrldi r7,r12,32
58: 7d 80 62 14 add r12,r0,r12
5c: 7d 29 38 50 subf r9,r9,r7
60: 79 29 0f e0 rldicl r9,r9,1,63
64: 7d 29 62 14 add r9,r9,r12
68: 79 27 00 20 clrldi r7,r9,32
6c: 7d 28 4a 14 add r9,r8,r9
70: 7c 00 38 50 subf r0,r0,r7
74: 78 00 0f e0 rldicl r0,r0,1,63
78: 7c 00 4a 14 add r0,r0,r9
7c: 78 09 00 20 clrldi r9,r0,32
80: 7c 0b 02 14 add r0,r11,r0
84: 7d 08 48 50 subf r8,r8,r9
88: 79 08 0f e0 rldicl r8,r8,1,63
8c: 7d 08 02 14 add r8,r8,r0
90: 79 09 00 20 clrldi r9,r8,32
94: 7d 03 42 14 add r8,r3,r8
98: 7d 2b 48 50 subf r9,r11,r9
9c: 79 29 0f e0 rldicl r9,r9,1,63
a0: 7d 29 42 14 add r9,r9,r8
a4: 79 28 00 20 clrldi r8,r9,32
a8: 7d 2a 4a 14 add r9,r10,r9
ac: 7d 03 40 50 subf r8,r3,r8
b0: 79 08 0f e0 rldicl r8,r8,1,63
b4: 7d 08 4a 14 add r8,r8,r9
b8: 79 09 00 20 clrldi r9,r8,32
bc: 7d 08 2a 14 add r8,r8,r5
c0: 7d 2a 48 50 subf r9,r10,r9
c4: 79 29 0f e0 rldicl r9,r9,1,63
c8: 7d 29 42 14 add r9,r9,r8
cc: 79 2a 00 20 clrldi r10,r9,32
d0: 7d 29 32 14 add r9,r9,r6
d4: 7c a5 50 50 subf r5,r5,r10
d8: 78 a5 0f e0 rldicl r5,r5,1,63
dc: 7d 25 4a 14 add r9,r5,r9
e0: 79 2a 00 20 clrldi r10,r9,32
e4: 7c c6 50 50 subf r6,r6,r10
e8: 78 c6 0f e0 rldicl r6,r6,1,63
ec: 7c c6 4a 14 add r6,r6,r9
f0: 54 c3 80 3e rotlwi r3,r6,16
f4: 7c c6 1a 14 add r6,r6,r3
f8: 7c c3 30 f8 not r3,r6
fc: 78 63 84 22 rldicl r3,r3,48,48
100: 4e 80 00 20 blr

Christophe


--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -293,3 +293,36 @@ dst_error:
ÂÂÂÂÂ EX_TABLE(51b, dst_error);
 EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ *ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ const struct in6_addr *daddr,
+ *ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+ÂÂÂ lwzÂÂÂ r8, 0(r3)
+ÂÂÂ lwzÂÂÂ r9, 4(r3)
+ÂÂÂ lwzÂÂÂ r10, 8(r3)
+ÂÂÂ lwzÂÂÂ r11, 12(r3)
+ÂÂÂ addcÂÂÂ r0, r5, r6
+ÂÂÂ addeÂÂÂ r0, r0, r7
+ÂÂÂ addeÂÂÂ r0, r0, r8
+ÂÂÂ addeÂÂÂ r0, r0, r9
+ÂÂÂ addeÂÂÂ r0, r0, r10
+ÂÂÂ addeÂÂÂ r0, r0, r11
+ÂÂÂ lwzÂÂÂ r8, 0(r4)
+ÂÂÂ lwzÂÂÂ r9, 4(r4)
+ÂÂÂ lwzÂÂÂ r10, 8(r4)
+ÂÂÂ lwzÂÂÂ r11, 12(r4)
+ÂÂÂ addeÂÂÂ r0, r0, r8
+ÂÂÂ addeÂÂÂ r0, r0, r9
+ÂÂÂ addeÂÂÂ r0, r0, r10
+ÂÂÂ addeÂÂÂ r0, r0, r11
+ÂÂÂ addzeÂÂÂ r0, r0
+ÂÂÂ rotlwiÂÂÂ r3, r0, 16
+ÂÂÂ addÂÂÂ r3, r0, r3
+ÂÂÂ notÂÂÂ r3, r3
+ÂÂÂ rlwinmÂÂÂ r3, r3, 16, 16, 31
+ÂÂÂ blr
+EXPORT_SYMBOL(csum_ipv6_magic)

Clustering the loads and carry insns together is pretty much the worst you
can do on most 32-bit CPUs.

Oh, really ? __csum_partial is written that way too.

Right, now I tried interleaving the lwz and adde. I get no improvment at all on a 885, but I get a 15% improvment on a 8321.

Christophe



Segher