Re: [PATCH v10 2/5] powerpc/vdso: Prepare for switching VDSO to generic C implementation.

From: Christophe Leroy
Date: Wed Aug 05 2020 - 16:00:38 EST


Hi,

On 08/05/2020 02:03 PM, Segher Boessenkool wrote:
Hi!

On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */

It cannot optimise it because it does not know shift < 32. The code
below is incorrect for shift equal to 32, fwiw.

Is there a way to tell it ?


+static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+ return (cycles - last) * mult;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#ifndef __powerpc64__
+static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
+{
+ u32 hi = ns >> 32;
+ u32 lo = ns;
+
+ lo >>= shift;
+ lo |= hi << (32 - shift);
+ hi >>= shift;


+ if (likely(hi == 0))
+ return lo;

Removing these two lines shouldn't change generated object code? Or not
make it worse, at least.

I remember it made noticeable difference allthough I can't remember the details. See below with GCC 10.1. At least we see that with those two lines, GCC only sets a 16 bytes stack frame. Without those lines it sets a 32 bytes stack frame and seems to save some values for no reason.

With the two lines:

000006ac <__c_kernel_clock_gettime>:
6ac: 28 03 00 0f cmplwi r3,15
6b0: 41 81 01 04 bgt 7b4 <__c_kernel_clock_gettime+0x108>
6b4: 39 40 00 01 li r10,1
6b8: 7d 4a 18 30 slw r10,r10,r3
6bc: 71 47 08 83 andi. r7,r10,2179
6c0: 41 82 01 2c beq 7ec <__c_kernel_clock_gettime+0x140>
6c4: 94 21 ff f0 stwu r1,-16(r1)
6c8: 54 63 20 36 rlwinm r3,r3,4,0,27
6cc: 93 e1 00 0c stw r31,12(r1)
6d0: 7d 85 1a 14 add r12,r5,r3
6d4: 80 05 00 00 lwz r0,0(r5)
6d8: 70 06 00 01 andi. r6,r0,1
6dc: 40 82 00 d4 bne 7b0 <__c_kernel_clock_gettime+0x104>
6e0: 7d 4d 42 e6 mftbu r10
6e4: 7d 6c 42 e6 mftb r11
6e8: 7c ed 42 e6 mftbu r7
6ec: 7c 0a 38 40 cmplw r10,r7
6f0: 40 82 ff f0 bne 6e0 <__c_kernel_clock_gettime+0x34>
6f4: 80 e5 00 0c lwz r7,12(r5)
6f8: 80 65 00 08 lwz r3,8(r5)
6fc: 7c e7 58 10 subfc r7,r7,r11
700: 81 65 00 18 lwz r11,24(r5)
704: 7d 43 51 10 subfe r10,r3,r10
708: 7f e7 58 16 mulhwu r31,r7,r11
70c: 7d 4a 59 d6 mullw r10,r10,r11
710: 7c e7 59 d6 mullw r7,r7,r11
714: 80 6c 00 2c lwz r3,44(r12)
718: 81 6c 00 28 lwz r11,40(r12)
71c: 7c e7 18 14 addc r7,r7,r3
720: 7d 4a fa 14 add r10,r10,r31
724: 80 65 00 1c lwz r3,28(r5)
728: 7d 4a 59 14 adde r10,r10,r11
72c: 7c e7 1c 30 srw r7,r7,r3
730: 21 63 00 20 subfic r11,r3,32
734: 7d 43 1c 31 srw. r3,r10,r3
738: 7d 4a 58 30 slw r10,r10,r11
73c: 7d 49 3b 78 or r9,r10,r7
740: 39 00 00 00 li r8,0
744: 40 82 00 84 bne 7c8 <__c_kernel_clock_gettime+0x11c>
748: 80 6c 00 24 lwz r3,36(r12)
74c: 81 45 00 00 lwz r10,0(r5)
750: 7c 00 50 40 cmplw r0,r10
754: 40 a2 ff 80 bne 6d4 <__c_kernel_clock_gettime+0x28>
758: 2c 08 00 00 cmpwi r8,0
75c: 41 82 00 7c beq 7d8 <__c_kernel_clock_gettime+0x12c>
760: 3c e0 c4 65 lis r7,-15259
764: 3c 00 3b 9a lis r0,15258
768: 60 e7 36 00 ori r7,r7,13824
76c: 60 00 c9 ff ori r0,r0,51711
770: 7c a9 38 14 addc r5,r9,r7
774: 7d 48 01 d4 addme r10,r8
778: 2c 0a 00 00 cmpwi r10,0
77c: 7d 48 53 78 mr r8,r10
780: 7c a9 2b 78 mr r9,r5
784: 38 c6 00 01 addi r6,r6,1
788: 40 82 ff e8 bne 770 <__c_kernel_clock_gettime+0xc4>
78c: 7c 05 00 40 cmplw r5,r0
790: 41 81 ff e0 bgt 770 <__c_kernel_clock_gettime+0xc4>
794: 7c 66 18 14 addc r3,r6,r3
798: 90 64 00 00 stw r3,0(r4)
79c: 91 24 00 04 stw r9,4(r4)
7a0: 38 60 00 00 li r3,0
7a4: 83 e1 00 0c lwz r31,12(r1)
7a8: 38 21 00 10 addi r1,r1,16
7ac: 4e 80 00 20 blr
7b0: 4b ff ff 24 b 6d4 <__c_kernel_clock_gettime+0x28>
7b4: 38 00 00 f6 li r0,246
7b8: 44 00 00 02 sc
7bc: 40 a3 00 08 bns 7c4 <__c_kernel_clock_gettime+0x118>
7c0: 7c 63 00 d0 neg r3,r3
7c4: 4e 80 00 20 blr
7c8: 7d 2a 4b 78 mr r10,r9
7cc: 7c 68 1b 78 mr r8,r3
7d0: 7d 49 53 78 mr r9,r10
7d4: 4b ff ff 74 b 748 <__c_kernel_clock_gettime+0x9c>
7d8: 3d 40 3b 9a lis r10,15258
7dc: 61 4a c9 ff ori r10,r10,51711
7e0: 7c 09 50 40 cmplw r9,r10
7e4: 41 81 ff 7c bgt 760 <__c_kernel_clock_gettime+0xb4>
7e8: 4b ff ff b0 b 798 <__c_kernel_clock_gettime+0xec>
7ec: 71 47 00 60 andi. r7,r10,96
7f0: 54 69 20 36 rlwinm r9,r3,4,0,27
7f4: 7d 25 4a 14 add r9,r5,r9
7f8: 40 82 00 14 bne 80c <__c_kernel_clock_gettime+0x160>
7fc: 71 4a 00 10 andi. r10,r10,16
800: 41 a2 ff b4 beq 7b4 <__c_kernel_clock_gettime+0x108>
804: 38 a5 00 f0 addi r5,r5,240
808: 4b ff fe bc b 6c4 <__c_kernel_clock_gettime+0x18>
80c: 81 05 00 00 lwz r8,0(r5)
810: 71 0a 00 01 andi. r10,r8,1
814: 40 a2 ff f8 bne 80c <__c_kernel_clock_gettime+0x160>
818: 80 69 00 24 lwz r3,36(r9)
81c: 81 49 00 2c lwz r10,44(r9)
820: 80 e5 00 00 lwz r7,0(r5)
824: 7c 08 38 40 cmplw r8,r7
828: 40 a2 ff e4 bne 80c <__c_kernel_clock_gettime+0x160>
82c: 90 64 00 00 stw r3,0(r4)
830: 91 44 00 04 stw r10,4(r4)
834: 38 60 00 00 li r3,0
838: 4e 80 00 20 blr


Without the two lines:

000006ac <__c_kernel_clock_gettime>:
6ac: 28 03 00 0f cmplwi r3,15
6b0: 41 81 01 14 bgt 7c4 <__c_kernel_clock_gettime+0x118>
6b4: 39 20 00 01 li r9,1
6b8: 7d 29 18 30 slw r9,r9,r3
6bc: 71 2a 08 83 andi. r10,r9,2179
6c0: 41 82 01 2c beq 7ec <__c_kernel_clock_gettime+0x140>
6c4: 94 21 ff e0 stwu r1,-32(r1)
6c8: 54 63 20 36 rlwinm r3,r3,4,0,27
6cc: 93 81 00 10 stw r28,16(r1)
6d0: 93 a1 00 14 stw r29,20(r1)
6d4: 93 c1 00 18 stw r30,24(r1)
6d8: 93 e1 00 1c stw r31,28(r1)
6dc: 7c 65 1a 14 add r3,r5,r3
6e0: 81 85 00 00 lwz r12,0(r5)
6e4: 71 87 00 01 andi. r7,r12,1
6e8: 40 82 00 d8 bne 7c0 <__c_kernel_clock_gettime+0x114>
6ec: 7d 2d 42 e6 mftbu r9
6f0: 7c cc 42 e6 mftb r6
6f4: 7d 4d 42 e6 mftbu r10
6f8: 7c 09 50 40 cmplw r9,r10
6fc: 40 82 ff f0 bne 6ec <__c_kernel_clock_gettime+0x40>
700: 83 83 00 28 lwz r28,40(r3)
704: 83 a3 00 2c lwz r29,44(r3)
708: 81 65 00 08 lwz r11,8(r5)
70c: 81 05 00 0c lwz r8,12(r5)
710: 83 c5 00 18 lwz r30,24(r5)
714: 83 e5 00 1c lwz r31,28(r5)
718: 80 03 00 24 lwz r0,36(r3)
71c: 81 45 00 00 lwz r10,0(r5)
720: 7c 0c 50 40 cmplw r12,r10
724: 40 a2 ff bc bne 6e0 <__c_kernel_clock_gettime+0x34>
728: 7d 48 30 10 subfc r10,r8,r6
72c: 7c cb 49 10 subfe r6,r11,r9
730: 7c c6 f1 d6 mullw r6,r6,r30
734: 7d 2a f0 16 mulhwu r9,r10,r30
738: 7d 4a f1 d6 mullw r10,r10,r30
73c: 7c c6 4a 14 add r6,r6,r9
740: 7d 4a e8 14 addc r10,r10,r29
744: 7c c6 e1 14 adde r6,r6,r28
748: 7c c8 fc 30 srw r8,r6,r31
74c: 2c 08 00 00 cmpwi r8,0
750: 20 bf 00 20 subfic r5,r31,32
754: 7d 4a fc 30 srw r10,r10,r31
758: 7c c5 28 30 slw r5,r6,r5
75c: 7c a9 53 78 or r9,r5,r10
760: 41 82 00 78 beq 7d8 <__c_kernel_clock_gettime+0x12c>
764: 3c c0 c4 65 lis r6,-15259
768: 3c 60 3b 9a lis r3,15258
76c: 60 c6 36 00 ori r6,r6,13824
770: 60 63 c9 ff ori r3,r3,51711
774: 7c a9 30 14 addc r5,r9,r6
778: 7d 48 01 d4 addme r10,r8
77c: 2c 0a 00 00 cmpwi r10,0
780: 7d 48 53 78 mr r8,r10
784: 7c a9 2b 78 mr r9,r5
788: 38 e7 00 01 addi r7,r7,1
78c: 40 82 ff e8 bne 774 <__c_kernel_clock_gettime+0xc8>
790: 7c 05 18 40 cmplw r5,r3
794: 41 81 ff e0 bgt 774 <__c_kernel_clock_gettime+0xc8>
798: 7c 07 00 14 addc r0,r7,r0
79c: 90 04 00 00 stw r0,0(r4)
7a0: 91 24 00 04 stw r9,4(r4)
7a4: 38 60 00 00 li r3,0
7a8: 83 81 00 10 lwz r28,16(r1)
7ac: 83 a1 00 14 lwz r29,20(r1)
7b0: 83 c1 00 18 lwz r30,24(r1)
7b4: 83 e1 00 1c lwz r31,28(r1)
7b8: 38 21 00 20 addi r1,r1,32
7bc: 4e 80 00 20 blr
7c0: 4b ff ff 20 b 6e0 <__c_kernel_clock_gettime+0x34>
7c4: 38 00 00 f6 li r0,246
7c8: 44 00 00 02 sc
7cc: 40 a3 00 08 bns 7d4 <__c_kernel_clock_gettime+0x128>
7d0: 7c 63 00 d0 neg r3,r3
7d4: 4e 80 00 20 blr
7d8: 3d 40 3b 9a lis r10,15258
7dc: 61 4a c9 ff ori r10,r10,51711
7e0: 7c 09 50 40 cmplw r9,r10
7e4: 41 81 ff 80 bgt 764 <__c_kernel_clock_gettime+0xb8>
7e8: 4b ff ff b4 b 79c <__c_kernel_clock_gettime+0xf0>
7ec: 71 2a 00 60 andi. r10,r9,96
7f0: 40 82 00 14 bne 804 <__c_kernel_clock_gettime+0x158>
7f4: 71 29 00 10 andi. r9,r9,16
7f8: 41 a2 ff cc beq 7c4 <__c_kernel_clock_gettime+0x118>
7fc: 38 a5 00 f0 addi r5,r5,240
800: 4b ff fe c4 b 6c4 <__c_kernel_clock_gettime+0x18>
804: 54 69 20 36 rlwinm r9,r3,4,0,27
808: 7d 25 4a 14 add r9,r5,r9
80c: 81 05 00 00 lwz r8,0(r5)
810: 71 0a 00 01 andi. r10,r8,1
814: 40 82 00 28 bne 83c <__c_kernel_clock_gettime+0x190>
818: 80 09 00 24 lwz r0,36(r9)
81c: 81 49 00 2c lwz r10,44(r9)
820: 80 e5 00 00 lwz r7,0(r5)
824: 7c 08 38 40 cmplw r8,r7
828: 40 a2 ff e4 bne 80c <__c_kernel_clock_gettime+0x160>
82c: 90 04 00 00 stw r0,0(r4)
830: 91 44 00 04 stw r10,4(r4)
834: 38 60 00 00 li r3,0
838: 4e 80 00 20 blr
83c: 4b ff ff d0 b 80c <__c_kernel_clock_gettime+0x160>



+ return ((u64)hi << 32) | lo;
+}


What does the compiler do for just

static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
return ns >> (shift & 31);
}


Worse:

000006ac <__c_kernel_clock_gettime>:
6ac: 28 03 00 0f cmplwi r3,15
6b0: 41 81 01 30 bgt 7e0 <__c_kernel_clock_gettime+0x134>
6b4: 39 20 00 01 li r9,1
6b8: 7d 29 18 30 slw r9,r9,r3
6bc: 71 2a 08 83 andi. r10,r9,2179
6c0: 41 82 01 48 beq 808 <__c_kernel_clock_gettime+0x15c>
6c4: 94 21 ff e0 stwu r1,-32(r1)
6c8: 54 63 20 36 rlwinm r3,r3,4,0,27
6cc: 93 81 00 10 stw r28,16(r1)
6d0: 93 a1 00 14 stw r29,20(r1)
6d4: 93 c1 00 18 stw r30,24(r1)
6d8: 93 e1 00 1c stw r31,28(r1)
6dc: 7c 65 1a 14 add r3,r5,r3
6e0: 80 c5 00 00 lwz r6,0(r5)
6e4: 70 c7 00 01 andi. r7,r6,1
6e8: 40 82 00 f4 bne 7dc <__c_kernel_clock_gettime+0x130>
6ec: 7d 2d 42 e6 mftbu r9
6f0: 7d 0c 42 e6 mftb r8
6f4: 7d 4d 42 e6 mftbu r10
6f8: 7c 09 50 40 cmplw r9,r10
6fc: 40 82 ff f0 bne 6ec <__c_kernel_clock_gettime+0x40>
700: 83 83 00 28 lwz r28,40(r3)
704: 83 c3 00 2c lwz r30,44(r3)
708: 81 65 00 08 lwz r11,8(r5)
70c: 81 45 00 0c lwz r10,12(r5)
710: 83 e5 00 18 lwz r31,24(r5)
714: 81 85 00 1c lwz r12,28(r5)
718: 80 03 00 24 lwz r0,36(r3)
71c: 83 a5 00 00 lwz r29,0(r5)
720: 7c 06 e8 40 cmplw r6,r29
724: 40 a2 ff bc bne 6e0 <__c_kernel_clock_gettime+0x34>
728: 7d 0a 40 10 subfc r8,r10,r8
72c: 7c cb 49 10 subfe r6,r11,r9
730: 7c c6 f9 d6 mullw r6,r6,r31
734: 7d 28 f8 16 mulhwu r9,r8,r31
738: 7d 08 f9 d6 mullw r8,r8,r31
73c: 55 8c 06 fe clrlwi r12,r12,27
740: 7f c8 f0 14 addc r30,r8,r30
744: 7c c6 4a 14 add r6,r6,r9
748: 7c c6 e1 14 adde r6,r6,r28
74c: 34 6c ff e0 addic. r3,r12,-32
750: 41 80 00 70 blt 7c0 <__c_kernel_clock_gettime+0x114>
754: 7c c9 1c 30 srw r9,r6,r3
758: 39 00 00 00 li r8,0
75c: 2c 08 00 00 cmpwi r8,0
760: 41 82 00 94 beq 7f4 <__c_kernel_clock_gettime+0x148>
764: 3c c0 c4 65 lis r6,-15259
768: 3c 60 3b 9a lis r3,15258
76c: 60 c6 36 00 ori r6,r6,13824
770: 60 63 c9 ff ori r3,r3,51711
774: 7c a9 30 14 addc r5,r9,r6
778: 7d 48 01 d4 addme r10,r8
77c: 2c 0a 00 00 cmpwi r10,0
780: 7d 48 53 78 mr r8,r10
784: 7c a9 2b 78 mr r9,r5
788: 38 e7 00 01 addi r7,r7,1
78c: 40 82 ff e8 bne 774 <__c_kernel_clock_gettime+0xc8>
790: 7c 05 18 40 cmplw r5,r3
794: 41 81 ff e0 bgt 774 <__c_kernel_clock_gettime+0xc8>
798: 7c 07 00 14 addc r0,r7,r0
79c: 90 04 00 00 stw r0,0(r4)
7a0: 91 24 00 04 stw r9,4(r4)
7a4: 38 60 00 00 li r3,0
7a8: 83 81 00 10 lwz r28,16(r1)
7ac: 83 a1 00 14 lwz r29,20(r1)
7b0: 83 c1 00 18 lwz r30,24(r1)
7b4: 83 e1 00 1c lwz r31,28(r1)
7b8: 38 21 00 20 addi r1,r1,32
7bc: 4e 80 00 20 blr
7c0: 54 c3 08 3c rlwinm r3,r6,1,0,30
7c4: 21 6c 00 1f subfic r11,r12,31
7c8: 7c 63 58 30 slw r3,r3,r11
7cc: 7f c9 64 30 srw r9,r30,r12
7d0: 7c 69 4b 78 or r9,r3,r9
7d4: 7c c8 64 30 srw r8,r6,r12
7d8: 4b ff ff 84 b 75c <__c_kernel_clock_gettime+0xb0>
7dc: 4b ff ff 04 b 6e0 <__c_kernel_clock_gettime+0x34>
7e0: 38 00 00 f6 li r0,246
7e4: 44 00 00 02 sc
7e8: 40 a3 00 08 bns 7f0 <__c_kernel_clock_gettime+0x144>
7ec: 7c 63 00 d0 neg r3,r3
7f0: 4e 80 00 20 blr
7f4: 3d 40 3b 9a lis r10,15258
7f8: 61 4a c9 ff ori r10,r10,51711
7fc: 7c 09 50 40 cmplw r9,r10
800: 41 81 ff 64 bgt 764 <__c_kernel_clock_gettime+0xb8>
804: 4b ff ff 98 b 79c <__c_kernel_clock_gettime+0xf0>
808: 71 2a 00 60 andi. r10,r9,96
80c: 40 82 00 14 bne 820 <__c_kernel_clock_gettime+0x174>
810: 71 29 00 10 andi. r9,r9,16
814: 41 a2 ff cc beq 7e0 <__c_kernel_clock_gettime+0x134>
818: 38 a5 00 f0 addi r5,r5,240
81c: 4b ff fe a8 b 6c4 <__c_kernel_clock_gettime+0x18>
820: 54 69 20 36 rlwinm r9,r3,4,0,27
824: 7d 25 4a 14 add r9,r5,r9
828: 81 05 00 00 lwz r8,0(r5)
82c: 71 0a 00 01 andi. r10,r8,1
830: 40 82 00 28 bne 858 <__c_kernel_clock_gettime+0x1ac>
834: 80 09 00 24 lwz r0,36(r9)
838: 81 49 00 2c lwz r10,44(r9)
83c: 80 e5 00 00 lwz r7,0(r5)
840: 7c 08 38 40 cmplw r8,r7
844: 40 a2 ff e4 bne 828 <__c_kernel_clock_gettime+0x17c>
848: 90 04 00 00 stw r0,0(r4)
84c: 91 44 00 04 stw r10,4(r4)
850: 38 60 00 00 li r3,0
854: 4e 80 00 20 blr
858: 4b ff ff d0 b 828 <__c_kernel_clock_gettime+0x17c>

Christophe