Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more

From: Peter Zijlstra
Date: Mon May 25 2020 - 07:01:08 EST


On Mon, May 25, 2020 at 12:40:38PM +0200, Peter Zijlstra wrote:
> On Mon, May 25, 2020 at 12:02:48PM +0200, Rasmus Villemoes wrote:
>
> > Naive question: did you check disassembly to see whether gcc threw your
> > native_get_debugreg() away, given that the asm isn't volatile and the
> > result is not used for anything? Testing here only shows a "mov
> > %r9,%db7", but the read did seem to get thrown away.
>
> Argh.. no I did not. Writing it all in asm gets me:
>
> [ 1.627405] XXX: 3900 8304 22632
>
> which is a lot worse...

+ u64 empty = 0, read = 0, write = 0, cpu = 0, cpu1 = 0;
+ unsigned long dr7;
+
+ for (i=0; i<100; i++) {
+ u64 s;
+
+ s = rdtsc();
+ asm volatile ("lfence; lfence;");
+ empty += rdtsc() - s;
+
+ s = rdtsc();
+ asm volatile ("lfence; mov %%db7, %0; lfence;" : "=r" (dr7));
+ read += rdtsc() - s;
+
+ s = rdtsc();
+ asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
+ write += rdtsc() - s;
+
+ s = rdtsc();
+ asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
+ write += rdtsc() - s;
+
+ clflush(this_cpu_ptr(&cpu_dr7));
+
+ s = rdtsc();
+ asm volatile ("lfence;");
+ dr7 = this_cpu_read(cpu_dr7);
+ asm volatile ("lfence;");
+ cpu += rdtsc() - s;
+
+ s = rdtsc();
+ asm volatile ("lfence;");
+ dr7 = this_cpu_read(cpu_dr7);
+ asm volatile ("lfence;");
+ cpu1 += rdtsc() - s;
+ }
+
+ printk("XXX: %ld %ld %ld %ld %ld\n", empty, read, write, cpu, cpu1);

[ 1.628252] XXX: 3820 8224 45516 35560 4800

Which still seems to suggest using DR7 directly is probably a good
thing. It's slower than a L1 hit, but massively faster than a full miss.

---

11f: 0f 31 rdtsc
121: 48 89 d1 mov %rdx,%rcx
124: 48 89 c6 mov %rax,%rsi
127: 0f ae e8 lfence
12a: 0f ae e8 lfence
12d: 0f 31 rdtsc
12f: 48 c1 e2 20 shl $0x20,%rdx
133: 48 c1 e1 20 shl $0x20,%rcx
137: 48 09 c2 or %rax,%rdx
13a: 48 09 f1 or %rsi,%rcx
13d: 48 29 ca sub %rcx,%rdx
140: 48 01 d3 add %rdx,%rbx
143: 0f 31 rdtsc
145: 48 89 d1 mov %rdx,%rcx
148: 48 89 c6 mov %rax,%rsi
14b: 0f ae e8 lfence
14e: 41 0f 21 fb mov %db7,%r11
152: 0f ae e8 lfence
155: 0f 31 rdtsc
157: 48 c1 e2 20 shl $0x20,%rdx
15b: 48 c1 e1 20 shl $0x20,%rcx
15f: 48 09 c2 or %rax,%rdx
162: 48 09 f1 or %rsi,%rcx
165: 48 29 ca sub %rcx,%rdx
168: 48 01 d5 add %rdx,%rbp
16b: 0f 31 rdtsc
16d: 48 89 d6 mov %rdx,%rsi
170: 49 89 c1 mov %rax,%r9
173: 0f ae e8 lfence
176: 41 0f 23 fb mov %r11,%db7
17a: 0f ae e8 lfence
17d: 0f 31 rdtsc
17f: 48 89 d7 mov %rdx,%rdi
182: 49 89 c2 mov %rax,%r10
185: 0f 31 rdtsc
187: 48 89 d1 mov %rdx,%rcx
18a: 49 89 c0 mov %rax,%r8
18d: 0f ae e8 lfence
190: 41 0f 23 fb mov %r11,%db7
194: 0f ae e8 lfence
197: 0f 31 rdtsc
199: 48 c1 e2 20 shl $0x20,%rdx
19d: 48 c1 e6 20 shl $0x20,%rsi
1a1: 48 09 c2 or %rax,%rdx
1a4: 48 89 f8 mov %rdi,%rax
1a7: 48 c1 e1 20 shl $0x20,%rcx
1ab: 48 c1 e0 20 shl $0x20,%rax
1af: 49 09 f1 or %rsi,%r9
1b2: 49 09 c8 or %rcx,%r8
1b5: 49 09 c2 or %rax,%r10
1b8: 4a 8d 04 12 lea (%rdx,%r10,1),%rax
1bc: 48 c7 c2 00 00 00 00 mov $0x0,%rdx
1bf: R_X86_64_32S cpu_dr7
1c3: 4c 29 c8 sub %r9,%rax
1c6: 4c 29 c0 sub %r8,%rax
1c9: 49 01 c4 add %rax,%r12
1cc: 48 89 14 24 mov %rdx,(%rsp)
1d0: 48 89 54 24 08 mov %rdx,0x8(%rsp)
1d5: e8 00 00 00 00 callq 1da <sched_init+0xe1>
1d6: R_X86_64_PLT32 debug_smp_processor_id-0x4
1da: 48 c7 c1 00 00 00 00 mov $0x0,%rcx
1dd: R_X86_64_32S __per_cpu_offset
1e1: 48 8b 14 24 mov (%rsp),%rdx
1e5: 89 c0 mov %eax,%eax
1e7: 48 03 14 c1 add (%rcx,%rax,8),%rdx
1eb: 0f ae 3a clflush (%rdx)
1ee: 0f 31 rdtsc
1f0: 48 89 d1 mov %rdx,%rcx
1f3: 48 89 c6 mov %rax,%rsi
1f6: 0f ae e8 lfence
1f9: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax # 201 <sched_init+0x108>
200: 00
1fd: R_X86_64_PC32 cpu_dr7-0x4
201: 0f ae e8 lfence
204: 0f 31 rdtsc
206: 48 c1 e2 20 shl $0x20,%rdx
20a: 48 c1 e1 20 shl $0x20,%rcx
20e: 48 09 c2 or %rax,%rdx
211: 48 09 f1 or %rsi,%rcx
214: 48 29 ca sub %rcx,%rdx
217: 49 01 d5 add %rdx,%r13
21a: 0f 31 rdtsc
21c: 48 89 d1 mov %rdx,%rcx
21f: 48 89 c6 mov %rax,%rsi
222: 0f ae e8 lfence
225: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax # 22d <sched_init+0x134>
22c: 00
229: R_X86_64_PC32 cpu_dr7-0x4
22d: 0f ae e8 lfence
230: 0f 31 rdtsc
232: 48 c1 e2 20 shl $0x20,%rdx
236: 48 c1 e1 20 shl $0x20,%rcx
23a: 48 09 c2 or %rax,%rdx
23d: 48 09 f1 or %rsi,%rcx
240: 48 29 ca sub %rcx,%rdx
243: 49 01 d6 add %rdx,%r14
246: 41 ff cf dec %r15d
249: 0f 85 d0 fe ff ff jne 11f <sched_init+0x26>