Re: [RFC PATCH V1 1/1] sched/numa: Enhance vma scanning logic

From: Raghavendra K T
Date: Tue Jan 24 2023 - 14:18:36 EST


On 1/17/2023 11:15 PM, Raghavendra K T wrote:
On 1/17/2023 8:29 PM, Mel Gorman wrote:
Note that the cc list is excessive for the topic.


Thank you Mel for the review. Sorry for the long list. (got by
get_maintainer). Will trim the list for V2.

(trimming the list early)
[...]

Nice idea. Thanks again.. I will take this as a base patch for expansion.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..3cebda5cc8a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -620,6 +620,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
      vma->vm_mm = mm;
      vma->vm_ops = &dummy_vm_ops;
      INIT_LIST_HEAD(&vma->anon_vma_chain);
+#ifdef CONFIG_NUMA_BALANCING
+    vma->numab = NULL;
+#endif
  }
  static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b8475007734..3c0cfdde33e0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,6 +526,10 @@ struct anon_vma_name {
      char name[];
  };
+struct vma_numab {
+    unsigned long next_scan;
+};
+
  /*
   * This struct describes a virtual memory area. There is one of these
   * per VM-area/task. A VM area is any part of the process virtual memory
@@ -593,6 +597,9 @@ struct vm_area_struct {
  #endif
  #ifdef CONFIG_NUMA
      struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+    struct vma_numab *numab;    /* NUMA Balancing state */
  #endif
      struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
  } __randomize_layout;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..2d34c484553d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -481,6 +481,9 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
  void vm_area_free(struct vm_area_struct *vma)
  {
+#ifdef CONFIG_NUMA_BALANCING
+    kfree(vma->numab);
+#endif >>       free_anon_vma_name(vma);
      kmem_cache_free(vm_area_cachep, vma);
  }

while running mmtest kernbench on (256 pcpu), I have hit BUG(),
(not reproducible in normal boot flow otherwise)

[ 716.825398] kernel BUG at mm/slub.c:419!
[ 716.825736] invalid opcode: 0000 [#146] PREEMPT SMP NOPTI
[ 716.826042] CPU: 232 PID: 364844 Comm: cc1 Tainted: G D W 6.1.0-test-snp-host-a7065246cf78+ #44
[ 716.826345] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.6.6 01/13/2022
[ 716.826645] RIP: 0010:__kmem_cache_free+0x2a4/0x2c0
[ 716.826941] Code: ff e9 32 ff ff ff 49 8b 47 08 f0 48 83 28 01 0f 85 9b fe ff ff 49 8b 47 08 4c 89 ff 48 8b 40 08 e8 a1 c5 cc 00 e9 86 fe ff ff <0f> 0b 48 8b 15 63 d6 4d 01 e9 85 fd ff ff 66 66 2e 0f 1f 84 00 00
[ 716.827550] RSP: 0018:ffffb0b070547c28 EFLAGS: 00010246
[ 716.827865] RAX: ffff990fa6bf1310 RBX: ffff990fa6bf1310 RCX: ffff990fa6bf1310
[ 716.828180] RDX: 00000000001000e8 RSI: 0000000000000000 RDI: ffff98d000044200
[ 716.828503] RBP: ffffb0b070547c50 R08: ffff98d030f222e0 R09: 0000000000000001
[ 716.828821] R10: ffff990ff6d298b0 R11: ffff98d030f226a0 R12: ffff98d000044200
[ 716.829139] R13: ffffd605c29afc40 R14: ffffffff9e89c20f R15: ffffb0b070547d58
[ 716.829458] FS: 00007f05f4cebac0(0000) GS:ffff994e00800000(0000) knlGS:0000000000000000
[ 716.829781] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 716.830105] CR2: 00007f05e9cbc002 CR3: 00000040eea7c005 CR4: 0000000000770ee0
[ 716.830432] PKRU: 55555554
[ 716.830749] Call Trace:
[ 716.831057] <TASK>
[ 716.831360] kfree+0x79/0x120
[ 716.831664] vm_area_free+0x1f/0x50
[ 716.831970] vma_expand+0x311/0x3e0
[ 716.832274] mmap_region+0x772/0x900
[ 716.832571] do_mmap+0x3c0/0x5e0
[ 716.832866] ? __this_cpu_preempt_check+0x13/0x20
[ 716.833165] ? security_mmap_file+0xa1/0xc0
[ 716.833458] vm_mmap_pgoff+0xd5/0x170
[ 716.833745] ksys_mmap_pgoff+0x46/0x210
[ 716.834022] __x64_sys_mmap+0x33/0x50
[ 716.834291] do_syscall_64+0x3b/0x90
[ 716.834549] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 716.834806] RIP: 0033:0x7f05f471ebd7
[ 716.835054] Code: 00 00 00 89 ef e8 59 ae ff ff eb e4 e8 62 7b 01 00 66 90 f3 0f 1e fa 41 89 ca 41 f7 c1 ff 0f 00 00 75 10 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 21 c3 48 8b 05 29 a2 0f 00 64 c7 00 16 00 00
[ 716.835567] RSP: 002b:00007fff24c27ae8 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
[ 716.835826] RAX: ffffffffffffffda RBX: 0000000000200000 RCX: 00007f05f471ebd7
[ 716.836077] RDX: 0000000000000003 RSI: 0000000000200000 RDI: 0000000000000000
[ 716.836323] RBP: 0000000000000000 R08: 00000000ffffffff R09: 0000000000000000
[ 716.836567] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000000038
[ 716.836808] R13: 0000000000001fff R14: 0000000000000044 R15: 0000000000000048
[ 716.837049] </TASK>
[ 716.837285] Modules linked in: tls ipmi_ssif binfmt_misc nls_iso8859_1 joydev input_leds intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd hid_generic kvm_amd dell_smbios dcdbas wmi_bmof dell_wmi_descriptor kvm usbhid hid ccp k10temp wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter mac_hid sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops crct10dif_pclmul i2c_algo_bit crc32_pclmul drm_shmem_helper ghash_clmulni_intel nvme aesni_intel crypto_simd cryptd tg3 drm nvme_core megaraid_sas ahci xhci_pci i2c_piix4 xhci_pci_renesas libahci
[ 716.839185] ---[ end trace 0000000000000000 ]---

looks like we have to additionally handle numab initialization in
vm_area_dup() code path. something like below fixed it (copied pasted
from tty):

diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..f5b2e41296c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,12 +475,18 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
dup_anon_vma_name(orig, new);
+#ifdef CONFIG_NUMA_BALANCING
+ new->numab = NULL;
+#endif
}
return new;
}

Does this look okay? if so I will fold it into V2 spin (in
vma_scan_delay patch, hoping you are okay with this change and do not
see any other changes required)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54ae071..6a1cffdfc76b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3027,6 +3027,23 @@ static void task_numa_work(struct callback_head *work)
          if (!vma_is_accessible(vma))
              continue;
+        /* Initialise new per-VMA NUMAB state. */
+        if (!vma->numab) {
+            vma->numab = kzalloc(sizeof(struct vma_numab), GFP_KERNEL);
+            if (!vma->numab)
+                continue;
+
+            vma->numab->next_scan = now +
+                msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+        }
+
+        /*
+         * After the first scan is complete, delay the balancing scan
+         * for new VMAs.
+         */
+        if (mm->numa_scan_seq && time_before(jiffies, vma->numab->next_scan))
+            continue;
+
          do {
              start = max(start, vma->vm_start);
              end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);