[RFC V2 31/37] dmem: introduce mce handler

From: yulei . kernel
Date: Mon Dec 07 2020 - 06:36:06 EST


From: Yulei Zhang <yuleixzhang@xxxxxxxxxxx>

dmem handle the mce if the pfn belongs to dmem when mce occurs.
1. check whether the pfn is handled by dmem. return if true.
2. mark the pfn in a new error bitmap defined in page.
3. a series of mechanism to ensure that the mce pfn is not allocated.

Signed-off-by: Haiwei Li <lihaiwei@xxxxxxxxxxx>
Signed-off-by: Yulei Zhang <yuleixzhang@xxxxxxxxxxx>
---
include/linux/dmem.h | 6 +++
include/trace/events/dmem.h | 17 ++++++++
mm/dmem.c | 103 +++++++++++++++++++++++++++++++-------------
mm/memory-failure.c | 6 +++
4 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/include/linux/dmem.h b/include/linux/dmem.h
index 59d3ef14..cd17a91 100644
--- a/include/linux/dmem.h
+++ b/include/linux/dmem.h
@@ -21,6 +21,8 @@
void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr);
bool is_dmem_pfn(unsigned long pfn);
#define dmem_free_page(addr) dmem_free_pages(addr, 1)
+
+bool dmem_memory_failure(unsigned long pfn, int flags);
#else
static inline int dmem_reserve_init(void)
{
@@ -32,5 +34,9 @@ static inline bool is_dmem_pfn(unsigned long pfn)
return 0;
}

+static inline bool dmem_memory_failure(unsigned long pfn, int flags)
+{
+ return false;
+}
#endif
#endif /* _LINUX_DMEM_H */
diff --git a/include/trace/events/dmem.h b/include/trace/events/dmem.h
index 10d1b90..f8eeb3c 100644
--- a/include/trace/events/dmem.h
+++ b/include/trace/events/dmem.h
@@ -62,6 +62,23 @@
TP_printk("addr %#lx dpages_nr %d", (unsigned long)__entry->addr,
__entry->dpages_nr)
);
+
+TRACE_EVENT(dmem_memory_failure,
+ TP_PROTO(unsigned long pfn, bool used),
+ TP_ARGS(pfn, used),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(bool, used)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->used = used;
+ ),
+
+ TP_printk("pfn=%#lx used=%d", __entry->pfn, __entry->used)
+);
#endif

/* This part must be outside protection */
diff --git a/mm/dmem.c b/mm/dmem.c
index 50cdff9..16438db 100644
--- a/mm/dmem.c
+++ b/mm/dmem.c
@@ -431,6 +431,41 @@ static void __init dmem_uinit(void)
dmem_pool.registered_pages = 0;
}

+/* set or clear corresponding bit on allocation bitmap based on error bitmap */
+static unsigned long dregion_alloc_bitmap_set_clear(struct dmem_region *dregion,
+ bool set)
+{
+ unsigned long pos_pfn, pos_offset;
+ unsigned long valid_pages, mce_dpages = 0;
+ phys_addr_t dpage, reserved_start_pfn;
+
+ reserved_start_pfn = __phys_to_pfn(dregion->reserved_start_addr);
+
+ valid_pages = dpage_to_pfn(dregion->dpage_end_pfn) - reserved_start_pfn;
+ pos_offset = dpage_to_pfn(dregion->dpage_start_pfn)
+ - reserved_start_pfn;
+try_set:
+ pos_pfn = find_next_bit(dregion->error_bitmap, valid_pages, pos_offset);
+
+ if (pos_pfn >= valid_pages)
+ return mce_dpages;
+ mce_dpages++;
+ dpage = pfn_to_dpage(pos_pfn + reserved_start_pfn);
+ if (set)
+ WARN_ON(__test_and_set_bit(dpage - dregion->dpage_start_pfn,
+ dregion->bitmap));
+ else
+ WARN_ON(!__test_and_clear_bit(dpage - dregion->dpage_start_pfn,
+ dregion->bitmap));
+ pos_offset = dpage_to_pfn(dpage + 1) - reserved_start_pfn;
+ goto try_set;
+}
+
+static unsigned long dmem_region_mark_mce_dpages(struct dmem_region *dregion)
+{
+ return dregion_alloc_bitmap_set_clear(dregion, true);
+}
+
static int __init dmem_region_init(struct dmem_region *dregion)
{
unsigned long *bitmap, nr_pages;
@@ -514,6 +549,8 @@ static int dmem_alloc_region_init(struct dmem_region *dregion,
dregion->dpage_start_pfn = start;
dregion->dpage_end_pfn = end;

+ *dpages -= dmem_region_mark_mce_dpages(dregion);
+
dmem_pool.unaligned_pages += __phys_to_pfn((dpage_to_phys(start)
- dregion->reserved_start_addr));
dmem_pool.unaligned_pages += __phys_to_pfn(dregion->reserved_end_addr
@@ -558,36 +595,6 @@ static bool dmem_dpage_is_error(struct dmem_region *dregion, phys_addr_t dpage)
return err_num;
}

-/* set or clear corresponding bit on allocation bitmap based on error bitmap */
-static unsigned long dregion_alloc_bitmap_set_clear(struct dmem_region *dregion,
- bool set)
-{
- unsigned long pos_pfn, pos_offset;
- unsigned long valid_pages, mce_dpages = 0;
- phys_addr_t dpage, reserved_start_pfn;
-
- reserved_start_pfn = __phys_to_pfn(dregion->reserved_start_addr);
-
- valid_pages = dpage_to_pfn(dregion->dpage_end_pfn) - reserved_start_pfn;
- pos_offset = dpage_to_pfn(dregion->dpage_start_pfn)
- - reserved_start_pfn;
-try_set:
- pos_pfn = find_next_bit(dregion->error_bitmap, valid_pages, pos_offset);
-
- if (pos_pfn >= valid_pages)
- return mce_dpages;
- mce_dpages++;
- dpage = pfn_to_dpage(pos_pfn + reserved_start_pfn);
- if (set)
- WARN_ON(__test_and_set_bit(dpage - dregion->dpage_start_pfn,
- dregion->bitmap));
- else
- WARN_ON(!__test_and_clear_bit(dpage - dregion->dpage_start_pfn,
- dregion->bitmap));
- pos_offset = dpage_to_pfn(dpage + 1) - reserved_start_pfn;
- goto try_set;
-}
-
static void dmem_uinit_check_alloc_bitmap(struct dmem_region *dregion)
{
unsigned long dpages, size;
@@ -989,6 +996,42 @@ void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr)
}
EXPORT_SYMBOL(dmem_free_pages);

+bool dmem_memory_failure(unsigned long pfn, int flags)
+{
+ struct dmem_region *dregion;
+ struct dmem_node *pdnode = NULL;
+ u64 pos;
+ phys_addr_t addr = __pfn_to_phys(pfn);
+ bool used = false;
+
+ dregion = find_dmem_region(addr, &pdnode);
+ if (!dregion)
+ return false;
+
+ WARN_ON(!pdnode || !dregion->error_bitmap);
+
+ mutex_lock(&dmem_pool.lock);
+ pos = pfn - __phys_to_pfn(dregion->reserved_start_addr);
+ if (__test_and_set_bit(pos, dregion->error_bitmap))
+ goto out;
+
+ if (!dregion->bitmap || pfn < dpage_to_pfn(dregion->dpage_start_pfn) ||
+ pfn >= dpage_to_pfn(dregion->dpage_end_pfn))
+ goto out;
+
+ pos = phys_to_dpage(addr) - dregion->dpage_start_pfn;
+ if (__test_and_set_bit(pos, dregion->bitmap)) {
+ used = true;
+ } else {
+ pr_info("MCE: free dpage, mark %#lx disabled in dmem\n", pfn);
+ dnode_count_free_dpages(pdnode, -1);
+ }
+out:
+ trace_dmem_memory_failure(pfn, used);
+ mutex_unlock(&dmem_pool.lock);
+ return true;
+}
+
bool is_dmem_pfn(unsigned long pfn)
{
struct dmem_node *dnode;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5d880d4..dda45d2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
*/
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/dmem.h>
#include <linux/page-flags.h>
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
@@ -1323,6 +1324,11 @@ int memory_failure(unsigned long pfn, int flags)
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);

+ if (dmem_memory_failure(pfn, flags)) {
+ pr_info("MCE %#lx: handled by dmem\n", pfn);
+ return 0;
+ }
+
p = pfn_to_online_page(pfn);
if (!p) {
if (pfn_valid(pfn)) {
--
1.8.3.1