[PATCH] kdump, vmcore: create linear direct mapping for old memory

From: HATAYAMA Daisuke
Date: Sat Dec 15 2012 - 08:49:31 EST


[Warning: This is very experimental buggy patch!]

On the current implementation of /proc/vmcore, memory on the 2nd
kernel, called old memory in the code around, is read through ioremap
per pages, which is causing big performance impact on terabyte class
machines.

To address the issue, it's not enough to change of ioremap unit size
from a page to larger one since ioremap doesn't use 1GB or 2MB pages
on mapping --- I guess vast majority of ioremap users doesn't need to
map such a large memory. On giga byte memory machine, this leads to
giga byte page table.

Instead, this patch makes linear direct address for the old memory,
which supports 1GB and 2MB pages. There's less risk than ioremap due
to page table memory consumption.

Note:

Currently, I confirmed this patch worked weell only on small 1GB
memory kvm guest machine. On 32GB memory machine, I encountered some
kind of scheduler BUG during the boot of 2nd kernel. Obviously, I
guess my code is doing some wrong things around init_memory_mapping.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@xxxxxxxxxxxxxx>
---
fs/proc/vmcore.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0d5071d..739bd04 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -116,6 +116,49 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
return read;
}

+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem_noioremap(char *buf, size_t count,
+ u64 *ppos, int userbuf)
+{
+ unsigned long pfn, offset;
+ size_t nr_bytes;
+ ssize_t read = 0;
+
+ if (!count)
+ return 0;
+
+ offset = (unsigned long)(*ppos % PAGE_SIZE);
+ pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+ do {
+ if (count > (PAGE_SIZE - offset))
+ nr_bytes = PAGE_SIZE - offset;
+ else
+ nr_bytes = count;
+
+ /* If pfn is not ram, return zeros for sparse dump files */
+ if (pfn_is_ram(pfn) == 0)
+ memset(buf, 0, nr_bytes);
+ else {
+ void *vaddr = pfn_to_kaddr(pfn);
+
+ if (userbuf) {
+ if (copy_to_user(buf, vaddr + offset, nr_bytes))
+ return -EFAULT;
+ } else
+ memcpy(buf, vaddr + offset, nr_bytes);
+ }
+ *ppos += nr_bytes;
+ count -= nr_bytes;
+ buf += nr_bytes;
+ read += nr_bytes;
+ ++pfn;
+ offset = 0;
+ } while (count);
+
+ return read;
+}
+
/* Maps vmcore file offset to respective physical address in memroy. */
static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
struct vmcore **m_ptr)
@@ -137,6 +180,22 @@ static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
return 0;
}

+static void init_memory_mapping_oldmem(struct list_head *vc_list)
+{
+ struct vmcore *m;
+
+ list_for_each_entry(m, vc_list, list) {
+ unsigned long last_mapped_pfn;
+
+ last_mapped_pfn = init_memory_mapping(m->paddr,
+ m->paddr + m->size);
+ if (last_mapped_pfn > max_pfn_mapped)
+ max_pfn_mapped = last_mapped_pfn;
+ printk("vmcore: map %016llx-%016llx\n",
+ m->paddr, m->paddr + m->size - 1);
+ }
+}
+
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
@@ -184,9 +243,11 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
tsz = nr_bytes;

while (buflen) {
- tmp = read_from_oldmem(buffer, tsz, &start, 1);
- if (tmp < 0)
+ tmp = read_from_oldmem_noioremap(buffer, tsz, &start, 1);
+ if (tmp < 0) {
+ printk("vmcore: failed to read oldmem: %016llx\n", start);
return tmp;
+ }
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -677,6 +738,9 @@ static int __init parse_crash_elf_headers(void)
" sane\n");
return -EINVAL;
}
+
+ init_memory_mapping_oldmem(&vmcore_list);
+
return 0;
}

--
1.7.7.6


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/