Re: [PATCH 06/19] x86, mm: setup page table in top-down

From: Konrad Rzeszutek Wilk
Date: Mon Oct 22 2012 - 11:18:24 EST


On Thu, Oct 18, 2012 at 01:50:17PM -0700, Yinghai Lu wrote:
> Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first.
> then use mapped pages to map more range below, and keep looping until
^Then ^-s
> all pages get mapped.
>
> alloc_low_page will use page from BRK at first, after that buff is used up,
^^^^ buffer
> will use memblock to find and reserve page for page table usage.
^^^^ - pages

You might want to mention how 'memblock' searches for regions.
Presumarily it is from top to bottom.


And also explain the granularity of what the size you are mapping
_after_ you are done with the PMD_SIZE.


>
> At last we could get rid of calculation and find early pgt related code.
^^^^^ - can

>
> -v2: update to after fix_xen change,
> also use MACRO for initial pgt_buf size and add comments with it.
> -v3: skip big reserved range in memblock.reserved near end.
> -v4: don't need fix_xen change now.
>
> Suggested-by: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>
> ---
> arch/x86/include/asm/page_types.h | 1 +
> arch/x86/include/asm/pgtable.h | 1 +
> arch/x86/kernel/setup.c | 3 +
> arch/x86/mm/init.c | 207 ++++++++++--------------------------
> arch/x86/mm/init_32.c | 17 +++-
> arch/x86/mm/init_64.c | 17 +++-
> 6 files changed, 91 insertions(+), 155 deletions(-)
>
> diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
> index 54c9787..9f6f3e6 100644
> --- a/arch/x86/include/asm/page_types.h
> +++ b/arch/x86/include/asm/page_types.h
> @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr);
>
> extern unsigned long max_low_pfn_mapped;
> extern unsigned long max_pfn_mapped;
> +extern unsigned long min_pfn_mapped;

Why not call it 'last_min_pfn_mapped'? It looks to be keyed
of the last 'start' of memory we mapped and keeps on decreasing.
>
> static inline phys_addr_t get_max_mapped(void)
> {
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index dd1a888..6991a3e 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd)
>
> extern int direct_gbpages;
> void init_mem_mapping(void);
> +void early_alloc_pgt_buf(void);
>
> /* local pte updates need not use xchg for locking */
> static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index e72e4c6..73cb7ba 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -124,6 +124,7 @@
> */
> unsigned long max_low_pfn_mapped;
> unsigned long max_pfn_mapped;
> +unsigned long min_pfn_mapped;
>
> #ifdef CONFIG_DMI
> RESERVE_BRK(dmi_alloc, 65536);
> @@ -897,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
>
> reserve_ibft_region();
>
> + early_alloc_pgt_buf();
> +
> /*
> * Need to conclude brk, before memblock_x86_fill()
> * it could use memblock_find_in_range, could overlap with
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index dbb2916..9ff29c1 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start;
> unsigned long __meminitdata pgt_buf_end;
> unsigned long __meminitdata pgt_buf_top;
>
> +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
> +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
> +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
> +void __init early_alloc_pgt_buf(void)
> +{
> + unsigned long tables = INIT_PGT_BUF_SIZE;
> + phys_addr_t base;
> +
> + base = __pa(extend_brk(tables, PAGE_SIZE));
> +
> + pgt_buf_start = base >> PAGE_SHIFT;
> + pgt_buf_end = pgt_buf_start;
> + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
> +}
> +
> int after_bootmem;
>
> int direct_gbpages
> @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
> return nr_range;
> }
>
> -static unsigned long __init calculate_table_space_size(unsigned long start,
> - unsigned long end)
> -{
> - unsigned long puds = 0, pmds = 0, ptes = 0, tables;
> - struct map_range mr[NR_RANGE_MR];
> - int nr_range, i;
> -
> - pr_info("calculate_table_space_size: [mem %#010lx-%#010lx]\n",
> - start, end - 1);
> -
> - memset(mr, 0, sizeof(mr));
> - nr_range = 0;
> - nr_range = split_mem_range(mr, nr_range, start, end);
> -
> - for (i = 0; i < nr_range; i++) {
> - unsigned long range, extra;
> -
> - range = mr[i].end - mr[i].start;
> - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
> -
> - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
> - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
> - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
> - } else
> - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
> -
> - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
> - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
> -#ifdef CONFIG_X86_32
> - extra += PMD_SIZE;
> -#endif
> - /* The first 2/4M doesn't use large pages. */
> - if (mr[i].start < PMD_SIZE)
> - extra += PMD_SIZE - mr[i].start;
> -
> - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
> - } else
> - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
> - }
> -
> - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
> - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
> - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
> -
> -#ifdef CONFIG_X86_32
> - /* for fixmap */
> - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
> -#endif
> -
> - return tables;
> -}
> -
> -static unsigned long __init calculate_all_table_space_size(void)
> -{
> - unsigned long start_pfn, end_pfn;
> - unsigned long tables;
> - int i;
> -
> - /* the ISA range is always mapped regardless of memory holes */
> - tables = calculate_table_space_size(0, ISA_END_ADDRESS);
> -
> - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> - u64 start = start_pfn << PAGE_SHIFT;
> - u64 end = end_pfn << PAGE_SHIFT;
> -
> - if (end <= ISA_END_ADDRESS)
> - continue;
> -
> - if (start < ISA_END_ADDRESS)
> - start = ISA_END_ADDRESS;
> -#ifdef CONFIG_X86_32
> - /* on 32 bit, we only map up to max_low_pfn */
> - if ((start >> PAGE_SHIFT) >= max_low_pfn)
> - continue;
> -
> - if ((end >> PAGE_SHIFT) > max_low_pfn)
> - end = max_low_pfn << PAGE_SHIFT;
> -#endif
> - tables += calculate_table_space_size(start, end);
> - }
> -
> - return tables;
> -}
> -
> -static void __init find_early_table_space(unsigned long start,
> - unsigned long good_end,
> - unsigned long tables)
> -{
> - phys_addr_t base;
> -
> - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
> - if (!base)
> - panic("Cannot find space for the kernel page tables");
> -
> - pgt_buf_start = base >> PAGE_SHIFT;
> - pgt_buf_end = pgt_buf_start;
> - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
> -}
> -
> static struct range pfn_mapped[E820_X_MAX];
> static int nr_pfn_mapped;
>
> @@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
> }
>
> /*
> - * Iterate through E820 memory map and create direct mappings for only E820_RAM
> - * regions. We cannot simply create direct mappings for all pfns from
> - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
> - * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
> - * Depending on the alignment of E820 ranges, this may possibly result in using
> - * smaller size (i.e. 4K instead of 2M or 1G) page tables.
> + * this one could take range with hole in it

You forgot a period at the end.

> */
> -static void __init init_range_memory_mapping(unsigned long range_start,
> +static unsigned long __init init_range_memory_mapping(
> + unsigned long range_start,
> unsigned long range_end)
> {
> unsigned long start_pfn, end_pfn;
> + unsigned long mapped_ram_size = 0;
> int i;
>
> for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> @@ -421,71 +334,67 @@ static void __init init_range_memory_mapping(unsigned long range_start,
> end = range_end;
>
> init_memory_mapping(start, end);
> +
> + mapped_ram_size += end - start;
> }
> +
> + return mapped_ram_size;
> }
>
> void __init init_mem_mapping(void)
> {
> - unsigned long tables, good_end, end;
> + unsigned long end, real_end, start, last_start;
> + unsigned long step_size;
> + unsigned long addr;
> + unsigned long mapped_ram_size = 0;
> + unsigned long new_mapped_ram_size;
>
> probe_page_size_mask();
>
> - /*
> - * Find space for the kernel direct mapping tables.
> - *
> - * Later we should allocate these tables in the local node of the
> - * memory mapped. Unfortunately this is done currently before the
> - * nodes are discovered.
> - */
> #ifdef CONFIG_X86_64
> end = max_pfn << PAGE_SHIFT;
> - good_end = end;
> #else
> end = max_low_pfn << PAGE_SHIFT;
> - good_end = max_pfn_mapped << PAGE_SHIFT;
> #endif
> - tables = calculate_all_table_space_size();
> - find_early_table_space(0, good_end, tables);
> - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n",
> - end - 1, pgt_buf_start << PAGE_SHIFT,
> - (pgt_buf_top << PAGE_SHIFT) - 1);
>
> - max_pfn_mapped = 0; /* will get exact value next */
> /* the ISA range is always mapped regardless of memory holes */
> init_memory_mapping(0, ISA_END_ADDRESS);
> - init_range_memory_mapping(ISA_END_ADDRESS, end);
> +
> + /* xen has big range in reserved near end of ram, skip it at first */

How does feeding ISA_END_ADDRESS in to the memblock_find_in_range skip it?
> + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
> + PAGE_SIZE);
> + real_end = addr + PMD_SIZE;
> +
> + /* step_size need to be small so pgt_buf from BRK could cover it */
> + step_size = PMD_SIZE;
> + max_pfn_mapped = 0; /* will get exact value next */

next in the init_rnage_memory_mapping? Might want to spell that out.

> + min_pfn_mapped = real_end >> PAGE_SHIFT;
> + last_start = start = real_end;

Might want to add a comment here saying:

"We are looping from the top to the bottom."

> + while (last_start > ISA_END_ADDRESS) {
> + if (last_start > step_size) {
> + start = round_down(last_start - 1, step_size);
> + if (start < ISA_END_ADDRESS)
> + start = ISA_END_ADDRESS;
> + } else
> + start = ISA_END_ADDRESS;
> + new_mapped_ram_size = init_range_memory_mapping(start,
> + last_start);
> + last_start = start;
> + min_pfn_mapped = last_start >> PAGE_SHIFT;
> + if (new_mapped_ram_size > mapped_ram_size)
> + step_size <<= 5;

Should '5' have a #define value?

> + mapped_ram_size += new_mapped_ram_size;
> + }

It looks like the step_size would keep on increasing on every loop.
First it would be 2MB, 64MB, then 2GB, and so - until the amount
of memory that has been mapped is greater then unmapped.
Is that right?

I am basing that assumption on that the "new_mapped_ram_size"
would return the size of the newly mapped region (start, last_start)
in bytes. And the 'mapped_ram_size' is the size of the previously
mapped region plus all the other ones.

The logic being that at the start of execution you start with a 2MB,
compare it to 0, and increase step_size up to 64MB. Then start
at real_end-2MB-step_size -> real_end-2MB-1. That gets you a 64MB chunk.

Since new_mapped_ram_size (64MB) > mapped_ram_sized (2MB)
you increase step_size once more.

If so, you should also explain that in the git commit description and
in the loop logic..

> +
> + if (real_end < end)
> + init_range_memory_mapping(real_end, end);
> +
> #ifdef CONFIG_X86_64
> if (max_pfn > max_low_pfn) {
> /* can we preseve max_low_pfn ?*/
> max_low_pfn = max_pfn;
> }
> #endif
> - /*
> - * Reserve the kernel pagetable pages we used (pgt_buf_start -
> - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
> - * so that they can be reused for other purposes.
> - *
> - * On native it just means calling memblock_reserve, on Xen it also
> - * means marking RW the pagetable pages that we allocated before
> - * but that haven't been used.
> - *
> - * In fact on xen we mark RO the whole range pgt_buf_start -
> - * pgt_buf_top, because we have to make sure that when
> - * init_memory_mapping reaches the pagetable pages area, it maps
> - * RO all the pagetable pages, including the ones that are beyond
> - * pgt_buf_end at that time.
> - */
> - if (pgt_buf_end > pgt_buf_start) {
> - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n",
> - end - 1, pgt_buf_start << PAGE_SHIFT,
> - (pgt_buf_end << PAGE_SHIFT) - 1);
> - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
> - PFN_PHYS(pgt_buf_end));
> - }
> -
> - /* stop the wrong using */
> - pgt_buf_top = 0;
> -
> early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
> }
>
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 27f7fc6..7bb1106 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false;
>
> static __init void *alloc_low_page(void)
> {
> - unsigned long pfn = pgt_buf_end++;
> + unsigned long pfn;
> void *adr;
>
> - if (pfn >= pgt_buf_top)
> - panic("alloc_low_page: ran out of memory");
> + if ((pgt_buf_end + 1) >= pgt_buf_top) {
> + unsigned long ret;
> + if (min_pfn_mapped >= max_pfn_mapped)
> + panic("alloc_low_page: ran out of memory");
> + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
> + max_pfn_mapped << PAGE_SHIFT,
> + PAGE_SIZE, PAGE_SIZE);
> + if (!ret)
> + panic("alloc_low_page: can not alloc memory");
> + memblock_reserve(ret, PAGE_SIZE);
> + pfn = ret >> PAGE_SHIFT;
> + } else
> + pfn = pgt_buf_end++;
>
> adr = __va(pfn * PAGE_SIZE);
> clear_page(adr);
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 4898e80..7dfa69b 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -316,7 +316,7 @@ void __init cleanup_highmap(void)
>
> static __ref void *alloc_low_page(unsigned long *phys)
> {
> - unsigned long pfn = pgt_buf_end++;
> + unsigned long pfn;
> void *adr;
>
> if (after_bootmem) {
> @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys)
> return adr;
> }
>
> - if (pfn >= pgt_buf_top)
> - panic("alloc_low_page: ran out of memory");
> + if ((pgt_buf_end + 1) >= pgt_buf_top) {
> + unsigned long ret;
> + if (min_pfn_mapped >= max_pfn_mapped)
> + panic("alloc_low_page: ran out of memory");
> + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
> + max_pfn_mapped << PAGE_SHIFT,
> + PAGE_SIZE, PAGE_SIZE);
> + if (!ret)
> + panic("alloc_low_page: can not alloc memory");
> + memblock_reserve(ret, PAGE_SIZE);
> + pfn = ret >> PAGE_SHIFT;
> + } else
> + pfn = pgt_buf_end++;
>
> adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
> clear_page(adr);
> --
> 1.7.7
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/