Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size [try #3]

From: David Howells
Date: Mon Jun 14 2004 - 05:50:54 EST



Here's version 3 of the patch, incorporating comments from Andrew Morton.

Andrew Morton <akpm@xxxxxxxx> wrote:
> We have an opportunity to make this loop less baroque.

That seems reasonable. gcc's optimiser should be able to render them the same.

> whereas in other places you _have_ made the definition of __init functions
> include the "__init". I'm not sure which is best, really, but it seems
> better to include the __init in the declaration.

You're probably right. This should probably apply to vfs_caches_init() and
inode_init() too.

> What's the attribute((pure)) for?

It tells gcc that the function's result only depends on its arguments... it
makes no references at all to external data. It's like __attribute__((const))
but stronger. This means that gcc can generate code that caches the result.

See the gcc manual.

> It generates a warning on older gcc - please use __attribute_pure__.

We still support gcc's that old?

> The other four or five implementations of log2() use ffx(~n).

Yes... but ffs() and ffz() take int args, not long args. I suspect that
shouldn't matter (that would require the hash table to be calculated at 4Gig
buckets in size or greater to be a problem), but why take the chance when we
can avoid it easily?

David

Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx>
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
diff -uNrp linux-2.6.7-rc3/fs/dcache.c linux-2.6.7-rc3-hash/fs/dcache.c
--- linux-2.6.7-rc3/fs/dcache.c 2004-06-11 09:36:25.000000000 +0100
+++ linux-2.6.7-rc3-hash/fs/dcache.c 2004-06-14 10:53:11.297499326 +0100
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/swap.h>
+#include <linux/bootmem.h>

#define DCACHE_PARANOIA 1
/* #define DCACHE_DEBUG 1 */
@@ -1561,13 +1562,25 @@ static int __init set_dhash_entries(char
}
__setup("dhash_entries=", set_dhash_entries);

-static void __init dcache_init(unsigned long mempages)
+static void __init dcache_init_early(void)
{
- struct hlist_head *d;
- unsigned long order;
- unsigned int nr_hash;
- int i;
+ int loop;
+
+ dentry_hashtable =
+ alloc_large_system_hash("Dentry cache",
+ sizeof(struct hlist_head),
+ dhash_entries,
+ 13,
+ 0,
+ &d_hash_shift,
+ &d_hash_mask);

+ for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+}
+
+static void __init dcache_init(unsigned long mempages)
+{
/*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
@@ -1580,45 +1593,6 @@ static void __init dcache_init(unsigned
NULL, NULL);

set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
-
- if (!dhash_entries)
- dhash_entries = PAGE_SHIFT < 13 ?
- mempages >> (13 - PAGE_SHIFT) :
- mempages << (PAGE_SHIFT - 13);
-
- dhash_entries *= sizeof(struct hlist_head);
- for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++)
- ;
-
- do {
- unsigned long tmp;
-
- nr_hash = (1UL << order) * PAGE_SIZE /
- sizeof(struct hlist_head);
- d_hash_mask = (nr_hash - 1);
-
- tmp = nr_hash;
- d_hash_shift = 0;
- while ((tmp >>= 1UL) != 0UL)
- d_hash_shift++;
-
- dentry_hashtable = (struct hlist_head *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (dentry_hashtable == NULL && --order >= 0);
-
- printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n",
- nr_hash, order, (PAGE_SIZE << order));
-
- if (!dentry_hashtable)
- panic("Failed to allocate dcache hash table\n");
-
- d = dentry_hashtable;
- i = nr_hash;
- do {
- INIT_HLIST_HEAD(d);
- d++;
- i--;
- } while (i);
}

/* SLAB cache for __getname() consumers */
@@ -1632,6 +1606,12 @@ EXPORT_SYMBOL(d_genocide);
extern void bdev_cache_init(void);
extern void chrdev_init(void);

+void __init vfs_caches_init_early(void)
+{
+ dcache_init_early();
+ inode_init_early();
+}
+
void __init vfs_caches_init(unsigned long mempages)
{
unsigned long reserve;
diff -uNrp linux-2.6.7-rc3/fs/inode.c linux-2.6.7-rc3-hash/fs/inode.c
--- linux-2.6.7-rc3/fs/inode.c 2004-06-11 09:36:27.000000000 +0100
+++ linux-2.6.7-rc3-hash/fs/inode.c 2004-06-14 10:53:16.278082113 +0100
@@ -20,6 +20,7 @@
#include <linux/security.h>
#include <linux/pagemap.h>
#include <linux/cdev.h>
+#include <linux/bootmem.h>

/*
* This is needed for the following functions:
@@ -1345,55 +1346,30 @@ __setup("ihash_entries=", set_ihash_entr
/*
* Initialize the waitqueues and inode hash table.
*/
+void __init inode_init_early(void)
+{
+ int loop;
+
+ inode_hashtable =
+ alloc_large_system_hash("Inode-cache",
+ sizeof(struct hlist_head),
+ ihash_entries,
+ 14,
+ 0,
+ &i_hash_shift,
+ &i_hash_mask);
+
+ for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ INIT_HLIST_HEAD(&inode_hashtable[loop]);
+}
+
void __init inode_init(unsigned long mempages)
{
- struct hlist_head *head;
- unsigned long order;
- unsigned int nr_hash;
int i;

for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
init_waitqueue_head(&i_wait_queue_heads[i].wqh);

- if (!ihash_entries)
- ihash_entries = PAGE_SHIFT < 14 ?
- mempages >> (14 - PAGE_SHIFT) :
- mempages << (PAGE_SHIFT - 14);
-
- ihash_entries *= sizeof(struct hlist_head);
- for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++)
- ;
-
- do {
- unsigned long tmp;
-
- nr_hash = (1UL << order) * PAGE_SIZE /
- sizeof(struct hlist_head);
- i_hash_mask = (nr_hash - 1);
-
- tmp = nr_hash;
- i_hash_shift = 0;
- while ((tmp >>= 1UL) != 0UL)
- i_hash_shift++;
-
- inode_hashtable = (struct hlist_head *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (inode_hashtable == NULL && --order >= 0);
-
- printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
- nr_hash, order, (PAGE_SIZE << order));
-
- if (!inode_hashtable)
- panic("Failed to allocate inode hash table\n");
-
- head = inode_hashtable;
- i = nr_hash;
- do {
- INIT_HLIST_HEAD(head);
- head++;
- i--;
- } while (i);
-
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once,
diff -uNrp linux-2.6.7-rc3/include/linux/bootmem.h linux-2.6.7-rc3-hash/include/linux/bootmem.h
--- linux-2.6.7-rc3/include/linux/bootmem.h 2004-06-11 09:25:03.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/bootmem.h 2004-06-11 10:38:23.000000000 +0100
@@ -67,4 +67,12 @@ extern void * __init __alloc_bootmem_nod
__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */

+extern void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int consider_highmem,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask);
+
#endif /* _LINUX_BOOTMEM_H */
diff -uNrp linux-2.6.7-rc3/include/linux/fs.h linux-2.6.7-rc3-hash/include/linux/fs.h
--- linux-2.6.7-rc3/include/linux/fs.h 2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/fs.h 2004-06-14 10:50:52.147156861 +0100
@@ -214,15 +214,17 @@ extern int leases_enable, dir_notify_ena
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/audit.h>
+#include <linux/init.h>
#include <asm/semaphore.h>
#include <asm/byteorder.h>

/* Used to be a macro which just called the function, now just a function */
extern void update_atime (struct inode *);

-extern void inode_init(unsigned long);
-extern void mnt_init(unsigned long);
-extern void files_init(unsigned long);
+extern void __init inode_init(unsigned long);
+extern void __init inode_init_early(void);
+extern void __init mnt_init(unsigned long);
+extern void __init files_init(unsigned long);

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
@@ -1199,7 +1201,8 @@ extern int filp_close(struct file *, fl_
extern char * getname(const char __user *);

/* fs/dcache.c */
-extern void vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init_early(void);
+extern void __init vfs_caches_init(unsigned long);

#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL)
#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
diff -uNrp linux-2.6.7-rc3/include/linux/kernel.h linux-2.6.7-rc3-hash/include/linux/kernel.h
--- linux-2.6.7-rc3/include/linux/kernel.h 2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/kernel.h 2004-06-14 10:48:45.130799964 +0100
@@ -92,6 +92,15 @@ asmlinkage int printk(const char * fmt,

unsigned long int_sqrt(unsigned long);

+static inline int __attribute_pure__ long_log2(unsigned long x)
+{
+ int r = 0;
+ for (x >>= 1; x > 0; x >>= 1)
+ r++;
+ return r;
+}
+
+
extern int printk_ratelimit(void);
extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);

diff -uNrp linux-2.6.7-rc3/include/linux/mmzone.h linux-2.6.7-rc3-hash/include/linux/mmzone.h
--- linux-2.6.7-rc3/include/linux/mmzone.h 2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/mmzone.h 2004-06-11 09:44:41.000000000 +0100
@@ -20,6 +20,18 @@
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif

+/*
+ * system hash table size limits
+ * - on large memory machines, we may want to allocate a bigger hash than that
+ * permitted by MAX_ORDER, so we allocate with the bootmem allocator, and are
+ * limited to this size
+ */
+#if MAX_ORDER > 14
+#define MAX_SYS_HASH_TABLE_ORDER MAX_ORDER
+#else
+#define MAX_SYS_HASH_TABLE_ORDER 14
+#endif
+
struct free_area {
struct list_head free_list;
unsigned long *map;
diff -uNrp linux-2.6.7-rc3/init/main.c linux-2.6.7-rc3-hash/init/main.c
--- linux-2.6.7-rc3/init/main.c 2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/init/main.c 2004-06-11 10:48:37.000000000 +0100
@@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void
initrd_start = 0;
}
#endif
+ vfs_caches_init_early();
mem_init();
kmem_cache_init();
if (late_time_init)
diff -uNrp linux-2.6.7-rc3/mm/page_alloc.c linux-2.6.7-rc3-hash/mm/page_alloc.c
--- linux-2.6.7-rc3/mm/page_alloc.c 2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/mm/page_alloc.c 2004-06-14 10:48:58.902645878 +0100
@@ -55,6 +55,9 @@ EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;

+static unsigned long __initdata nr_kernel_pages;
+static unsigned long __initdata nr_all_pages;
+
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -1454,6 +1457,10 @@ static void __init free_area_init_core(s
if (zholes_size)
realsize -= zholes_size[j];

+ if (j == ZONE_DMA || j == ZONE_NORMAL)
+ nr_kernel_pages += realsize;
+ nr_all_pages += realsize;
+
zone->spanned_pages = size;
zone->present_pages = realsize;
zone->name = zone_names[j];
@@ -1994,3 +2001,69 @@ int lower_zone_protection_sysctl_handler
setup_per_zone_protection();
return 0;
}
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ * quantity of entries
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int consider_highmem,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask)
+{
+ unsigned long mem, max, log2qty, size;
+ void *table;
+
+ /* round applicable memory size up to nearest megabyte */
+ mem = consider_highmem ? nr_all_pages : nr_kernel_pages;
+ mem += (1UL << (20 - PAGE_SHIFT)) - 1;
+ mem >>= 20 - PAGE_SHIFT;
+ mem <<= 20 - PAGE_SHIFT;
+
+ /* limit to 1 bucket per 2^scale bytes of low memory (rounded up to
+ * nearest power of 2 in size) */
+ if (scale > PAGE_SHIFT)
+ mem >>= (scale - PAGE_SHIFT);
+ else
+ mem <<= (PAGE_SHIFT - scale);
+
+ mem = 1UL << (long_log2(mem) + 1);
+
+ /* limit allocation size */
+ max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize;
+ if (max > mem)
+ max = mem;
+
+ /* allow the kernel cmdline to have a say */
+ if (!numentries || numentries > max)
+ numentries = max;
+
+ log2qty = long_log2(numentries);
+
+ do {
+ size = bucketsize << log2qty;
+
+ table = (void *) alloc_bootmem(size);
+
+ } while (!table && size > PAGE_SIZE);
+
+ if (!table)
+ panic("Failed to allocate %s hash table\n", tablename);
+
+ printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+ tablename,
+ (1U << log2qty),
+ long_log2(size) - PAGE_SHIFT,
+ size);
+
+ if (_hash_shift)
+ *_hash_shift = log2qty;
+ if (_hash_mask)
+ *_hash_mask = (1 << log2qty) - 1;
+
+ return table;
+}