select() support for NR_OPEN>256

Andi Kleen (andi@mlm.extern.lrz-muenchen.de)
Wed, 05 Feb 1997 14:00:13 +0100


Hi,

The appended patch adds support in sys_select() for using more than
256fds per process. With this the user can safely compile the kernel
with NR_OPEN upto 1024.

The default NR_OPEN of 256 wastes memory with
the current power-of-two-based kmalloc() so I changed it to 383
for 32 bit machines (2K files_struct) and 479 for 64bit machines
(4K files_struct). When a slab cache is used for files_struct the
values could change again.

I also was looking into making files_struct completely dynamic
to get rid of the fixed NR_OPEN constant. When I replace all these
ugly

if (fd < NR_OPEN && (file = current->files->fd[fd]))

that are scattered all over the kernel source with calls
to files_from_fd(). This function makes sure that current->files is threated
as a volatile iso it should be safe to reallocate files_struct when
the process runs out of fds. This would also remove the reference
to NR_OPEN.

AFAICS only fs/exec.c:flush_old_files() has to
be changed to make sure that current->files isn't held in a register
when the process sleeps. The SMP people have to add a writer lock
anyway so it shouldn't affect them. I would add a sysctl to tune
the number of fds that are allocated at fork() time. The maximum
number of fds per process is limited by rlimits.

When you agree to this scheme I'll implement it.

Comments?

-Andi

--- ./fs/select.c Sun Jan 26 20:09:42 1997
+++ ./fs/select.c Sun Jan 26 20:18:34 1997
@@ -8,6 +8,10 @@
* COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
* flag set in its personality we do *not* modify the given timeout
* parameter to reflect time remaining.
+ * 26 January 1997
+ * Removed all fd_set size limits from sys_select().
+ * Removed all verify_area() calls.
+ * - Andi Kleen <andi@mlm.extern.lrz-muenchen.de>
*/

#include <linux/types.h>
@@ -24,6 +28,7 @@
#include <linux/malloc.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
+#include <linux/malloc.h>

#include <asm/uaccess.h>
#include <asm/system.h>
@@ -32,6 +37,9 @@
#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)

+#define CUR_NR_OPEN NR_OPEN
+#define SMALL_NR_OPEN 256
+
/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
* I have rewritten this, taking some shortcuts: This code may not be easy to
@@ -61,30 +69,21 @@
}
}

-/*
- * Due to kernel stack usage, we use a _limited_ fd_set type here, and once
- * we really start supporting >256 file descriptors we'll probably have to
- * allocate the kernel fd_set copies dynamically.. (The kernel select routines
- * are careful to touch only the defined low bits of any fd_set pointer, this
- * is important for performance too).
- */
-typedef unsigned long limited_fd_set[NR_OPEN/(8*(sizeof(unsigned long)))];
+typedef unsigned long * fds_ptr;
+
+/* assumes sizeof(type) is a power of 2 */
+#define roundbit(n, type) (((n) + sizeof(type)*8 - 1) & ~(sizeof(type)*8-1))

-typedef struct {
- limited_fd_set in, out, ex;
- limited_fd_set res_in, res_out, res_ex;
-} fd_set_buffer;
-
-#define __IN(in) (in)
-#define __OUT(in) (in + sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __EX(in) (in + 2*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_IN(in) (in + 3*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_OUT(in) (in + 4*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_EX(in) (in + 5*sizeof(limited_fd_set)/sizeof(unsigned long))
+#define __IN(fds,sz) (fds)
+#define __OUT(fds,sz) ((unsigned long *) ((char *)(fds) + sz))
+#define __EX(fds,sz) ((unsigned long *) ((char *)(fds) + 2*sz))
+#define __RES_IN(fds,sz) ((unsigned long *) ((char *)(fds) + 3*sz))
+#define __RES_OUT(fds,sz) ((unsigned long *) ((char *)(fds) + 4*sz))
+#define __RES_EX(fds,sz) ((unsigned long *) ((char *)(fds) + 5*sz))

-#define BITS(in) (*__IN(in)|*__OUT(in)|*__EX(in))
+#define BITS(fds,sz) (*__IN(fds,sz)|*__OUT(fds,sz)|*__EX(fds,sz))

-static int max_select_fd(unsigned long n, fd_set_buffer *fds)
+static int max_select_fd(int sz,unsigned long n,fds_ptr fds)
{
unsigned long *open_fds, *in;
unsigned long set;
@@ -94,10 +93,10 @@
set = ~(~0UL << (n & (__NFDBITS-1)));
n /= __NFDBITS;
open_fds = current->files->open_fds.fds_bits+n;
- in = fds->in+n;
+ in = fds+n;
max = 0;
if (set) {
- set &= BITS(in);
+ set &= BITS(in, sz);
if (set) {
if (!(set & ~*open_fds))
goto get_max;
@@ -108,7 +107,7 @@
in--;
open_fds--;
n--;
- set = BITS(in);
+ set = BITS(in,sz);
if (!set)
continue;
if (set & ~*open_fds)
@@ -135,14 +134,14 @@
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

-static int do_select(int n, fd_set_buffer *fds)
+static int do_select(int sz, int n, fds_ptr fds)
{
int retval;
poll_table wait_table, *wait;
struct poll_table_entry *entry;
int i;

- retval = max_select_fd(n, fds);
+ retval = max_select_fd(sz, n, fds);
if (retval < 0)
goto out;
n = retval;
@@ -159,9 +158,9 @@
current->state = TASK_INTERRUPTIBLE;
for (i = 0 ; i < n ; i++,fd++) {
unsigned long bit = BIT(i);
- unsigned long *in = MEM(i,fds->in);
+ unsigned long *in = MEM(i,fds);

- if (bit & BITS(in)) {
+ if (bit & BITS(in, sz)) {
struct file * file = *fd;
unsigned int mask = POLLNVAL;
if (file) {
@@ -169,18 +168,18 @@
if (file->f_op && file->f_op->poll)
mask = file->f_op->poll(file, wait);
}
- if ((mask & POLLIN_SET) && ISSET(bit, __IN(in))) {
- SET(bit, __RES_IN(in));
+ if ((mask & POLLIN_SET) && ISSET(bit, __IN(in,sz))) {
+ SET(bit, __RES_IN(in,sz));
retval++;
wait = NULL;
}
- if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(in))) {
- SET(bit, __RES_OUT(in));
+ if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(in,sz))) {
+ SET(bit, __RES_OUT(in,sz));
retval++;
wait = NULL;
}
- if ((mask & POLLEX_SET) && ISSET(bit, __EX(in))) {
- SET(bit, __RES_EX(in));
+ if ((mask & POLLEX_SET) && ISSET(bit, __EX(in,sz))) {
+ SET(bit, __RES_EX(in,sz));
retval++;
wait = NULL;
}
@@ -199,9 +198,6 @@
}

/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
* Use "int" accesses to let user-mode fd_set's be int-aligned.
*/
static int __get_fd_set(unsigned long nr, int * fs_pointer, int * fdset)
@@ -209,16 +205,14 @@
/* round up nr to nearest "int" */
nr = (nr + 8*sizeof(int)-1) / (8*sizeof(int));
if (fs_pointer) {
- int error = verify_area(VERIFY_WRITE,fs_pointer,nr*sizeof(int));
- if (!error) {
- while (nr) {
- get_user(*fdset, fs_pointer);
- nr--;
- fs_pointer++;
- fdset++;
- }
+ while (nr) {
+ if (get_user(*fdset, fs_pointer))
+ return -EFAULT;
+ nr--;
+ fs_pointer++;
+ fdset++;
}
- return error;
+ return 0;
}
while (nr) {
*fdset = 0;
@@ -228,16 +222,20 @@
return 0;
}

-static void __set_fd_set(long nr, int * fs_pointer, int * fdset)
+static int __set_fd_set(long nr, int * fs_pointer, int * fdset)
{
+ int err = 0;
if (!fs_pointer)
- return;
+ return 0;
while (nr >= 0) {
- put_user(*fdset, fs_pointer);
+ err = __put_user(*fdset, fs_pointer);
+ if (err)
+ break;
nr -= 8 * sizeof(int);
fdset++;
fs_pointer++;
}
+ return err;
}

/* We can do long accesses here, kernel fdsets are always long-aligned */
@@ -277,63 +275,83 @@
asmlinkage int sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
int error = -EINVAL;
- fd_set_buffer fds;
+ fds_ptr fds;
unsigned long timeout;
+ unsigned int sz;

lock_kernel();
if (n < 0)
- goto out;
- if (n > NR_OPEN)
- n = NR_OPEN;
- if ((error = get_fd_set(n, inp, &fds.in)) ||
- (error = get_fd_set(n, outp, &fds.out)) ||
- (error = get_fd_set(n, exp, &fds.ex))) goto out;
+ goto finalout;
+ if (n > CUR_NR_OPEN)
+ n = CUR_NR_OPEN; /* why not return -EINVAL here? */
+
+ sz = roundbit(n, unsigned long)/8;
+ if (n <= SMALL_NR_OPEN)
+ fds = __builtin_alloca(6*sz);
+ else {
+ fds = kmalloc(6*sz, GFP_KERNEL);
+ if (!fds)
+ goto finalout;
+ }
+
+ if ((error = get_fd_set(n, inp, __IN(fds,sz))) ||
+ (error = get_fd_set(n, outp, __OUT(fds,sz))) ||
+ (error = get_fd_set(n, exp, __EX(fds,sz)))) goto out;
+
timeout = ~0UL;
if (tvp) {
- error = verify_area(VERIFY_WRITE, tvp, sizeof(*tvp));
- if (error)
- goto out;
- get_user(timeout, &tvp->tv_usec);
+ unsigned long tmp;
+
+ if ((error = get_user(timeout, &tvp->tv_usec)) ||
+ (error = get_user(tmp, &tvp->tv_sec)))
+ goto out;
timeout = ROUND_UP(timeout,(1000000/HZ));
- {
- unsigned long tmp;
- get_user(tmp, &tvp->tv_sec);
- timeout += tmp * (unsigned long) HZ;
- }
+ timeout += tmp * (unsigned long) HZ;
+ /* XXX Note that timeout might still be too
+ small because of rounding errors to 100000/HZ.
+ I'm not sure about the right fix though. */
if (timeout)
timeout += jiffies + 1;
}
- zero_fd_set(n, &fds.res_in);
- zero_fd_set(n, &fds.res_out);
- zero_fd_set(n, &fds.res_ex);
+ zero_fd_set(n, __RES_IN(fds,sz));
+ zero_fd_set(n, __RES_OUT(fds,sz));
+ zero_fd_set(n, __RES_EX(fds,sz));
current->timeout = timeout;
- error = do_select(n, &fds);
+ error = do_select(sz, n, fds);
timeout = current->timeout - jiffies - 1;
current->timeout = 0;
if ((long) timeout < 0)
timeout = 0;
if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
- put_user(timeout/HZ, &tvp->tv_sec);
- timeout %= HZ;
- timeout *= (1000000/HZ);
- put_user(timeout, &tvp->tv_usec);
+ if (__put_user(timeout/HZ, &tvp->tv_sec) ||
+ __put_user((timeout%HZ)*(1000000/HZ), &tvp->tv_usec)) {
+ error = -EFAULT;
+ goto out;
+ }
}
- if (error < 0)
- goto out;
+ if (error < 0)
+ goto out;
if (!error) {
error = -ERESTARTNOHAND;
if (current->signal & ~current->blocked)
goto out;
error = 0;
}
- set_fd_set(n, inp, &fds.res_in);
- set_fd_set(n, outp, &fds.res_out);
- set_fd_set(n, exp, &fds.res_ex);
+
+ if (set_fd_set(n, inp, __RES_IN(fds,sz)) ||
+ set_fd_set(n, outp, __RES_OUT(fds,sz)) ||
+ set_fd_set(n, exp, __RES_EX(fds,sz))) {
+ error = -EFAULT;
+ }
out:
- unlock_kernel();
+ if (n > SMALL_NR_OPEN)
+ kfree(fds);
+finalout:
+ unlock_kernel();
return error;
}

+
static int do_poll(unsigned int nfds, struct pollfd *fds, poll_table *wait)
{
int count;
@@ -376,13 +394,13 @@

asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, int timeout)
{
- int i, count, fdcount, err = -EINVAL;
+ int i, count, fdcount, err = -EINVAL;
struct pollfd * fds, *fds1;
poll_table wait_table;
struct poll_table_entry *entry;

lock_kernel();
- if (nfds > NR_OPEN)
+ if (nfds > CUR_NR_OPEN)
goto out;

err = -ENOMEM;
--- ./include/linux/limits.h Sat Nov 9 18:32:22 1996
+++ ./include/linux/limits.h Sun Jan 26 20:30:51 1997
@@ -1,12 +1,12 @@
#ifndef _LINUX_LIMITS_H
#define _LINUX_LIMITS_H

-#define NR_OPEN 256
+#define NR_OPEN 959

#define NGROUPS_MAX 32 /* supplemental group IDs are available */
#define ARG_MAX 131072 /* # bytes of args + environ for exec() */
#define CHILD_MAX 999 /* no limit :-) */
-#define OPEN_MAX 256 /* # open files a process may have */
+#define OPEN_MAX 959 /* # open files a process may have */
#define LINK_MAX 127 /* # links a file may have */
#define MAX_CANON 255 /* size of the canonical input queue */
#define MAX_INPUT 255 /* size of the type-ahead buffer */
--- ./include/linux/fs.h Sun Jan 26 20:09:47 1997
+++ ./include/linux/fs.h Sun Jan 26 20:50:27 1997
@@ -25,9 +25,30 @@
* recompiled to take full advantage of the new limits..
*/

+/* Good values for NR_OPEN with the current kmalloc() (that always
+ rounds the size to the next power of 2). The header of struct
+ file_struct is 2*sizeof(fd_set)+sizeof(int).
+
+ allocated size 2K 4K 8K 16K 32K
+ -------------------------------------------------------
+ 32bit architecture
+ NR_OPEN 383 959 1919 3839 7935
+ fd_set size 128 128 256 512 512
+ -------------------------------------------------------
+ 64bit architecture
+ NR_OPEN 223 479 991 1983 3967
+ fd_set size 128 128 128 256 512
+
+ When we use slab_cache_alloc() to allocate struct files_struct
+ at fork time the optimal NR_OPEN value is different I think.
+
+ Note that when you change the fd_set size you have to change
+ posix_limits.h, the libc include files and recompile the applications.
+ */
+
/* Fixed constants first: */
#undef NR_OPEN
-#define NR_OPEN 256
+#define NR_OPEN 959

#define NR_SUPER 64
#define BLOCK_SIZE 1024
--- ./fs/select.c-o Sun Feb 2 12:14:50 1997
+++ ./fs/select.c Sun Feb 2 12:17:35 1997
@@ -37,13 +37,13 @@
#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)

-/*
- * It must be kept true that NR_OPEN > SMALL_NR_OPEN or the
- * select code springs a leak.
- */
-
#define CUR_NR_OPEN NR_OPEN
-#define SMALL_NR_OPEN 64
+
+/* This needs 192bytes on the kernel stack. This should be acceptable.
+ Most programs currently are compiled with NR_OPEN=256 so we mostly
+ use the fast path.
+ On a Alpha with 8K kernel stack this variable could be increased. */
+#define SMALL_NR_OPEN 256

/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
--- ./include/linux/fs.h-o Sun Feb 2 12:17:57 1997
+++ ./include/linux/fs.h Sun Feb 2 12:22:32 1997
@@ -48,7 +48,13 @@

/* Fixed constants first: */
#undef NR_OPEN
-#define NR_OPEN 768
+
+/* move this to asm/ ? */
+#if defined(__alpha__) || defined(__sparc64__)
+#define NR_OPEN 479 /* 64bit architecture. Uses 4K for files_struct */
+#else
+#define NR_OPEN 383 /* 32bit architecture. Uses 2K for files_struct */
+#endif

#define NR_SUPER 64
#define BLOCK_SIZE 1024
--- ./include/linux/limits.h-o Sun Feb 2 12:22:42 1997
+++ ./include/linux/limits.h Sun Feb 2 12:24:26 1997
@@ -1,17 +1,18 @@
#ifndef _LINUX_LIMITS_H
#define _LINUX_LIMITS_H

-/*
- * NR_OPEN must always be higher than SMALL_NR_OPEN defined in
- * linux/fs/select.c or we will spring a memory leak
- */
-
-#define NR_OPEN 768
+/* move this to asm/ ? */
+#if defined(__alpha__) || defined(__sparc64__)
+#define NR_OPEN 479
+#define OPEN_MAX 479
+#else
+#define NR_OPEN 383
+#define OPEN_MAX 383
+#endif

#define NGROUPS_MAX 32 /* supplemental group IDs are available */
#define ARG_MAX 131072 /* # bytes of args + environ for exec() */
#define CHILD_MAX 999 /* no limit :-) */
-#define OPEN_MAX 768 /* # open files a process may have */
#define LINK_MAX 127 /* # links a file may have */
#define MAX_CANON 255 /* size of the canonical input queue */
#define MAX_INPUT 255 /* size of the type-ahead buffer */
--- include/linux/posix_types.h-o Sun Feb 2 12:52:37 1997
+++ include/linux/posix_types.h Sun Feb 2 12:47:24 1997
@@ -12,11 +12,9 @@
#endif

/*
- * This allows for 1024 file descriptors: if NR_OPEN is ever grown
- * beyond that you'll have to change this too. But 1024 fd's seem to be
- * enough even for such "real" unices like OSF/1, so hopefully this is
- * one limit that doesn't have to be changed [again].
- *
+ * If you want to use more than 1024 file descriptors with select()
+ * recompile your program with -D__USER_FD_SETSIZE=fds_i_want
+ *
* Note that POSIX wants the FD_CLEAR(fd,fdsetp) defines to be in
* <sys/time.h> (and thus <linux/time.h>) - but this is a more logical
* place for them. Solved by having dummy defines in <sys/time.h>.
@@ -29,8 +27,16 @@
#undef __NFDBITS
#define __NFDBITS (8 * sizeof(unsigned long))

+/* For compatibility with other systems (BSD4.4, OSF/1) the libc
+ might do a #define __USER_FD_SETSIZE FD_SETSIZE. We can't do this
+ here because we have to keep the namespace clean.
+ glibc doesn't use this kernel include so glibc has to be changed. */
#undef __FD_SETSIZE
+#ifdef __USER_FD_SETSIZE
+#define __FD_SETSIZE __USER_FD_SETSIZE
+#else
#define __FD_SETSIZE 1024
+#endif

#undef __FDSET_LONGS
#define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS)

--
|andi@mlm.extern.lrz-muenchen.de     Nonsense is better than no sense at all.
|                                        -NoMeansNo,0-1=2