Re: >256 fd patch...

Matthias Urlichs (smurf@noris.de)
21 Mar 1997 14:34:29 +0100


In linux.dev.kernel, article <199703210420.MAA15152@metal.iinet.net.au>,
"Michael O'Reilly" <michael@metal.iinet.net.au> writes:
>
> In message <9703201627.AA20113@dcl.MIT.EDU>, "Theodore Y. Ts'o" writes:
> > Yes, but how many of the selects were using more than 32 file
> > descriptors? If most of them are using less than 32, then using the
> > statck for those cases *would* be a major memory win.
>
> Ahh yes. It would be a major memory win (very few of them were using
> more than 32 fd's), but from my point of view it would complicate the
> patch, and slow it down. I belive it's more suited to be a kernel
> compile option, than a permenant place in the kernel.
>
That would _not_ slow the patch down. One single comparison is all you'd
have to add.

Here's my version of the fs/select.c part of your patch, relative to
your patch. Completely untested (so far -- my test system is broken at
the moment) but believed to be correct.

Note that I dropped the caching. kmalloc does its own caching (more or
less, for small blocks), and I didn't want all processes with more than
32/64 files in their select() call to require a two- or four-page buffer,
depending on how high you set the absolute maximum.

I also use an "unsigned long long" for the fd_set; that prevents a kmalloc
for up to 64 bytes, which should be enough most of the time.

Since the buffer size depends on what the process is doing, it might make
sense to store a pointer to the kmalloced buffer in the process table.
I'll probably look into this over the weekend.

Index: NEWselect.c
===================================================================
RCS file: /usr/src/cvs/kernel/linux/fs/select.c,v
retrieving revision 1.1.1.3.2.3
diff -u -r1.1.1.3.2.3 select.c
--- OLDselect.c 1997/03/20 12:27:40 1.1.1.3.2.3
+++ NEWselect.c 1997/03/21 13:16:03
@@ -206,20 +206,16 @@
}

/*
- * Due to kernel stack usage, we use a _limited_ fd_set type here, and once
- * we really start supporting >256 file descriptors we'll probably have to
- * allocate the kernel fd_set copies dynamically.. (The kernel select routines
- * are careful to touch only the defined low bits of any fd_set pointer, this
- * is important for performance too).
+ * Due to kernel stack usage, we now allocate the kernel fd_set copies
+ * dynamically.. (The kernel select routines are careful to touch only the
+ * defined low bits of any fd_set pointer, this is important for
+ * performance too).
*
* Note a few subtleties: we use "long" for the dummy, not int, and we do a
* subtract by 1 on the nr of file descriptors. The former is better for
* machines with long > int, and the latter allows us to test the bit count
* against "zero or positive", which can mostly be just a sign bit test..
*/
-typedef struct {
- unsigned long dummy[NR_OPEN/(8*(sizeof(unsigned long)))];
-} limited_fd_set;

#define get_fd_set(nr,fsp,fdp) \
__get_fd_set(nr, (int *) (fsp), (int *) (fdp))
@@ -241,16 +237,14 @@

#define roundbit(n, type) (((n) + sizeof(type)*8 - 1) & ~(sizeof(type)*8-1))

-static unsigned long * save_fds[10] = {NULL, };
-static int fds_index = 0;
-
asmlinkage int sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
- unsigned long * fds = 0;
+ unsigned long * fds = NULL;
+ unsigned long long fd1,fd2,fd3,fd4,fd5,fd6;
int error;
- limited_fd_set *res_in, *in;
- limited_fd_set *res_out, *out;
- limited_fd_set *res_ex, *ex;
+ fd_set *res_in, *in;
+ fd_set *res_out, *out;
+ fd_set *res_ex, *ex;
unsigned long timeout;
int size;

@@ -260,25 +254,28 @@
if (n > NR_OPEN)
n = NR_OPEN;

- size = roundbit(NR_OPEN, unsigned long) / 8;
- if (save_fds[fds_index]) {
- fds = save_fds[fds_index];
- save_fds[fds_index] = NULL;
- if (fds_index > 0)
- --fds_index;
+ size = roundbit(n, unsigned long) / 8;
+ if(size <= sizeof(fd1)) {
+ in = (fd_set *) fd1;
+ out = (fd_set *) fd2;
+ ex = (fd_set *) fd3;
+ res_in = (fd_set *) fd4;
+ res_out = (fd_set *) fd5;
+ res_ex = (fd_set *) fd6;
} else {
fds = kmalloc(6 * size, GFP_KERNEL);
+
+ if (!fds) {
+ error = -ENOMEM;
+ goto out;
+ }
+ in = (fd_set *) fds;
+ out = (fd_set *) (((char*)fds) + size);
+ ex = (fd_set *) (((char*)fds) + size*2);
+ res_in = (fd_set *) (((char*)fds) + size*3);
+ res_out = (fd_set *) (((char*)fds) + size*4);
+ res_ex = (fd_set *) (((char*)fds) + size*5);
}
- if (!fds) {
- error = -ENOMEM;
- goto out;
- }
- in = (limited_fd_set *) fds;
- out = (limited_fd_set *) (((char*)fds) + size);
- ex = (limited_fd_set *) (((char*)fds) + size*2);
- res_in = (limited_fd_set *) (((char*)fds) + size*3);
- res_out = (limited_fd_set *) (((char*)fds) + size*4);
- res_ex = (limited_fd_set *) (((char*)fds) + size*5);

if ((error = get_fd_set(n, inp, in)) ||
(error = get_fd_set(n, outp, out)) ||
@@ -297,13 +294,7 @@
zero_fd_set(n, res_out);
zero_fd_set(n, res_ex);
current->timeout = timeout;
- error = do_select(n,
- (fd_set *) in,
- (fd_set *) out,
- (fd_set *) ex,
- (fd_set *) res_in,
- (fd_set *) res_out,
- (fd_set *) res_ex);
+ error = do_select(n, in, out, ex, res_in, res_out, res_ex);
timeout = current->timeout - jiffies - 1;
current->timeout = 0;
if ((long) timeout < 0)
@@ -326,14 +317,7 @@
set_fd_set(n, outp, res_out);
set_fd_set(n, exp, res_ex);
out:
- if (fds) {
- if (fds_index < 8) {
- if (save_fds[fds_index])
- ++fds_index;
- save_fds[fds_index] = fds;
- } else {
- kfree(fds);
- }
- }
+ if (fds)
+ kfree(fds);
return error;
}

Here's the same, relative to 2.0.29-ISS.

Index: fs/select.c
===================================================================
RCS file: /usr/src/cvs/kernel/linux/fs/select.c,v
retrieving revision 1.1.1.3
retrieving revision 1.1.1.3.2.5
diff -u -r1.1.1.3 -r1.1.1.3.2.5
--- select.c 1997/01/16 07:21:46 1.1.1.3
+++ select.c 1997/03/20 23:01:50 1.1.1.3.2.5
@@ -21,6 +21,7 @@
#include <linux/errno.h>
#include <linux/personality.h>
#include <linux/mm.h>
+#include <linux/malloc.h>

#include <asm/segment.h>
#include <asm/system.h>
@@ -205,20 +206,16 @@
}

/*
- * Due to kernel stack usage, we use a _limited_ fd_set type here, and once
- * we really start supporting >256 file descriptors we'll probably have to
- * allocate the kernel fd_set copies dynamically.. (The kernel select routines
- * are careful to touch only the defined low bits of any fd_set pointer, this
- * is important for performance too).
+ * Due to kernel stack usage, we now allocate the kernel fd_set copies
+ * dynamically. (The kernel select routines are careful to touch only the
+ * defined low bits of any fd_set pointer, this is important for
+ * performance too).
*
* Note a few subtleties: we use "long" for the dummy, not int, and we do a
* subtract by 1 on the nr of file descriptors. The former is better for
* machines with long > int, and the latter allows us to test the bit count
* against "zero or positive", which can mostly be just a sign bit test..
*/
-typedef struct {
- unsigned long dummy[NR_OPEN/(8*(sizeof(unsigned long)))];
-} limited_fd_set;

#define get_fd_set(nr,fsp,fdp) \
__get_fd_set(nr, (int *) (fsp), (int *) (fdp))
@@ -237,22 +234,52 @@
* Update: ERESTARTSYS breaks at least the xview clock binary, so
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
+
+#define roundbit(n, type) (((n) + sizeof(type)*8 - 1) & ~(sizeof(type)*8-1))
+
asmlinkage int sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
+ unsigned long * fds = NULL;
+ unsigned long long fd1,fd2,fd3,fd4,fd5,fd6;
int error;
- limited_fd_set res_in, in;
- limited_fd_set res_out, out;
- limited_fd_set res_ex, ex;
+ fd_set *res_in, *in;
+ fd_set *res_out, *out;
+ fd_set *res_ex, *ex;
unsigned long timeout;
+ int size;

error = -EINVAL;
if (n < 0)
goto out;
if (n > NR_OPEN)
n = NR_OPEN;
- if ((error = get_fd_set(n, inp, &in)) ||
- (error = get_fd_set(n, outp, &out)) ||
- (error = get_fd_set(n, exp, &ex))) goto out;
+
+ size = roundbit(n, unsigned long) / 8;
+ if(size <= sizeof(fd1)) {
+ in = (fd_set *) fd1;
+ out = (fd_set *) fd2;
+ ex = (fd_set *) fd3;
+ res_in = (fd_set *) fd4;
+ res_out = (fd_set *) fd5;
+ res_ex = (fd_set *) fd6;
+ } else {
+ fds = kmalloc(6 * size, GFP_KERNEL);
+
+ if (!fds) {
+ error = -ENOMEM;
+ goto out;
+ }
+ in = (fd_set *) fds;
+ out = (fd_set *) (((char*)fds) + size);
+ ex = (fd_set *) (((char*)fds) + size*2);
+ res_in = (fd_set *) (((char*)fds) + size*3);
+ res_out = (fd_set *) (((char*)fds) + size*4);
+ res_ex = (fd_set *) (((char*)fds) + size*5);
+ }
+
+ if ((error = get_fd_set(n, inp, in)) ||
+ (error = get_fd_set(n, outp, out)) ||
+ (error = get_fd_set(n, exp, ex))) goto out;
timeout = ~0UL;
if (tvp) {
error = verify_area(VERIFY_WRITE, tvp, sizeof(*tvp));
@@ -263,17 +294,11 @@
if (timeout)
timeout += jiffies + 1;
}
- zero_fd_set(n, &res_in);
- zero_fd_set(n, &res_out);
- zero_fd_set(n, &res_ex);
+ zero_fd_set(n, res_in);
+ zero_fd_set(n, res_out);
+ zero_fd_set(n, res_ex);
current->timeout = timeout;
- error = do_select(n,
- (fd_set *) &in,
- (fd_set *) &out,
- (fd_set *) &ex,
- (fd_set *) &res_in,
- (fd_set *) &res_out,
- (fd_set *) &res_ex);
+ error = do_select(n, in, out, ex, res_in, res_out, res_ex);
timeout = current->timeout - jiffies - 1;
current->timeout = 0;
if ((long) timeout < 0)
@@ -292,9 +317,11 @@
goto out;
error = 0;
}
- set_fd_set(n, inp, &res_in);
- set_fd_set(n, outp, &res_out);
- set_fd_set(n, exp, &res_ex);
+ set_fd_set(n, inp, res_in);
+ set_fd_set(n, outp, res_out);
+ set_fd_set(n, exp, res_ex);
out:
+ if (fds)
+ kfree(fds);
return error;
}

-- 
Matthias Urlichs         \  noris network GmbH  /  Xlink-POP Nürnberg 
Schleiermacherstraße 12   \   Linux+Internet   /   EMail: urlichs@noris.de
90491 Nürnberg (Germany)   \    Consulting+Programming+Networking+etc'ing
   PGP: 1024/4F578875   1B 89 E2 1C 43 EA 80 44  15 D2 29 CF C6 C7 E0 DE
       Click <A HREF="http://info.noris.de/~smurf/finger">here</A>.    42