Re: /proc/sys/kernel/pid_max issues

From: William Lee Irwin III
Date: Sun Sep 12 2004 - 12:16:46 EST


* William Lee Irwin III <wli@xxxxxxxxxxxxxx> wrote:
>> Forgot to check map->page in the first spin:
>> last_pid is not honored because next_free_map(map - 1, ...) may return
>> the same map and so restart with a lesser offset.

On Sun, Sep 12, 2004 at 01:20:26PM +0200, Ingo Molnar wrote:
> it's getting quite spaghetti ... do we really want to handle
> RESERVED_PID? There's no guarantee that any root daemon wont stray out
> of the 1...300 PID range anyway, so if it has an exploitable PID race
> bug then it's probably exploitable even without the RESERVED_PID
> protection.

I presumed it was merely cosmetic, so daemons around system startup
will get low pid numbers recognizable by sysadmins. Maybe filtering
process listings for pids < 300 is/was used to find daemons that may
have crashed? I'm not particularly attached to the feature, and have
never used it myself, but merely noticed its implementation was off.

Spaghetti may mean it's time to rewrite things. The following is a
goto-less alloc_pidmap() vs. 2.6.9-rc1-mm4 addressing all stated
concerns, but otherwise changing none of the semantics. Untested.


-- wli

Index: mm4-2.6.9-rc1/kernel/pid.c
===================================================================
--- mm4-2.6.9-rc1.orig/kernel/pid.c 2004-09-08 05:46:18.000000000 -0700
+++ mm4-2.6.9-rc1/kernel/pid.c 2004-09-12 09:44:52.586896544 -0700
@@ -38,6 +38,9 @@
#define PIDMAP_ENTRIES (PID_MAX_LIMIT/PAGE_SIZE/8)
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
+#define find_next_offset(map, off) \
+ find_next_zero_bit((map)->page, BITS_PER_PAGE, off)

/*
* PID-map pages start out as NULL, they get allocated upon
@@ -53,8 +56,6 @@
static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
{ [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };

-static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES;
-
static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;

fastcall void free_pidmap(int pid)
@@ -66,15 +67,16 @@
atomic_inc(&map->nr_free);
}

-/*
- * Here we search for the next map that has free bits left.
- * Normally the next map has free PIDs.
- */
-static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps)
+int alloc_pidmap(void)
{
- while (--*max_steps) {
- if (++map == map_limit)
- map = pidmap_array;
+ int i, offset, pid = last_pid + 1;
+ pidmap_t *map;
+
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pidmap_array[pid/BITS_PER_PAGE];
+ for (i = 0; i < PIDMAP_ENTRIES; ++i) {
if (unlikely(!map->page)) {
unsigned long page = get_zeroed_page(GFP_KERNEL);
/*
@@ -87,64 +89,29 @@
else
map->page = (void *)page;
spin_unlock(&pidmap_lock);
-
- if (!map->page)
+ if (unlikely(!map->page))
break;
}
- if (atomic_read(&map->nr_free))
- return map;
- }
- return NULL;
-}
-
-int alloc_pidmap(void)
-{
- int pid, offset, max_steps = PIDMAP_ENTRIES + 1;
- pidmap_t *map;
-
- pid = last_pid + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
-
- offset = pid & BITS_PER_PAGE_MASK;
- map = pidmap_array + pid / BITS_PER_PAGE;
-
- if (likely(map->page && !test_and_set_bit(offset, map->page))) {
- /*
- * There is a small window for last_pid updates to race,
- * but in that case the next allocation will go into the
- * slowpath and that fixes things up.
- */
-return_pid:
- atomic_dec(&map->nr_free);
- last_pid = pid;
- return pid;
- }
-
- if (!offset || !atomic_read(&map->nr_free)) {
- if (!offset)
- map--;
-next_map:
- map = next_free_map(map, &max_steps);
- if (!map)
- goto failure;
- offset = 0;
+ if (likely(atomic_read(&map->nr_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ last_pid = pid;
+ return pid;
+ }
+ offset = find_next_offset(map, offset);
+ pid = mk_pid(map, offset);
+ } while (offset < BITS_PER_PAGE && pid < pid_max);
+ }
+ if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &pidmap_array[0];
+ offset = RESERVED_PIDS;
+ }
+ pid = mk_pid(map, offset);
}
- /*
- * Find the next zero bit:
- */
-scan_more:
- offset = find_next_zero_bit(map->page, BITS_PER_PAGE, offset);
- if (offset >= BITS_PER_PAGE)
- goto next_map;
- if (test_and_set_bit(offset, map->page))
- goto scan_more;
-
- /* we got the PID: */
- pid = (map - pidmap_array) * BITS_PER_PAGE + offset;
- goto return_pid;
-
-failure:
return -1;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/