Re: predictable IP ID

Savochkin Andrey Vladimirovich (saw@msu.ru)
Sat, 2 Oct 1999 22:41:11 +0400


Dave,

On Sun, Sep 26, 1999 at 06:18:24PM +0400, A.N.Kuznetsov wrote:
> I'll apply it to my tree for hour, break it a bit and bounce
> back to you with comments. OK?

I've finished the implementation of hard-to-predict IP IDs with a big help of
Alexey (thanks to him).

The main differences since the initial implementation are the following.
- A constant ID is used for packets with will never be defragmented (DF and
unfragmented on the source). It reduces the usual size of the information
storage.
- Possibilities of any serious DoS were finally eliminated. Packets go out in
any circumstances and in the case of overloading old entries are thrown
away independently of their age. All these changes make the chances of a
successful DoS attacks even lower than with the current kernels.
Both implementations (current kernels and with the patch) may generate the
same IDs if more than 65536 packets are output in a short time but nothing
more serious (like OOM) happens. The new implementation will behave even
much better because 1) TCP traffic uses PMTU and constant IP IDs; and 2)
IP sequence number wrap requires packets being destined to different
hosts.
- IP ID generation for raw packets were fixed.

What do you think about the patch?

Best wishes
Andrey

diff -ruN linux-cvs.ipid.orig/Documentation/networking/ip-sysctl.txt linux-cvs.ipid/Documentation/networking/ip-sysctl.txt
--- linux-cvs.ipid.orig/Documentation/networking/ip-sysctl.txt Sun May 16 14:35:39 1999
+++ linux-cvs.ipid/Documentation/networking/ip-sysctl.txt Sat Oct 2 21:59:16 1999
@@ -51,6 +51,36 @@
ipfrag_time - INTEGER
Time in seconds to keep an IP fragment in memory.

+INET peer storage:
+
+inet_peer_threshold - INTEGER
+ The approximate size of the storage. Starting from this threshold
+ entries will be thrown aggressively. This threshold also determines
+ entries' time-to-live and time intervals between garbage collection
+ passes. More entries, less time-to-live, less GC interval.
+
+inet_peer_minttl - INTEGER
+ Minimum time-to-live of entries. Should be enough to cover fragment
+ time-to-live on the reassembling side. This minimum time-to-live is
+ guaranteed if the pool size is less than inet_peer_threshold.
+ Measured in jiffies.
+
+inet_peer_maxttl - INTEGER
+ Maximum time-to-live of entries. Unused entries will expire after
+ this period of time if there is no memory pressure on the pool (i.e.
+ when the number of entries in the pool is very small).
+ Measured in jiffies.
+
+inet_peer_gc_mintime - INTEGER
+ Minimum interval between garbage collection passes. This interval is
+ in effect under high memory pressure on the pool.
+ Measured in jiffies.
+
+net_peer_gc_maxtime - INTEGER
+ Minimum interval between garbage collection passes. This interval is
+ in effect under low (or absent) memory pressure on the pool.
+ Measured in jiffies.
+
TCP variables:

tcp_syn_retries - INTEGER
diff -ruN linux-cvs.ipid.orig/drivers/char/random.c linux-cvs.ipid/drivers/char/random.c
--- linux-cvs.ipid.orig/drivers/char/random.c Wed Sep 1 21:50:37 1999
+++ linux-cvs.ipid/drivers/char/random.c Sat Oct 2 20:18:23 1999
@@ -2046,6 +2046,38 @@
saddr, daddr, sport, dport, seq);
#endif
return seq;
+}
+
+/* The code below is shamelessly stolen from secure_tcp_sequence_number().
+ * All blames to Andrey V. Savochkin <saw@msu.ru>.
+ */
+__u32 secure_ip_id(__u32 daddr)
+{
+ static time_t rekey_time = 0;
+ static __u32 secret[12];
+ time_t t;
+
+ /*
+ * Pick a random secret every REKEY_INTERVAL seconds.
+ */
+ t = CURRENT_TIME;
+ if (!rekey_time || (t - rekey_time) > REKEY_INTERVAL) {
+ rekey_time = t;
+ /* First word is overwritten below. */
+ get_random_bytes(secret+1, sizeof(secret)-4);
+ }
+
+ /*
+ * Pick a unique starting offset for each IP destination.
+ * Note that the words are placed into the first words to be
+ * mixed in with the halfMD4. This is because the starting
+ * vector is also a random secret (at secret+8), and further
+ * hashing fixed data into it isn't going to improve anything,
+ * so we should get started with the variable data.
+ */
+ secret[0]=daddr;
+
+ return halfMD4Transform(secret+8, secret);
}

#ifdef CONFIG_SYN_COOKIES
diff -ruN linux-cvs.ipid.orig/include/linux/random.h linux-cvs.ipid/include/linux/random.h
--- linux-cvs.ipid.orig/include/linux/random.h Wed Sep 1 22:14:14 1999
+++ linux-cvs.ipid/include/linux/random.h Mon Sep 27 16:17:42 1999
@@ -54,6 +54,7 @@
extern void get_random_bytes(void *buf, int nbytes);
void generate_random_uuid(unsigned char uuid_out[16]);

+extern __u32 secure_ip_id(__u32 daddr);
extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
__u16 sport, __u16 dport);
extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
diff -ruN linux-cvs.ipid.orig/include/linux/sysctl.h linux-cvs.ipid/include/linux/sysctl.h
--- linux-cvs.ipid.orig/include/linux/sysctl.h Fri Sep 10 19:22:52 1999
+++ linux-cvs.ipid/include/linux/sysctl.h Tue Sep 21 17:19:11 1999
@@ -240,6 +240,11 @@
NET_TCP_TW_RECYCLE=66,
NET_IPV4_ALWAYS_DEFRAG=67,
NET_IPV4_TCP_KEEPALIVE_INTVL=68,
+ NET_IPV4_INET_PEER_THRESHOLD=69,
+ NET_IPV4_INET_PEER_MINTTL=70,
+ NET_IPV4_INET_PEER_MAXTTL=71,
+ NET_IPV4_INET_PEER_GC_MINTIME=72,
+ NET_IPV4_INET_PEER_GC_MAXTIME=73,
};

enum {
diff -ruN linux-cvs.ipid.orig/include/net/dst.h linux-cvs.ipid/include/net/dst.h
--- linux-cvs.ipid.orig/include/net/dst.h Sun Aug 22 19:31:21 1999
+++ linux-cvs.ipid/include/net/dst.h Sun Sep 12 18:06:28 1999
@@ -23,6 +23,7 @@
#define DST_GC_MAX (120*HZ)

struct sk_buff;
+struct inet_peer;

struct dst_entry
{
diff -ruN linux-cvs.ipid.orig/include/net/inetpeer.h linux-cvs.ipid/include/net/inetpeer.h
--- linux-cvs.ipid.orig/include/net/inetpeer.h Thu Jan 1 03:00:00 1970
+++ linux-cvs.ipid/include/net/inetpeer.h Mon Sep 27 16:30:23 1999
@@ -0,0 +1,63 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * Version: $Id$
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#ifndef _NET_INETPEER_H
+#define _NET_INETPEER_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct inet_peer {
+ struct inet_peer *avl_left, *avl_right;
+ struct inet_peer *unused_next, **unused_prevp;
+ atomic_t refcnt;
+ unsigned long dtime; /* the time of last use of not
+ * referenced entries */
+ __u32 v4daddr; /* peer's address */
+ __u16 avl_height;
+ __u16 ip_id_count; /* IP ID for the next packet */
+};
+
+void inet_initpeers(void) __init;
+
+/* can be called with or without local BH being disabled */
+struct inet_peer *inet_getpeer(__u32 daddr);
+
+extern spinlock_t inet_peer_unused_lock;
+extern struct inet_peer *inet_peer_unused_head;
+extern struct inet_peer **inet_peer_unused_tailp;
+/* can be called from BH context or outside */
+extern inline void inet_putpeer(struct inet_peer *p)
+{
+ spin_lock_bh(&inet_peer_unused_lock);
+ if (atomic_dec_and_test(&p->refcnt)) {
+ p->unused_prevp = inet_peer_unused_tailp;
+ p->unused_next = NULL;
+ *inet_peer_unused_tailp = p;
+ inet_peer_unused_tailp = &p->unused_next;
+ p->dtime = jiffies;
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+}
+
+extern spinlock_t inet_peer_idlock;
+/* can be called with or without local BH being disabled */
+extern inline __u16 inet_getid(struct inet_peer *p)
+{
+ __u16 id;
+
+ spin_lock_bh(&inet_peer_idlock);
+ id = p->ip_id_count++;
+ spin_unlock_bh(&inet_peer_idlock);
+ return id;
+}
+
+#endif /* _NET_INETPEER_H */
diff -ruN linux-cvs.ipid.orig/include/net/ip.h linux-cvs.ipid/include/net/ip.h
--- linux-cvs.ipid.orig/include/net/ip.h Sun Aug 22 19:31:21 1999
+++ linux-cvs.ipid/include/net/ip.h Sat Oct 2 21:00:08 1999
@@ -96,7 +96,6 @@
extern int ip_fragment(struct sk_buff *skb, int (*out)(struct sk_buff*));
extern int ip_do_nat(struct sk_buff *skb);
extern void ip_send_check(struct iphdr *ip);
-extern int ip_id_count;
extern int ip_queue_xmit(struct sk_buff *skb);
extern void ip_init(void);
extern int ip_build_xmit(struct sock *sk,
@@ -162,6 +161,16 @@
return (sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_DO ||
(sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_WANT &&
!(dst->mxlock&(1<<RTAX_MTU))));
+}
+
+extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst);
+
+extern __inline__ void ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
+{
+ if (iph->frag_off&__constant_htons(IP_DF))
+ iph->id = __constant_htons(1);
+ else
+ __ip_select_ident(iph, dst);
}

/*
diff -ruN linux-cvs.ipid.orig/include/net/route.h linux-cvs.ipid/include/net/route.h
--- linux-cvs.ipid.orig/include/net/route.h Sun Aug 22 19:31:27 1999
+++ linux-cvs.ipid/include/net/route.h Fri Sep 17 19:00:39 1999
@@ -53,6 +53,7 @@
__u8 scope;
};

+struct inet_peer;
struct rtable
{
union
@@ -76,6 +77,7 @@

/* Miscellaneous cached information */
__u32 rt_spec_dst; /* RFC1122 specific destination */
+ struct inet_peer *peer; /* long-living peer info */

#ifdef CONFIG_IP_ROUTE_NAT
__u32 rt_src_map;
diff -ruN linux-cvs.ipid.orig/net/ipv4/Makefile linux-cvs.ipid/net/ipv4/Makefile
--- linux-cvs.ipid.orig/net/ipv4/Makefile Sun Aug 22 19:32:50 1999
+++ linux-cvs.ipid/net/ipv4/Makefile Sun Sep 12 18:03:03 1999
@@ -8,7 +8,7 @@
# Note 2! The CFLAGS definition is now in the main makefile...

O_TARGET := ipv4.o
-IPV4_OBJS := utils.o route.o proc.o protocol.o \
+IPV4_OBJS := utils.o route.o inetpeer.o proc.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
ip_output.o ip_sockglue.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\
diff -ruN linux-cvs.ipid.orig/net/ipv4/igmp.c linux-cvs.ipid/net/ipv4/igmp.c
--- linux-cvs.ipid.orig/net/ipv4/igmp.c Sun Aug 22 19:32:57 1999
+++ linux-cvs.ipid/net/ipv4/igmp.c Mon Sep 27 16:17:42 1999
@@ -224,13 +224,13 @@
iph->version = 4;
iph->ihl = (sizeof(struct iphdr)+4)>>2;
iph->tos = 0;
- iph->frag_off = 0;
+ iph->frag_off = __constant_htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
iph->saddr = rt->rt_src;
iph->protocol = IPPROTO_IGMP;
iph->tot_len = htons(IGMP_SIZE);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, &rt->u.dst);
((u8*)&iph[1])[0] = IPOPT_RA;
((u8*)&iph[1])[1] = 4;
((u8*)&iph[1])[2] = 0;
diff -ruN linux-cvs.ipid.orig/net/ipv4/inetpeer.c linux-cvs.ipid/net/ipv4/inetpeer.c
--- linux-cvs.ipid.orig/net/ipv4/inetpeer.c Thu Jan 1 03:00:00 1970
+++ linux-cvs.ipid/net/ipv4/inetpeer.c Sat Oct 2 22:01:49 1999
@@ -0,0 +1,443 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ * Version: $Id$
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <net/inetpeer.h>
+
+/*
+ * Theory of operations.
+ * We keep one entry for each peer IP address. The nodes contains long-living
+ * information about the peer which doesn't depend on routes.
+ * At this moment this information consists only of ID field for the next
+ * outgoing IP packet. This field is incremented with each packet as encoded
+ * in inet_getid() function (include/net/inetpeer.h).
+ * At the moment of writing this notes identifier of IP packets is generated
+ * to be unpredictable using this code only for packets subjected
+ * (actually or potentially) to defragmentation. I.e. DF packets less than
+ * PMTU in size uses a constant ID and do not use this code (see
+ * ip_select_ident() in include/net/ip.h).
+ *
+ * Route cache entries hold references to our nodes.
+ * New cache entries get references via lookup by destination IP address in
+ * the avl tree. The reference is grabbed only when it's needed i.e. only
+ * when we try to output IP packet which needs an unpredictable ID (see
+ * __ip_select_ident() in net/ipv4/route.c).
+ * Nodes are removed only when reference counter goes to 0.
+ * When it's happened the node may be removed when a sufficient amount of
+ * time has been passed since its last use. The less-recently-used entry can
+ * also be removed if the pool is overloaded i.e. if the total amount of
+ * entries is greater-or-equal than the threshold.
+ *
+ * Node pool is organised as an AVL tree.
+ * Such an implementation has been chosen not just for fun. It's a way to
+ * prevent easy and efficient DoS attacks by creating hash collisions. A huge
+ * amount of long living nodes in a single hash slot would significantly delay
+ * lookups performed with disabled BHs.
+ *
+ * Serialisation issues.
+ * 1. Nodes may appear in the tree only with the pool write lock held.
+ * 2. Nodes may disappear from the tree only with the pool write lock held
+ * AND reference count being 0.
+ * 3. Nodes appears and disappears from unused node list only under
+ * "inet_peer_unused_lock".
+ * 4. Global variable peer_total is modified under the pool lock.
+ * 5. struct inet_peer fields modification:
+ * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * unused_next, unused_prevp: unused node list lock
+ * refcnt: atomically against modifications on other CPU;
+ * usually under some other lock to prevent node disappearing
+ * dtime: unused node list lock
+ * v4daddr: unchangeable
+ * ip_id_count: idlock
+ */
+
+spinlock_t inet_peer_idlock = SPIN_LOCK_UNLOCKED;
+
+static kmem_cache_t *peer_cachep;
+
+#define node_height(x) x->avl_height
+static struct inet_peer peer_fake_node = {
+ avl_left : &peer_fake_node,
+ avl_right : &peer_fake_node,
+ avl_height : 0
+};
+#define peer_avl_empty (&peer_fake_node)
+static struct inet_peer *peer_root = peer_avl_empty;
+static rwlock_t peer_pool_lock = RW_LOCK_UNLOCKED;
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+static volatile int peer_total = 0;
+int inet_peer_threshold = 65536 + 128; /* start to throw entries more
+ * aggressively at this stage */
+int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */
+struct inet_peer *inet_peer_unused_head = NULL,
+ **inet_peer_unused_tailp = &inet_peer_unused_head;
+spinlock_t inet_peer_unused_lock = SPIN_LOCK_UNLOCKED;
+#define PEER_MAX_CLEANUP_WORK 30
+
+static void peer_check_expire(unsigned long dummy);
+static struct timer_list peer_periodic_timer =
+ { NULL, NULL, 0, 0, &peer_check_expire };
+int inet_peer_gc_mintime = 10 * HZ,
+ inet_peer_gc_maxtime = 120 * HZ;
+
+
+void __init inet_initpeers(void)
+{
+ struct sysinfo si;
+
+ /* Use the straight interface to information about memory. */
+ si_meminfo(&si);
+ /* The values below were suggested by Alexey Kuznetsov
+ * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
+ * myself. --SAW
+ */
+ if (si.totalram <= 32768*1024)
+ inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+ if (si.totalram <= 16384*1024)
+ inet_peer_threshold >>= 1; /* about 512KB */
+ if (si.totalram <= 8192*1024)
+ inet_peer_threshold >>= 2; /* about 128KB */
+
+ peer_cachep = kmem_cache_create("inet_peer_cache",
+ sizeof(struct inet_peer),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ /* All the timers, started at system startup tend
+ to synchronize. Perturb it a bit.
+ */
+ peer_periodic_timer.expires = jiffies
+ + net_random() % inet_peer_gc_maxtime
+ + inet_peer_gc_maxtime;
+ add_timer(&peer_periodic_timer);
+}
+
+/* Called with local BH disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+ spin_lock(&inet_peer_unused_lock);
+ if (p->unused_prevp != NULL) {
+ /* On unused list. */
+ *p->unused_prevp = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark it as removed */
+ }
+ spin_unlock(&inet_peer_unused_lock);
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup(daddr) \
+({ \
+ struct inet_peer *u, **v; \
+ stackptr = stack; \
+ *stackptr++ = &peer_root; \
+ for (u = peer_root; u != peer_avl_empty; ) { \
+ if (daddr == u->v4daddr) \
+ break; \
+ if (daddr < u->v4daddr) \
+ v = &u->avl_left; \
+ else \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held. */
+#define lookup_rightempty(start) \
+({ \
+ struct inet_peer *u, **v; \
+ *stackptr++ = &start->avl_left; \
+ v = &start->avl_left; \
+ for (u = *v; u->avl_right != peer_avl_empty; ) { \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas. */
+static void peer_avl_rebalance(struct inet_peer **stack[],
+ struct inet_peer ***stackend)
+{
+ struct inet_peer **nodep, *node, *l, *r;
+ int lh, rh;
+
+ while (stackend > stack) {
+ nodep = *--stackend;
+ node = *nodep;
+ l = node->avl_left;
+ r = node->avl_right;
+ lh = node_height(l);
+ rh = node_height(r);
+ if (lh > rh + 1) { /* l: RH+2 */
+ struct inet_peer *ll, *lr, *lrl, *lrr;
+ int lrh;
+ ll = l->avl_left;
+ lr = l->avl_right;
+ lrh = node_height(lr);
+ if (lrh <= node_height(ll)) { /* ll: RH+1 */
+ node->avl_left = lr; /* lr: RH or RH+1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+ l->avl_left = ll; /* ll: RH+1 */
+ l->avl_right = node; /* node: RH+1 or RH+2 */
+ l->avl_height = node->avl_height + 1;
+ *nodep = l;
+ } else { /* ll: RH, lr: RH+1 */
+ lrl = lr->avl_left; /* lrl: RH or RH-1 */
+ lrr = lr->avl_right; /* lrr: RH or RH-1 */
+ node->avl_left = lrr; /* lrr: RH or RH-1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = rh + 1; /* node: RH+1 */
+ l->avl_left = ll; /* ll: RH */
+ l->avl_right = lrl; /* lrl: RH or RH-1 */
+ l->avl_height = rh + 1; /* l: RH+1 */
+ lr->avl_left = l; /* l: RH+1 */
+ lr->avl_right = node; /* node: RH+1 */
+ lr->avl_height = rh + 2;
+ *nodep = lr;
+ }
+ } else if (rh > lh + 1) { /* r: LH+2 */
+ struct inet_peer *rr, *rl, *rlr, *rll;
+ int rlh;
+ rr = r->avl_right;
+ rl = r->avl_left;
+ rlh = node_height(rl);
+ if (rlh <= node_height(rr)) { /* rr: LH+1 */
+ node->avl_right = rl; /* rl: LH or LH+1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+ r->avl_right = rr; /* rr: LH+1 */
+ r->avl_left = node; /* node: LH+1 or LH+2 */
+ r->avl_height = node->avl_height + 1;
+ *nodep = r;
+ } else { /* rr: RH, rl: RH+1 */
+ rlr = rl->avl_right; /* rlr: LH or LH-1 */
+ rll = rl->avl_left; /* rll: LH or LH-1 */
+ node->avl_right = rll; /* rll: LH or LH-1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = lh + 1; /* node: LH+1 */
+ r->avl_right = rr; /* rr: LH */
+ r->avl_left = rlr; /* rlr: LH or LH-1 */
+ r->avl_height = lh + 1; /* r: LH+1 */
+ rl->avl_right = r; /* r: LH+1 */
+ rl->avl_left = node; /* node: LH+1 */
+ rl->avl_height = lh + 2;
+ *nodep = rl;
+ }
+ } else {
+ node->avl_height = (lh > rh ? lh : rh) + 1;
+ }
+ }
+}
+
+/* Called with local BH disabled and the pool write lock held. */
+#define link_to_pool(n) \
+do { \
+ n->avl_height = 1; \
+ n->avl_left = peer_avl_empty; \
+ n->avl_right = peer_avl_empty; \
+ **--stackptr = n; \
+ peer_avl_rebalance(stack, stackptr); \
+} while(0)
+
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p)
+{
+ int do_free;
+
+ do_free = 0;
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check the reference counter. It was artificially incremented by 1
+ * in cleanup() function to prevent sudden disappearing. If the
+ * reference count is still 1 then the node is referenced only as `p'
+ * here and from the pool. So under the exclusive pool lock it's safe
+ * to remove the node and free it later. */
+ if (atomic_read(&p->refcnt) == 1) {
+ struct inet_peer **stack[PEER_MAXDEPTH];
+ struct inet_peer ***stackptr, ***delp;
+ if (lookup(p->v4daddr) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p);
+ if (*stackptr[-1] != t)
+ BUG();
+ **--stackptr = t->avl_left;
+ /* t is removed, t->v4daddr > x->v4daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ *delp[0] = t;
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ if (delp[1] != &p->avl_left)
+ BUG();
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr);
+ peer_total--;
+ do_free = 1;
+ }
+ write_unlock_bh(&peer_pool_lock);
+
+ if (do_free)
+ kmem_cache_free(peer_cachep, p);
+ else
+ /* The node is used again. Decrease the reference counter
+ * back. The loop "cleanup -> unlink_from_unused
+ * -> unlink_from_pool -> putpeer -> link_to_unused
+ * -> cleanup (for the same node)"
+ * doesn't really exist because the entry will have a
+ * recent deletion time and will not be cleaned again soon. */
+ inet_putpeer(p);
+}
+
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl)
+{
+ struct inet_peer *p;
+
+ /* Remove the first entry from the list of unused nodes. */
+ spin_lock_bh(&inet_peer_unused_lock);
+ p = inet_peer_unused_head;
+ if (p != NULL) {
+ if (time_before(p->dtime + ttl, jiffies)) {
+ /* Do not prune fresh entries. */
+ spin_unlock(&inet_peer_unused_lock);
+ return -1;
+ }
+ inet_peer_unused_head = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark as not on the list */
+ /* Grab an extra reference to prevent node disappearing
+ * before unlink_from_pool() call. */
+ atomic_inc(&p->refcnt);
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+
+ if (p == NULL)
+ /* It means that the total number of USED entries has
+ * grown over inet_peer_threshold. It shouldn't really
+ * happen because of entry limits in route cache. */
+ return -1;
+
+ unlink_from_pool(p);
+ return 0;
+}
+
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(__u32 daddr)
+{
+ struct inet_peer *p, *n;
+ struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+
+ /* Look up for the address quickly. */
+ read_lock_bh(&peer_pool_lock);
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ atomic_inc(&p->refcnt);
+ read_unlock_bh(&peer_pool_lock);
+
+ if (p != peer_avl_empty) {
+ /* The existing node has been found. */
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ return p;
+ }
+
+ /* Allocate the space outside the locked region. */
+ n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (n == NULL)
+ return NULL;
+ n->v4daddr = daddr;
+ atomic_set(&n->refcnt, 1);
+ n->ip_id_count = secure_ip_id(daddr);
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check if an entry has suddenly appeared. */
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ goto out_free;
+
+ /* Link the node. */
+ link_to_pool(n);
+ n->unused_prevp = NULL; /* not on the list */
+ peer_total++;
+ write_unlock_bh(&peer_pool_lock);
+
+ if (peer_total >= inet_peer_threshold)
+ /* Remove one less-recently-used entry. */
+ cleanup_once(0);
+
+ return n;
+
+out_free:
+ /* The appropriate node is already in the pool. */
+ atomic_inc(&p->refcnt);
+ write_unlock_bh(&peer_pool_lock);
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ /* Free preallocated the preallocated node. */
+ kmem_cache_free(peer_cachep, n);
+ return p;
+}
+
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+ int i;
+ int ttl;
+
+ if (peer_total >= inet_peer_threshold)
+ ttl = inet_peer_minttl;
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ for (i = 0; i < PEER_MAX_CLEANUP_WORK && cleanup_once(ttl); i++);
+
+ /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+ * interval depending on the total number of entries (more entries,
+ * less interval). */
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ add_timer(&peer_periodic_timer);
+}
diff -ruN linux-cvs.ipid.orig/net/ipv4/ip_gre.c linux-cvs.ipid/net/ipv4/ip_gre.c
--- linux-cvs.ipid.orig/net/ipv4/ip_gre.c Tue Aug 31 21:59:58 1999
+++ linux-cvs.ipid/net/ipv4/ip_gre.c Mon Sep 27 16:17:42 1999
@@ -815,7 +815,7 @@
}

iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);

stats->tx_bytes += skb->len;
diff -ruN linux-cvs.ipid.orig/net/ipv4/ip_output.c linux-cvs.ipid/net/ipv4/ip_output.c
--- linux-cvs.ipid.orig/net/ipv4/ip_output.c Tue Sep 7 21:22:04 1999
+++ linux-cvs.ipid/net/ipv4/ip_output.c Sat Oct 2 21:00:08 1999
@@ -71,6 +71,7 @@
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
+#include <net/inetpeer.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/mroute.h>
@@ -82,8 +83,6 @@

int sysctl_ip_dynaddr = 0;

-int ip_id_count = 0;
-
/* Generate a checksum for an outgoing IP datagram. */
__inline__ void ip_send_check(struct iphdr *iph)
{
@@ -172,7 +171,7 @@
iph->saddr = rt->rt_src;
iph->protocol = sk->protocol;
iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, &rt->u.dst);
skb->nh.iph = iph;

if (opt && opt->optlen) {
@@ -356,6 +355,8 @@
if (ip_dont_fragment(sk, &rt->u.dst))
iph->frag_off |= __constant_htons(IP_DF);

+ ip_select_ident(iph, &rt->u.dst);
+
/* Add an IP checksum. */
ip_send_check(iph);

@@ -375,6 +376,7 @@
kfree_skb(skb);
return -EMSGSIZE;
}
+ ip_select_ident(iph, &rt->u.dst);
return ip_fragment(skb, skb->dst->output);
}

@@ -429,7 +431,6 @@
}

iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);

return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
ip_queue_xmit2);
@@ -475,7 +476,7 @@
int err;
int offset, mf;
int mtu;
- unsigned short id;
+ u16 id = __constant_htons(1);

int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
int nfrags=0;
@@ -510,15 +511,14 @@
/*
* Start at the end of the frame by handling the remainder.
*/
-
offset = length - (length % (maxfraglen - fragheaderlen));
-
+
/*
* Amount of memory to allocate for final fragment.
*/
-
+
fraglen = length - offset + fragheaderlen;
-
+
if (length-offset==0) {
fraglen = maxfraglen;
offset -= maxfraglen-fragheaderlen;
@@ -527,14 +527,12 @@
/*
* The last fragment will not have MF (more fragments) set.
*/
-
mf = 0;

/*
* Don't fragment packets for path mtu discovery.
*/
-
- if (offset > 0 && df) {
+ if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
return -EMSGSIZE;
}
@@ -542,15 +540,9 @@
goto out;

/*
- * Get an identifier
- */
-
- id = htons(ip_id_count++);
-
- /*
* Begin outputting the bytes.
*/
-
+
do {
char *data;
struct sk_buff * skb;
@@ -566,7 +558,7 @@
/*
* Fill in the control structures
*/
-
+
skb->priority = sk->priority;
skb->dst = dst_clone(&rt->u.dst);
skb_reserve(skb, hh_len);
@@ -574,14 +566,14 @@
/*
* Find where to start putting bytes.
*/
-
+
data = skb_put(skb, fraglen);
skb->nh.iph = (struct iphdr *)data;

/*
* Only write IP header onto non-raw packets
*/
-
+
{
struct iphdr *iph = (struct iphdr *)data;

@@ -594,9 +586,23 @@
}
iph->tos = sk->protinfo.af_inet.tos;
iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
+ iph->frag_off = htons(offset>>3)|mf|df;
iph->id = id;
- iph->frag_off = htons(offset>>3);
- iph->frag_off |= mf|df;
+ if (!mf) {
+ if (offset || !df) {
+ /* Select an unpredictable ident only
+ * for packets without DF or having
+ * been fragmented.
+ */
+ __ip_select_ident(iph, &rt->u.dst);
+ id = iph->id;
+ }
+
+ /*
+ * Any further fragments will have MF set.
+ */
+ mf = htons(IP_MF);
+ }
if (rt->rt_type == RTN_MULTICAST)
iph->ttl = sk->protinfo.af_inet.mc_ttl;
else
@@ -607,14 +613,8 @@
iph->daddr = rt->rt_dst;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
data += iph->ihl*4;
-
- /*
- * Any further fragments will have MF set.
- */
-
- mf = htons(IP_MF);
}
-
+
/*
* User data callback
*/
@@ -712,20 +712,20 @@
goto error;
skb_reserve(skb, hh_len);
}
-
+
skb->priority = sk->priority;
skb->dst = dst_clone(&rt->u.dst);

skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
-
+
if(!sk->protinfo.af_inet.hdrincl) {
iph->version=4;
iph->ihl=5;
iph->tos=sk->protinfo.af_inet.tos;
iph->tot_len = htons(length);
- iph->id=htons(ip_id_count++);
iph->frag_off = df;
iph->ttl=sk->protinfo.af_inet.mc_ttl;
+ ip_select_ident(iph, &rt->u.dst);
if (rt->rt_type != RTN_MULTICAST)
iph->ttl=sk->protinfo.af_inet.ttl;
iph->protocol=sk->protocol;
@@ -1027,6 +1027,7 @@
dev_add_pack(&ip_packet_type);

ip_rt_init();
+ inet_initpeers();

#ifdef CONFIG_PROC_FS
#ifdef CONFIG_IP_MULTICAST
diff -ruN linux-cvs.ipid.orig/net/ipv4/ipip.c linux-cvs.ipid/net/ipv4/ipip.c
--- linux-cvs.ipid.orig/net/ipv4/ipip.c Tue Aug 31 21:59:58 1999
+++ linux-cvs.ipid/net/ipv4/ipip.c Mon Sep 27 16:17:42 1999
@@ -616,7 +616,7 @@
iph->ttl = old_iph->ttl;

iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);

stats->tx_bytes += skb->len;
diff -ruN linux-cvs.ipid.orig/net/ipv4/ipmr.c linux-cvs.ipid/net/ipv4/ipmr.c
--- linux-cvs.ipid.orig/net/ipv4/ipmr.c Tue Aug 31 21:59:58 1999
+++ linux-cvs.ipid/net/ipv4/ipmr.c Mon Sep 27 16:17:42 1999
@@ -1095,7 +1095,7 @@
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, skb->dst);
ip_send_check(iph);

skb->h.ipiph = skb->nh.iph;
diff -ruN linux-cvs.ipid.orig/net/ipv4/raw.c linux-cvs.ipid/net/ipv4/raw.c
--- linux-cvs.ipid.orig/net/ipv4/raw.c Sun Aug 22 19:33:10 1999
+++ linux-cvs.ipid/net/ipv4/raw.c Mon Sep 27 16:17:42 1999
@@ -214,7 +214,7 @@

if (sk->protinfo.af_inet.recverr)
ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1));
-
+
if (sk->protinfo.af_inet.recverr || harderr) {
sk->err = err;
sk->error_report(sk);
@@ -257,6 +257,7 @@
{
struct iovec *iov;
u32 saddr;
+ struct dst_entry *dst;
};

/*
@@ -296,7 +297,7 @@
* ip_build_xmit clean (well less messy).
*/
if (!iph->id)
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, rfh->dst);
iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl);
}
return 0;
@@ -372,6 +373,7 @@
}

rfh.saddr = ipc.addr;
+ rfh.dst = &rt->u.dst;
ipc.addr = daddr;

if (!ipc.opt)
diff -ruN linux-cvs.ipid.orig/net/ipv4/route.c linux-cvs.ipid/net/ipv4/route.c
--- linux-cvs.ipid.orig/net/ipv4/route.c Tue Aug 31 21:59:58 1999
+++ linux-cvs.ipid/net/ipv4/route.c Sat Oct 2 20:54:24 1999
@@ -83,9 +83,11 @@
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/arp.h>
@@ -134,6 +136,7 @@
static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
struct sk_buff *);
+static void ipv4_dst_destroy(struct dst_entry * dst);
static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
static void ipv4_link_failure(struct sk_buff *skb);
static int rt_garbage_collect(void);
@@ -148,7 +151,7 @@
rt_garbage_collect,
ipv4_dst_check,
ipv4_dst_reroute,
- NULL,
+ ipv4_dst_destroy,
ipv4_negative_advice,
ipv4_link_failure,
sizeof(struct rtable),
@@ -625,6 +628,65 @@
return 0;
}

+static void rt_bind_peer(struct rtable *rt)
+{
+ static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
+ struct inet_peer *peer;
+
+ peer = inet_getpeer(rt->rt_dst);
+
+ spin_lock_bh(&rt_peer_lock);
+ if (rt->peer == NULL) {
+ rt->peer = peer;
+ peer = NULL;
+ }
+ spin_unlock_bh(&rt_peer_lock);
+ if (peer)
+ inet_putpeer(peer);
+}
+
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions. However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+ static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
+ static u32 ip_fallback_id;
+ u32 salt;
+
+ spin_lock_bh(&ip_fb_id_lock);
+ salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
+ iph->id = salt & 0xFFFF;
+ ip_fallback_id = salt;
+ spin_unlock_bh(&ip_fb_id_lock);
+}
+
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt) {
+ if (rt->peer == NULL)
+ rt_bind_peer(rt);
+
+ /* If peer is attached to destination, it is never detached,
+ so that we need not to grab a lock to dereference it.
+ */
+ if (rt->peer) {
+ iph->id = htons(inet_getid(rt->peer));
+ return;
+ }
+ } else {
+ printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
+ }
+
+ ip_select_fb_ident(iph);
+}
+
static void rt_del(unsigned hash, struct rtable *rt)
{
struct rtable **rthp;
@@ -726,6 +788,9 @@
/* Redirect received -> path was valid */
dst_confirm(&rth->u.dst);

+ if (rt->peer)
+ atomic_inc(&rt->peer->refcnt);
+
if (!arp_bind_neighbour(&rt->u.dst) ||
!(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
if (rt->u.dst.neighbour)
@@ -970,6 +1035,17 @@
struct sk_buff *skb)
{
return NULL;
+}
+
+static void ipv4_dst_destroy(struct dst_entry * dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer = rt->peer;
+
+ if (peer) {
+ rt->peer = NULL;
+ inet_putpeer(peer);
+ }
}

static void ipv4_link_failure(struct sk_buff *skb)
diff -ruN linux-cvs.ipid.orig/net/ipv4/sysctl_net_ipv4.c linux-cvs.ipid/net/ipv4/sysctl_net_ipv4.c
--- linux-cvs.ipid.orig/net/ipv4/sysctl_net_ipv4.c Tue Sep 7 21:22:04 1999
+++ linux-cvs.ipid/net/ipv4/sysctl_net_ipv4.c Tue Sep 21 18:21:21 1999
@@ -41,9 +41,6 @@
/* From ip_output.c */
extern int sysctl_ip_dynaddr;

-/* From ip_masq.c */
-extern int sysctl_ip_masq_debug;
-
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_sack;
@@ -70,6 +67,13 @@
/* From igmp.c */
extern int sysctl_igmp_max_memberships;

+/* From inetpeer.c */
+extern int inet_peer_threshold;
+extern int inet_peer_minttl;
+extern int inet_peer_maxttl;
+extern int inet_peer_gc_mintime;
+extern int inet_peer_gc_maxtime;
+
int tcp_retr1_max = 255;

struct ipv4_config ipv4_config;
@@ -200,6 +204,20 @@
{NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships",
&sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec},
#endif
+ {NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold",
+ &inet_peer_threshold, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl",
+ &inet_peer_minttl, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl",
+ &inet_peer_maxttl, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime",
+ &inet_peer_gc_mintime, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime",
+ &inet_peer_gc_maxtime, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{0}
};

diff -ruN linux-cvs.ipid.orig/net/ipv6/sit.c linux-cvs.ipid/net/ipv6/sit.c
--- linux-cvs.ipid.orig/net/ipv6/sit.c Tue Aug 31 21:59:58 1999
+++ linux-cvs.ipid/net/ipv6/sit.c Mon Sep 27 16:17:42 1999
@@ -544,7 +544,7 @@
iph->ttl = iph6->hop_limit;

iph->tot_len = htons(skb->len);
- iph->id = htons(ip_id_count++);
+ ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);

stats->tx_bytes += skb->len;
diff -ruN linux-cvs.ipid.orig/net/netsyms.c linux-cvs.ipid/net/netsyms.c
--- linux-cvs.ipid.orig/net/netsyms.c Tue Aug 31 21:59:55 1999
+++ linux-cvs.ipid/net/netsyms.c Fri Sep 17 19:43:15 1999
@@ -231,7 +231,6 @@
EXPORT_SYMBOL(ip_options_undo);
EXPORT_SYMBOL(arp_send);
EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(ip_id_count);
EXPORT_SYMBOL(ip_send_check);
EXPORT_SYMBOL(ip_fragment);
EXPORT_SYMBOL(inet_family_ops);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/