patch for 2.1.123 RPC

Bill Hawes (whawes@transmeta.com)
Tue, 29 Sep 1998 13:09:52 -0700


This is a multi-part message in MIME format.
--------------67BC36E57CB67B1F6F789788
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

G. Allen Morris III wrote:

> I tried running `dd' the other day and it did hang. `ps' said that
> it was hung in `end'. The input Q was close ot 64k and the output
> Q was 8k less.
>
> Now that I have a few seconds I have tried it again, but can't get it
> to fail. Has anyone else tried this.

I've put together a patch for the RPC layer that appears to fix the problem with
knfsd getting stuck.

The problem was that knfsd was waiting in the svc_recv routine, and was not being
awakened when the client-side RPC retransmitted the RPC requests. This seems to
have been caused by a race condition in testing for whether a socket had been
assigned after waking up. The patch protects the test with a bh-exclusion, and
this made the problem go away.

(I should note that I don't quite understand _why_ this was happening -- it
appears that the wake-ups under the conditions I was testing should always have
set rq_sock and dequeued the thread. But removing the thread from the server queue
did need to be protected from bh activity.)

I tested this by running a script looping over 100 dd commands writing 40- or
80-megabyte files; before the changes the knfsd server would get stuck pretty
quickly, but the script now runs to completion with no apparent problems.

The patch revises the use of the rq_wait queue in svc_recv, as the previous usage
was decidedly non-standard -- it was doing an add_wait_queue, but no
remove_wait_queue, and then was zeroing the wait queue each time.

I've also changed the code that reassigns a freed request slot; it now just wakes
up the backlog queue if the transport is not congested. The previous code doesn't
look quite safe to me ...

If anyone else has seen the problems with knfsd and large dd commands, please give
this a try and let me know of any remaining problems.

Regards,
Bill

--------------67BC36E57CB67B1F6F789788
Content-Type: text/plain; charset=us-ascii; name="sunrpc_svc123-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="sunrpc_svc123-patch"

--- linux-2.1.123/net/sunrpc/clnt.c.old Mon Sep 28 07:56:39 1998
+++ linux-2.1.123/net/sunrpc/clnt.c Tue Sep 29 12:11:54 1998
@@ -58,6 +58,7 @@
static u32 * call_header(struct rpc_task *task);
static u32 * call_verify(struct rpc_task *task);

+
/*
* Create an RPC client
* FIXME: This should also take a flags argument (as in task->tk_flags).
@@ -500,13 +501,12 @@
call_receive(struct rpc_task *task)
{
dprintk("RPC: %4d call_receive (status %d)\n",
- task->tk_pid, task->tk_status);
+ task->tk_pid, task->tk_status);

+ task->tk_action = call_status;
/* In case of error, evaluate status */
- if (task->tk_status < 0) {
- task->tk_action = call_status;
+ if (task->tk_status < 0)
return;
- }

/* If we have no decode function, this means we're performing
* a void call (a la lockd message passing). */
@@ -516,7 +516,6 @@
return;
}

- task->tk_action = call_status;
xprt_receive(task);
}

@@ -579,7 +578,8 @@
task->tk_pid);
goto minor_timeout;
}
- if ((to->to_initval <<= 1) > to->to_maxval)
+ to->to_initval <<= 1;
+ if (to->to_initval > to->to_maxval)
to->to_initval = to->to_maxval;
}

@@ -592,9 +592,13 @@
return;
}
if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) {
- printk("%s: server %s not responding, still trying\n",
- clnt->cl_protname, clnt->cl_server);
task->tk_flags |= RPC_CALL_MAJORSEEN;
+ if (req)
+ printk("%s: server %s not responding, still trying\n",
+ clnt->cl_protname, clnt->cl_server);
+ else
+ printk("%s: task %d can't get a request slot\n",
+ clnt->cl_protname, task->tk_pid);
}
if (clnt->cl_autobind)
clnt->cl_port = 0;
--- linux-2.1.123/net/sunrpc/svcsock.c.old Wed Sep 23 11:42:44 1998
+++ linux-2.1.123/net/sunrpc/svcsock.c Tue Sep 29 12:17:01 1998
@@ -76,43 +76,52 @@
static inline void
svc_release_skb(struct svc_rqst *rqstp)
{
- if (!rqstp->rq_skbuff)
- return;
+ struct sk_buff *skb = rqstp->rq_skbuff;

- dprintk("svc: releasing skb %p\n", rqstp->rq_skbuff);
- skb_free_datagram(rqstp->rq_sock->sk_sk, rqstp->rq_skbuff);
+ if (!skb)
+ return;
rqstp->rq_skbuff = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
}

/*
* Queue up a socket with data pending. If there are idle nfsd
- * processes, wake'em up.
+ * processes, wake 'em up.
* When calling this function, you should make sure it can't be interrupted
* by the network bottom half.
*/
-static inline void
+static void
svc_sock_enqueue(struct svc_sock *svsk)
{
+ struct svc_serv *serv = svsk->sk_server;
struct svc_rqst *rqstp;
- struct svc_serv *serv;
+
+ if (serv->sv_threads && serv->sv_sockets)
+ printk(KERN_ERR
+ "svc_sock_enqueue: threads and sockets both waiting??\n");

if (svsk->sk_busy) {
/* Don't enqueue socket while daemon is receiving */
- dprintk("svc: socket %p not enqueued: busy\n", svsk->sk_sk);
+ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
return;
}

/* Mark socket as busy. It will remain in this state until the
* server has processed all pending data and put the socket back
- * on the idle list
+ * on the idle list.
*/
svsk->sk_busy = 1;

- serv = svsk->sk_server;
if ((rqstp = serv->sv_threads) != NULL) {
dprintk("svc: socket %p served by daemon %p\n",
- svsk->sk_sk, rqstp);
+ svsk->sk_sk, rqstp);
svc_serv_dequeue(serv, rqstp);
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_sock_enqueue: server %p, rq_sock=%p!\n",
+ rqstp, rqstp->rq_sock);
rqstp->rq_sock = svsk;
svsk->sk_inuse++;
wake_up(&rqstp->rq_wait);
@@ -137,7 +146,8 @@
end_bh_atomic();

if (svsk) {
- dprintk("svc: socket %p dequeued\n", svsk->sk_sk);
+ dprintk("svc: socket %p dequeued, inuse=%d\n",
+ svsk->sk_sk, svsk->sk_inuse);
svsk->sk_qued = 0;
}

@@ -325,13 +335,12 @@
static void
svc_udp_data_ready(struct sock *sk, int count)
{
- struct svc_sock *svsk;
-
- dprintk("svc: socket %p data ready (inet %p)\n", sk->user_data, sk);
+ struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);

- svsk = (struct svc_sock *)(sk->user_data);
if (!svsk)
return;
+ dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
+ svsk, sk, count, svsk->sk_busy);
svsk->sk_data = 1;
svc_sock_enqueue(svsk);
}
@@ -724,12 +733,21 @@
int
svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp)
{
- struct wait_queue wait = { current, NULL };
struct svc_sock *svsk;
int len;
+ struct wait_queue wait = { current, NULL };

dprintk("svc: server %p waiting for data (to = %ld)\n",
- rqstp, current->timeout);
+ rqstp, current->timeout);
+
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_recv: service %p, socket not NULL!\n",
+ rqstp);
+ if (waitqueue_active(&rqstp->rq_wait))
+ printk(KERN_ERR
+ "svc_recv: service %p, wait queue active!\n",
+ rqstp);

again:
/* Initialize the buffers */
@@ -741,13 +759,10 @@

start_bh_atomic();
if ((svsk = svc_sock_dequeue(serv)) != NULL) {
- end_bh_atomic();
rqstp->rq_sock = svsk;
- svsk->sk_inuse++; /* N.B. where is this decremented? */
+ svsk->sk_inuse++;
} else {
/* No data pending. Go to sleep */
- rqstp->rq_sock = NULL;
- rqstp->rq_wait = NULL;
svc_serv_enqueue(serv, rqstp);

/*
@@ -759,15 +774,22 @@
end_bh_atomic();
schedule();

+ remove_wait_queue(&rqstp->rq_wait, &wait);
+
+ start_bh_atomic();
if (!(svsk = rqstp->rq_sock)) {
svc_serv_dequeue(serv, rqstp);
- if (!(svsk = rqstp->rq_sock))
- return signalled()? -EINTR : -EAGAIN;
+ end_bh_atomic();
+ dprintk("svc: server %p, no data yet\n", rqstp);
+ return signalled()? -EINTR : -EAGAIN;
}
}
+ end_bh_atomic();

- dprintk("svc: server %p servicing socket %p\n", rqstp, svsk);
+ dprintk("svc: server %p, socket %p, inuse=%d\n",
+ rqstp, svsk, svsk->sk_inuse);
len = svsk->sk_recvfrom(rqstp);
+ dprintk("svc: got len=%d\n", len);

/* No data, incomplete (TCP) read, or accept() */
if (len == 0 || len == -EAGAIN) {
--- linux-2.1.123/net/sunrpc/xprt.c.old Mon Sep 28 07:56:40 1998
+++ linux-2.1.123/net/sunrpc/xprt.c Tue Sep 29 12:05:34 1998
@@ -310,7 +310,7 @@
if ((cwnd >>= 1) < RPC_CWNDSCALE)
cwnd = RPC_CWNDSCALE;
xprt->congtime = jiffies + ((cwnd * HZ) << 3) / RPC_CWNDSCALE;
- dprintk("RPC: cong %08lx, cwnd was %08lx, now %08lx, "
+ dprintk("RPC: cong %ld, cwnd was %ld, now %ld, "
"time %ld ms\n", xprt->cong, xprt->cwnd, cwnd,
(xprt->congtime-jiffies)*1000/HZ);
pprintk("RPC: %lu %ld cwnd\n", jiffies, cwnd);
@@ -884,11 +884,14 @@
static void
xprt_timer(struct rpc_task *task)
{
- if (task->tk_rqstp)
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (req) {
xprt_adjust_cwnd(task->tk_xprt, -ETIMEDOUT);
+ }

- dprintk("RPC: %4d xprt_timer (%s request)\n", task->tk_pid,
- task->tk_rqstp? "pending" : "backlogged");
+ dprintk("RPC: %4d xprt_timer (%s request)\n",
+ task->tk_pid, req ? "pending" : "backlogged");

task->tk_status = -ETIMEDOUT;
task->tk_timeout = 0;
@@ -1155,12 +1158,13 @@
return;

bad_list:
- printk("RPC: %4d inconsistent free list (cong %ld cwnd %ld)\n",
+ printk(KERN_ERR
+ "RPC: %4d inconsistent free list (cong %ld cwnd %ld)\n",
task->tk_pid, xprt->cong, xprt->cwnd);
rpc_debug = ~0;
goto bummer;
bad_used:
- printk("RPC: used rqst slot %p on free list!\n", req);
+ printk(KERN_ERR "RPC: used rqst slot %p on free list!\n", req);
bummer:
task->tk_status = -EIO;
xprt->free = NULL;
@@ -1216,12 +1220,16 @@
}
end_bh_atomic();

- /* Decrease congestion value. If congestion threshold is not yet
- * reached, pass on the request slot.
+ /* Decrease congestion value. */
+ xprt->cong -= RPC_CWNDSCALE;
+
+#if 0
+ /* If congestion threshold is not yet reached, pass on the request slot.
* This looks kind of kludgy, but it guarantees backlogged requests
* are served in order.
+ * N.B. This doesn't look completely safe, as the task is still
+ * on the backlog list after wake-up.
*/
- xprt->cong -= RPC_CWNDSCALE;
if (!RPCXPRT_CONGESTED(xprt)) {
struct rpc_task *next = rpc_wake_up_next(&xprt->backlog);

@@ -1232,9 +1240,14 @@
return;
}
}
+#endif

req->rq_next = xprt->free;
xprt->free = req;
+
+ /* If not congested, wake up the next backlogged process */
+ if (!RPCXPRT_CONGESTED(xprt))
+ rpc_wake_up_next(&xprt->backlog);
}

/*

--------------67BC36E57CB67B1F6F789788--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/