Re: block: make blktrace use per-cpu buffers for message notes

From: Jens Axboe
Date: Thu May 29 2008 - 03:09:27 EST


On Thu, May 29 2008, Jens Axboe wrote:
> On Wed, May 28 2008, Andrew Morton wrote:
> > On Thu, 29 May 2008 08:22:15 +0200 Jens Axboe <jens.axboe@xxxxxxxxxx> wrote:
> >
> > > On Wed, May 28 2008, Andrew Morton wrote:
> > > > On Wed, 28 May 2008 15:59:07 GMT Linux Kernel Mailing List <linux-kernel@xxxxxxxxxxxxxxx> wrote:
> > > >
> > > > > Gitweb: http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=64565911cdb57c2f512a9715b985b5617402cc67
> > > > > Commit: 64565911cdb57c2f512a9715b985b5617402cc67
> > > > > Parent: 4722dc52a891ab6cb2d637ddb87233e0ce277827
> > > > > Author: Jens Axboe <jens.axboe@xxxxxxxxxx>
> > > > > AuthorDate: Wed May 28 14:45:33 2008 +0200
> > > > > Committer: Jens Axboe <jens.axboe@xxxxxxxxxx>
> > > > > CommitDate: Wed May 28 14:49:27 2008 +0200
> > > >
> > > > please try to avoid merging unreviewed changes.
> > >
> > > Just because you didn't review it doesn't mean it's unreviewed :-)
> > >
> > > It's not unreviewed, it was posted on lkml and a few version were
> > > bounced back and forth.
> >
> > OK. The Subject: swizzling confounded me.
> >
> > > > > if (unlikely(bt)) \
> > > > > __trace_note_message(bt, fmt, ##__VA_ARGS__); \
> > > > > } while (0)
> > > > > -#define BLK_TN_MAX_MSG 1024
> > > > > +#define BLK_TN_MAX_MSG 128
> > > >
> > > > It seems a bit strange to do this right when we've taken this _off_ the
> > > > stack. But I suppose nothing will break.
> > >
> > > It was never on the stack, it was a global static char array. We are
> > > still allocating memory for this, per-cpu. So I think it still makes
> > > sense to shrink the size. It's really meant for small trace messages,
> > > 128 bytes is plenty. It's an in-kernel property, the userland app
> > > doesn't care. So we could easily grow this in the future, should the
> > > need arise.
> >
> > yup.
> >
> > It's a bit sad to stage the data in a local per-cpu buffer and then
> > copy it into relay's per-cpu buffer. I guess this is because the
> > length of the output isn't known beforehand. Could be fixed by doing
> > what kvasprintf() does, but that might well be slower.
>
> I agree, this is what we debated. My reasoning is that it's better
> to minimize usage of the relay buffer, so the stage-and-copy doesn't
> matter a whole lot.
>
> I seem to recall a relay_unreserve() patch from Tom back in the day,
> if we had something like that we could do the optimal approach of:

So I dug back into the old archives, and the patch was actually done
by me back in feb 2006. Not sure I particularly like it or if it
even works in this forward ported version, but it should get the
point across. blktrace is the sole user of relay_reserve(), so I
think it would be OK to change the interface.

This is clearly NOT 2.6.26 material though, but it's food for
thought (I hope).

diff --git a/block/blktrace.c b/block/blktrace.c
index 7ae87cc..8583bfc 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -27,6 +27,18 @@

static unsigned int blktrace_seq __read_mostly = 1;

+static void note_header(struct blk_io_trace *t, struct blk_trace *bt,
+ pid_t pid, int action, size_t len)
+{
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->time = ktime_to_ns(ktime_get());
+ t->device = bt->dev;
+ t->action = action;
+ t->pid = pid;
+ t->cpu = smp_processor_id();
+ t->pdu_len = len;
+}
+
/*
* Send out a notify message.
*/
@@ -37,16 +49,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,

t = relay_reserve(bt->rchan, sizeof(*t) + len);
if (t) {
- const int cpu = smp_processor_id();
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = ktime_to_ns(ktime_get());
- t->device = bt->dev;
- t->action = action;
- t->pid = pid;
- t->cpu = cpu;
- t->pdu_len = len;
+ note_header(t, bt, pid, action, len);
memcpy((void *) t + sizeof(*t), data, len);
+ relay_commit_reserve(bt->rchan);
}
}

@@ -77,17 +82,24 @@ static void trace_note_time(struct blk_trace *bt)

void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
{
- int n;
- va_list args;
- char *buf;
+ struct blk_io_trace *t;

preempt_disable();
- buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
- va_start(args, fmt);
- n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
- va_end(args);
+ t = relay_reserve(bt->rchan, sizeof(*t) + BLK_TN_MAX_MSG);
+ if (t) {
+ va_list args;
+ char *buf = (void *) t + sizeof(*t);
+ int len;

- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ va_start(args, fmt);
+ len = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+ va_end(args);
+
+ if (BLK_TN_MAX_MSG - len)
+ relay_unreserve(bt->rchan, BLK_TN_MAX_MSG - len);
+
+ relay_commit_reserve(bt->rchan);
+ }
preempt_enable();
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -187,6 +199,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,

if (pdu_len)
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ relay_commit_reserve(bt->rchan);
}

local_irq_restore(flags);
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 6cd8c44..593f27f 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -35,6 +35,7 @@ struct rchan_buf
void *start; /* start of channel buffer */
void *data; /* start of current sub-buffer */
size_t offset; /* current offset into sub-buffer */
+ size_t reserve_offset;
size_t subbufs_produced; /* count of sub-buffers produced */
size_t subbufs_consumed; /* count of sub-buffers consumed */
struct rchan *chan; /* associated channel */
@@ -206,6 +207,7 @@ static inline void relay_write(struct rchan *chan,
length = relay_switch_subbuf(buf, length);
memcpy(buf->data + buf->offset, data, length);
buf->offset += length;
+ buf->reserve_offset = buf->offset;
local_irq_restore(flags);
}

@@ -232,6 +234,7 @@ static inline void __relay_write(struct rchan *chan,
length = relay_switch_subbuf(buf, length);
memcpy(buf->data + buf->offset, data, length);
buf->offset += length;
+ buf->reserve_offset = buf->offset;
put_cpu();
}

@@ -244,7 +247,8 @@ static inline void __relay_write(struct rchan *chan,
*
* Reserves a slot in the current cpu's channel buffer.
* Does not protect the buffer at all - caller must provide
- * appropriate synchronization.
+ * appropriate synchronization. Must be paired directly with
+ * a call to @relay_commit_reserve or @relay_unreserve.
*/
static inline void *relay_reserve(struct rchan *chan, size_t length)
{
@@ -257,11 +261,25 @@ static inline void *relay_reserve(struct rchan *chan, size_t length)
return NULL;
}
reserved = buf->data + buf->offset;
- buf->offset += length;
+ buf->reserve_offset += length;

return reserved;
}

+static inline void relay_unreserve(struct rchan *chan, size_t length)
+{
+ struct rchan_buf *buf = chan->buf[smp_processor_id()];
+
+ buf->reserve_offset -= length;
+}
+
+static inline void relay_commit_reserve(struct rchan *chan)
+{
+ struct rchan_buf *buf = chan->buf[smp_processor_id()];
+
+ buf->offset = buf->reserve_offset;
+}
+
/**
* subbuf_start_reserve - reserve bytes at the start of a sub-buffer
* @buf: relay channel buffer
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644c..9208bd9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -369,6 +369,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
+ buf->reserve_offset = 0;

for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
@@ -644,8 +645,10 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
+ buf->reserve_offset = 0;
if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
+ buf->reserve_offset = buf->offset;
return 0;
}
buf->data = new;

--
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/