- Previous thread: [RFC PATCH 06/12 v3] ptrace: mv task_struct->ptrace ptrace_ctx->flags, kill ptrace_link()
- Next thread: Re:[PATCH] Fix Warnining in arch/x86/kernel/signal.c
- Threads sorted by date: kernel 200905
TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:
- zero-copy and per-cpu splice() tracing
- binary tracing without printf overhead
- structured logging records exposed under /debug/tracing/events
- trace events embedded in function tracer output and other plugins
- user-defined, per tracepoint filter expressions
...
Cons and problems:
- no dev_t info for the output of plug, unplug_timer and unplug_io events.
no dev_t info for getrq and sleeprq events if bio == NULL.
no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
- for large packet commands, only 16 bytes of the command will be output.
Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
supports dynamic-sized strings.
- a packet command is converted to a string in TP_assign, not TP_print.
While blktrace do the convertion just before output.
- in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
has a unique format. (see the output of getrq and rq_insert)
Following are some comparisons between TRACE_EVENT and blktrace:
plug:
kjournald-480 [000] 303.084981: block_plug: [kjournald]
kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
unplug_io:
kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1
kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1
remap:
kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 nr_sectors)
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
/*
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a7..3fd8b1e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
#include
#include
#include
-#include
+
+#include
#define DM_MSG_PREFIX "core"
@@ -53,8 +54,6 @@ struct dm_target_io {
union map_info info;
};
-DEFINE_TRACE(block_bio_complete);
-
/*
* For request-based dm.
* One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 9871164..740699c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
#include
#include
#include
-#include
#include /* for struct sg_iovec */
-DEFINE_TRACE(block_split);
+#include
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636..c14fb5b 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,13 @@ static inline int blk_trace_init_sysfs(struct device *dev)
#endif /* CONFIG_BLK_DEV_IO_TRACE */
+#ifdef CONFIG_EVENT_TRACING
+
+extern void __dump_pdu(char *buf, struct request *rq);
+extern void __fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void __fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif
+
#endif /* __KERNEL__ */
#endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa..0000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include
-#include
-
-DECLARE_TRACE(block_rq_abort,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
- TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
- TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
- TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
- TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
- TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
- TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
- TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
- sector_t from),
- TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 0000000..e9aa2b5
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,476 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include
+#include
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( unsigned int, bytes )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->bytes, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( unsigned int, bytes )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->bytes, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned, nr_sector )
+ __field( int, error )
+ __array( char, rwbs, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+ TP_ARGS(q, bio, rw),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
+ __entry->sector = bio ? bio->bi_sector : 0;
+ __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
+ __fill_rwbs(__entry->rwbs,
+ bio ? bio->bi_rw : 0, __entry->nr_sector);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+ TP_ARGS(q, bio, rw),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
+ __entry->sector = bio ? bio->bi_sector : 0;
+ __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
+ __fill_rwbs(__entry->rwbs,
+ bio ? bio->bi_rw : 0, __entry->nr_sector);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __field( int, nr_rq )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __field( int, nr_rq )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio,
+ unsigned int new_sector),
+
+ TP_ARGS(q, bio, new_sector),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( sector_t, new_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->new_sector = new_sector;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu / %llu [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+ sector_t from),
+
+ TP_ARGS(q, bio, dev, from),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( dev_t, old_dev )
+ __field( sector_t, old_sector )
+ __array( char, rwbs, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __entry->old_dev = dev;
+ __entry->old_sector = from;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u
these new capabilities to this tracepoint:
- zero-copy and per-cpu splice() tracing
- binary tracing without printf overhead
- structured logging records exposed under /debug/tracing/events
- trace events embedded in function tracer output and other plugins
- user-defined, per tracepoint filter expressions
...
Cons and problems:
- no dev_t info for the output of plug, unplug_timer and unplug_io events.
no dev_t info for getrq and sleeprq events if bio == NULL.
no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
- for large packet commands, only 16 bytes of the command will be output.
Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
supports dynamic-sized strings.
- a packet command is converted to a string in TP_assign, not TP_print.
While blktrace do the convertion just before output.
- in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
has a unique format. (see the output of getrq and rq_insert)
Following are some comparisons between TRACE_EVENT and blktrace:
plug:
kjournald-480 [000] 303.084981: block_plug: [kjournald]
kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
unplug_io:
kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1
kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1
remap:
kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 nr_sectors)
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
/*
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a7..3fd8b1e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
#include
#include
#include
-#include
+
+#include
#define DM_MSG_PREFIX "core"
@@ -53,8 +54,6 @@ struct dm_target_io {
union map_info info;
};
-DEFINE_TRACE(block_bio_complete);
-
/*
* For request-based dm.
* One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 9871164..740699c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
#include
#include
#include
-#include
#include /* for struct sg_iovec */
-DEFINE_TRACE(block_split);
+#include
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636..c14fb5b 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,13 @@ static inline int blk_trace_init_sysfs(struct device *dev)
#endif /* CONFIG_BLK_DEV_IO_TRACE */
+#ifdef CONFIG_EVENT_TRACING
+
+extern void __dump_pdu(char *buf, struct request *rq);
+extern void __fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void __fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif
+
#endif /* __KERNEL__ */
#endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa..0000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include
-#include
-
-DECLARE_TRACE(block_rq_abort,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
- TP_PROTO(struct request_queue *q, struct request *rq),
- TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
- TP_PROTO(struct request_queue *q, struct bio *bio),
- TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
- TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
- TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
- TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
- TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
- TP_PROTO(struct request_queue *q),
- TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
- TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
- TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
- TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
- sector_t from),
- TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 0000000..e9aa2b5
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,476 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include
+#include
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( unsigned int, bytes )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->bytes, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( unsigned int, bytes )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->bytes, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+ TP_PROTO(struct request_queue *q, struct request *rq),
+
+ TP_ARGS(q, rq),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( int, errors )
+ __array( char, rwbs, 6 )
+ __array( char, cmd, BLK_MAX_CDB * 3 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+ __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
+ __entry->nr_sector = blk_pc_request(rq) ?
+ 0 : rq->hard_nr_sectors;
+ __entry->errors = rq->errors;
+ __fill_rwbs_rq(__entry->rwbs, rq);
+ __dump_pdu(__entry->cmd, rq);
+ ),
+
+ TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs, __entry->cmd,
+ __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned, nr_sector )
+ __field( int, error )
+ __array( char, rwbs, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%d]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio),
+
+ TP_ARGS(q, bio),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+ TP_ARGS(q, bio, rw),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
+ __entry->sector = bio ? bio->bi_sector : 0;
+ __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
+ __fill_rwbs(__entry->rwbs,
+ bio ? bio->bi_rw : 0, __entry->nr_sector);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+ TP_ARGS(q, bio, rw),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
+ __entry->sector = bio ? bio->bi_sector : 0;
+ __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
+ __fill_rwbs(__entry->rwbs,
+ bio ? bio->bi_rw : 0, __entry->nr_sector);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __field( int, nr_rq )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+ TP_PROTO(struct request_queue *q),
+
+ TP_ARGS(q),
+
+ TP_STRUCT__entry(
+ __field( int, nr_rq )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio,
+ unsigned int new_sector),
+
+ TP_ARGS(q, bio, new_sector),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( sector_t, new_sector )
+ __array( char, rwbs, 6 )
+ __array( char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->new_sector = new_sector;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu / %llu [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+ TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+ sector_t from),
+
+ TP_ARGS(q, bio, dev, from),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( sector_t, sector )
+ __field( unsigned int, nr_sector )
+ __field( dev_t, old_dev )
+ __field( sector_t, old_sector )
+ __array( char, rwbs, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ __entry->old_dev = dev;
+ __entry->old_sector = from;
+ __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u
* Li Zefan wrote:
> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
> these new capabilities to this tracepoint:
>
> - zero-copy and per-cpu splice() tracing
> - binary tracing without printf overhead
> - structured logging records exposed under /debug/tracing/events
> - trace events embedded in function tracer output and other plugins
> - user-defined, per tracepoint filter expressions
> ...
Nice!
> Cons and problems:
>
> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
> no dev_t info for getrq and sleeprq events if bio == NULL.
> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
Cannot we output the numeric major:minor pairs?
> - for large packet commands, only 16 bytes of the command will be output.
> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
> supports dynamic-sized strings.
>
> - a packet command is converted to a string in TP_assign, not TP_print.
> While blktrace do the convertion just before output.
Couldnt we do a memcpy instead of the snprintf() in __dump_pdu()? We
dont actually interpret the bytes there. We could extend the
in-kernel printk format with a 'dump raw memory in hex' type of
format specifier.
OTOH, packet requests are rather rare, right? So going to ASCII
there results in a simpler interface. In the !blk_pc_request(rq)
common case we just return early without any snprintf overhead.
> - in blktrace, an event can have 2 different print formats, but
> a TRACE_EVENT has a unique format. (see the output of getrq
> and rq_insert)
Is this a problem?
I think a good way forward would be to benchmark the ioctl versus
the splice based TRACE_EVENT tracing (via some artificially high
rate event, to push things), and see where we are right now in terms
of overhead.
Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Thu, May 14, 2009 at 02:20:03PM +0800, Li Zefan wrote:
> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
> these new capabilities to this tracepoint:
>
> - zero-copy and per-cpu splice() tracing
> - binary tracing without printf overhead
> - structured logging records exposed under /debug/tracing/events
> - trace events embedded in function tracer output and other plugins
> - user-defined, per tracepoint filter expressions
> ...
>
> Cons and problems:
>
> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
> no dev_t info for getrq and sleeprq events if bio == NULL.
> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> - for large packet commands, only 16 bytes of the command will be output.
> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
> supports dynamic-sized strings.
>
> - a packet command is converted to a string in TP_assign, not TP_print.
> While blktrace do the convertion just before output.
>
> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
> has a unique format. (see the output of getrq and rq_insert)
I'm starting to think it would be nice to choose between several outputs
in a trace event.
Ie: perhaps we need a kind of per event flag, something simple, just to
choose between several TP_printk output. Not sure how much it would
(non) trivial to implement though...
Frederic.
>
> Following are some comparisons between TRACE_EVENT and blktrace:
>
> plug:
> kjournald-480 [000] 303.084981: block_plug: [kjournald]
> kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
>
> unplug_io:
> kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1
> kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1
>
> remap:
> kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8
> bio_backmerge:
> kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
> kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald]
>
> getrq:
> kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
> kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald]
>
> bash-2066 [001] 1072.953770: 8,0 G N [bash]
> bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash]
>
> rq_complete:
> konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
> konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0]
>
> rq_insert:
> kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
> kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald]
>
> ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0]
> ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]
>
> Signed-off-by: Li Zefan
> ---
> block/blk-core.c | 16 +-
> block/elevator.c | 8 +-
> drivers/md/dm.c | 5 +-
> fs/bio.c | 3 +-
> include/linux/blktrace_api.h | 8 +
> include/trace/block.h | 76 -------
> include/trace/events/block.h | 476 ++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/Makefile | 5 +-
> kernel/trace/blktrace.c | 78 +++++++-
> mm/bounce.c | 5 +-
> 10 files changed, 576 insertions(+), 104 deletions(-)
> delete mode 100644 include/trace/block.h
> create mode 100644 include/trace/events/block.h
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d028baf..ff50f9d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -28,22 +28,14 @@
> #include
> #include
> #include
> -#include
> +
> +#define CREATE_TRACE_POINTS
> +#include
>
> #include "blk.h"
>
> -DEFINE_TRACE(block_plug);
> -DEFINE_TRACE(block_unplug_io);
> -DEFINE_TRACE(block_unplug_timer);
> -DEFINE_TRACE(block_getrq);
> -DEFINE_TRACE(block_sleeprq);
> -DEFINE_TRACE(block_rq_requeue);
> -DEFINE_TRACE(block_bio_backmerge);
> -DEFINE_TRACE(block_bio_frontmerge);
> -DEFINE_TRACE(block_bio_queue);
> -DEFINE_TRACE(block_rq_complete);
> -DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */
> EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
>
> static int __make_request(struct request_queue *q, struct bio *bio);
>
> diff --git a/block/elevator.c b/block/elevator.c
> index 7073a90..e220f0c 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -33,17 +33,16 @@
> #include
> #include
> #include
> -#include
> #include
> #include
>
> +#include
> +
> #include "blk.h"
>
> static DEFINE_SPINLOCK(elv_list_lock);
> static LIST_HEAD(elv_list);
>
> -DEFINE_TRACE(block_rq_abort);
> -
> /*
> * Merge hash stuff.
> */
> @@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
> #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
> #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
>
> -DEFINE_TRACE(block_rq_insert);
> -DEFINE_TRACE(block_rq_issue);
> -
> /*
> * Query io scheduler to see if the current process issuing bio may be
> * merged with rq.
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index e2ee4a7..3fd8b1e 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -20,7 +20,8 @@
> #include
> #include
> #include
> -#include
> +
> +#include
>
> #define DM_MSG_PREFIX "core"
>
> @@ -53,8 +54,6 @@ struct dm_target_io {
> union map_info info;
> };
>
> -DEFINE_TRACE(block_bio_complete);
> -
> /*
> * For request-based dm.
> * One of these is allocated per request.
> diff --git a/fs/bio.c b/fs/bio.c
> index 9871164..740699c 100644
> --- a/fs/bio.c
> +++ b/fs/bio.c
> @@ -26,10 +26,9 @@
> #include
> #include
> #include
> -#include
> #include /* for struct sg_iovec */
>
> -DEFINE_TRACE(block_split);
> +#include
>
> /*
> * Test patch to inline a certain number of bi_io_vec's inside the bio
> diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
> index 82b4636..c14fb5b 100644
> --- a/include/linux/blktrace_api.h
> +++ b/include/linux/blktrace_api.h
> @@ -218,5 +218,13 @@ static inline int blk_trace_init_sysfs(struct device *dev)
>
> #endif /* CONFIG_BLK_DEV_IO_TRACE */
>
> +#ifdef CONFIG_EVENT_TRACING
> +
> +extern void __dump_pdu(char *buf, struct request *rq);
> +extern void __fill_rwbs(char *rwbs, u32 rw, int bytes);
> +extern void __fill_rwbs_rq(char *rwbs, struct request *rq);
> +
> +#endif
> +
> #endif /* __KERNEL__ */
> #endif
> diff --git a/include/trace/block.h b/include/trace/block.h
> deleted file mode 100644
> index 5b12efa..0000000
> --- a/include/trace/block.h
> +++ /dev/null
> @@ -1,76 +0,0 @@
> -#ifndef _TRACE_BLOCK_H
> -#define _TRACE_BLOCK_H
> -
> -#include
> -#include
> -
> -DECLARE_TRACE(block_rq_abort,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_insert,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_issue,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_requeue,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_complete,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_bio_bounce,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_complete,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_backmerge,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_frontmerge,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_queue,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_getrq,
> - TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> - TP_ARGS(q, bio, rw));
> -
> -DECLARE_TRACE(block_sleeprq,
> - TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> - TP_ARGS(q, bio, rw));
> -
> -DECLARE_TRACE(block_plug,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_unplug_timer,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_unplug_io,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_split,
> - TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
> - TP_ARGS(q, bio, pdu));
> -
> -DECLARE_TRACE(block_remap,
> - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
> - sector_t from),
> - TP_ARGS(q, bio, dev, from));
> -
> -#endif
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> new file mode 100644
> index 0000000..e9aa2b5
> --- /dev/null
> +++ b/include/trace/events/block.h
> @@ -0,0 +1,476 @@
> +#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_BLOCK_H
> +
> +#include
> +#include
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM block
> +
> +TRACE_EVENT(block_rq_abort,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +
> +TRACE_EVENT(block_rq_insert,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( unsigned int, bytes )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->bytes, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_rq_issue,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( unsigned int, bytes )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->bytes, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_rq_requeue,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +
> +TRACE_EVENT(block_rq_complete,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +TRACE_EVENT(block_bio_bounce,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_complete,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned, nr_sector )
> + __field( int, error )
> + __array( char, rwbs, 6 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->error)
> +);
> +
> +TRACE_EVENT(block_bio_backmerge,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_frontmerge,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_queue,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_getrq,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> +
> + TP_ARGS(q, bio, rw),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
> + __entry->sector = bio ? bio->bi_sector : 0;
> + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
> + __fill_rwbs(__entry->rwbs,
> + bio ? bio->bi_rw : 0, __entry->nr_sector);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_sleeprq,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> +
> + TP_ARGS(q, bio, rw),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
> + __entry->sector = bio ? bio->bi_sector : 0;
> + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
> + __fill_rwbs(__entry->rwbs,
> + bio ? bio->bi_rw : 0, __entry->nr_sector);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_plug,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s]", __entry->comm)
> +);
> +
> +TRACE_EVENT(block_unplug_timer,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __field( int, nr_rq )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
> +);
> +
> +TRACE_EVENT(block_unplug_io,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __field( int, nr_rq )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
> +);
> +
> +TRACE_EVENT(block_split,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio,
> + unsigned int new_sector),
> +
> + TP_ARGS(q, bio, new_sector),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( sector_t, new_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->new_sector = new_sector;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu / %llu [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->new_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_remap,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
> + sector_t from),
> +
> + TP_ARGS(q, bio, dev, from),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( dev_t, old_dev )
> + __field( sector_t, old_sector )
> + __array( char, rwbs, 6 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __entry->old_dev = dev;
> + __entry->old_sector = from;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u
> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
> these new capabilities to this tracepoint:
>
> - zero-copy and per-cpu splice() tracing
> - binary tracing without printf overhead
> - structured logging records exposed under /debug/tracing/events
> - trace events embedded in function tracer output and other plugins
> - user-defined, per tracepoint filter expressions
> ...
>
> Cons and problems:
>
> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
> no dev_t info for getrq and sleeprq events if bio == NULL.
> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> - for large packet commands, only 16 bytes of the command will be output.
> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
> supports dynamic-sized strings.
>
> - a packet command is converted to a string in TP_assign, not TP_print.
> While blktrace do the convertion just before output.
>
> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
> has a unique format. (see the output of getrq and rq_insert)
I'm starting to think it would be nice to choose between several outputs
in a trace event.
Ie: perhaps we need a kind of per event flag, something simple, just to
choose between several TP_printk output. Not sure how much it would
(non) trivial to implement though...
Frederic.
>
> Following are some comparisons between TRACE_EVENT and blktrace:
>
> plug:
> kjournald-480 [000] 303.084981: block_plug: [kjournald]
> kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
>
> unplug_io:
> kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1
> kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1
>
> remap:
> kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8
> bio_backmerge:
> kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
> kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald]
>
> getrq:
> kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
> kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald]
>
> bash-2066 [001] 1072.953770: 8,0 G N [bash]
> bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash]
>
> rq_complete:
> konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
> konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0]
>
> rq_insert:
> kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
> kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald]
>
> ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0]
> ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]
>
> Signed-off-by: Li Zefan
> ---
> block/blk-core.c | 16 +-
> block/elevator.c | 8 +-
> drivers/md/dm.c | 5 +-
> fs/bio.c | 3 +-
> include/linux/blktrace_api.h | 8 +
> include/trace/block.h | 76 -------
> include/trace/events/block.h | 476 ++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/Makefile | 5 +-
> kernel/trace/blktrace.c | 78 +++++++-
> mm/bounce.c | 5 +-
> 10 files changed, 576 insertions(+), 104 deletions(-)
> delete mode 100644 include/trace/block.h
> create mode 100644 include/trace/events/block.h
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d028baf..ff50f9d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -28,22 +28,14 @@
> #include
> #include
> #include
> -#include
> +
> +#define CREATE_TRACE_POINTS
> +#include
>
> #include "blk.h"
>
> -DEFINE_TRACE(block_plug);
> -DEFINE_TRACE(block_unplug_io);
> -DEFINE_TRACE(block_unplug_timer);
> -DEFINE_TRACE(block_getrq);
> -DEFINE_TRACE(block_sleeprq);
> -DEFINE_TRACE(block_rq_requeue);
> -DEFINE_TRACE(block_bio_backmerge);
> -DEFINE_TRACE(block_bio_frontmerge);
> -DEFINE_TRACE(block_bio_queue);
> -DEFINE_TRACE(block_rq_complete);
> -DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */
> EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
>
> static int __make_request(struct request_queue *q, struct bio *bio);
>
> diff --git a/block/elevator.c b/block/elevator.c
> index 7073a90..e220f0c 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -33,17 +33,16 @@
> #include
> #include
> #include
> -#include
> #include
> #include
>
> +#include
> +
> #include "blk.h"
>
> static DEFINE_SPINLOCK(elv_list_lock);
> static LIST_HEAD(elv_list);
>
> -DEFINE_TRACE(block_rq_abort);
> -
> /*
> * Merge hash stuff.
> */
> @@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
> #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
> #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
>
> -DEFINE_TRACE(block_rq_insert);
> -DEFINE_TRACE(block_rq_issue);
> -
> /*
> * Query io scheduler to see if the current process issuing bio may be
> * merged with rq.
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index e2ee4a7..3fd8b1e 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -20,7 +20,8 @@
> #include
> #include
> #include
> -#include
> +
> +#include
>
> #define DM_MSG_PREFIX "core"
>
> @@ -53,8 +54,6 @@ struct dm_target_io {
> union map_info info;
> };
>
> -DEFINE_TRACE(block_bio_complete);
> -
> /*
> * For request-based dm.
> * One of these is allocated per request.
> diff --git a/fs/bio.c b/fs/bio.c
> index 9871164..740699c 100644
> --- a/fs/bio.c
> +++ b/fs/bio.c
> @@ -26,10 +26,9 @@
> #include
> #include
> #include
> -#include
> #include /* for struct sg_iovec */
>
> -DEFINE_TRACE(block_split);
> +#include
>
> /*
> * Test patch to inline a certain number of bi_io_vec's inside the bio
> diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
> index 82b4636..c14fb5b 100644
> --- a/include/linux/blktrace_api.h
> +++ b/include/linux/blktrace_api.h
> @@ -218,5 +218,13 @@ static inline int blk_trace_init_sysfs(struct device *dev)
>
> #endif /* CONFIG_BLK_DEV_IO_TRACE */
>
> +#ifdef CONFIG_EVENT_TRACING
> +
> +extern void __dump_pdu(char *buf, struct request *rq);
> +extern void __fill_rwbs(char *rwbs, u32 rw, int bytes);
> +extern void __fill_rwbs_rq(char *rwbs, struct request *rq);
> +
> +#endif
> +
> #endif /* __KERNEL__ */
> #endif
> diff --git a/include/trace/block.h b/include/trace/block.h
> deleted file mode 100644
> index 5b12efa..0000000
> --- a/include/trace/block.h
> +++ /dev/null
> @@ -1,76 +0,0 @@
> -#ifndef _TRACE_BLOCK_H
> -#define _TRACE_BLOCK_H
> -
> -#include
> -#include
> -
> -DECLARE_TRACE(block_rq_abort,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_insert,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_issue,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_requeue,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_rq_complete,
> - TP_PROTO(struct request_queue *q, struct request *rq),
> - TP_ARGS(q, rq));
> -
> -DECLARE_TRACE(block_bio_bounce,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_complete,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_backmerge,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_frontmerge,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_bio_queue,
> - TP_PROTO(struct request_queue *q, struct bio *bio),
> - TP_ARGS(q, bio));
> -
> -DECLARE_TRACE(block_getrq,
> - TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> - TP_ARGS(q, bio, rw));
> -
> -DECLARE_TRACE(block_sleeprq,
> - TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> - TP_ARGS(q, bio, rw));
> -
> -DECLARE_TRACE(block_plug,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_unplug_timer,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_unplug_io,
> - TP_PROTO(struct request_queue *q),
> - TP_ARGS(q));
> -
> -DECLARE_TRACE(block_split,
> - TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
> - TP_ARGS(q, bio, pdu));
> -
> -DECLARE_TRACE(block_remap,
> - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
> - sector_t from),
> - TP_ARGS(q, bio, dev, from));
> -
> -#endif
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> new file mode 100644
> index 0000000..e9aa2b5
> --- /dev/null
> +++ b/include/trace/events/block.h
> @@ -0,0 +1,476 @@
> +#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_BLOCK_H
> +
> +#include
> +#include
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM block
> +
> +TRACE_EVENT(block_rq_abort,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +
> +TRACE_EVENT(block_rq_insert,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( unsigned int, bytes )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->bytes, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_rq_issue,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( unsigned int, bytes )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->bytes, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_rq_requeue,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +
> +TRACE_EVENT(block_rq_complete,
> +
> + TP_PROTO(struct request_queue *q, struct request *rq),
> +
> + TP_ARGS(q, rq),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( int, errors )
> + __array( char, rwbs, 6 )
> + __array( char, cmd, BLK_MAX_CDB * 3 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
> + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector;
> + __entry->nr_sector = blk_pc_request(rq) ?
> + 0 : rq->hard_nr_sectors;
> + __entry->errors = rq->errors;
> + __fill_rwbs_rq(__entry->rwbs, rq);
> + __dump_pdu(__entry->cmd, rq);
> + ),
> +
> + TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rwbs, __entry->cmd,
> + __entry->sector, __entry->nr_sector, __entry->errors)
> +);
> +TRACE_EVENT(block_bio_bounce,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_complete,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned, nr_sector )
> + __field( int, error )
> + __array( char, rwbs, 6 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%d]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->error)
> +);
> +
> +TRACE_EVENT(block_bio_backmerge,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_frontmerge,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_bio_queue,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio),
> +
> + TP_ARGS(q, bio),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_getrq,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> +
> + TP_ARGS(q, bio, rw),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
> + __entry->sector = bio ? bio->bi_sector : 0;
> + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
> + __fill_rwbs(__entry->rwbs,
> + bio ? bio->bi_rw : 0, __entry->nr_sector);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_sleeprq,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
> +
> + TP_ARGS(q, bio, rw),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
> + __entry->sector = bio ? bio->bi_sector : 0;
> + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0;
> + __fill_rwbs(__entry->rwbs,
> + bio ? bio->bi_rw : 0, __entry->nr_sector);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->nr_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_plug,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s]", __entry->comm)
> +);
> +
> +TRACE_EVENT(block_unplug_timer,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __field( int, nr_rq )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
> +);
> +
> +TRACE_EVENT(block_unplug_io,
> +
> + TP_PROTO(struct request_queue *q),
> +
> + TP_ARGS(q),
> +
> + TP_STRUCT__entry(
> + __field( int, nr_rq )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
> +);
> +
> +TRACE_EVENT(block_split,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio,
> + unsigned int new_sector),
> +
> + TP_ARGS(q, bio, new_sector),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( sector_t, new_sector )
> + __array( char, rwbs, 6 )
> + __array( char, comm, TASK_COMM_LEN )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->new_sector = new_sector;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> + ),
> +
> + TP_printk("%d,%d %s %llu / %llu [%s]",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
> + __entry->sector, __entry->new_sector, __entry->comm)
> +);
> +
> +TRACE_EVENT(block_remap,
> +
> + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
> + sector_t from),
> +
> + TP_ARGS(q, bio, dev, from),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( sector_t, sector )
> + __field( unsigned int, nr_sector )
> + __field( dev_t, old_dev )
> + __field( sector_t, old_sector )
> + __array( char, rwbs, 6 )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = bio->bi_bdev->bd_dev;
> + __entry->sector = bio->bi_sector;
> + __entry->nr_sector = bio->bi_size >> 9;
> + __entry->old_dev = dev;
> + __entry->old_sector = from;
> + __fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
> + ),
> +
> + TP_printk("%d,%d %s %llu + %u
Ingo Molnar wrote:
> * Li Zefan wrote:
>
>> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
>> these new capabilities to this tracepoint:
>>
>> - zero-copy and per-cpu splice() tracing
>> - binary tracing without printf overhead
>> - structured logging records exposed under /debug/tracing/events
>> - trace events embedded in function tracer output and other plugins
>> - user-defined, per tracepoint filter expressions
>> ...
>
> Nice!
>
>> Cons and problems:
>>
>> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
>> no dev_t info for getrq and sleeprq events if bio == NULL.
>> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> Cannot we output the numeric major:minor pairs?
>
No, we can't.
Take plug tracepoint for example, the only argument is a struct request_queue,
but we can't map from a queue to a device, since there is no 1:1 mapping.
That's why blktrace adds dev_t info in struct blk_trace, which is associated
to a queue.
>> - for large packet commands, only 16 bytes of the command will be output.
>> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
>> supports dynamic-sized strings.
>>
>> - a packet command is converted to a string in TP_assign, not TP_print.
>> While blktrace do the convertion just before output.
>
> Couldnt we do a memcpy instead of the snprintf() in __dump_pdu()? We
> dont actually interpret the bytes there. We could extend the
> in-kernel printk format with a 'dump raw memory in hex' type of
> format specifier.
>
Sure, it's do-able. The disavantage is then we can't do filtering
on __entry->cmd, because now it's unsigned char[], not a string.
> OTOH, packet requests are rather rare, right? So going to ASCII
> there results in a simpler interface. In the !blk_pc_request(rq)
> common case we just return early without any snprintf overhead.
>
Right. :)
>> - in blktrace, an event can have 2 different print formats, but
>> a TRACE_EVENT has a unique format. (see the output of getrq
>> and rq_insert)
>
> Is this a problem?
>
One of the defect is, we have __entry->cmd[] even it's not used
if !blk_pc_request(rq).
This can be avoided though, by using __string() (needs small modification)
instead of __array().
> I think a good way forward would be to benchmark the ioctl versus
> the splice based TRACE_EVENT tracing (via some artificially high
> rate event, to push things), and see where we are right now in terms
> of overhead.
>
I'll try.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> * Li Zefan wrote:
>
>> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
>> these new capabilities to this tracepoint:
>>
>> - zero-copy and per-cpu splice() tracing
>> - binary tracing without printf overhead
>> - structured logging records exposed under /debug/tracing/events
>> - trace events embedded in function tracer output and other plugins
>> - user-defined, per tracepoint filter expressions
>> ...
>
> Nice!
>
>> Cons and problems:
>>
>> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
>> no dev_t info for getrq and sleeprq events if bio == NULL.
>> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> Cannot we output the numeric major:minor pairs?
>
No, we can't.
Take plug tracepoint for example, the only argument is a struct request_queue,
but we can't map from a queue to a device, since there is no 1:1 mapping.
That's why blktrace adds dev_t info in struct blk_trace, which is associated
to a queue.
>> - for large packet commands, only 16 bytes of the command will be output.
>> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
>> supports dynamic-sized strings.
>>
>> - a packet command is converted to a string in TP_assign, not TP_print.
>> While blktrace do the convertion just before output.
>
> Couldnt we do a memcpy instead of the snprintf() in __dump_pdu()? We
> dont actually interpret the bytes there. We could extend the
> in-kernel printk format with a 'dump raw memory in hex' type of
> format specifier.
>
Sure, it's do-able. The disavantage is then we can't do filtering
on __entry->cmd, because now it's unsigned char[], not a string.
> OTOH, packet requests are rather rare, right? So going to ASCII
> there results in a simpler interface. In the !blk_pc_request(rq)
> common case we just return early without any snprintf overhead.
>
Right. :)
>> - in blktrace, an event can have 2 different print formats, but
>> a TRACE_EVENT has a unique format. (see the output of getrq
>> and rq_insert)
>
> Is this a problem?
>
One of the defect is, we have __entry->cmd[] even it's not used
if !blk_pc_request(rq).
This can be avoided though, by using __string() (needs small modification)
instead of __array().
> I think a good way forward would be to benchmark the ioctl versus
> the splice based TRACE_EVENT tracing (via some artificially high
> rate event, to push things), and see where we are right now in terms
> of overhead.
>
I'll try.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Frederic Weisbecker wrote:
> On Thu, May 14, 2009 at 02:20:03PM +0800, Li Zefan wrote:
>> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
>> these new capabilities to this tracepoint:
>>
>> - zero-copy and per-cpu splice() tracing
>> - binary tracing without printf overhead
>> - structured logging records exposed under /debug/tracing/events
>> - trace events embedded in function tracer output and other plugins
>> - user-defined, per tracepoint filter expressions
>> ...
>>
>> Cons and problems:
>>
>> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
>> no dev_t info for getrq and sleeprq events if bio == NULL.
>> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>>
>> - for large packet commands, only 16 bytes of the command will be output.
>> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
>> supports dynamic-sized strings.
>>
>> - a packet command is converted to a string in TP_assign, not TP_print.
>> While blktrace do the convertion just before output.
>>
>> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
>> has a unique format. (see the output of getrq and rq_insert)
>
> I'm starting to think it would be nice to choose between several outputs
> in a trace event.
> Ie: perhaps we need a kind of per event flag, something simple, just to
> choose between several TP_printk output. Not sure how much it would
> (non) trivial to implement though...
>
If a trace event wants several TP_printk output, probably it wants different
structs of trace entry, otherwise we'll be wasting memory.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> On Thu, May 14, 2009 at 02:20:03PM +0800, Li Zefan wrote:
>> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
>> these new capabilities to this tracepoint:
>>
>> - zero-copy and per-cpu splice() tracing
>> - binary tracing without printf overhead
>> - structured logging records exposed under /debug/tracing/events
>> - trace events embedded in function tracer output and other plugins
>> - user-defined, per tracepoint filter expressions
>> ...
>>
>> Cons and problems:
>>
>> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
>> no dev_t info for getrq and sleeprq events if bio == NULL.
>> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>>
>> - for large packet commands, only 16 bytes of the command will be output.
>> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
>> supports dynamic-sized strings.
>>
>> - a packet command is converted to a string in TP_assign, not TP_print.
>> While blktrace do the convertion just before output.
>>
>> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
>> has a unique format. (see the output of getrq and rq_insert)
>
> I'm starting to think it would be nice to choose between several outputs
> in a trace event.
> Ie: perhaps we need a kind of per event flag, something simple, just to
> choose between several TP_printk output. Not sure how much it would
> (non) trivial to implement though...
>
If a trace event wants several TP_printk output, probably it wants different
structs of trace entry, otherwise we'll be wasting memory.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Li Zefan writes:
> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
> these new capabilities to this tracepoint:
> - zero-copy and per-cpu splice() tracing
> - binary tracing without printf overhead
> - structured logging records exposed under /debug/tracing/events
> - trace events embedded in function tracer output and other plugins
> - user-defined, per tracepoint filter expressions
> ...
>
> Cons and problems:
>
> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
I'm not sure I'm comfortable with this. Having had to diagnose
performance problems using blktrace, I find the dev_t information for
plugs and unplugs especially interesting. Maybe we can still figure
this out given the context in the trace, though. Can you comment on
that?
> no dev_t info for getrq and sleeprq events if bio == NULL.
> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> - for large packet commands, only 16 bytes of the command will be output.
> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
> supports dynamic-sized strings.
>
> - a packet command is converted to a string in TP_assign, not TP_print.
> While blktrace do the convertion just before output.
>
> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
> has a unique format. (see the output of getrq and rq_insert)
>
>
> Following are some comparisons between TRACE_EVENT and blktrace:
>
> plug:
> kjournald-480 [000] 303.084981: block_plug: [kjournald]
> kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
I'm not a big fan of having to change my scripts! Why did you switch
from the single letter output to the verbose strings? Am I just being a
curmudgeon?
I have one further question, which will expose my ignorance on this
implementation. Would the proposed patch change the information
available to the blktrace utility, or is this just for users such as
systemtap?
Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
> these new capabilities to this tracepoint:
> - zero-copy and per-cpu splice() tracing
> - binary tracing without printf overhead
> - structured logging records exposed under /debug/tracing/events
> - trace events embedded in function tracer output and other plugins
> - user-defined, per tracepoint filter expressions
> ...
>
> Cons and problems:
>
> - no dev_t info for the output of plug, unplug_timer and unplug_io events.
I'm not sure I'm comfortable with this. Having had to diagnose
performance problems using blktrace, I find the dev_t information for
plugs and unplugs especially interesting. Maybe we can still figure
this out given the context in the trace, though. Can you comment on
that?
> no dev_t info for getrq and sleeprq events if bio == NULL.
> no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.
>
> - for large packet commands, only 16 bytes of the command will be output.
> Because TRACE_EVENT doesn't support dynamic-sized arrays, though it
> supports dynamic-sized strings.
>
> - a packet command is converted to a string in TP_assign, not TP_print.
> While blktrace do the convertion just before output.
>
> - in blktrace, an event can have 2 different print formats, but a TRACE_EVENT
> has a unique format. (see the output of getrq and rq_insert)
>
>
> Following are some comparisons between TRACE_EVENT and blktrace:
>
> plug:
> kjournald-480 [000] 303.084981: block_plug: [kjournald]
> kjournald-480 [000] 303.084981: 8,0 P N [kjournald]
I'm not a big fan of having to change my scripts! Why did you switch
from the single letter output to the verbose strings? Am I just being a
curmudgeon?
I have one further question, which will expose my ignorance on this
implementation. Would the proposed patch change the information
available to the blktrace utility, or is this just for users such as
systemtap?
Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
> I'm not sure I'm comfortable with this. Having had to diagnose
> performance problems using blktrace, I find the dev_t information for
> plugs and unplugs especially interesting. Maybe we can still figure
> this out given the context in the trace, though. Can you comment on
> that?
The problem is that right now we still allow request queues shared by
multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
interface gets around that by doing the setup with an ioctl on the
block device node and storing the dev_t at that point.
But I don't think we actually have any drivers left sharing a queue
that way, and I'd love to get rid of that for various reasons.
> from the single letter output to the verbose strings? Am I just being a
> curmudgeon?
>
> I have one further question, which will expose my ignorance on this
> implementation. Would the proposed patch change the information
> available to the blktrace utility,
That's the output from /sys/kernel/debug/tracing/trace when the
block events are enabled. The blktrace utility should continue to
work without changes, although it could be ported to use the ftrace
binary ring buffer directly in the future.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> I'm not sure I'm comfortable with this. Having had to diagnose
> performance problems using blktrace, I find the dev_t information for
> plugs and unplugs especially interesting. Maybe we can still figure
> this out given the context in the trace, though. Can you comment on
> that?
The problem is that right now we still allow request queues shared by
multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
interface gets around that by doing the setup with an ioctl on the
block device node and storing the dev_t at that point.
But I don't think we actually have any drivers left sharing a queue
that way, and I'd love to get rid of that for various reasons.
> from the single letter output to the verbose strings? Am I just being a
> curmudgeon?
>
> I have one further question, which will expose my ignorance on this
> implementation. Would the proposed patch change the information
> available to the blktrace utility,
That's the output from /sys/kernel/debug/tracing/trace when the
block events are enabled. The blktrace utility should continue to
work without changes, although it could be ported to use the ftrace
binary ring buffer directly in the future.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Tue, 19 May 2009 09:08:41 -0400
Christoph Hellwig wrote:
> On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
> > I'm not sure I'm comfortable with this. Having had to diagnose
> > performance problems using blktrace, I find the dev_t information for
> > plugs and unplugs especially interesting. Maybe we can still figure
> > this out given the context in the trace, though. Can you comment on
> > that?
>
> The problem is that right now we still allow request queues shared by
> multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> interface gets around that by doing the setup with an ioctl on the
> block device node and storing the dev_t at that point.
>
> But I don't think we actually have any drivers left sharing a queue
> that way, and I'd love to get rid of that for various reasons.
There are still some (e.g. drivers/block/floppy.c)? I though that we
hit this when we tried to add the command filter feature though I
might be wrong (I can't recall).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Hellwig wrote:
> On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
> > I'm not sure I'm comfortable with this. Having had to diagnose
> > performance problems using blktrace, I find the dev_t information for
> > plugs and unplugs especially interesting. Maybe we can still figure
> > this out given the context in the trace, though. Can you comment on
> > that?
>
> The problem is that right now we still allow request queues shared by
> multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> interface gets around that by doing the setup with an ioctl on the
> block device node and storing the dev_t at that point.
>
> But I don't think we actually have any drivers left sharing a queue
> that way, and I'd love to get rid of that for various reasons.
There are still some (e.g. drivers/block/floppy.c)? I though that we
hit this when we tried to add the command filter feature though I
might be wrong (I can't recall).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Wed, May 20 2009, FUJITA Tomonori wrote:
> On Tue, 19 May 2009 09:08:41 -0400
> Christoph Hellwig wrote:
>
> > On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
> > > I'm not sure I'm comfortable with this. Having had to diagnose
> > > performance problems using blktrace, I find the dev_t information for
> > > plugs and unplugs especially interesting. Maybe we can still figure
> > > this out given the context in the trace, though. Can you comment on
> > > that?
> >
> > The problem is that right now we still allow request queues shared by
> > multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> > interface gets around that by doing the setup with an ioctl on the
> > block device node and storing the dev_t at that point.
> >
> > But I don't think we actually have any drivers left sharing a queue
> > that way, and I'd love to get rid of that for various reasons.
>
> There are still some (e.g. drivers/block/floppy.c)? I though that we
> hit this when we tried to add the command filter feature though I
> might be wrong (I can't recall).
And mtd, iirc. But only a few. I'll fiddle up a patch to finally get rid
of this, it has been pending for... years.
--
Jens Axboe
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> On Tue, 19 May 2009 09:08:41 -0400
> Christoph Hellwig wrote:
>
> > On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
> > > I'm not sure I'm comfortable with this. Having had to diagnose
> > > performance problems using blktrace, I find the dev_t information for
> > > plugs and unplugs especially interesting. Maybe we can still figure
> > > this out given the context in the trace, though. Can you comment on
> > > that?
> >
> > The problem is that right now we still allow request queues shared by
> > multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> > interface gets around that by doing the setup with an ioctl on the
> > block device node and storing the dev_t at that point.
> >
> > But I don't think we actually have any drivers left sharing a queue
> > that way, and I'd love to get rid of that for various reasons.
>
> There are still some (e.g. drivers/block/floppy.c)? I though that we
> hit this when we tried to add the command filter feature though I
> might be wrong (I can't recall).
And mtd, iirc. But only a few. I'll fiddle up a patch to finally get rid
of this, it has been pending for... years.
--
Jens Axboe
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Hellwig wrote:
> On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
>> I'm not sure I'm comfortable with this. Having had to diagnose
>> performance problems using blktrace, I find the dev_t information for
>> plugs and unplugs especially interesting. Maybe we can still figure
>> this out given the context in the trace, though. Can you comment on
>> that?
>
> The problem is that right now we still allow request queues shared by
> multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> interface gets around that by doing the setup with an ioctl on the
> block device node and storing the dev_t at that point.
>
> But I don't think we actually have any drivers left sharing a queue
> that way, and I'd love to get rid of that for various reasons.
>
It will be great if this can be done.
>> from the single letter output to the verbose strings? Am I just being a
>> curmudgeon?
>>
>> I have one further question, which will expose my ignorance on this
>> implementation. Would the proposed patch change the information
>> available to the blktrace utility,
>
> That's the output from /sys/kernel/debug/tracing/trace when the
> block events are enabled. The blktrace utility should continue to
> work without changes, although it could be ported to use the ftrace
> binary ring buffer directly in the future.
>
Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
blktrace at all.
To use blktracer:
# echo 1 > /sys/block/sda/trace/enable
# echo blk > /debug/tracing/current_tracer
# cat /debug/tracing/trace_pipe
To use block trace events:
# echo 'block:*' > /debug/tracing/set_event
# cat /trace_pipe
You can mixed-use blktracer and trace events, to see how they differ
in the output.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> On Tue, May 19, 2009 at 08:59:29AM -0400, Jeff Moyer wrote:
>> I'm not sure I'm comfortable with this. Having had to diagnose
>> performance problems using blktrace, I find the dev_t information for
>> plugs and unplugs especially interesting. Maybe we can still figure
>> this out given the context in the trace, though. Can you comment on
>> that?
>
> The problem is that right now we still allow request queues shared by
> multiple gendisks, so we can't get at the gendisk. The blktrace ioctl
> interface gets around that by doing the setup with an ioctl on the
> block device node and storing the dev_t at that point.
>
> But I don't think we actually have any drivers left sharing a queue
> that way, and I'd love to get rid of that for various reasons.
>
It will be great if this can be done.
>> from the single letter output to the verbose strings? Am I just being a
>> curmudgeon?
>>
>> I have one further question, which will expose my ignorance on this
>> implementation. Would the proposed patch change the information
>> available to the blktrace utility,
>
> That's the output from /sys/kernel/debug/tracing/trace when the
> block events are enabled. The blktrace utility should continue to
> work without changes, although it could be ported to use the ftrace
> binary ring buffer directly in the future.
>
Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
blktrace at all.
To use blktracer:
# echo 1 > /sys/block/sda/trace/enable
# echo blk > /debug/tracing/current_tracer
# cat /debug/tracing/trace_pipe
To use block trace events:
# echo 'block:*' > /debug/tracing/set_event
# cat /trace_pipe
You can mixed-use blktracer and trace events, to see how they differ
in the output.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Wed, May 20, 2009 at 04:38:54PM +0800, Li Zefan wrote:
> Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
> blktrace at all.
Is there any good reason for keeping the ftrace plugin once we have
the fully function event tracer support for the block events?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
> blktrace at all.
Is there any good reason for keeping the ftrace plugin once we have
the fully function event tracer support for the block events?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> On Wed, May 20, 2009 at 04:38:54PM +0800, Li Zefan wrote:
> > Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
> > blktrace at all.
>
> Is there any good reason for keeping the ftrace plugin once we have
> the fully function event tracer support for the block events?
waiting userland blktrace tool convert using event tracer?
just guess :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> > Yes, this patch doesn't affect ioctl-based blktrace or ftrace-plugin
> > blktrace at all.
>
> Is there any good reason for keeping the ftrace plugin once we have
> the fully function event tracer support for the block events?
waiting userland blktrace tool convert using event tracer?
just guess :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
On Sun, May 24, 2009 at 02:15:05PM +0900, KOSAKI Motohiro wrote:
> > Is there any good reason for keeping the ftrace plugin once we have
> > the fully function event tracer support for the block events?
>
> waiting userland blktrace tool convert using event tracer?
blktrace(8) uses the ioctl interface, which we should keep for a long
time for backwards compatiblity.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> > Is there any good reason for keeping the ftrace plugin once we have
> > the fully function event tracer support for the block events?
>
> waiting userland blktrace tool convert using event tracer?
blktrace(8) uses the ioctl interface, which we should keep for a long
time for backwards compatiblity.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> On Sun, May 24, 2009 at 02:15:05PM +0900, KOSAKI Motohiro wrote:
> > > Is there any good reason for keeping the ftrace plugin once we have
> > > the fully function event tracer support for the block events?
> >
> > waiting userland blktrace tool convert using event tracer?
>
> blktrace(8) uses the ioctl interface, which we should keep for a long
> time for backwards compatiblity.
Ah, I misunderstand your point. sorry.
I though you suggested to remove ioctl. but it's reverse. ok. :)
So, I guess same formatting make easy understanding for administrator, maybe.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
> > > Is there any good reason for keeping the ftrace plugin once we have
> > > the fully function event tracer support for the block events?
> >
> > waiting userland blktrace tool convert using event tracer?
>
> blktrace(8) uses the ioctl interface, which we should keep for a long
> time for backwards compatiblity.
Ah, I misunderstand your point. sorry.
I though you suggested to remove ioctl. but it's reverse. ok. :)
So, I guess same formatting make easy understanding for administrator, maybe.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Related Threads
- [Python-Dev] __length_hint__ - python
- C++ PATCH for c++/35146 - gcc
- Getting "5 Related" Books via multiple Genres (HABTM ) from within Book view() and without requestAction - cakephp
- [users] is open office free to use in internet cafe? - openoffice
- problems updating lxde and gnome, both installed - ubuntu
- [gentoo-user] Genlop -f not quite right. - gentoo
- x86: more than 8 32-bit CPUs requires X86_BIGSMP - kernel
- Re: /etc/crontab won't run my script - freebsd
- [patch] enum cast fix (svn revision 146752) - gcc
- rake variables - ruby
- [gentoo-user] AUTO: Martin Schrodi ist außer Haus. - gentoo
- [gentoo-user] AUTO: Martin Schrodi ist außer Haus. - gentoo