From 41f2df62894bfcd3bf868af916b32b90aa7168dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Jun 2010 08:54:16 +0200 Subject: block: BARRIER request should imply SYNC A barrier request should by defintion have priority in get_request and let the queue be unplugged immediately as it's blocking all forward progress due to the queue draining. Most filesystems already get this implicitly by the way how submit_bh treats the buffer_ordered flag, and gfs2 sets it explicitly. But btrfs and XFS are still forgetting to set the flag, as is blkdev_issue_flush and some places in DM/MD. For XFS on metadata heavy workloads this gives a consistent speedup in the 2-3% range. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6a857e24f94..efc3539ac5a 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); + submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); -- cgit v1.2.3 From 7b6d91daee5cac6402186ff224c3af39d79f4a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Aug 2010 18:20:39 +0200 Subject: block: unify flags for struct bio and struct request Remove the current bio flags and reuse the request flags for the bio, too. This allows to more easily trace the type of I/O from the filesystem down to the block driver. There were two flags in the bio that were missing in the requests: BIO_RW_UNPLUG and BIO_RW_AHEAD. Also I've renamed two request flags that had a superflous RW in them. Note that the flags are in bio.h despite having the REQ_ name - as blkdev.h includes bio.h that is the only way to go for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-barrier.c | 2 +- block/blk-core.c | 37 +++-------- block/blk-map.c | 2 +- block/blk-merge.c | 2 +- block/cfq-iosched.c | 14 ++--- block/elevator.c | 3 +- drivers/ata/libata-scsi.c | 2 +- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/brd.c | 2 +- drivers/block/drbd/drbd_actlog.c | 8 +-- drivers/block/drbd/drbd_main.c | 6 +- drivers/block/drbd/drbd_receiver.c | 22 +++---- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/umem.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-floppy.c | 2 +- drivers/md/dm-io.c | 12 ++-- drivers/md/dm-kcopyd.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-stripe.c | 2 +- drivers/md/dm.c | 14 ++--- drivers/md/linear.c | 2 +- drivers/md/md.c | 10 +-- drivers/md/md.h | 4 +- drivers/md/multipath.c | 8 +-- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 22 +++---- drivers/md/raid10.c | 12 ++-- drivers/md/raid5.c | 2 +- drivers/scsi/osd/osd_initiator.c | 8 +-- fs/bio.c | 5 +- fs/btrfs/disk-io.c | 8 +-- fs/btrfs/inode.c | 6 +- fs/btrfs/volumes.c | 18 +++--- fs/exofs/ios.c | 2 +- fs/gfs2/log.c | 4 +- fs/gfs2/meta_io.c | 8 +-- fs/gfs2/ops_fstype.c | 2 +- fs/nilfs2/segbuf.c | 2 +- include/linux/bio.h | 125 +++++++++++++++++++++++-------------- include/linux/blkdev.h | 66 +------------------- include/linux/fs.h | 38 +++++------ kernel/power/block_io.c | 2 +- kernel/trace/blktrace.c | 27 ++++---- mm/page_io.c | 2 +- 47 files changed, 242 insertions(+), 289 deletions(-) (limited to 'fs') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 74e40439317..7c6f4a71468 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -203,7 +203,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) /* initialize proxy request and queue it */ blk_rq_init(q, rq); if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; + rq->cmd_flags |= REQ_WRITE; if (q->ordered & QUEUE_ORDERED_DO_FUA) rq->cmd_flags |= REQ_FUA; init_request_from_bio(rq, q->orig_bar_rq->bio); diff --git a/block/blk-core.c b/block/blk-core.c index dca43a31e72..66c3cfe94d0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1140,25 +1140,9 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; - /* - * Inherit FAILFAST from bio (for read-ahead, and explicit - * FAILFAST). FAILFAST flags are identical for req and bio. - */ - if (bio_rw_flagged(bio, BIO_RW_AHEAD)) + req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; + if (bio->bi_rw & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; - else - req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; - - if (bio_rw_flagged(bio, BIO_RW_DISCARD)) - req->cmd_flags |= REQ_DISCARD; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) - req->cmd_flags |= REQ_HARDBARRIER; - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) - req->cmd_flags |= REQ_RW_SYNC; - if (bio_rw_flagged(bio, BIO_RW_META)) - req->cmd_flags |= REQ_RW_META; - if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) - req->cmd_flags |= REQ_NOIDLE; req->errors = 0; req->__sector = bio->bi_sector; @@ -1181,12 +1165,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); - const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); + const bool sync = (bio->bi_rw & REQ_SYNC); + const bool unplug = (bio->bi_rw & REQ_UNPLUG); const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; - if (bio_rw_flagged(bio, BIO_RW_BARRIER) && + if ((bio->bi_rw & REQ_HARDBARRIER) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; @@ -1200,7 +1184,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) + if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1275,7 +1259,7 @@ get_rq: */ rw_flags = bio_data_dir(bio); if (sync) - rw_flags |= REQ_RW_SYNC; + rw_flags |= REQ_SYNC; /* * Grab a free request. This is might sleep but can not fail. @@ -1464,7 +1448,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && + if (unlikely(!(bio->bi_rw & REQ_DISCARD) && nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), @@ -1497,8 +1481,7 @@ static inline void __generic_make_request(struct bio *bio) if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_rw_flagged(bio, BIO_RW_DISCARD) && - !blk_queue_discard(q)) { + if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(q)) { err = -EOPNOTSUPP; goto end_io; } @@ -2365,7 +2348,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ - rq->cmd_flags |= bio->bi_rw & REQ_RW; + rq->cmd_flags |= bio->bi_rw & REQ_WRITE; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); diff --git a/block/blk-map.c b/block/blk-map.c index 9083cf0180c..c65d7593f7f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return PTR_ERR(bio); if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= (1 << REQ_WRITE); if (do_copy) rq->cmd_flags |= REQ_COPY_USER; diff --git a/block/blk-merge.c b/block/blk-merge.c index 87e4fb7d0e9..4852475521e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -180,7 +180,7 @@ new_segment: } if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (rq->cmd_flags & REQ_RW) + if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); sg->page_link &= ~0x02; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d4edeb8fceb..eb4086f7dfe 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -458,7 +458,7 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) */ static inline bool cfq_bio_sync(struct bio *bio) { - return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO); + return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); } /* @@ -646,10 +646,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, return rq1; else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) return rq2; - if ((rq1->cmd_flags & REQ_RW_META) && !(rq2->cmd_flags & REQ_RW_META)) + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) return rq1; - else if ((rq2->cmd_flags & REQ_RW_META) && - !(rq1->cmd_flags & REQ_RW_META)) + else if ((rq2->cmd_flags & REQ_META) && + !(rq1->cmd_flags & REQ_META)) return rq2; s1 = blk_rq_pos(rq1); @@ -1485,7 +1485,7 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_RW_META) { + if (rq->cmd_flags & REQ_META) { WARN_ON(!cfqq->meta_pending); cfqq->meta_pending--; } @@ -3177,7 +3177,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if ((rq->cmd_flags & REQ_RW_META) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) return true; /* @@ -3231,7 +3231,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_RW_META) + if (rq->cmd_flags & REQ_META) cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cic); diff --git a/block/elevator.c b/block/elevator.c index aa99b59c03d..816a7c8d639 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -79,8 +79,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) /* * Don't merge file system requests and discard requests */ - if (bio_rw_flagged(bio, BIO_RW_DISCARD) != - bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) + if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) return 0; /* diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index a5c08b082ed..0a8cd348479 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1114,7 +1114,7 @@ static int atapi_drain_needed(struct request *rq) if (likely(rq->cmd_type != REQ_TYPE_BLOCK_PC)) return 0; - if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW)) + if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_WRITE)) return 0; return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 035cefe4045..65deffde60a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -173,7 +173,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) BUG(); bio_endio(bio, -ENXIO); return 0; - } else if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { + } else if (bio->bi_rw & REQ_HARDBARRIER) { bio_endio(bio, -EOPNOTSUPP); return 0; } else if (bio->bi_io_vec == NULL) { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index f1bf79d9bc0..1b218c6b682 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -340,7 +340,7 @@ static int brd_make_request(struct request_queue *q, struct bio *bio) get_capacity(bdev->bd_disk)) goto out; - if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { + if (unlikely(bio->bi_rw & REQ_DISCARD)) { err = 0; discard_from_brd(brd, sector, bio->bi_size); goto out; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index df018990c42..9400845d602 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -79,8 +79,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, md_io.error = 0; if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) - rw |= (1 << BIO_RW_BARRIER); - rw |= ((1<bi_rw & REQ_HARDBARRIER) && !ok)) { /* Try again with no barrier */ dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); set_bit(MD_NO_BARRIER, &mdev->flags); - rw &= ~(1 << BIO_RW_BARRIER); + rw &= ~REQ_HARDBARRIER; bio_put(bio); goto retry; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 7258c95e895..e2ab13d99d6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2425,15 +2425,15 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) /* NOTE: no need to check if barriers supported here as we would * not pass the test in make_request_common in that case */ - if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { + if (req->master_bio->bi_rw & REQ_HARDBARRIER) { dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); /* dp_flags |= DP_HARDBARRIER; */ } - if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) + if (req->master_bio->bi_rw & REQ_SYNC) dp_flags |= DP_RW_SYNC; /* for now handle SYNCIO and UNPLUG * as if they still were one and the same flag */ - if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) + if (req->master_bio->bi_rw & REQ_UNPLUG) dp_flags |= DP_RW_SYNC; if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index dff48701b84..cba1deb7b27 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1180,7 +1180,7 @@ next_bio: bio->bi_sector = sector; bio->bi_bdev = mdev->ldev->backing_bdev; /* we special case some flags in the multi-bio case, see below - * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */ + * (REQ_UNPLUG, REQ_HARDBARRIER) */ bio->bi_rw = rw; bio->bi_private = e; bio->bi_end_io = drbd_endio_sec; @@ -1209,16 +1209,16 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - /* strip off BIO_RW_UNPLUG unless it is the last bio */ + /* strip off REQ_UNPLUG unless it is the last bio */ if (bios) - bio->bi_rw &= ~(1<bi_rw &= ~REQ_UNPLUG; drbd_generic_make_request(mdev, fault_type, bio); - /* strip off BIO_RW_BARRIER, + /* strip off REQ_HARDBARRIER, * unless it is the first or last bio */ if (bios && bios->bi_next) - bios->bi_rw &= ~(1<bi_rw &= ~REQ_HARDBARRIER; } while (bios); maybe_kick_lo(mdev); return 0; @@ -1233,7 +1233,7 @@ fail: } /** - * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set + * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set * @mdev: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways (unused in this callback) @@ -1245,7 +1245,7 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) so that we can finish that epoch in drbd_may_finish_epoch(). That is necessary if we already have a long chain of Epochs, before - we realize that BIO_RW_BARRIER is actually not supported */ + we realize that REQ_HARDBARRIER is actually not supported */ /* As long as the -ENOTSUPP on the barrier is reported immediately that will never trigger. If it is reported late, we will just @@ -1824,14 +1824,14 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); if (epoch == e->epoch) { set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - rw |= (1<flags |= EE_IS_BARRIER; } else { if (atomic_read(&epoch->epoch_size) > 1 || !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - rw |= (1<flags |= EE_IS_BARRIER; } } @@ -1841,10 +1841,10 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) dp_flags = be32_to_cpu(p->dp_flags); if (dp_flags & DP_HARDBARRIER) { dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); - /* rw |= (1<flags |= EE_MAY_SET_IN_SYNC; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 654f1ef5cbb..f761d98a4e9 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -997,7 +997,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio) * because of those XXX, this is not yet enabled, * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. */ - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { + if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) { /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ bio_endio(bio, -EOPNOTSUPP); return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6120922f459..fedfdb7d3cd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -476,7 +476,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; if (bio_rw(bio) == WRITE) { - bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); + bool barrier = (bio->bi_rw & REQ_HARDBARRIER); struct file *file = lo->lo_backing_file; if (barrier) { diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 8a549db2aa7..9f3e4454274 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1221,7 +1221,7 @@ static int pkt_start_recovery(struct packet_data *pkt) pkt->bio->bi_flags = 1 << BIO_UPTODATE; pkt->bio->bi_idx = 0; - BUG_ON(pkt->bio->bi_rw != (1 << BIO_RW)); + BUG_ON(pkt->bio->bi_rw != REQ_WRITE); BUG_ON(pkt->bio->bi_vcnt != pkt->frames); BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE); BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 2f9470ff8f7..8be57151f5d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -478,7 +478,7 @@ static void process_page(unsigned long data) le32_to_cpu(desc->local_addr)>>9, le32_to_cpu(desc->transfer_size)); dump_dmastat(card, control); - } else if (test_bit(BIO_RW, &bio->bi_rw) && + } else if ((bio->bi_rw & REQ_WRITE) && le32_to_cpu(desc->local_addr) >> 9 == card->init_size) { card->init_size += le32_to_cpu(desc->transfer_size) >> 9; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 02712bf045c..766b3deeb23 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -454,7 +454,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, touch it at all. */ if (cgc->data_direction == CGC_DATA_WRITE) - flags |= REQ_RW; + flags |= REQ_WRITE; if (cgc->sense) memset(cgc->sense, 0, sizeof(struct request_sense)); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index c7d0737bb18..5406b6ea3ad 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -207,7 +207,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive, memcpy(rq->cmd, pc->c, 12); pc->rq = rq; - if (rq->cmd_flags & REQ_RW) + if (rq->cmd_flags & REQ_WRITE) pc->flags |= PC_FLAG_WRITING; pc->flags |= PC_FLAG_DMA_OK; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 10f457ca6af..0590c75b0ab 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -356,7 +356,7 @@ static void dispatch_io(int rw, unsigned int num_regions, BUG_ON(num_regions > DM_IO_MAX_REGIONS); if (sync) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; /* * For multiple regions we need to be careful to rewind @@ -364,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) + if (where[i].count || (rw & REQ_HARDBARRIER)) do_region(rw, i, where + i, dp, io); } @@ -412,8 +412,8 @@ retry: } set_current_state(TASK_RUNNING); - if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { - rw &= ~(1 << BIO_RW_BARRIER); + if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) { + rw &= ~REQ_HARDBARRIER; goto retry; } @@ -479,8 +479,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in - * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to + * the queue with blk_unplug() some time later or set REQ_SYNC in +io_req->bi_rw. If you fail to do one of these, the IO will be submitted to * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index addf8347504..d8587bac568 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), + .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ddda531723d..74136262d65 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1211,7 +1211,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) goto out; if (unlikely(error)) { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e610725db76..d6e28d732b4 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -284,7 +284,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) return error; if (error == -EOPNOTSUPP) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1e0e6dd5150..d6f77baeafd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -614,7 +614,7 @@ static void dec_pending(struct dm_io *io, int error) */ spin_lock_irqsave(&md->deferred_lock, flags); if (__noflush_suspending(md)) { - if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) + if (!(io->bio->bi_rw & REQ_HARDBARRIER)) bio_list_add_head(&md->deferred, io->bio); } else @@ -626,7 +626,7 @@ static void dec_pending(struct dm_io *io, int error) io_error = io->error; bio = io->bio; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { + if (bio->bi_rw & REQ_HARDBARRIER) { /* * There can be just one barrier request so we use * a per-device variable for error reporting. @@ -1106,7 +1106,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_sector = sector; clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); + clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; clone->bi_vcnt = 1; clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; @@ -1133,7 +1133,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); - clone->bi_rw &= ~(1 << BIO_RW_BARRIER); + clone->bi_rw &= ~REQ_HARDBARRIER; clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; @@ -1301,7 +1301,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.map = dm_get_live_table(md); if (unlikely(!ci.map)) { - if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) + if (!(bio->bi_rw & REQ_HARDBARRIER)) bio_io_error(bio); else if (!md->barrier_error) @@ -1414,7 +1414,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio) * we have to queue this io for later. */ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || - unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + unlikely(bio->bi_rw & REQ_HARDBARRIER)) { up_read(&md->io_lock); if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && @@ -2296,7 +2296,7 @@ static void dm_wq_work(struct work_struct *work) if (dm_request_based(md)) generic_make_request(c); else { - if (bio_rw_flagged(c, BIO_RW_BARRIER)) + if (c->bi_rw & REQ_HARDBARRIER) process_barrier(md, c); else __split_and_process_bio(md, c); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7e0e057db9a..ba19060bcf3 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -294,7 +294,7 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) dev_info_t *tmp_dev; sector_t start_sector; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { md_barrier_request(mddev, bio); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index cb20d0b0555..1893af67877 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -353,7 +353,7 @@ static void md_submit_barrier(struct work_struct *ws) /* an empty barrier - all done */ bio_endio(bio, 0); else { - bio->bi_rw &= ~(1<bi_rw &= ~REQ_HARDBARRIER; if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); mddev->barrier = POST_REQUEST_BARRIER; @@ -675,11 +675,11 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * if zero is reached. * If an error occurred, call md_error * - * As we might need to resubmit the request if BIO_RW_BARRIER + * As we might need to resubmit the request if REQ_HARDBARRIER * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -691,7 +691,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, atomic_inc(&mddev->pending_writes); if (!test_bit(BarriersNotsupp, &rdev->flags)) { struct bio *rbio; - rw |= (1<bi_private = bio; rbio->bi_end_io = super_written_barrier; @@ -736,7 +736,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = bdev; bio->bi_sector = sector; diff --git a/drivers/md/md.h b/drivers/md/md.h index 10597bfec00..fc56e0f21c8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -67,7 +67,7 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ +#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */ #define AllReserved 6 /* If whole device is reserved for * one array */ #define AutoDetected 7 /* added by auto-detect */ @@ -254,7 +254,7 @@ struct mddev_s * fails. Only supported */ struct bio *biolist; /* bios that need to be retried - * because BIO_RW_BARRIER is not supported + * because REQ_HARDBARRIER is not supported */ atomic_t recovery_active; /* blocks scheduled, but not written */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 410fb60699a..0307d217e7a 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -91,7 +91,7 @@ static void multipath_end_request(struct bio *bio, int error) if (uptodate) multipath_end_bh_io(mp_bh, 0); - else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) { + else if (!(bio->bi_rw & REQ_RAHEAD)) { /* * oops, IO error: */ @@ -142,7 +142,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { md_barrier_request(mddev, bio); return 0; } @@ -163,7 +163,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio) mp_bh->bio = *bio; mp_bh->bio.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); + mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; generic_make_request(&mp_bh->bio); @@ -398,7 +398,7 @@ static void multipathd (mddev_t *mddev) *bio = *(mp_bh->master_bio); bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; generic_make_request(bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 563abed5a2c..6f7af46d623 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -483,7 +483,7 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) struct strip_zone *zone; mdk_rdev_t *tmp_dev; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { md_barrier_request(mddev, bio); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a948da8012d..73cc74ffc26 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -787,7 +787,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool do_sync = (bio->bi_rw & REQ_SYNC); bool do_barriers; mdk_rdev_t *blocked_rdev; @@ -822,7 +822,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) finish_wait(&conf->wait_barrier, &w); } if (unlikely(!mddev->barriers_work && - bio_rw_flagged(bio, BIO_RW_BARRIER))) { + (bio->bi_rw & REQ_HARDBARRIER))) { if (rw == WRITE) md_write_end(mddev); bio_endio(bio, -EOPNOTSUPP); @@ -877,7 +877,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -959,7 +959,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); + do_barriers = bio->bi_rw & REQ_HARDBARRIER; if (do_barriers) set_bit(R1BIO_Barrier, &r1_bio->state); @@ -975,8 +975,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | - (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_barriers | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1633,7 +1632,7 @@ static void raid1d(mddev_t *mddev) sync_request_write(mddev, r1_bio); unplug = 1; } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were BIO_RW_BARRIER + /* some requests in the r1bio were REQ_HARDBARRIER * requests which failed with -EOPNOTSUPP. Hohumm.. * Better resubmit without the barrier. * We know which devices to resubmit for, because @@ -1641,7 +1640,7 @@ static void raid1d(mddev_t *mddev) * We already have a nr_pending reference on these rdevs. */ int i; - const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); + const bool do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); clear_bit(R1BIO_BarrierRetry, &r1_bio->state); clear_bit(R1BIO_Barrier, &r1_bio->state); for (i=0; i < conf->raid_disks; i++) @@ -1662,8 +1661,7 @@ static void raid1d(mddev_t *mddev) conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | - (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = WRITE | do_sync; bio->bi_private = r1_bio; r1_bio->bios[i] = bio; generic_make_request(bio); @@ -1698,7 +1696,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); + const bool do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1715,7 +1713,7 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 42e64e4e5e2..62ecb6650fd 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -799,12 +799,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); - const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool do_sync = (bio->bi_rw & REQ_SYNC); struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { md_barrier_request(mddev, bio); return 0; } @@ -879,7 +879,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; generic_make_request(read_bio); @@ -947,7 +947,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1716,7 +1716,7 @@ static void raid10d(mddev_t *mddev) raid_end_bio_io(r10_bio); bio_put(bio); } else { - const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO); + const bool do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) @@ -1730,7 +1730,7 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 96c690279fc..20ac2f14376 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3958,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) const int rw = bio_data_dir(bi); int remaining; - if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { + if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { /* Drain all pending writes. We only really need * to ensure they have been submitted, but this is * easier. diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index ee4b6914667..fda4de3440c 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -716,7 +716,7 @@ static int _osd_req_list_objects(struct osd_request *or, return PTR_ERR(bio); } - bio->bi_rw &= ~(1 << BIO_RW); + bio->bi_rw &= ~REQ_WRITE; or->in.bio = bio; or->in.total_bytes = bio->bi_size; return 0; @@ -814,7 +814,7 @@ void osd_req_write(struct osd_request *or, { _osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, len); WARN_ON(or->out.bio || or->out.total_bytes); - WARN_ON(0 == bio_rw_flagged(bio, BIO_RW)); + WARN_ON(0 == (bio->bi_rw & REQ_WRITE)); or->out.bio = bio; or->out.total_bytes = len; } @@ -829,7 +829,7 @@ int osd_req_write_kern(struct osd_request *or, if (IS_ERR(bio)) return PTR_ERR(bio); - bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ + bio->bi_rw |= REQ_WRITE; /* FIXME: bio_set_dir() */ osd_req_write(or, obj, offset, bio, len); return 0; } @@ -865,7 +865,7 @@ void osd_req_read(struct osd_request *or, { _osd_req_encode_common(or, OSD_ACT_READ, obj, offset, len); WARN_ON(or->in.bio || or->in.total_bytes); - WARN_ON(1 == bio_rw_flagged(bio, BIO_RW)); + WARN_ON(1 == (bio->bi_rw & REQ_WRITE)); or->in.bio = bio; or->in.total_bytes = len; } diff --git a/fs/bio.c b/fs/bio.c index e7bf6ca64dc..8abb2dfb2e7 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (!bio) goto out_bmd; - bio->bi_rw |= (!write_to_vm << BIO_RW); + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; ret = 0; @@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * set data direction, and check if mapped pages need bouncing */ if (!write_to_vm) - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; bio->bi_bdev = bdev; bio->bi_flags |= (1 << BIO_USER_MAPPED); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34f7c375567..64f10082f04 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -480,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.func = end_workqueue_fn; end_io_wq->work.flags = 0; - if (bio->bi_rw & (1 << BIO_RW)) { + if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); @@ -604,7 +604,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & (1 << BIO_RW_SYNCIO)) + if (rw & REQ_SYNC) btrfs_set_work_high_prio(&async->work); btrfs_queue_worker(&fs_info->workers, &async->work); @@ -668,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, 1); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -1427,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work) * ram and up to date before trying to verify things. For * blocksize <= pagesize, it is basically a noop */ - if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && !bio_ready_for_csum(bio)) { btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1bff92ad474..e975d7180a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1429,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1841,7 +1841,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, bio->bi_size = 0; bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & (1 << BIO_RW)) + if (failed_bio->bi_rw & REQ_WRITE) rw = WRITE; else rw = READ; @@ -5642,7 +5642,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, struct bio_vec *bvec = bio->bi_io_vec; u64 start; int skip_sum; - int write = rw & (1 << BIO_RW); + int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6e3af8be95..dd318ff280b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -258,7 +258,7 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); - if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) + if (cur->bi_rw & REQ_SYNC) num_sync_run++; submit_bio(cur->bi_rw, cur); @@ -2651,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int max_errors = 0; struct btrfs_multi_bio *multi = NULL; - if (multi_ret && !(rw & (1 << BIO_RW))) + if (multi_ret && !(rw & REQ_WRITE)) stripes_allocated = 1; again: if (multi_ret) { @@ -2687,7 +2687,7 @@ again: mirror_num = 0; /* if our multi bio struct is too small, back off and try again */ - if (rw & (1 << BIO_RW)) { + if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { stripes_required = map->num_stripes; @@ -2697,7 +2697,7 @@ again: max_errors = 1; } } - if (multi_ret && (rw & (1 << BIO_RW)) && + if (multi_ret && (rw & REQ_WRITE) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2733,7 +2733,7 @@ again: num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2744,7 +2744,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (1 << BIO_RW)) + if (rw & REQ_WRITE) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2755,7 +2755,7 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -2945,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { bio_get(bio); submit_bio(rw, bio); bio_put(bio); @@ -2964,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) + if (bio->bi_rw & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4337cad7777..e2732203fa9 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -599,7 +599,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) } else { bio = master_dev->bio; /* FIXME: bio_set_dir() */ - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; } osd_req_write(or, &ios->obj, per_dev->offset, bio, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index efc3539ac5a..cde1248a622 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + submit_bh(WRITE_BARRIER | REQ_META, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -605,7 +605,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) lock_buffer(bh); skip_barrier: get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); + submit_bh(WRITE_SYNC | REQ_META, bh); wait_on_buffer(bh); } if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 18176d0b75d..f3b071f921a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE)); + int write_op = REQ_META | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); + submit_bh(READ_SYNC | REQ_META, bh); if (!(flags & DIO_WAIT)) return 0; @@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3593b3a7290..fd4f8946abf 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -275,7 +275,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + submit_bio(READ_SYNC | REQ_META, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2e6a2723b8f..4588fb9e93d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -508,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 7fc5606e6ea..4d379c8250a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -138,55 +138,83 @@ struct bio { #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) /* - * bio bi_rw flags - * - * bit 0 -- data direction - * If not set, bio is a read from device. If set, it's a write to device. - * bit 1 -- fail fast device errors - * bit 2 -- fail fast transport errors - * bit 3 -- fail fast driver errors - * bit 4 -- rw-ahead when set - * bit 5 -- barrier - * Insert a serialization point in the IO queue, forcing previously - * submitted IO to be completed before this one is issued. - * bit 6 -- synchronous I/O hint. - * bit 7 -- Unplug the device immediately after submitting this bio. - * bit 8 -- metadata request - * Used for tracing to differentiate metadata and data IO. May also - * get some preferential treatment in the IO scheduler - * bit 9 -- discard sectors - * Informs the lower level device that this range of sectors is no longer - * used by the file system and may thus be freed by the device. Used - * for flash based storage. - * Don't want driver retries for any fast fail whatever the reason. - * bit 10 -- Tell the IO scheduler not to wait for more requests after this - one has been submitted, even if it is a SYNC request. + * Request flags. For use in the cmd_flags field of struct request, and in + * bi_rw of struct bio. Note that some flags are only valid in either one. */ -enum bio_rw_flags { - BIO_RW, - BIO_RW_FAILFAST_DEV, - BIO_RW_FAILFAST_TRANSPORT, - BIO_RW_FAILFAST_DRIVER, - /* above flags must match REQ_* */ - BIO_RW_AHEAD, - BIO_RW_BARRIER, - BIO_RW_SYNCIO, - BIO_RW_UNPLUG, - BIO_RW_META, - BIO_RW_DISCARD, - BIO_RW_NOIDLE, +enum rq_flag_bits { + /* common flags */ + __REQ_WRITE, /* not set, read. set, write */ + __REQ_FAILFAST_DEV, /* no driver retries of device errors */ + __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ + __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ + + __REQ_HARDBARRIER, /* may not be passed by drive either */ + __REQ_SYNC, /* request is sync (sync write or read) */ + __REQ_META, /* metadata io request */ + __REQ_DISCARD, /* request to discard sectors */ + __REQ_NOIDLE, /* don't anticipate more IO after this one */ + + /* bio only flags */ + __REQ_UNPLUG, /* unplug the immediately after submission */ + __REQ_RAHEAD, /* read ahead, can fail anytime */ + + /* request only flags */ + __REQ_SORTED, /* elevator knows about this request */ + __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ + __REQ_FUA, /* forced unit access */ + __REQ_NOMERGE, /* don't touch this for merging */ + __REQ_STARTED, /* drive already may have started this one */ + __REQ_DONTPREP, /* don't call prep for this one */ + __REQ_QUEUED, /* uses queueing */ + __REQ_ELVPRIV, /* elevator private data attached */ + __REQ_FAILED, /* set if the request failed */ + __REQ_QUIET, /* don't worry about errors */ + __REQ_PREEMPT, /* set for "ide_preempt" requests */ + __REQ_ORDERED_COLOR, /* is before or after barrier */ + __REQ_ALLOCED, /* request came from our alloc pool */ + __REQ_COPY_USER, /* contains copies of user pages */ + __REQ_INTEGRITY, /* integrity metadata has been remapped */ + __REQ_IO_STAT, /* account I/O stat */ + __REQ_MIXED_MERGE, /* merge of different types, fail separately */ + __REQ_NR_BITS, /* stops here */ }; -/* - * First four bits must match between bio->bi_rw and rq->cmd_flags, make - * that explicit here. - */ -#define BIO_RW_RQ_MASK 0xf - -static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag) -{ - return (bio->bi_rw & (1 << flag)) != 0; -} +#define REQ_WRITE (1 << __REQ_WRITE) +#define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) +#define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) +#define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) +#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) +#define REQ_SYNC (1 << __REQ_SYNC) +#define REQ_META (1 << __REQ_META) +#define REQ_DISCARD (1 << __REQ_DISCARD) +#define REQ_NOIDLE (1 << __REQ_NOIDLE) + +#define REQ_FAILFAST_MASK \ + (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) +#define REQ_COMMON_MASK \ + (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \ + REQ_META| REQ_DISCARD | REQ_NOIDLE) + +#define REQ_UNPLUG (1 << __REQ_UNPLUG) +#define REQ_RAHEAD (1 << __REQ_RAHEAD) + +#define REQ_SORTED (1 << __REQ_SORTED) +#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) +#define REQ_FUA (1 << __REQ_FUA) +#define REQ_NOMERGE (1 << __REQ_NOMERGE) +#define REQ_STARTED (1 << __REQ_STARTED) +#define REQ_DONTPREP (1 << __REQ_DONTPREP) +#define REQ_QUEUED (1 << __REQ_QUEUED) +#define REQ_ELVPRIV (1 << __REQ_ELVPRIV) +#define REQ_FAILED (1 << __REQ_FAILED) +#define REQ_QUIET (1 << __REQ_QUIET) +#define REQ_PREEMPT (1 << __REQ_PREEMPT) +#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) +#define REQ_ALLOCED (1 << __REQ_ALLOCED) +#define REQ_COPY_USER (1 << __REQ_COPY_USER) +#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) +#define REQ_IO_STAT (1 << __REQ_IO_STAT) +#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) /* * upper 16 bits of bi_rw define the io priority of this bio @@ -211,7 +239,10 @@ static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag) #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_empty_barrier(bio) (bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD)) +#define bio_empty_barrier(bio) \ + ((bio->bi_rw & REQ_HARDBARRIER) && \ + !bio_has_data(bio) && \ + !(bio->bi_rw & REQ_DISCARD)) static inline unsigned int bio_cur_bytes(struct bio *bio) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3ecd28ef9ba..3fc0f590861 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -84,70 +84,6 @@ enum { REQ_LB_OP_FLUSH = 0x41, /* flush request */ }; -/* - * request type modified bits. first four bits match BIO_RW* bits, important - */ -enum rq_flag_bits { - __REQ_RW, /* not set, read. set, write */ - __REQ_FAILFAST_DEV, /* no driver retries of device errors */ - __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ - __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - /* above flags must match BIO_RW_* */ - __REQ_DISCARD, /* request to discard sectors */ - __REQ_SORTED, /* elevator knows about this request */ - __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ - __REQ_HARDBARRIER, /* may not be passed by drive either */ - __REQ_FUA, /* forced unit access */ - __REQ_NOMERGE, /* don't touch this for merging */ - __REQ_STARTED, /* drive already may have started this one */ - __REQ_DONTPREP, /* don't call prep for this one */ - __REQ_QUEUED, /* uses queueing */ - __REQ_ELVPRIV, /* elevator private data attached */ - __REQ_FAILED, /* set if the request failed */ - __REQ_QUIET, /* don't worry about errors */ - __REQ_PREEMPT, /* set for "ide_preempt" requests */ - __REQ_ORDERED_COLOR, /* is before or after barrier */ - __REQ_RW_SYNC, /* request is sync (sync write or read) */ - __REQ_ALLOCED, /* request came from our alloc pool */ - __REQ_RW_META, /* metadata io request */ - __REQ_COPY_USER, /* contains copies of user pages */ - __REQ_INTEGRITY, /* integrity metadata has been remapped */ - __REQ_NOIDLE, /* Don't anticipate more IO after this one */ - __REQ_IO_STAT, /* account I/O stat */ - __REQ_MIXED_MERGE, /* merge of different types, fail separately */ - __REQ_NR_BITS, /* stops here */ -}; - -#define REQ_RW (1 << __REQ_RW) -#define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) -#define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) -#define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) -#define REQ_DISCARD (1 << __REQ_DISCARD) -#define REQ_SORTED (1 << __REQ_SORTED) -#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) -#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) -#define REQ_FUA (1 << __REQ_FUA) -#define REQ_NOMERGE (1 << __REQ_NOMERGE) -#define REQ_STARTED (1 << __REQ_STARTED) -#define REQ_DONTPREP (1 << __REQ_DONTPREP) -#define REQ_QUEUED (1 << __REQ_QUEUED) -#define REQ_ELVPRIV (1 << __REQ_ELVPRIV) -#define REQ_FAILED (1 << __REQ_FAILED) -#define REQ_QUIET (1 << __REQ_QUIET) -#define REQ_PREEMPT (1 << __REQ_PREEMPT) -#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) -#define REQ_RW_SYNC (1 << __REQ_RW_SYNC) -#define REQ_ALLOCED (1 << __REQ_ALLOCED) -#define REQ_RW_META (1 << __REQ_RW_META) -#define REQ_COPY_USER (1 << __REQ_COPY_USER) -#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) -#define REQ_NOIDLE (1 << __REQ_NOIDLE) -#define REQ_IO_STAT (1 << __REQ_IO_STAT) -#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) - -#define REQ_FAILFAST_MASK (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \ - REQ_FAILFAST_DRIVER) - #define BLK_MAX_CDB 16 /* @@ -631,7 +567,7 @@ enum { */ static inline bool rw_is_sync(unsigned int rw_flags) { - return !(rw_flags & REQ_RW) || (rw_flags & REQ_RW_SYNC); + return !(rw_flags & REQ_WRITE) || (rw_flags & REQ_SYNC); } static inline bool rq_is_sync(struct request *rq) diff --git a/include/linux/fs.h b/include/linux/fs.h index 59887883149..c5c92943c76 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -144,29 +144,31 @@ struct inodes_stat_t { * of this IO. * */ -#define RW_MASK 1 -#define RWA_MASK 2 -#define READ 0 -#define WRITE 1 -#define READA 2 /* read-ahead - don't block if no resources */ -#define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ -#define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) -#define READ_META (READ | (1 << BIO_RW_META)) -#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) -#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) -#define WRITE_META (WRITE | (1 << BIO_RW_META)) -#define SWRITE_SYNC_PLUG \ - (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) -#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -#define WRITE_BARRIER (WRITE_SYNC | (1 << BIO_RW_BARRIER)) +#define RW_MASK 1 +#define RWA_MASK 2 + +#define READ 0 +#define WRITE 1 +#define READA 2 /* readahead - don't block if no resources */ +#define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ + +#define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) +#define READ_META (READ | REQ_META) +#define WRITE_SYNC_PLUG (WRITE | REQ_SYNC | REQ_NOIDLE) +#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) +#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) +#define WRITE_META (WRITE | REQ_META) +#define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ + REQ_HARDBARRIER) +#define SWRITE_SYNC_PLUG (SWRITE | REQ_SYNC | REQ_NOIDLE) +#define SWRITE_SYNC (SWRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) /* * These aren't really reads or writes, they pass down information about * parts of device that are now unused by the file system. */ -#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD)) -#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER)) +#define DISCARD_NOBARRIER (WRITE | REQ_DISCARD) +#define DISCARD_BARRIER (WRITE | REQ_DISCARD | REQ_HARDBARRIER) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 97024fd40cd..83bbc7c02df 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4f149944cb8..3b4a695051b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +#define BLK_TC_HARDBARRIER BLK_TC_BARRIER +#define BLK_TC_RAHEAD BLK_TC_AHEAD + /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNCIO); - what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, HARDBARRIER); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); @@ -662,7 +665,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, return; if (rq->cmd_flags & REQ_DISCARD) - rw |= (1 << BIO_RW_DISCARD); + rw |= REQ_DISCARD; if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); @@ -1755,20 +1758,20 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & WRITE) rwbs[i++] = 'W'; - else if (rw & 1 << BIO_RW_DISCARD) + else if (rw & REQ_DISCARD) rwbs[i++] = 'D'; else if (bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (rw & 1 << BIO_RW_AHEAD) + if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & 1 << BIO_RW_BARRIER) + if (rw & REQ_HARDBARRIER) rwbs[i++] = 'B'; - if (rw & 1 << BIO_RW_SYNCIO) + if (rw & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & 1 << BIO_RW_META) + if (rw & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; @@ -1780,7 +1783,7 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) int bytes; if (rq->cmd_flags & REQ_DISCARD) - rw |= (1 << BIO_RW_DISCARD); + rw |= REQ_DISCARD; bytes = blk_rq_bytes(rq); diff --git a/mm/page_io.c b/mm/page_io.c index 31a3b962230..2dee975bf46 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3 From c1955ce32fdb0877b7a1b22feb2669358f65be76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 19 Jun 2010 23:08:06 +0200 Subject: writeback: remove wb_list The wb_list member of struct backing_device_info always has exactly one element. Just use the direct bdi->wb pointer instead and simplify some code. Also remove bdi_task_init which is now trivial to prepare for the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 +-- include/linux/backing-dev.h | 5 +-- mm/backing-dev.c | 83 ++++++++++++++++----------------------------- 3 files changed, 32 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be1693ac9..d67989b8ba4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -73,9 +73,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * If the default thread isn't there, make sure we add it. When * it gets created and wakes up, we'll run this work. */ - if (unlikely(list_empty_careful(&bdi->wb_list))) + if (unlikely(!bdi->wb.task)) { wake_up_process(default_backing_dev_info.wb.task); - else { + } else { struct bdi_writeback *wb = &bdi->wb; if (wb->task) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9aec0d099d..50f14614616 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -45,8 +45,6 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { - struct list_head list; /* hangs off the bdi */ - struct backing_dev_info *bdi; /* our parent bdi */ unsigned int nr; @@ -80,8 +78,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects update side of wb_list */ - struct list_head wb_list; /* the flusher threads hanging off this bdi */ + spinlock_t wb_lock; /* protects work_list */ struct list_head work_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 123bcef13e5..6c2a09c8922 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -65,28 +65,21 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb; + struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; struct inode *inode; - /* - * inode lock is enough here, the bdi->wb_list is protected by - * RCU on the reader side - */ nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(wb, &bdi->wb_list, list) { - nr_wb++; - list_for_each_entry(inode, &wb->b_dirty, i_list) - nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) - nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) - nr_more_io++; - } + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" "bdi_list: %8u\n" - "state: %8lx\n" - "wb_list: %8u\n", + "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, - !list_empty(&bdi->bdi_list), bdi->state, - !list_empty(&bdi->wb_list)); + K(background_thresh), nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state); #undef K return 0; @@ -270,24 +260,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_more_io); } -static void bdi_task_init(struct backing_dev_info *bdi, - struct bdi_writeback *wb) -{ - struct task_struct *tsk = current; - - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&wb->list, &bdi->wb_list); - spin_unlock(&bdi->wb_lock); - - tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(tsk, 0); -} - static int bdi_start_fn(void *ptr) { struct bdi_writeback *wb = ptr; @@ -301,7 +273,13 @@ static int bdi_start_fn(void *ptr) list_add_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); - bdi_task_init(bdi, wb); + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); /* * Clear pending bit and wakeup anybody waiting to tear us down @@ -312,12 +290,7 @@ static int bdi_start_fn(void *ptr) ret = bdi_writeback_task(wb); - /* - * Remove us from the list - */ - spin_lock(&bdi->wb_lock); - list_del_rcu(&wb->list); - spin_unlock(&bdi->wb_lock); + wb->task = NULL; /* * Flush any work that raced with us exiting. No new work @@ -326,7 +299,6 @@ static int bdi_start_fn(void *ptr) if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); - wb->task = NULL; return ret; } @@ -391,7 +363,13 @@ static int bdi_forker_task(void *ptr) { struct bdi_writeback *me = ptr; - bdi_task_init(me->bdi, me); + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); for (;;) { struct backing_dev_info *bdi, *tmp; @@ -598,8 +576,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct bdi_writeback *wb; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -615,14 +591,14 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * Finally, kill the kernel threads. We don't need to be RCU + * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. Force * unfreeze of the thread before calling kthread_stop(), otherwise * it would never exet if it is currently stuck in the refrigerator. */ - list_for_each_entry(wb, &bdi->wb_list, list) { - thaw_process(wb->task); - kthread_stop(wb->task); + if (bdi->wb.task) { + thaw_process(bdi->wb.task); + kthread_stop(bdi->wb.task); } } @@ -667,7 +643,6 @@ int bdi_init(struct backing_dev_info *bdi) spin_lock_init(&bdi->wb_lock); INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); bdi_wb_init(&bdi->wb, bdi); -- cgit v1.2.3 From 082439004b31adc146e96e5f1c574dd2b57dcd93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 19 Jun 2010 23:08:22 +0200 Subject: writeback: merge bdi_writeback_task and bdi_start_fn Move all code for the writeback thread into fs/fs-writeback.c instead of splitting it over two functions in two files. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 35 ++++++++++++++++++++++++++++++++++- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 44 +------------------------------------------- 3 files changed, 36 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d67989b8ba4..c8471b3ddcc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -775,12 +775,36 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct bdi_writeback *wb) +int bdi_writeback_thread(void *data) { + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; unsigned long last_active = jiffies; unsigned long wait_jiffies = -1UL; long pages_written; + /* + * Add us to the active bdi_list + */ + spin_lock_bh(&bdi_lock); + list_add_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + while (!kthread_should_stop()) { pages_written = wb_do_writeback(wb, 0); @@ -813,9 +837,18 @@ int bdi_writeback_task(struct bdi_writeback *wb) try_to_freeze(); } + wb->task = NULL; + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); return 0; } + /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 50f14614616..e536f3a74e6 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -102,7 +102,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_task(struct bdi_writeback *wb); +int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6c2a09c8922..bceac647e4d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -260,48 +260,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_more_io); } -static int bdi_start_fn(void *ptr) -{ - struct bdi_writeback *wb = ptr; - struct backing_dev_info *bdi = wb->bdi; - int ret; - - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - - ret = bdi_writeback_task(wb); - - wb->task = NULL; - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); - - return ret; -} - int bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); @@ -425,7 +383,7 @@ static int bdi_forker_task(void *ptr) spin_unlock_bh(&bdi_lock); wb = &bdi->wb; - wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s", dev_name(bdi->dev)); /* * If task creation fails, then readd the bdi to -- cgit v1.2.3 From 1676effca4cd2a6b32e6e8e0ecaa91522dfda6fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 21 Jun 2010 11:02:48 +0200 Subject: gcc-4.6: fs: fix unused but set warnings No real bugs I believe, just some dead code, and some shut up code. Signed-off-by: Andi Kleen Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 -- include/linux/audit.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index efdbfece993..ec11c52d646 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -597,7 +597,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; - pgoff_t index; ssize_t res; size_t this_len; int error; @@ -621,7 +620,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, goto shrink_ret; } - index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; diff --git a/include/linux/audit.h b/include/linux/audit.h index f391d45c8ae..e24afabc548 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -544,7 +544,7 @@ extern int audit_signals; #define audit_putname(n) do { ; } while (0) #define __audit_inode(n,d) do { ; } while (0) #define __audit_inode_child(i,p) do { ; } while (0) -#define audit_inode(n,d) do { ; } while (0) +#define audit_inode(n,d) do { (void)(d); } while (0) #define audit_inode_child(i,p) do { ; } while (0) #define audit_core_dumps(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) -- cgit v1.2.3 From 455b2864686d3591b3b2f39eb46290c95f76471f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:06 +1000 Subject: writeback: Initial tracing support Trace queue/sched/exec parts of the writeback loop. This provides insight into when and why flusher threads are scheduled to run. e.g a sync invocation leaves traces like: sync-[...]: writeback_queue: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 flush-8:0-[...]: writeback_exec: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 This also lays the foundation for adding more writeback tracing to provide deeper insight into the whole writeback path. The original tracing code is from Jens Axboe, though this version is a rewrite as a result of the code being traced changing significantly. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 38 +++++++++++++---- include/trace/events/writeback.h | 91 ++++++++++++++++++++++++++++++++++++++++ mm/backing-dev.c | 3 ++ 3 files changed, 124 insertions(+), 8 deletions(-) create mode 100644 include/trace/events/writeback.h (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c8471b3ddcc..73acab4dc2b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,15 +26,9 @@ #include #include #include +#include #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -50,6 +44,21 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include + +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -65,6 +74,8 @@ int writeback_in_progress(struct backing_dev_info *bdi) static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { + trace_writeback_queue(bdi, work); + spin_lock(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); @@ -74,6 +85,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * it gets created and wakes up, we'll run this work. */ if (unlikely(!bdi->wb.task)) { + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } else { struct bdi_writeback *wb = &bdi->wb; @@ -95,8 +107,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) + if (bdi->wb.task) { + trace_writeback_nowork(bdi); wake_up_process(bdi->wb.task); + } return; } @@ -751,6 +765,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->sync_mode = WB_SYNC_ALL; + trace_writeback_exec(bdi, work); + wrote += wb_writeback(wb, work); /* @@ -805,9 +821,13 @@ int bdi_writeback_thread(void *data) smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) last_active = jiffies; else if (wait_jiffies != -1UL) { @@ -845,6 +865,8 @@ int bdi_writeback_thread(void *data) */ if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h new file mode 100644 index 00000000000..562fcae10d9 --- /dev/null +++ b/include/trace/events/writeback.h @@ -0,0 +1,91 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM writeback + +#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WRITEBACK_H + +#include +#include + +struct wb_writeback_work; + +DECLARE_EVENT_CLASS(writeback_work_class, + TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), + TP_ARGS(bdi, work), + TP_STRUCT__entry( + __array(char, name, 32) + __field(long, nr_pages) + __field(dev_t, sb_dev) + __field(int, sync_mode) + __field(int, for_kupdate) + __field(int, range_cyclic) + __field(int, for_background) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->nr_pages = work->nr_pages; + __entry->sb_dev = work->sb ? work->sb->s_dev : 0; + __entry->sync_mode = work->sync_mode; + __entry->for_kupdate = work->for_kupdate; + __entry->range_cyclic = work->range_cyclic; + __entry->for_background = work->for_background; + ), + TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " + "kupdate=%d range_cyclic=%d background=%d", + __entry->name, + MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), + __entry->nr_pages, + __entry->sync_mode, + __entry->for_kupdate, + __entry->range_cyclic, + __entry->for_background + ) +); +#define DEFINE_WRITEBACK_WORK_EVENT(name) \ +DEFINE_EVENT(writeback_work_class, name, \ + TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ + TP_ARGS(bdi, work)) +DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); +DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); +DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); + +TRACE_EVENT(writeback_pages_written, + TP_PROTO(long pages_written), + TP_ARGS(pages_written), + TP_STRUCT__entry( + __field(long, pages) + ), + TP_fast_assign( + __entry->pages = pages_written; + ), + TP_printk("%ld", __entry->pages) +); + +DECLARE_EVENT_CLASS(writeback_class, + TP_PROTO(struct backing_dev_info *bdi), + TP_ARGS(bdi), + TP_STRUCT__entry( + __array(char, name, 32) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + ), + TP_printk("bdi %s", + __entry->name + ) +); +#define DEFINE_WRITEBACK_EVENT(name) \ +DEFINE_EVENT(writeback_class, name, \ + TP_PROTO(struct backing_dev_info *bdi), \ + TP_ARGS(bdi)) + +DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_bdi_register); +DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); +DEFINE_WRITEBACK_EVENT(writeback_thread_start); +DEFINE_WRITEBACK_EVENT(writeback_thread_stop); + +#endif /* _TRACE_WRITEBACK_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bceac647e4d..ac78a333618 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,7 @@ #include #include #include +#include static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -518,6 +519,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); + trace_writeback_bdi_register(bdi); exit: return ret; } @@ -578,6 +580,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); if (!bdi_cap_flush_forker(bdi)) -- cgit v1.2.3 From 028c2dd184c097809986684f2f0627eea5529fea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:07 +1000 Subject: writeback: Add tracing to balance_dirty_pages Tracing high level background writeback events is good, but it doesn't give the entire picture. Add visibility into write throttling to catch IO dispatched by foreground throttling of processing dirtying lots of pages. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 ++++ include/trace/events/writeback.h | 64 ++++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 4 +++ 3 files changed, 73 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 73acab4dc2b..bf10cbf379d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -656,10 +656,14 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) __writeback_inodes_sb(work->sb, wb, &wbc); else writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; @@ -687,6 +691,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 562fcae10d9..0be26acae06 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -85,6 +85,70 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DECLARE_EVENT_CLASS(wbc_class, + TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), + TP_ARGS(wbc, bdi), + TP_STRUCT__entry( + __array(char, name, 32) + __field(long, nr_to_write) + __field(long, pages_skipped) + __field(int, sync_mode) + __field(int, nonblocking) + __field(int, encountered_congestion) + __field(int, for_kupdate) + __field(int, for_background) + __field(int, for_reclaim) + __field(int, range_cyclic) + __field(int, more_io) + __field(unsigned long, older_than_this) + __field(long, range_start) + __field(long, range_end) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->nr_to_write = wbc->nr_to_write; + __entry->pages_skipped = wbc->pages_skipped; + __entry->sync_mode = wbc->sync_mode; + __entry->for_kupdate = wbc->for_kupdate; + __entry->for_background = wbc->for_background; + __entry->for_reclaim = wbc->for_reclaim; + __entry->range_cyclic = wbc->range_cyclic; + __entry->more_io = wbc->more_io; + __entry->older_than_this = wbc->older_than_this ? + *wbc->older_than_this : 0; + __entry->range_start = (long)wbc->range_start; + __entry->range_end = (long)wbc->range_end; + ), + + TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " + "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " + "start=0x%lx end=0x%lx", + __entry->name, + __entry->nr_to_write, + __entry->pages_skipped, + __entry->sync_mode, + __entry->for_kupdate, + __entry->for_background, + __entry->for_reclaim, + __entry->range_cyclic, + __entry->more_io, + __entry->older_than_this, + __entry->range_start, + __entry->range_end) +) + +#define DEFINE_WBC_EVENT(name) \ +DEFINE_EVENT(wbc_class, name, \ + TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ + TP_ARGS(wbc, bdi)) +DEFINE_WBC_EVENT(wbc_writeback_start); +DEFINE_WBC_EVENT(wbc_writeback_written); +DEFINE_WBC_EVENT(wbc_writeback_wait); +DEFINE_WBC_EVENT(wbc_balance_dirty_start); +DEFINE_WBC_EVENT(wbc_balance_dirty_written); +DEFINE_WBC_EVENT(wbc_balance_dirty_wait); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 37498ef6154..d556cd829af 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited @@ -535,11 +536,13 @@ static void balance_dirty_pages(struct address_space *mapping, * threshold otherwise wait until the disk writes catch * up. */ + trace_wbc_balance_dirty_start(&wbc, bdi); if (bdi_nr_reclaimable > bdi_thresh) { writeback_inodes_wb(&bdi->wb, &wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + trace_wbc_balance_dirty_written(&wbc, bdi); } /* @@ -565,6 +568,7 @@ static void balance_dirty_pages(struct address_space *mapping, if (pages_written >= write_chunk) break; /* We've done our duty */ + trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_INTERRUPTIBLE); io_schedule_timeout(pause); -- cgit v1.2.3 From 6e9624b8caec290d28b4c6d9ec75749df6372b87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 7 Aug 2010 18:25:34 +0200 Subject: block: push down BKL into .open and .release The open and release block_device_operations are currently called with the BKL held. In order to change that, we must first make sure that all drivers that currently rely on this have no regressions. This blindly pushes the BKL into all .open and .release operations for all block drivers to prepare for the next step. The drivers can subsequently replace the BKL with their own locks or remove it completely when it can be shown that it is not needed. The functions blkdev_get and blkdev_put are the only remaining users of the big kernel lock in the block layer, besides a few uses in the ioctl code, none of which need to serialize with blkdev_{get,put}. Most of these two functions is also under the protection of bdev->bd_mutex, including the actual calls to ->open and ->release, and the common code does not access any global data structures that need the BKL. Signed-off-by: Arnd Bergmann Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 7 ++++++- drivers/block/DAC960.c | 13 +++++++++---- drivers/block/amiflop.c | 12 ++++++++++-- drivers/block/aoe/aoeblk.c | 4 ++++ drivers/block/ataflop.c | 14 +++++++++++++- drivers/block/cciss.c | 23 ++++++++++++++++++++--- drivers/block/cpqarray.c | 22 +++++++++++++++++++--- drivers/block/drbd/drbd_main.c | 4 ++++ drivers/block/floppy.c | 5 +++++ drivers/block/loop.c | 5 +++++ drivers/block/paride/pcd.c | 10 +++++++++- drivers/block/paride/pd.c | 4 ++++ drivers/block/paride/pf.c | 20 +++++++++++++++----- drivers/block/pktcdvd.c | 5 +++++ drivers/block/swim.c | 15 ++++++++++++++- drivers/block/swim3.c | 15 ++++++++++++++- drivers/block/ub.c | 17 ++++++++++++++++- drivers/block/viodasd.c | 19 ++++++++++++++++++- drivers/block/xen-blkfront.c | 7 +++++++ drivers/block/xsysace.c | 6 ++++++ drivers/block/z2ram.c | 13 ++++++++++--- drivers/cdrom/gdrom.c | 8 +++++++- drivers/cdrom/viocd.c | 10 +++++++++- drivers/ide/ide-cd.c | 14 +++++++++----- drivers/ide/ide-gd.c | 17 ++++++++++++++++- drivers/ide/ide-tape.c | 9 ++++++++- drivers/md/dm.c | 7 +++++++ drivers/md/md.c | 6 ++++++ drivers/memstick/core/mspro_block.c | 9 ++++++++- drivers/message/i2o/i2o_block.c | 5 +++++ drivers/mmc/card/block.c | 5 +++++ drivers/mtd/mtd_blkdevs.c | 6 +++++- drivers/s390/block/dasd.c | 6 ++++++ drivers/s390/block/dcssblk.c | 5 +++++ drivers/s390/char/tape_block.c | 8 +++++++- drivers/scsi/sd.c | 5 +++++ drivers/scsi/sr.c | 7 ++++++- drivers/staging/hv/blkvsc_drv.c | 5 +++++ fs/block_dev.c | 10 ++-------- 39 files changed, 334 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index da992a3ad6b..1bcd208c459 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -33,6 +33,7 @@ #include "linux/mm.h" #include "linux/slab.h" #include "linux/vmalloc.h" +#include "linux/smp_lock.h" #include "linux/blkpg.h" #include "linux/genhd.h" #include "linux/spinlock.h" @@ -1098,6 +1099,7 @@ static int ubd_open(struct block_device *bdev, fmode_t mode) struct ubd *ubd_dev = disk->private_data; int err = 0; + lock_kernel(); if(ubd_dev->count == 0){ err = ubd_open_dev(ubd_dev); if(err){ @@ -1115,7 +1117,8 @@ static int ubd_open(struct block_device *bdev, fmode_t mode) if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev); err = -EROFS; }*/ - out: +out: + unlock_kernel(); return err; } @@ -1123,8 +1126,10 @@ static int ubd_release(struct gendisk *disk, fmode_t mode) { struct ubd *ubd_dev = disk->private_data; + lock_kernel(); if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev); + unlock_kernel(); return 0; } diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index c5f22bb0a48..4e2c367fec1 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -79,23 +79,28 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; DAC960_Controller_T *p = disk->queue->queuedata; int drive_nr = (long)disk->private_data; + int ret = -ENXIO; + lock_kernel(); if (p->FirmwareType == DAC960_V1_Controller) { if (p->V1.LogicalDriveInformation[drive_nr]. LogicalDriveState == DAC960_V1_LogicalDrive_Offline) - return -ENXIO; + goto out; } else { DAC960_V2_LogicalDeviceInfo_T *i = p->V2.LogicalDeviceInformation[drive_nr]; if (!i || i->LogicalDeviceState == DAC960_V2_LogicalDevice_Offline) - return -ENXIO; + goto out; } check_disk_change(bdev); if (!get_capacity(p->disks[drive_nr])) - return -ENXIO; - return 0; + goto out; + ret = 0; +out: + unlock_kernel(); + return ret; } static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 0fa26359304..76f114f0bba 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1555,10 +1555,13 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) int old_dev; unsigned long flags; + lock_kernel(); old_dev = fd_device[drive]; - if (fd_ref[drive] && old_dev != system) + if (fd_ref[drive] && old_dev != system) { + unlock_kernel(); return -EBUSY; + } if (mode & (FMODE_READ|FMODE_WRITE)) { check_disk_change(bdev); @@ -1571,8 +1574,10 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) fd_deselect (drive); rel_fdc(); - if (wrprot) + if (wrprot) { + unlock_kernel(); return -EROFS; + } } } @@ -1589,6 +1594,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive, unit[drive].type->name, data_types[system].name); + unlock_kernel(); return 0; } @@ -1597,6 +1603,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) struct amiga_floppy_struct *p = disk->private_data; int drive = p - unit; + lock_kernel(); if (unit[drive].dirty == 1) { del_timer (flush_track_timer + drive); non_int_flush_track (drive); @@ -1610,6 +1617,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) /* the mod_use counter is handled this way */ floppy_off (drive | 0x40000000); #endif + unlock_kernel(); return 0; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 65deffde60a..a946929735a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "aoe.h" static struct kmem_cache *buf_pool_cache; @@ -124,13 +125,16 @@ aoeblk_open(struct block_device *bdev, fmode_t mode) struct aoedev *d = bdev->bd_disk->private_data; ulong flags; + lock_kernel(); spin_lock_irqsave(&d->lock, flags); if (d->flags & DEVFL_UP) { d->nopen++; spin_unlock_irqrestore(&d->lock, flags); + unlock_kernel(); return 0; } spin_unlock_irqrestore(&d->lock, flags); + unlock_kernel(); return -ENODEV; } diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 1bb8bfcfdbd..aceb9647652 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1850,22 +1850,34 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; } +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = floppy_open(bdev, mode); + unlock_kernel(); + + return ret; +} static int floppy_release(struct gendisk *disk, fmode_t mode) { struct atari_floppy_struct *p = disk->private_data; + lock_kernel(); if (p->ref < 0) p->ref = 0; else if (!p->ref--) { printk(KERN_ERR "floppy_release with fd_ref == 0"); p->ref = 0; } + unlock_kernel(); return 0; } static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, - .open = floppy_open, + .open = floppy_unlocked_open, .release = floppy_release, .ioctl = fd_ioctl, .media_changed = check_floppy_change, diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index a6c0494dd05..665a470310a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -178,6 +178,7 @@ static void do_cciss_request(struct request_queue *q); static irqreturn_t do_cciss_intx(int irq, void *dev_id); static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id); static int cciss_open(struct block_device *bdev, fmode_t mode); +static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode); static int cciss_release(struct gendisk *disk, fmode_t mode); static int do_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); @@ -237,7 +238,7 @@ static int cciss_compat_ioctl(struct block_device *, fmode_t, static const struct block_device_operations cciss_fops = { .owner = THIS_MODULE, - .open = cciss_open, + .open = cciss_unlocked_open, .release = cciss_release, .ioctl = do_ioctl, .getgeo = cciss_getgeo, @@ -1042,13 +1043,28 @@ static int cciss_open(struct block_device *bdev, fmode_t mode) return 0; } +static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = cciss_open(bdev, mode); + unlock_kernel(); + + return ret; +} + /* * Close. Sync first. */ static int cciss_release(struct gendisk *disk, fmode_t mode) { - ctlr_info_t *host = get_host(disk); - drive_info_struct *drv = get_drv(disk); + ctlr_info_t *host; + drive_info_struct *drv; + + lock_kernel(); + host = get_host(disk); + drv = get_drv(disk); #ifdef CCISS_DEBUG printk(KERN_DEBUG "cciss_release %s\n", disk->disk_name); @@ -1056,6 +1072,7 @@ static int cciss_release(struct gendisk *disk, fmode_t mode) drv->usage_count--; host->usage_count--; + unlock_kernel(); return 0; } diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index c459aeea3c0..28937b66156 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -158,7 +158,7 @@ static int sendcmd( unsigned int blkcnt, unsigned int log_unit ); -static int ida_open(struct block_device *bdev, fmode_t mode); +static int ida_unlocked_open(struct block_device *bdev, fmode_t mode); static int ida_release(struct gendisk *disk, fmode_t mode); static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -196,7 +196,7 @@ static inline ctlr_info_t *get_host(struct gendisk *disk) static const struct block_device_operations ida_fops = { .owner = THIS_MODULE, - .open = ida_open, + .open = ida_unlocked_open, .release = ida_release, .ioctl = ida_ioctl, .getgeo = ida_getgeo, @@ -841,13 +841,29 @@ static int ida_open(struct block_device *bdev, fmode_t mode) return 0; } +static int ida_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = ida_open(bdev, mode); + unlock_kernel(); + + return ret; +} + /* * Close. Sync first. */ static int ida_release(struct gendisk *disk, fmode_t mode) { - ctlr_info_t *host = get_host(disk); + ctlr_info_t *host; + + lock_kernel(); + host = get_host(disk); host->usage_count--; + unlock_kernel(); + return 0; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e2ab13d99d6..d2b6764a7b1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2604,6 +2604,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) unsigned long flags; int rv = 0; + lock_kernel(); spin_lock_irqsave(&mdev->req_lock, flags); /* to have a stable mdev->state.role * and no race with updating open_cnt */ @@ -2618,6 +2619,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) if (!rv) mdev->open_cnt++; spin_unlock_irqrestore(&mdev->req_lock, flags); + unlock_kernel(); return rv; } @@ -2625,7 +2627,9 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) static int drbd_release(struct gendisk *gd, fmode_t mode) { struct drbd_conf *mdev = gd->private_data; + lock_kernel(); mdev->open_cnt--; + unlock_kernel(); return 0; } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 40419b066aa..3126d5122b2 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3616,6 +3616,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) { int drive = (long)disk->private_data; + lock_kernel(); mutex_lock(&open_lock); if (UDRS->fd_ref < 0) UDRS->fd_ref = 0; @@ -3626,6 +3627,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) if (!UDRS->fd_ref) opened_bdev[drive] = NULL; mutex_unlock(&open_lock); + unlock_kernel(); return 0; } @@ -3643,6 +3645,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) int res = -EBUSY; char *tmp; + lock_kernel(); mutex_lock(&open_lock); old_dev = UDRS->fd_device; if (opened_bdev[drive] && opened_bdev[drive] != bdev) @@ -3719,6 +3722,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) goto out; } mutex_unlock(&open_lock); + unlock_kernel(); return 0; out: if (UDRS->fd_ref < 0) @@ -3729,6 +3733,7 @@ out: opened_bdev[drive] = NULL; out2: mutex_unlock(&open_lock); + unlock_kernel(); return res; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d285a548196..f3c636d2371 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include /* for invalidate_bdev() */ #include @@ -1408,9 +1409,11 @@ static int lo_open(struct block_device *bdev, fmode_t mode) { struct loop_device *lo = bdev->bd_disk->private_data; + lock_kernel(); mutex_lock(&lo->lo_ctl_mutex); lo->lo_refcnt++; mutex_unlock(&lo->lo_ctl_mutex); + unlock_kernel(); return 0; } @@ -1420,6 +1423,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode) struct loop_device *lo = disk->private_data; int err; + lock_kernel(); mutex_lock(&lo->lo_ctl_mutex); if (--lo->lo_refcnt) @@ -1444,6 +1448,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode) out: mutex_unlock(&lo->lo_ctl_mutex); out_unlocked: + lock_kernel(); return 0; } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index daba7a62a66..76f8565e1e8 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -225,13 +225,21 @@ static char *pcd_buf; /* buffer for request in progress */ static int pcd_block_open(struct block_device *bdev, fmode_t mode) { struct pcd_unit *cd = bdev->bd_disk->private_data; - return cdrom_open(&cd->info, bdev, mode); + int ret; + + lock_kernel(); + ret = cdrom_open(&cd->info, bdev, mode); + unlock_kernel(); + + return ret; } static int pcd_block_release(struct gendisk *disk, fmode_t mode) { struct pcd_unit *cd = disk->private_data; + lock_kernel(); cdrom_release(&cd->info, mode); + unlock_kernel(); return 0; } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index c4d6ed9846c..985f0d4f1d1 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -736,12 +736,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode) { struct pd_unit *disk = bdev->bd_disk->private_data; + lock_kernel(); disk->access++; if (disk->removable) { pd_special_command(disk, pd_media_check); pd_special_command(disk, pd_door_lock); } + unlock_kernel(); return 0; } @@ -783,8 +785,10 @@ static int pd_release(struct gendisk *p, fmode_t mode) { struct pd_unit *disk = p->private_data; + lock_kernel(); if (!--disk->access && disk->removable) pd_special_command(disk, pd_door_unlock); + unlock_kernel(); return 0; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 38b4d566b81..4457b494882 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -300,20 +300,26 @@ static void __init pf_init_units(void) static int pf_open(struct block_device *bdev, fmode_t mode) { struct pf_unit *pf = bdev->bd_disk->private_data; + int ret; + lock_kernel(); pf_identify(pf); + ret = -ENODEV; if (pf->media_status == PF_NM) - return -ENODEV; + goto out; + ret = -EROFS; if ((pf->media_status == PF_RO) && (mode & FMODE_WRITE)) - return -EROFS; + goto out; + ret = 0; pf->access++; if (pf->removable) pf_lock(pf, 1); - - return 0; +out: + unlock_kernel(); + return ret; } static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -354,14 +360,18 @@ static int pf_release(struct gendisk *disk, fmode_t mode) { struct pf_unit *pf = disk->private_data; - if (pf->access <= 0) + lock_kernel(); + if (pf->access <= 0) { + unlock_kernel(); return -EINVAL; + } pf->access--; if (!pf->access && pf->removable) pf_lock(pf, 0); + unlock_kernel(); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 40f1e31f42c..b1cbeb59bb7 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2383,6 +2383,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) VPRINTK(DRIVER_NAME": entering open\n"); + lock_kernel(); mutex_lock(&ctl_mutex); pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); if (!pd) { @@ -2410,6 +2411,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) } mutex_unlock(&ctl_mutex); + unlock_kernel(); return 0; out_dec: @@ -2417,6 +2419,7 @@ out_dec: out: VPRINTK(DRIVER_NAME": failed open (%d)\n", ret); mutex_unlock(&ctl_mutex); + unlock_kernel(); return ret; } @@ -2425,6 +2428,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode) struct pktcdvd_device *pd = disk->private_data; int ret = 0; + lock_kernel(); mutex_lock(&ctl_mutex); pd->refcnt--; BUG_ON(pd->refcnt < 0); @@ -2433,6 +2437,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode) pkt_release_dev(pd, flush); } mutex_unlock(&ctl_mutex); + unlock_kernel(); return ret; } diff --git a/drivers/block/swim.c b/drivers/block/swim.c index f04f74e3758..2e46815876d 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -662,11 +662,23 @@ out: return err; } +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = floppy_open(bdev, mode); + unlock_kernel(); + + return ret; +} + static int floppy_release(struct gendisk *disk, fmode_t mode) { struct floppy_state *fs = disk->private_data; struct swim __iomem *base = fs->swd->base; + lock_kernel(); if (fs->ref_count < 0) fs->ref_count = 0; else if (fs->ref_count > 0) @@ -674,6 +686,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) if (fs->ref_count == 0) swim_motor(base, OFF); + unlock_kernel(); return 0; } @@ -754,7 +767,7 @@ static int floppy_revalidate(struct gendisk *disk) static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, - .open = floppy_open, + .open = floppy_unlocked_open, .release = floppy_release, .ioctl = floppy_ioctl, .getgeo = floppy_getgeo, diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index f3657b2a538..cc6a3864822 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -949,15 +949,28 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; } +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = floppy_open(bdev, mode); + unlock_kernel(); + + return ret; +} + static int floppy_release(struct gendisk *disk, fmode_t mode) { struct floppy_state *fs = disk->private_data; struct swim3 __iomem *sw = fs->swim3; + lock_kernel(); if (fs->ref_count > 0 && --fs->ref_count == 0) { swim3_action(fs, MOTOR_OFF); out_8(&sw->control_bic, 0xff); swim3_select(fs, RELAX); } + unlock_kernel(); return 0; } @@ -1008,7 +1021,7 @@ static int floppy_revalidate(struct gendisk *disk) } static const struct block_device_operations floppy_fops = { - .open = floppy_open, + .open = floppy_unlocked_open, .release = floppy_release, .ioctl = floppy_ioctl, .media_changed = floppy_check_change, diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 102ed52d0e0..c48e1487858 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -1711,6 +1711,18 @@ err_open: return rc; } +static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = ub_bd_open(bdev, mode); + unlock_kernel(); + + return ret; +} + + /* */ static int ub_bd_release(struct gendisk *disk, fmode_t mode) @@ -1718,7 +1730,10 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode) struct ub_lun *lun = disk->private_data; struct ub_dev *sc = lun->udev; + lock_kernel(); ub_put(sc); + unlock_kernel(); + return 0; } @@ -1798,7 +1813,7 @@ static int ub_bd_media_changed(struct gendisk *disk) static const struct block_device_operations ub_bd_fops = { .owner = THIS_MODULE, - .open = ub_bd_open, + .open = ub_bd_unlocked_open, .release = ub_bd_release, .ioctl = ub_bd_ioctl, .media_changed = ub_bd_media_changed, diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index 5663d3c284c..f651e51a331 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -175,6 +176,18 @@ static int viodasd_open(struct block_device *bdev, fmode_t mode) return 0; } +static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = viodasd_open(bdev, mode); + unlock_kernel(); + + return ret; +} + + /* * External release entry point. */ @@ -183,6 +196,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode) struct viodasd_device *d = disk->private_data; HvLpEvent_Rc hvrc; + lock_kernel(); /* Send the event to OS/400. We DON'T expect a response */ hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, HvLpEvent_Type_VirtualIo, @@ -195,6 +209,9 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode) 0, 0, 0); if (hvrc != 0) pr_warning("HV close call failed %d\n", (int)hvrc); + + unlock_kernel(); + return 0; } @@ -219,7 +236,7 @@ static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) */ static const struct block_device_operations viodasd_fops = { .owner = THIS_MODULE, - .open = viodasd_open, + .open = viodasd_unlocked_open, .release = viodasd_release, .getgeo = viodasd_getgeo, }; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9119cd3d56a..91374282755 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -1018,13 +1019,18 @@ static int blkfront_is_ready(struct xenbus_device *dev) static int blkif_open(struct block_device *bdev, fmode_t mode) { struct blkfront_info *info = bdev->bd_disk->private_data; + + lock_kernel(); info->users++; + unlock_kernel(); + return 0; } static int blkif_release(struct gendisk *disk, fmode_t mode) { struct blkfront_info *info = disk->private_data; + lock_kernel(); info->users--; if (info->users == 0) { /* Check whether we have been instructed to close. We will @@ -1036,6 +1042,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) if (state == XenbusStateClosing && info->is_ready) blkfront_closing(dev); } + unlock_kernel(); return 0; } diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index ac278ac908d..b71888b909a 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -901,11 +902,14 @@ static int ace_open(struct block_device *bdev, fmode_t mode) dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1); + lock_kernel(); spin_lock_irqsave(&ace->lock, flags); ace->users++; spin_unlock_irqrestore(&ace->lock, flags); check_disk_change(bdev); + unlock_kernel(); + return 0; } @@ -917,6 +921,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode) dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1); + lock_kernel(); spin_lock_irqsave(&ace->lock, flags); ace->users--; if (ace->users == 0) { @@ -924,6 +929,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode) ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ); } spin_unlock_irqrestore(&ace->lock, flags); + unlock_kernel(); return 0; } diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 9114654b54d..d75b2bb601a 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -153,6 +154,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode) device = MINOR(bdev->bd_dev); + lock_kernel(); if ( current_device != -1 && current_device != device ) { rc = -EBUSY; @@ -294,20 +296,25 @@ static int z2_open(struct block_device *bdev, fmode_t mode) set_capacity(z2ram_gendisk, z2ram_size >> 9); } + unlock_kernel(); return 0; err_out_kfree: kfree(z2ram_map); err_out: + unlock_kernel(); return rc; } static int z2_release(struct gendisk *disk, fmode_t mode) { - if ( current_device == -1 ) - return 0; - + lock_kernel(); + if ( current_device == -1 ) { + unlock_kernel(); + return 0; + } + unlock_kernel(); /* * FIXME: unmap memory */ diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 1772fd914fb..261107d1457 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -493,12 +493,18 @@ static struct cdrom_device_ops gdrom_ops = { static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) { - return cdrom_open(gd.cd_info, bdev, mode); + int ret; + lock_kernel(); + ret = cdrom_open(gd.cd_info, bdev, mode); + unlock_kernel(); + return ret; } static int gdrom_bdops_release(struct gendisk *disk, fmode_t mode) { + lock_kernel(); cdrom_release(gd.cd_info, mode); + unlock_kernel(); return 0; } diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 16dada0627e..56bf9f44700 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -154,13 +154,21 @@ static const struct file_operations proc_viocd_operations = { static int viocd_blk_open(struct block_device *bdev, fmode_t mode) { struct disk_info *di = bdev->bd_disk->private_data; - return cdrom_open(&di->viocd_info, bdev, mode); + int ret; + + lock_kernel(); + ret = cdrom_open(&di->viocd_info, bdev, mode); + unlock_kernel(); + + return ret; } static int viocd_blk_release(struct gendisk *disk, fmode_t mode) { struct disk_info *di = disk->private_data; + lock_kernel(); cdrom_release(&di->viocd_info, mode); + unlock_kernel(); return 0; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index bf9f61a5c2f..5108e9739c9 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1591,17 +1591,19 @@ static struct ide_driver ide_cdrom_driver = { static int idecd_open(struct block_device *bdev, fmode_t mode) { - struct cdrom_info *info = ide_cd_get(bdev->bd_disk); - int rc = -ENOMEM; + struct cdrom_info *info; + int rc = -ENXIO; + lock_kernel(); + info = ide_cd_get(bdev->bd_disk); if (!info) - return -ENXIO; + goto out; rc = cdrom_open(&info->devinfo, bdev, mode); - if (rc < 0) ide_cd_put(info); - +out: + unlock_kernel(); return rc; } @@ -1609,9 +1611,11 @@ static int idecd_release(struct gendisk *disk, fmode_t mode) { struct cdrom_info *info = ide_drv_g(disk, cdrom_info); + lock_kernel(); cdrom_release(&info->devinfo, mode); ide_cd_put(info); + unlock_kernel(); return 0; } diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index 883f0c979c9..137337a795a 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -237,6 +238,18 @@ out_put_idkp: return ret; } +static int ide_gd_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + lock_kernel(); + ret = ide_gd_open(bdev, mode); + unlock_kernel(); + + return ret; +} + + static int ide_gd_release(struct gendisk *disk, fmode_t mode) { struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); @@ -244,6 +257,7 @@ static int ide_gd_release(struct gendisk *disk, fmode_t mode) ide_debug_log(IDE_DBG_FUNC, "enter"); + lock_kernel(); if (idkp->openers == 1) drive->disk_ops->flush(drive); @@ -255,6 +269,7 @@ static int ide_gd_release(struct gendisk *disk, fmode_t mode) idkp->openers--; ide_disk_put(idkp); + unlock_kernel(); return 0; } @@ -321,7 +336,7 @@ static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode, static const struct block_device_operations ide_gd_ops = { .owner = THIS_MODULE, - .open = ide_gd_open, + .open = ide_gd_unlocked_open, .release = ide_gd_release, .ioctl = ide_gd_ioctl, .getgeo = ide_gd_getgeo, diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 39b0a5c45f0..6d622cb5ac8 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1907,7 +1907,11 @@ static const struct file_operations idetape_fops = { static int idetape_open(struct block_device *bdev, fmode_t mode) { - struct ide_tape_obj *tape = ide_tape_get(bdev->bd_disk, false, 0); + struct ide_tape_obj *tape; + + lock_kernel(); + tape = ide_tape_get(bdev->bd_disk, false, 0); + unlock_kernel(); if (!tape) return -ENXIO; @@ -1919,7 +1923,10 @@ static int idetape_release(struct gendisk *disk, fmode_t mode) { struct ide_tape_obj *tape = ide_drv_g(disk, ide_tape_obj); + lock_kernel(); ide_tape_put(tape); + unlock_kernel(); + return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d505a96845c..a3f21dc02bd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -338,6 +339,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; + lock_kernel(); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -355,6 +357,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); + unlock_kernel(); return md ? 0 : -ENXIO; } @@ -362,8 +365,12 @@ out: static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; + + lock_kernel(); atomic_dec(&md->open_count); dm_put(md); + unlock_kernel(); + return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1893af67877..700c96edf9b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -36,6 +36,7 @@ #include #include #include +#include #include /* for invalidate_bdev */ #include #include @@ -5902,6 +5903,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) mddev_t *mddev = mddev_find(bdev->bd_dev); int err; + lock_kernel(); if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. @@ -5910,6 +5912,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ flush_scheduled_work(); /* Then retry the open from the top */ + unlock_kernel(); return -ERESTARTSYS; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -5923,6 +5926,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) check_disk_size_change(mddev->gendisk, bdev); out: + unlock_kernel(); return err; } @@ -5931,8 +5935,10 @@ static int md_release(struct gendisk *disk, fmode_t mode) mddev_t *mddev = disk->private_data; BUG_ON(!mddev); + lock_kernel(); atomic_dec(&mddev->openers); mddev_put(mddev); + unlock_kernel(); return 0; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 56645408d22..eef78a068fd 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #define DRIVER_NAME "mspro_block" @@ -179,6 +180,7 @@ static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode) struct mspro_block_data *msb = disk->private_data; int rc = -ENXIO; + lock_kernel(); mutex_lock(&mspro_block_disk_lock); if (msb && msb->card) { @@ -190,6 +192,7 @@ static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode) } mutex_unlock(&mspro_block_disk_lock); + unlock_kernel(); return rc; } @@ -221,7 +224,11 @@ static int mspro_block_disk_release(struct gendisk *disk) static int mspro_block_bd_release(struct gendisk *disk, fmode_t mode) { - return mspro_block_disk_release(disk); + int ret; + lock_kernel(); + ret = mspro_block_disk_release(disk); + unlock_kernel(); + return ret; } static int mspro_block_bd_getgeo(struct block_device *bdev, diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index d1bdf8abe5d..a5bc3ee0d93 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -53,6 +53,7 @@ #include #include #include +#include #include @@ -577,6 +578,7 @@ static int i2o_block_open(struct block_device *bdev, fmode_t mode) if (!dev->i2o_dev) return -ENODEV; + lock_kernel(); if (dev->power > 0x1f) i2o_block_device_power(dev, 0x02); @@ -585,6 +587,7 @@ static int i2o_block_open(struct block_device *bdev, fmode_t mode) i2o_block_device_lock(dev->i2o_dev, -1); osm_debug("Ready.\n"); + unlock_kernel(); return 0; }; @@ -615,6 +618,7 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode) if (!dev->i2o_dev) return 0; + lock_kernel(); i2o_block_device_flush(dev->i2o_dev); i2o_block_device_unlock(dev->i2o_dev, -1); @@ -625,6 +629,7 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode) operation = 0x24; i2o_block_device_power(dev, operation); + unlock_kernel(); return 0; } diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index cb9fbc83b09..8433cde29c8 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,7 @@ static int mmc_blk_open(struct block_device *bdev, fmode_t mode) struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); int ret = -ENXIO; + lock_kernel(); if (md) { if (md->usage == 2) check_disk_change(bdev); @@ -117,6 +119,7 @@ static int mmc_blk_open(struct block_device *bdev, fmode_t mode) ret = -EROFS; } } + unlock_kernel(); return ret; } @@ -125,7 +128,9 @@ static int mmc_blk_release(struct gendisk *disk, fmode_t mode) { struct mmc_blk_data *md = disk->private_data; + lock_kernel(); mmc_blk_put(md); + unlock_kernel(); return 0; } diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 8c83b11a77d..5ca80aee2ed 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -165,8 +165,9 @@ static int blktrans_open(struct block_device *bdev, fmode_t mode) int ret; if (!dev) - return -ERESTARTSYS; + return -ERESTARTSYS; /* FIXME: busy loop! -arnd*/ + lock_kernel(); mutex_lock(&dev->lock); if (!dev->mtd) { @@ -183,6 +184,7 @@ static int blktrans_open(struct block_device *bdev, fmode_t mode) unlock: mutex_unlock(&dev->lock); blktrans_dev_put(dev); + unlock_kernel(); return ret; } @@ -194,6 +196,7 @@ static int blktrans_release(struct gendisk *disk, fmode_t mode) if (!dev) return ret; + lock_kernel(); mutex_lock(&dev->lock); /* Release one reference, we sure its not the last one here*/ @@ -206,6 +209,7 @@ static int blktrans_release(struct gendisk *disk, fmode_t mode) unlock: mutex_unlock(&dev->lock); blktrans_dev_put(dev); + unlock_kernel(); return ret; } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 17b033d0e05..1a84fae155e 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -2235,6 +2236,7 @@ static int dasd_open(struct block_device *bdev, fmode_t mode) if (!block) return -ENODEV; + lock_kernel(); base = block->base; atomic_inc(&block->open_count); if (test_bit(DASD_FLAG_OFFLINE, &base->flags)) { @@ -2269,12 +2271,14 @@ static int dasd_open(struct block_device *bdev, fmode_t mode) goto out; } + unlock_kernel(); return 0; out: module_put(base->discipline->owner); unlock: atomic_dec(&block->open_count); + unlock_kernel(); return rc; } @@ -2282,8 +2286,10 @@ static int dasd_release(struct gendisk *disk, fmode_t mode) { struct dasd_block *block = disk->private_data; + lock_kernel(); atomic_dec(&block->open_count); module_put(block->base->discipline->owner); + unlock_kernel(); return 0; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 9b43ae94beb..2bd72aa34c5 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -775,6 +776,7 @@ dcssblk_open(struct block_device *bdev, fmode_t mode) struct dcssblk_dev_info *dev_info; int rc; + lock_kernel(); dev_info = bdev->bd_disk->private_data; if (NULL == dev_info) { rc = -ENODEV; @@ -784,6 +786,7 @@ dcssblk_open(struct block_device *bdev, fmode_t mode) bdev->bd_block_size = 4096; rc = 0; out: + unlock_kernel(); return rc; } @@ -794,6 +797,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode) struct segment_info *entry; int rc; + lock_kernel(); if (!dev_info) { rc = -ENODEV; goto out; @@ -811,6 +815,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode) up_write(&dcssblk_devices_sem); rc = 0; out: + unlock_kernel(); return rc; } diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 097da8ce6be..b7de02525ec 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -361,6 +362,7 @@ tapeblock_open(struct block_device *bdev, fmode_t mode) struct tape_device * device; int rc; + lock_kernel(); device = tape_get_device(disk->private_data); if (device->required_tapemarks) { @@ -384,12 +386,14 @@ tapeblock_open(struct block_device *bdev, fmode_t mode) * is called. */ tape_state_set(device, TS_BLKUSE); + unlock_kernel(); return 0; release: tape_release(device); put_device: tape_put_device(device); + unlock_kernel(); return rc; } @@ -403,10 +407,12 @@ static int tapeblock_release(struct gendisk *disk, fmode_t mode) { struct tape_device *device = disk->private_data; - + + lock_kernel(); tape_state_set(device, TS_IN_USE); tape_release(device); tape_put_device(device); + unlock_kernel(); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 633ac32b25c..01680c7c850 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -795,6 +795,7 @@ static int sd_open(struct block_device *bdev, fmode_t mode) SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); + lock_kernel(); sdev = sdkp->device; /* @@ -838,10 +839,12 @@ static int sd_open(struct block_device *bdev, fmode_t mode) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } + unlock_kernel(); return 0; error_out: scsi_disk_put(sdkp); + unlock_kernel(); return retval; } @@ -863,6 +866,7 @@ static int sd_release(struct gendisk *disk, fmode_t mode) SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); + lock_kernel(); if (!--sdkp->openers && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); @@ -873,6 +877,7 @@ static int sd_release(struct gendisk *disk, fmode_t mode) * XXX is followed by a "rmmod sd_mod"? */ scsi_disk_put(sdkp); + unlock_kernel(); return 0; } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d42fa6468f4..ba9c3e0387c 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -467,22 +467,27 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) static int sr_block_open(struct block_device *bdev, fmode_t mode) { - struct scsi_cd *cd = scsi_cd_get(bdev->bd_disk); + struct scsi_cd *cd; int ret = -ENXIO; + lock_kernel(); + cd = scsi_cd_get(bdev->bd_disk); if (cd) { ret = cdrom_open(&cd->cdi, bdev, mode); if (ret) scsi_cd_put(cd); } + unlock_kernel(); return ret; } static int sr_block_release(struct gendisk *disk, fmode_t mode) { struct scsi_cd *cd = scsi_cd(disk); + lock_kernel(); cdrom_release(&cd->cdi, mode); scsi_cd_put(cd); + unlock_kernel(); return 0; } diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c index a9aff90e58e..87a11c9293e 100644 --- a/drivers/staging/hv/blkvsc_drv.c +++ b/drivers/staging/hv/blkvsc_drv.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ static int blkvsc_open(struct block_device *bdev, fmode_t mode) DPRINT_DBG(BLKVSC_DRV, "- users %d disk %s\n", blkdev->users, blkdev->gd->disk_name); + lock_kernel(); spin_lock(&blkdev->lock); if (!blkdev->users && blkdev->device_type == DVD_TYPE) { @@ -1337,6 +1339,7 @@ static int blkvsc_open(struct block_device *bdev, fmode_t mode) blkdev->users++; spin_unlock(&blkdev->lock); + unlock_kernel(); return 0; } @@ -1347,6 +1350,7 @@ static int blkvsc_release(struct gendisk *disk, fmode_t mode) DPRINT_DBG(BLKVSC_DRV, "- users %d disk %s\n", blkdev->users, blkdev->gd->disk_name); + lock_kernel(); spin_lock(&blkdev->lock); if (blkdev->users == 1) { spin_unlock(&blkdev->lock); @@ -1357,6 +1361,7 @@ static int blkvsc_release(struct gendisk *disk, fmode_t mode) blkdev->users--; spin_unlock(&blkdev->lock); + unlock_kernel(); return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 99d6af81174..693c2bf5d65 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1345,13 +1345,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } - lock_kernel(); restart: ret = -ENXIO; disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) - goto out_unlock_kernel; + goto out; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { @@ -1431,7 +1430,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return 0; out_clear: @@ -1444,9 +1442,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_contains = NULL; out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); - out_unlock_kernel: - unlock_kernel(); - + out: if (disk) module_put(disk->fops->owner); put_disk(disk); @@ -1515,7 +1511,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct block_device *victim = NULL; mutex_lock_nested(&bdev->bd_mutex, for_part); - lock_kernel(); if (for_part) bdev->bd_part_count--; @@ -1540,7 +1535,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; } - unlock_kernel(); mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) -- cgit v1.2.3 From 6965031d331a642e31278fa1b5bd47f372ffdd5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 3 Aug 2010 12:48:50 +0200 Subject: splice: fix misuse of SPLICE_F_NONBLOCK SPLICE_F_NONBLOCK is clearly documented to only affect blocking on the pipe. In __generic_file_splice_read(), however, it causes an EAGAIN if the page is currently being read. This makes it impossible to write an application that only wants failure if the pipe is full. For example if the same process is handling both ends of a pipe and isn't otherwise able to determine whether a splice to the pipe will fill it or not. We could make the read non-blocking on O_NONBLOCK or some other splice flag, but for now this is the simplest fix. Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Jens Axboe --- fs/splice.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ec11c52d646..8f1dfaecc8f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -399,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { - /* - * If in nonblock mode then dont block on waiting - * for an in-flight io page - */ - if (flags & SPLICE_F_NONBLOCK) { - if (!trylock_page(page)) { - error = -EAGAIN; - break; - } - } else - lock_page(page); + lock_page(page); /* * Page was truncated, or invalidated by the -- cgit v1.2.3 From 08852b6d6c40f387f2b75e199e2ca1df68970f4c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 3 Aug 2010 12:51:16 +0200 Subject: writeback: remove wb in get_next_work_item 83ba7b07 cleans up the writeback. So we don't use wb any more in get_next_work_item. Let's remove unnecessary argument. CC: Christoph Hellwig Signed-off-by: Minchan Kim Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bf10cbf379d..261570deb22 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -704,7 +704,7 @@ static long wb_writeback(struct bdi_writeback *wb, * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) +get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; @@ -762,7 +762,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) struct wb_writeback_work *work; long wrote = 0; - while ((work = get_next_work_item(bdi, wb)) != NULL) { + while ((work = get_next_work_item(bdi)) != NULL) { /* * Override sync mode, in case we must wait for completion * because this thread is exiting now. -- cgit v1.2.3 From 4aeefdc69f7b6f3f287e6fd8d4b213953b9e92d8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Aug 2010 13:22:51 +0200 Subject: coda: fixup clash with block layer REQ_* defines CODA should not be using defines in the global name space of that nature, prefix them with CODA_. Signed-off-by: Jens Axboe --- fs/coda/psdev.c | 12 ++++++------ fs/coda/upcall.c | 12 ++++++------ include/linux/coda_psdev.h | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 66b9cf79c5b..de89645777c 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -177,7 +177,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, nbytes = req->uc_outSize; /* don't have more space! */ } if (copy_from_user(req->uc_data, buf, nbytes)) { - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); retval = -EFAULT; goto out; @@ -254,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, retval = -EFAULT; /* If request was not a signal, enqueue and don't free */ - if (!(req->uc_flags & REQ_ASYNC)) { - req->uc_flags |= REQ_READ; + if (!(req->uc_flags & CODA_REQ_ASYNC)) { + req->uc_flags |= CODA_REQ_READ; list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } @@ -315,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file) list_del(&req->uc_chain); /* Async requests need to be freed here */ - if (req->uc_flags & REQ_ASYNC) { + if (req->uc_flags & CODA_REQ_ASYNC) { CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); kfree(req); continue; } - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { list_del(&req->uc_chain); - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f09c5ed76f6..b8893ab6f9e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old) (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ (r)->uc_opcode != CODA_RELEASE) || \ - (r)->uc_flags & REQ_READ)) + (r)->uc_flags & CODA_REQ_READ)) static inline void coda_waitfor_upcall(struct upc_req *req) { @@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req) set_current_state(TASK_UNINTERRUPTIBLE); /* got a reply */ - if (req->uc_flags & (REQ_WRITE | REQ_ABORT)) + if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT)) break; if (blocked && time_after(jiffies, timeout) && @@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp, coda_waitfor_upcall(req); /* Op went through, interrupt or not... */ - if (req->uc_flags & REQ_WRITE) { + if (req->uc_flags & CODA_REQ_WRITE) { out = (union outputArgs *)req->uc_data; /* here we map positive Venus errors to kernel errors */ error = -out->oh.result; @@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp, } error = -EINTR; - if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) { + if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { printk(KERN_WARNING "coda: Unexpected interruption.\n"); goto exit; } /* Interrupted before venus read it. */ - if (!(req->uc_flags & REQ_READ)) + if (!(req->uc_flags & CODA_REQ_READ)) goto exit; /* Venus saw the upcall, make sure we can send interrupt signal */ @@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp, sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; - sig_req->uc_flags = REQ_ASYNC; + sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; sig_req->uc_inSize = sizeof(struct coda_in_hdr); diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 8859e2ede9f..284b520934a 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -86,9 +86,9 @@ struct upc_req { wait_queue_head_t uc_sleep; /* process' wait queue */ }; -#define REQ_ASYNC 0x1 -#define REQ_READ 0x2 -#define REQ_WRITE 0x4 -#define REQ_ABORT 0x8 +#define CODA_REQ_ASYNC 0x1 +#define CODA_REQ_READ 0x2 +#define CODA_REQ_WRITE 0x4 +#define CODA_REQ_ABORT 0x8 #endif -- cgit v1.2.3 From 6f904ff0e39ea88f81eb77e8dfb4e1238492f0a8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:11 +0300 Subject: writeback: harmonize writeback threads naming The write-back code mixes words "thread" and "task" for the same things. This is not a big deal, but still an inconsistency. hch: a convention I tend to use and I've seen in various places is to always use _task for the storage of the task_struct pointer, and thread everywhere else. This especially helps with having foo_thread for the actual thread and foo_task for a global variable keeping the task_struct pointer This patch renames: * 'bdi_add_default_flusher_task()' -> 'bdi_add_default_flusher_thread()' * 'bdi_forker_task()' -> 'bdi_forker_thread()' because bdi threads are 'bdi_writeback_thread()', so these names are more consistent. This patch also amends commentaries and makes them refer the forker and bdi threads as "thread", not "task". Also, while on it, make 'bdi_add_default_flusher_thread()' declaration use 'static void' instead of 'void static' and make checkpatch.pl happy. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 26 +++++++++++++------------- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 261570deb22..002be0ff2ab 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -840,7 +840,7 @@ int bdi_writeback_thread(void *data) /* * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the task will get + * see dirty data again later, the thread will get * recreated automatically. */ max_idle = max(5UL * 60 * HZ, wait_jiffies); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e536f3a74e6..f0936f5f85d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -50,7 +50,7 @@ struct bdi_writeback { unsigned long last_old_flush; /* last old data flush */ - struct task_struct *task; /* writeback task */ + struct task_struct *task; /* writeback thread */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ac78a333618..4e9ed2a8521 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -50,7 +50,7 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); -static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); +static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include @@ -279,10 +279,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi) } /* - * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback tasks individually. + * bdi writeback thread individually. */ static int bdi_sync_supers(void *unused) { @@ -318,7 +318,7 @@ static void sync_supers_timer_fn(unsigned long unused) bdi_arm_supers_timer(); } -static int bdi_forker_task(void *ptr) +static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; @@ -354,7 +354,7 @@ static int bdi_forker_task(void *ptr) !bdi_has_dirty_io(bdi)) continue; - bdi_add_default_flusher_task(bdi); + bdi_add_default_flusher_thread(bdi); } set_current_state(TASK_INTERRUPTIBLE); @@ -376,7 +376,7 @@ static int bdi_forker_task(void *ptr) /* * This is our real job - check for pending entries in - * bdi_pending_list, and create the tasks that got added + * bdi_pending_list, and create the threads that got added */ bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, bdi_list); @@ -387,7 +387,7 @@ static int bdi_forker_task(void *ptr) wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s", dev_name(bdi->dev)); /* - * If task creation fails, then readd the bdi to + * If thread creation fails, then readd the bdi to * the pending list and force writeout of the bdi * from this forker thread. That will free some memory * and we can try again. @@ -430,10 +430,10 @@ static void bdi_add_to_pending(struct rcu_head *head) } /* - * Add the default flusher task that gets created for any bdi + * Add the default flusher thread that gets created for any bdi * that has dirty data pending writeout */ -void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi) { if (!bdi_cap_writeback_dirty(bdi)) return; @@ -445,10 +445,10 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) } /* - * Check with the helper whether to proceed adding a task. Will only + * Check with the helper whether to proceed adding a thread. Will only * abort if we two or more simultanous calls to - * bdi_add_default_flusher_task() occured, further additions will block - * waiting for previous additions to finish. + * bdi_add_default_flusher_thread() occured, further additions will + * block waiting for previous additions to finish. */ if (!test_and_set_bit(BDI_pending, &bdi->state)) { list_del_rcu(&bdi->bdi_list); @@ -506,7 +506,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (bdi_cap_flush_forker(bdi)) { struct bdi_writeback *wb = &bdi->wb; - wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", dev_name(dev)); if (IS_ERR(wb->task)) { wb->task = NULL; -- cgit v1.2.3 From 297252c81de8043ca6c36e5984c24fdb5aab9013 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:15 +0300 Subject: writeback: do not lose wake-ups in bdi threads Currently, bdi threads ('bdi_writeback_thread()') can lose wake-ups. For example, if 'bdi_queue_work()' is executed after the bdi thread have had finished 'wb_do_writeback()' but before it called 'schedule_timeout_interruptible()'. To fix this issue, we have to check whether we have works to process after we have changed the task state to 'TASK_INTERRUPTIBLE'. This patch also clean-ups handling of the cases when 'dirty_writeback_interval' is zero or non-zero. Additionally, this patch also removes unneeded 'list_empty_careful()' call. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 002be0ff2ab..05444eaa3f3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -848,17 +848,18 @@ int bdi_writeback_thread(void *data) break; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); - } else { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty_careful(&wb->bdi->work_list) && - !kthread_should_stop()) - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list)) { __set_current_state(TASK_RUNNING); + continue; } + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait_jiffies); + } else + schedule(); + try_to_freeze(); } -- cgit v1.2.3 From 78c40cb6581a74adc48821f3de6b864a54d4c34d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:17 +0300 Subject: writeback: do not remove bdi from bdi_list The forker thread removes bdis from 'bdi_list' before forking the bdi thread. But this is wrong for at least 2 reasons. Reason #1: if we temporary remove a bdi from the list, we may miss works which would otherwise be given to us. Reason #2: this is racy; indeed, 'bdi_wb_shutdown()' expects that bdis are always in the 'bdi_list' (see 'bdi_remove_from_list()'), and when it races with the forker thread, it can shut down the bdi thread at the same time as the forker creates it. This patch makes sure the forker thread never removes bdis from 'bdi_list' (which was suggested by Christoph Hellwig). In order to make sure that we do not race with 'bdi_wb_shutdown()', we have to hold the 'bdi_lock' while walking the 'bdi_list' and setting the 'BDI_pending' flag. NOTE! The error path is interesting. Currently, when we fail to create a bdi thread, we move the bdi to the tail of 'bdi_list'. But if we never remove the bdi from the list, we cannot move it to the tail either, because then we can mess up the RCU readers which walk the list. And also, we'll have the race described above in "Reason #2". But I not think that adding to the tail is any important so I just do not do that. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 7 ------- mm/backing-dev.c | 31 ++++++++++--------------------- 2 files changed, 10 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05444eaa3f3..57fbfd0ebc5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -804,13 +804,6 @@ int bdi_writeback_thread(void *data) unsigned long wait_jiffies = -1UL; long pages_written; - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - current->flags |= PF_FLUSHER | PF_SWAPWRITE; set_freezable(); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dbc66815a0f..672c17bb32d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -331,7 +331,7 @@ static int bdi_forker_thread(void *ptr) for (;;) { bool fork = false; struct task_struct *task; - struct backing_dev_info *bdi, *tmp; + struct backing_dev_info *bdi; /* * Temporary measure, we want to make sure we don't see @@ -347,7 +347,7 @@ static int bdi_forker_thread(void *ptr) * Check if any existing bdi's have dirty data without * a thread registered. If so, set that up. */ - list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + list_for_each_entry(bdi, &bdi_list, bdi_list) { if (!bdi_cap_writeback_dirty(bdi)) continue; if (bdi->wb.task) @@ -359,8 +359,13 @@ static int bdi_forker_thread(void *ptr) WARN(!test_bit(BDI_registered, &bdi->state), "bdi %p/%s is not registered!\n", bdi, bdi->name); - list_del_rcu(&bdi->bdi_list); fork = true; + + /* + * Set the pending bit - if someone will try to + * unregister this bdi - it'll wait on this bit. + */ + set_bit(BDI_pending, &bdi->state); break; } spin_unlock_bh(&bdi_lock); @@ -383,29 +388,13 @@ static int bdi_forker_thread(void *ptr) __set_current_state(TASK_RUNNING); - /* - * Set the pending bit - if someone will try to unregister this - * bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - - /* Make sure no one uses the picked bdi */ - synchronize_rcu(); - task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", dev_name(bdi->dev)); if (IS_ERR(task)) { /* - * If thread creation fails, then readd the bdi back to - * the list and force writeout of the bdi from this - * forker thread. That will free some memory and we can - * try again. Add it to the tail so we get a chance to - * flush other bdi's to free memory. + * If thread creation fails, force writeout of the bdi + * from the thread. */ - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - bdi_flush_io(bdi); } else bdi->wb.task = task; -- cgit v1.2.3 From ecd584030da67ede1bf17955746a6ce834d9fc6b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:18 +0300 Subject: writeback: move last_active to bdi Currently bdi threads use local variable 'last_active' which stores last time when the bdi thread did some useful work. Move this local variable to 'struct bdi_writeback'. This is just a preparation for the further patches which will make the forker thread decide when bdi threads should be killed. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- include/linux/backing-dev.h | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 57fbfd0ebc5..9f5cab75c15 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -800,12 +800,12 @@ int bdi_writeback_thread(void *data) { struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; - unsigned long last_active = jiffies; unsigned long wait_jiffies = -1UL; long pages_written; current->flags |= PF_FLUSHER | PF_SWAPWRITE; set_freezable(); + wb->last_active = jiffies; /* * Our parent may run at a different priority, just set us to normal @@ -827,7 +827,7 @@ int bdi_writeback_thread(void *data) trace_writeback_pages_written(pages_written); if (pages_written) - last_active = jiffies; + wb->last_active = jiffies; else if (wait_jiffies != -1UL) { unsigned long max_idle; @@ -837,7 +837,7 @@ int bdi_writeback_thread(void *data) * recreated automatically. */ max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + last_active)) + if (time_after(jiffies, max_idle + wb->last_active)) break; } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 95ecb2bebca..71b6223e0a7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -45,15 +45,16 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ + struct backing_dev_info *bdi; /* our parent bdi */ unsigned int nr; - unsigned long last_old_flush; /* last old data flush */ + unsigned long last_old_flush; /* last old data flush */ + unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ + struct task_struct *task; /* writeback thread */ + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ }; struct backing_dev_info { -- cgit v1.2.3 From fff5b85aa4225a7be157f208277a055822039a9e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:20 +0300 Subject: writeback: move bdi threads exiting logic to the forker thread Currently, bdi threads can decide to exit if there were no useful activities for 5 minutes. However, this causes nasty races: we can easily oops in the 'bdi_queue_work()' if the bdi thread decides to exit while we are waking it up. And even if we do not oops, but the bdi tread exits immediately after we wake it up, we'd lose the wake-up event and have an unnecessary delay (up to 5 secs) in the bdi work processing. This patch makes the forker thread to be the central place which not only creates bdi threads, but also kills them if they were inactive long enough. This better design-wise. Another reason why this change was done is to prepare for the further changes which will prevent the bdi threads from waking up every 5 sec and wasting power. Indeed, when the task does not wake up periodically anymore, it won't be able to exit either. This patch also moves the the 'wake_up_bit()' call from the bdi thread to the forker thread as well. So now the forker thread sets the BDI_pending bit, then forks the task or kills it, then clears the bit and wakes up the waiting process. The only process which may wain on the bit is 'bdi_wb_shutdown()'. This function was changed as well - now it first removes the bdi from the 'bdi_list', then waits on the 'BDI_pending' bit. Once it wakes up, it is guaranteed that the forker thread won't race with it, because the bdi is not visible. Note, the forker thread sets the 'BDI_pending' bit under the 'bdi->wb_lock' which is essential for proper serialization. And additionally, when we change 'bdi->wb.task', we now take the 'bdi->work_lock', to make sure that we do not lose wake-ups which we otherwise would when raced with, say, 'bdi_queue_work()'. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 54 ++++++++++--------------------------------- mm/backing-dev.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9f5cab75c15..905f3ea3848 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -78,21 +78,17 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); - - /* - * If the default thread isn't there, make sure we add it. When - * it gets created and wakes up, we'll run this work. - */ - if (unlikely(!bdi->wb.task)) { + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); } + spin_unlock(&bdi->wb_lock); } static void @@ -800,7 +796,6 @@ int bdi_writeback_thread(void *data) { struct bdi_writeback *wb = data; struct backing_dev_info *bdi = wb->bdi; - unsigned long wait_jiffies = -1UL; long pages_written; current->flags |= PF_FLUSHER | PF_SWAPWRITE; @@ -812,13 +807,6 @@ int bdi_writeback_thread(void *data) */ set_user_nice(current, 0); - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - trace_writeback_thread_start(bdi); while (!kthread_should_stop()) { @@ -828,18 +816,6 @@ int bdi_writeback_thread(void *data) if (pages_written) wb->last_active = jiffies; - else if (wait_jiffies != -1UL) { - unsigned long max_idle; - - /* - * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the thread will get - * recreated automatically. - */ - max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + wb->last_active)) - break; - } set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&bdi->work_list)) { @@ -847,21 +823,15 @@ int bdi_writeback_thread(void *data) continue; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout(wait_jiffies); - } else + if (dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else schedule(); try_to_freeze(); } - wb->task = NULL; - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ + /* Flush any work that raced with us exiting */ if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e104e32c2ee..9c1c199f88c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -316,6 +316,18 @@ static void sync_supers_timer_fn(unsigned long unused) bdi_arm_supers_timer(); } +/* + * Calculate the longest interval (jiffies) bdi threads are allowed to be + * inactive. + */ +static unsigned long bdi_longest_inactive(void) +{ + unsigned long interval; + + interval = msecs_to_jiffies(dirty_writeback_interval * 10); + return max(5UL * 60 * HZ, interval); +} + static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; @@ -329,11 +341,12 @@ static int bdi_forker_thread(void *ptr) set_user_nice(current, 0); for (;;) { - struct task_struct *task; + struct task_struct *task = NULL; struct backing_dev_info *bdi; enum { NO_ACTION, /* Nothing to do */ FORK_THREAD, /* Fork bdi thread */ + KILL_THREAD, /* Kill inactive bdi thread */ } action = NO_ACTION; /* @@ -346,10 +359,6 @@ static int bdi_forker_thread(void *ptr) spin_lock_bh(&bdi_lock); set_current_state(TASK_INTERRUPTIBLE); - /* - * Check if any existing bdi's have dirty data without - * a thread registered. If so, set that up. - */ list_for_each_entry(bdi, &bdi_list, bdi_list) { bool have_dirty_io; @@ -376,6 +385,25 @@ static int bdi_forker_thread(void *ptr) action = FORK_THREAD; break; } + + spin_lock(&bdi->wb_lock); + /* + * If there is no work to do and the bdi thread was + * inactive long enough - kill it. The wb_lock is taken + * to make sure no-one adds more work to this bdi and + * wakes the bdi thread up. + */ + if (bdi->wb.task && !have_dirty_io && + time_after(jiffies, bdi->wb.last_active + + bdi_longest_inactive())) { + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock(&bdi->wb_lock); + set_bit(BDI_pending, &bdi->state); + action = KILL_THREAD; + break; + } + spin_unlock(&bdi->wb_lock); } spin_unlock_bh(&bdi_lock); @@ -394,8 +422,20 @@ static int bdi_forker_thread(void *ptr) * the bdi from the thread. */ bdi_flush_io(bdi); - } else + } else { + /* + * The spinlock makes sure we do not lose + * wake-ups when racing with 'bdi_queue_work()'. + */ + spin_lock(&bdi->wb_lock); bdi->wb.task = task; + spin_unlock(&bdi->wb_lock); + } + break; + + case KILL_THREAD: + __set_current_state(TASK_RUNNING); + kthread_stop(task); break; case NO_ACTION: @@ -407,6 +447,13 @@ static int bdi_forker_thread(void *ptr) /* Back to the main loop */ continue; } + + /* + * Clear pending bit and wakeup anybody waiting to tear us down. + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); } return 0; @@ -490,15 +537,15 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) return; /* - * If setup is pending, wait for that to complete first + * Make sure nobody finds us on the bdi_list anymore */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + bdi_remove_from_list(bdi); /* - * Make sure nobody finds us on the bdi_list anymore + * If setup is pending, wait for that to complete first */ - bdi_remove_from_list(bdi); + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); /* * Finally, kill the kernel thread. We don't need to be RCU -- cgit v1.2.3 From 253c34e9b10c30d3064be654b5b78fbc1a8b1896 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:21 +0300 Subject: writeback: prevent unnecessary bdi threads wakeups Finally, we can get rid of unnecessary wake-ups in bdi threads, which are very bad for battery-driven devices. There are two types of activities bdi threads do: 1. process bdi works from the 'bdi->work_list' 2. periodic write-back So there are 2 sources of wake-up events for bdi threads: 1. 'bdi_queue_work()' - submits bdi works 2. '__mark_inode_dirty()' - adds dirty I/O to bdi's The former already has bdi wake-up code. The latter does not, and this patch adds it. '__mark_inode_dirty()' is hot-path function, but this patch adds another 'spin_lock(&bdi->wb_lock)' there. However, it is taken only in rare cases when the bdi has no dirty inodes. So adding this spinlock should be fine and should not affect performance. This patch makes sure bdi threads and the forker thread do not wake-up if there is nothing to do. The forker thread will nevertheless wake up at least every 5 min. to check whether it has to kill a bdi thread. This can also be optimized, but is not worth it. This patch also tidies up the warning about unregistered bid, and turns it from an ugly crocodile to a simple 'WARN()' statement. Signed-off-by: Artem Bityutskiy Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 59 ++++++++++++++++++++++++++++++++++++++++++++----------- mm/backing-dev.c | 13 +++++++++--- 2 files changed, 58 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 905f3ea3848..55f6e46e06f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -823,10 +823,16 @@ int bdi_writeback_thread(void *data) continue; } - if (dirty_writeback_interval) + if (wb_has_dirty_io(wb) && dirty_writeback_interval) schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ schedule(); + } try_to_freeze(); } @@ -862,6 +868,26 @@ void wakeup_flusher_threads(long nr_pages) rcu_read_unlock(); } +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. + */ +static void wakeup_bdi_thread(struct backing_dev_info *bdi) +{ + spin_lock(&bdi->wb_lock); + if (bdi->wb.task) + wake_up_process(bdi->wb.task); + else + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + wake_up_process(default_backing_dev_info.wb.task); + spin_unlock(&bdi->wb_lock); +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -914,6 +940,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -967,22 +995,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - struct backing_dev_info *bdi = wb->bdi; - - if (bdi_cap_writeback_dirty(bdi) && - !test_bit(BDI_registered, &bdi->state)) { - WARN_ON(1); - printk(KERN_ERR "bdi-%s not registered\n", - bdi->name); + bdi = inode_to_bdi(inode); + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_list, &bdi->wb.b_dirty); } } out: spin_unlock(&inode_lock); + + if (wakeup_bdi) + wakeup_bdi_thread(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9c1c199f88c..a9a08d88a74 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -439,10 +439,17 @@ static int bdi_forker_thread(void *ptr) break; case NO_ACTION: - if (dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + if (!wb_has_dirty_io(me) || !dirty_writeback_interval) + /* + * There are no dirty data. The only thing we + * should now care about is checking for + * inactive bdi threads and killing them. Thus, + * let's sleep for longer time, save energy and + * be friendly for battery-driven devices. + */ + schedule_timeout(bdi_longest_inactive()); else - schedule(); + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); try_to_freeze(); /* Back to the main loop */ continue; -- cgit v1.2.3 From 6467716a37673e8d47b4984eb19839bdad0a8353 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:22 +0300 Subject: writeback: optimize periodic bdi thread wakeups Whe the first inode for a bdi is marked dirty, we wake up the bdi thread which should take care of the periodic background write-out. However, the write-out will actually start only 'dirty_writeback_interval' centisecs later, so we can delay the wake-up. This change was requested by Nick Piggin who pointed out that if we delay the wake-up, we weed out 2 unnecessary contex switches, which matters because '__mark_inode_dirty()' is a hot-path function. This patch introduces a new function - 'bdi_wakeup_thread_delayed()', which sets up a timer to wake-up the bdi thread and returns. So the wake-up is delayed. We also delete the timer in bdi threads just before writing-back. And synchronously delete it when unregistering bdi. At the unregister point the bdi does not have any users, so no one can arm it again. Since now we take 'bdi->wb_lock' in the timer, which can execute in softirq context, we have to use 'spin_lock_bh()' for 'bdi->wb_lock'. This patch makes this change as well. This patch also moves the 'bdi_wb_init()' function down in the file to avoid forward-declaration of 'bdi_wakeup_thread_delayed()'. Signed-off-by: Artem Bityutskiy Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 36 +++++++--------------- include/linux/backing-dev.h | 2 ++ mm/backing-dev.c | 73 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 70 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 55f6e46e06f..bfa2df2c7ce 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -76,7 +76,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, { trace_writeback_queue(bdi, work); - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); @@ -88,7 +88,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); } static void @@ -704,13 +704,13 @@ get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); if (!list_empty(&bdi->work_list)) { work = list_entry(bdi->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); return work; } @@ -810,6 +810,12 @@ int bdi_writeback_thread(void *data) trace_writeback_thread_start(bdi); while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + pages_written = wb_do_writeback(wb, 0); trace_writeback_pages_written(pages_written); @@ -868,26 +874,6 @@ void wakeup_flusher_threads(long nr_pages) rcu_read_unlock(); } -/* - * This function is used when the first inode for this bdi is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. - */ -static void wakeup_bdi_thread(struct backing_dev_info *bdi) -{ - spin_lock(&bdi->wb_lock); - if (bdi->wb.task) - wake_up_process(bdi->wb.task); - else - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - wake_up_process(default_backing_dev_info.wb.task); - spin_unlock(&bdi->wb_lock); -} - static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1019,7 +1005,7 @@ out: spin_unlock(&inode_lock); if (wakeup_bdi) - wakeup_bdi_thread(bdi); + bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 71b6223e0a7..7628219e538 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -52,6 +52,7 @@ struct bdi_writeback { unsigned long last_active; /* last time bdi thread was active */ struct task_struct *task; /* writeback thread */ + struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -105,6 +106,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a9a08d88a74..cfff7225138 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -248,17 +248,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) -{ - memset(wb, 0, sizeof(*wb)); - - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); -} - int bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); @@ -316,6 +305,43 @@ static void sync_supers_timer_fn(unsigned long unused) bdi_arm_supers_timer(); } +static void wakeup_timer_fn(unsigned long data) +{ + struct backing_dev_info *bdi = (struct backing_dev_info *)data; + + spin_lock_bh(&bdi->wb_lock); + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + wake_up_process(default_backing_dev_info.wb.task); + } + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); +} + /* * Calculate the longest interval (jiffies) bdi threads are allowed to be * inactive. @@ -353,8 +379,10 @@ static int bdi_forker_thread(void *ptr) * Temporary measure, we want to make sure we don't see * dirty data on the default backing_dev_info */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { + del_timer(&me->wakeup_timer); wb_do_writeback(me, 0); + } spin_lock_bh(&bdi_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -386,7 +414,7 @@ static int bdi_forker_thread(void *ptr) break; } - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); /* * If there is no work to do and the bdi thread was * inactive long enough - kill it. The wb_lock is taken @@ -403,7 +431,7 @@ static int bdi_forker_thread(void *ptr) action = KILL_THREAD; break; } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); } spin_unlock_bh(&bdi_lock); @@ -427,9 +455,9 @@ static int bdi_forker_thread(void *ptr) * The spinlock makes sure we do not lose * wake-ups when racing with 'bdi_queue_work()'. */ - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); bdi->wb.task = task; - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); } break; @@ -586,6 +614,7 @@ void bdi_unregister(struct backing_dev_info *bdi) if (bdi->dev) { trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); + del_timer_sync(&bdi->wb.wakeup_timer); if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); @@ -596,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); +} + int bdi_init(struct backing_dev_info *bdi) { int i, err; -- cgit v1.2.3