Attention CentOS Developers,
Here is a back-port of Lars Ellenberg's md raid1/10 patch from 2.6.19+ to 2.6.9. It seems the md raid1 and raid10 drivers were stripping the BIO_RW_SYNC flags on requests which caused a severe performance penalty for DRBD when writing it's meta-data to these volumes.
The patch is attached as well as inlined:
---------- BEGIN ---------- Subject: md: pass down BIO_RW_SYNC in raid{1,10} From: Lars Ellenberg <[EMAIL PROTECTED]>
md raidX make_request functions strip off the BIO_RW_SYNC flag, thus introducing additional latency.
Fixing this in raid1 and raid10 seems to be straightforward enough.
For our particular usage case in DRBD, passing this flag improved some initialization time from ~5 minutes to ~5 seconds.
---
raid1.c | 11 ++++++++--- raid10.c | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-)
diff -puN drivers/md/raid1.c~md-pass-down-bio_rw_sync-in-raid1 drivers/md/raid1.c --- a/drivers/md/raid1.c~md-pass-down-bio_rw_sync-in-raid110 2007-02-24 01:51:20.000000000 -0500 +++ b/drivers/md/raid1.c 2007-02-24 01:50:38.000000000 -0500 @@ -521,6 +521,7 @@ static int make_request(request_queue_t struct bio *read_bio; int i, disks; mdk_rdev_t *rdev; + const int do_sync = bio_sync(bio);
if (unlikely(bio_barrier(bio))) { bio_endio(bio, bio->bi_size, -EOPNOTSUPP); @@ -582,7 +583,7 @@ static int make_request(request_queue_t read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio;
generic_make_request(read_bio); @@ -625,7 +626,7 @@ static int make_request(request_queue_t mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE; + mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining); @@ -637,6 +638,9 @@ static int make_request(request_queue_t raid_end_bio_io(r1_bio); }
+ if (do_sync) + md_wakeup_thread(mddev->thread); + return 0; }
@@ -960,6 +964,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { + const int do_sync = bio_sync(r1_bio->master_bio); r1_bio->bios[r1_bio->read_disk] = NULL; r1_bio->read_disk = disk; bio_put(bio); @@ -974,7 +979,7 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ; + bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); --- a/drivers/md/raid10.c~md-pass-down-bio_rw_sync-in-raid110 2007-02-24 01:51:20.000000000 -0500 +++ b/drivers/md/raid10.c 2007-02-24 01:50:38.000000000 -0500 @@ -663,6 +663,7 @@ static int make_request(request_queue_t struct bio *read_bio; int i; int chunk_sects = conf->chunk_mask + 1; + const int do_sync = bio_sync(bio);
if (unlikely(bio_barrier(bio))) { bio_endio(bio, bio->bi_size, -EOPNOTSUPP); @@ -747,7 +748,7 @@ static int make_request(request_queue_t mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio;
generic_make_request(read_bio); @@ -789,7 +790,7 @@ static int make_request(request_queue_t conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE; + mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining); @@ -801,6 +802,9 @@ static int make_request(request_queue_t raid_end_bio_io(r10_bio); }
+ if (do_sync) + md_wakeup_thread(mddev->thread); + return 0; }
@@ -1247,6 +1251,7 @@ static void raid10d(mddev_t *mddev) (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); } else { + const int do_sync = bio_sync(r10_bio->master_bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "raid10: %s: redirecting sector %llu to" @@ -1258,7 +1263,7 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ; + bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; ----------- END -----------
Ross S. W. Walker Information Systems Manager Medallion Financial, Corp. 437 Madison Avenue 38th Floor New York, NY 10022 Tel: (212) 328-2165 Fax: (212) 328-2125 WWW: http://www.medallion.com http://www.medallion.com
______________________________________________________________________ This e-mail, and any attachments thereto, is intended only for use by the addressee(s) named herein and may contain legally privileged and/or confidential information. If you are not the intended recipient of this e-mail, you are hereby notified that any dissemination, distribution or copying of this e-mail, and any attachments thereto, is strictly prohibited. If you have received this e-mail in error, please immediately notify the sender and permanently delete the original and any copy or printout thereof.