[CentOS-devel] Back-port md raid1/10 BIO_RW_SYNC patch for DRBD

Sun Feb 25 17:24:03 UTC 2007
Ross S. W. Walker <rwalker at medallion.com>

Attention CentOS Developers,

Here is a back-port of Lars Ellenberg's md raid1/10 patch from 2.6.19+
to 2.6.9. It seems the md raid1 and raid10 drivers were stripping the
BIO_RW_SYNC flags on requests which caused a severe performance penalty
for DRBD when writing it's meta-data to these volumes.

The patch is attached as well as inlined:

---------- BEGIN ----------
Subject: md: pass down BIO_RW_SYNC in raid{1,10}
From: Lars Ellenberg <[EMAIL PROTECTED]>

md raidX make_request functions strip off the BIO_RW_SYNC flag, thus
introducing additional latency.

Fixing this in raid1 and raid10 seems to be straightforward enough.

For our particular usage case in DRBD, passing this flag improved some
initialization time from ~5 minutes to ~5 seconds.

---

 raid1.c  |   11 ++++++++---
 raid10.c |   11 ++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff -puN drivers/md/raid1.c~md-pass-down-bio_rw_sync-in-raid1
drivers/md/raid1.c
--- a/drivers/md/raid1.c~md-pass-down-bio_rw_sync-in-raid110
2007-02-24 01:51:20.000000000 -0500
+++ b/drivers/md/raid1.c	2007-02-24 01:50:38.000000000 -0500
@@ -521,6 +521,7 @@ static int make_request(request_queue_t 
 	struct bio *read_bio;
 	int i, disks;
 	mdk_rdev_t *rdev;
+	const int do_sync = bio_sync(bio);
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -582,7 +583,7 @@ static int make_request(request_queue_t 
 		read_bio->bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
-		read_bio->bi_rw = READ;
+		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r1_bio;
 
 		generic_make_request(read_bio);
@@ -625,7 +626,7 @@ static int make_request(request_queue_t 
 		mbio->bi_sector	= r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE;
+		mbio->bi_rw = WRITE | do_sync;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -637,6 +638,9 @@ static int make_request(request_queue_t 
 		raid_end_bio_io(r1_bio);
 	}
 
+	if (do_sync)
+		md_wakeup_thread(mddev->thread);
+
 	return 0;
 }
 
@@ -960,6 +964,7 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long
long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
+				const int do_sync =
bio_sync(r1_bio->master_bio);
 				r1_bio->bios[r1_bio->read_disk] = NULL;
 				r1_bio->read_disk = disk;
 				bio_put(bio);
@@ -974,7 +979,7 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_sector = r1_bio->sector +
rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
-				bio->bi_rw = READ;
+				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
 				unplug = 1;
 				generic_make_request(bio);
--- a/drivers/md/raid10.c~md-pass-down-bio_rw_sync-in-raid110
2007-02-24 01:51:20.000000000 -0500
+++ b/drivers/md/raid10.c	2007-02-24 01:50:38.000000000 -0500
@@ -663,6 +663,7 @@ static int make_request(request_queue_t 
 	struct bio *read_bio;
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
+	const int do_sync = bio_sync(bio);
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -747,7 +748,7 @@ static int make_request(request_queue_t 
 			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
-		read_bio->bi_rw = READ;
+		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r10_bio;
 
 		generic_make_request(read_bio);
@@ -789,7 +790,7 @@ static int make_request(request_queue_t 
 			conf->mirrors[d].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE;
+		mbio->bi_rw = WRITE | do_sync;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -801,6 +802,9 @@ static int make_request(request_queue_t 
 		raid_end_bio_io(r10_bio);
 	}
 
+	if (do_sync)
+		md_wakeup_thread(mddev->thread);
+
 	return 0;
 }
 
@@ -1247,6 +1251,7 @@ static void raid10d(mddev_t *mddev)
 				       (unsigned long
long)r10_bio->sector);
 				raid_end_bio_io(r10_bio);
 			} else {
+				const int do_sync =
bio_sync(r10_bio->master_bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
 					printk(KERN_ERR "raid10: %s:
redirecting sector %llu to"
@@ -1258,7 +1263,7 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_sector =
r10_bio->devs[r10_bio->read_slot].addr
 					+ rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
-				bio->bi_rw = READ;
+				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io =
raid10_end_read_request;
 				unplug = 1;
----------- END -----------

Ross S. W. Walker
Information Systems Manager
Medallion Financial, Corp.
437 Madison Avenue
38th Floor
New York, NY 10022
Tel: (212) 328-2165
Fax: (212) 328-2125
WWW: http://www.medallion.com <http://www.medallion.com> 



______________________________________________________________________
This e-mail, and any attachments thereto, is intended only for use by
the addressee(s) named herein and may contain legally privileged
and/or confidential information. If you are not the intended recipient
of this e-mail, you are hereby notified that any dissemination,
distribution or copying of this e-mail, and any attachments thereto,
is strictly prohibited. If you have received this e-mail in error,
please immediately notify the sender and permanently delete the
original and any copy or printout thereof.

-------------- next part --------------
A non-text attachment was scrubbed...
Name: linux-2.6.9-md-raid1-sync.patch
Type: application/octet-stream
Size: 4383 bytes
Desc: linux-2.6.9-md-raid1-sync.patch
URL: <http://lists.centos.org/pipermail/centos-devel/attachments/20070225/0dbb92cb/attachment-0006.obj>