[dm-devel] [patch] dm-raid1.c patch to fix bug when bio's bi_size exceed the region's size
Zhao Qian
zhaoqian at aaastor.com
Mon Jun 27 04:39:47 UTC 2005
under my test, sometimes the bios dispatched to the dm-mirror's bi_size is larger then the dm-mirror target's region size,
here some bi_size when i run bonnie++ on a dm-mirror target(mkfs as ext3), dm-mirror's region size is 2048 bytes:
.........
*************************bi_size(in bytes) - bi_sector
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1060816
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1070040
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1150960
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1160184
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1176552
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1185768
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1225704
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1234912
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1258464
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1291256
Jun 24 15:09:14 darkstar kernel: Catch it:131072 - 1920960
Jun 24 15:09:15 darkstar kernel: Catch it:131072 - 1921984
......................
the debug code insert to function queue_bio(struct mirror_set *ms, struct bio *bio, int rw)'s first line:
+ if( (bio->bi_sector>>ms->rh.region_shift)
+ != ((bio->bi_sector+((bio->bi_size-1)>>9))>> ms->rh.region_shift) )
+ {
+ printk("Catch it:%d - %d\n",bio->bi_size,bio->bi_sector);
+ }
this problem's keypoint is READ/WRITE request to dm-mirror could affect not only one region, but in current implementation it assumes
only one region is affected and locked, others are not locked.
for example,
[__________________________MIRROR_1____________________________]
[__________________________MIRROR_2____________________________]
+---------------------+--------------------+-----------------+----------------------+
| sync region | recovering region| nosync region | nosync region |
+---------------------+--------------------+-----------------+----------------------+
^ recover working here now.
<--------------------------READ bio range------------------------------------------>
In this situation, READ could read nosync mirrror then return corrupted data. WRITE has similar problem with it.
Sincerely,
Zhao Qian <zhaoqian at aaastor.com>
Du Jun <dujun at aaastor.com>
Patch to solve this problem here:
[root at darkstar src]# diff -u orig/linux-2.6.11/drivers/md/dm-raid1.c linux-2.6.11.11/drivers/md/dm-raid1.c
--- orig/linux-2.6.11/drivers/md/dm-raid1.c 2005-03-02 15:38:33.000000000 +0800
+++ linux-2.6.11.11/drivers/md/dm-raid1.c 2005-06-27 11:58:08.000000000 +0800
@@ -76,6 +76,7 @@
/* hash table */
rwlock_t hash_lock;
mempool_t *region_pool;
+ mempool_t *region_pair_pool;
unsigned int mask;
unsigned int nr_buckets;
struct list_head *buckets;
@@ -85,6 +86,9 @@
struct list_head clean_regions;
struct list_head quiesced_regions;
struct list_head recovered_regions;
+
+ spinlock_t region_pair_lock;
+ struct list_head unused_region_pairs;
};
enum {
@@ -132,7 +136,32 @@
kfree(element);
}
+struct region_pair
+{
+ region_t first;
+ region_t last;
+ struct list_head list;
+};
+
+static struct region_pair *bio_to_region_pair(struct region_hash *rh, struct bio *bio,struct region_pair *rp)
+{
+ rp->first = bio->bi_sector >> rh->region_shift;
+ rp->last = (bio->bi_sector+((bio->bi_size-1)>>9)) >> rh->region_shift;
+ return rp;
+}
+
+static void *region_pair_alloc(int gfp_mask, void *pool_data)
+{
+ return kmalloc(sizeof(struct region_pair), gfp_mask);
+}
+
+static void region_pair_free(void *element, void *pool_data)
+{
+ kfree(element);
+}
+
#define MIN_REGIONS 64
+#define MIN_REGION_PAIRS 128
#define MAX_RECOVERY 1
static int rh_init(struct region_hash *rh, struct mirror_set *ms,
struct dirty_log *log, uint32_t region_size,
@@ -173,6 +202,9 @@
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
+ spin_lock_init(&rh->region_pair_lock);
+ INIT_LIST_HEAD(&rh->unused_region_pairs);
+
rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
region_free, NULL);
if (!rh->region_pool) {
@@ -181,6 +213,15 @@
return -ENOMEM;
}
+ rh->region_pair_pool = mempool_create(MIN_REGION_PAIRS, region_pair_alloc,
+ region_pair_free, NULL);
+ if (!rh->region_pair_pool) {
+ vfree(rh->buckets);
+ rh->buckets = NULL;
+ mempool_destroy(rh->region_pool);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -188,6 +229,7 @@
{
unsigned int h;
struct region *reg, *nreg;
+ struct region_pair *pair,*pair_next;
BUG_ON(!list_empty(&rh->quiesced_regions));
for (h = 0; h < rh->nr_buckets; h++) {
@@ -197,10 +239,15 @@
}
}
+ list_for_each_entry_safe( pair,pair_next,&rh->unused_region_pairs,list)
+ mempool_free(pair,rh->region_pair_pool);
+
if (rh->log)
dm_destroy_dirty_log(rh->log);
if (rh->region_pool)
mempool_destroy(rh->region_pool);
+ if (rh->region_pair_pool)
+ mempool_destroy(rh->region_pair_pool);
vfree(rh->buckets);
}
@@ -320,9 +367,11 @@
static void rh_update_states(struct region_hash *rh)
{
struct region *reg, *next;
+ struct region_pair *pair,*pair_next;
LIST_HEAD(clean);
LIST_HEAD(recovered);
+ LIST_HEAD(unused);
/*
* Quickly grab the lists.
@@ -347,6 +396,12 @@
list_del(®->hash_list);
}
spin_unlock(&rh->region_lock);
+
+ spin_lock(&rh->region_pair_lock);
+ list_splice(&rh->unused_region_pairs,&unused);
+ INIT_LIST_HEAD(&rh->unused_region_pairs);
+ spin_unlock(&rh->region_pair_lock);
+
write_unlock_irq(&rh->hash_lock);
/*
@@ -367,6 +422,9 @@
list_for_each_entry_safe (reg, next, &clean, list)
mempool_free(reg, rh->region_pool);
+
+ list_for_each_entry_safe( pair,pair_next,&unused,list)
+ mempool_free(pair,rh->region_pair_pool);
}
static void rh_inc(struct region_hash *rh, region_t region)
@@ -391,9 +449,15 @@
static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
{
struct bio *bio;
+ struct region_pair rp;
+ region_t r;
- for (bio = bios->head; bio; bio = bio->bi_next)
- rh_inc(rh, bio_to_region(rh, bio));
+ for (bio = bios->head; bio; bio = bio->bi_next){
+ bio_to_region_pair(rh,bio,&rp);
+ for( r=rp.first; r<=rp.last;r++ ){
+ rh_inc(rh, r);
+ }
+ }
}
static void rh_dec(struct region_hash *rh, region_t region)
@@ -506,14 +570,14 @@
rh->log->type->flush(rh->log);
}
-static void rh_delay(struct region_hash *rh, struct bio *bio)
+static void rh_delay_on_region(struct region_hash *rh, struct bio *bio,region_t region)
{
- struct region *reg;
+ struct region *reg;
- read_lock(&rh->hash_lock);
- reg = __rh_find(rh, bio_to_region(rh, bio));
- bio_list_add(®->delayed_bios, bio);
- read_unlock(&rh->hash_lock);
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ bio_list_add(®->delayed_bios, bio);
+ read_unlock(&rh->hash_lock);
}
static void rh_stop_recovery(struct region_hash *rh)
@@ -692,17 +756,26 @@
static void do_reads(struct mirror_set *ms, struct bio_list *reads)
{
- region_t region;
struct bio *bio;
struct mirror *m;
+ struct region_pair rp;
+ region_t r;
+ int sync_count;
while ((bio = bio_list_pop(reads))) {
- region = bio_to_region(&ms->rh, bio);
+ bio_to_region_pair(&ms->rh,bio,&rp);
/*
* We can only read balance if the region is in sync.
*/
- if (rh_in_sync(&ms->rh, region, 0))
+
+ sync_count = 0;
+ for(r=rp.first;r<=rp.last;r++){
+ if (rh_in_sync(&ms->rh, r, 0)){
+ sync_count++;
+ }
+ }
+ if( sync_count>(rp.last-rp.first) )
m = choose_mirror(ms, bio->bi_sector);
else
m = ms->mirror + DEFAULT_MIRROR;
@@ -779,6 +852,8 @@
int state;
struct bio *bio;
struct bio_list sync, nosync, recover, *this_list = NULL;
+ struct region_pair rp;
+ region_t r;
if (!writes->head)
return;
@@ -791,23 +866,28 @@
bio_list_init(&recover);
while ((bio = bio_list_pop(writes))) {
- state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
- switch (state) {
- case RH_CLEAN:
- case RH_DIRTY:
- this_list = &sync;
- break;
-
- case RH_NOSYNC:
- this_list = &nosync;
- break;
+ bio_to_region_pair(&ms->rh,bio,&rp);
+ this_list = &sync;
+ for(r=rp.first;r<=rp.last;r++){
+ state = rh_state(&ms->rh, r, 1);
+ switch (state) {
+ case RH_CLEAN:
+ case RH_DIRTY:
+ break;
- case RH_RECOVERING:
- this_list = &recover;
- break;
+ case RH_NOSYNC:
+ this_list = &nosync;
+ break;
+
+ case RH_RECOVERING:
+ rh_delay_on_region(&ms->rh,bio,r);
+ this_list = &recover;
+ break;
+ }
+ if( &recover == this_list )break;
}
- bio_list_add(this_list, bio);
+ if( &recover!=this_list )bio_list_add(this_list, bio);
}
/*
@@ -825,9 +905,6 @@
while ((bio = bio_list_pop(&sync)))
do_write(ms, bio);
- while ((bio = bio_list_pop(&recover)))
- rh_delay(&ms->rh, bio);
-
while ((bio = bio_list_pop(&nosync))) {
map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
generic_make_request(bio);
@@ -1105,9 +1182,10 @@
struct mirror *m;
struct mirror_set *ms = ti->private;
- map_context->ll = bio->bi_sector >> ms->rh.region_shift;
-
if (rw == WRITE) {
+ struct region_pair *rp = mempool_alloc(ms->rh.region_pair_pool, GFP_NOIO);
+ bio_to_region_pair(&ms->rh,bio,rp);
+ map_context->ptr = rp;
queue_bio(ms, bio, rw);
return 0;
}
@@ -1128,7 +1206,7 @@
if (!r && rw == READA)
return -EIO;
- if (!r) {
+ if ( !r || bio_sectors(bio)>1 ) {
/* Pass this io over to the daemon */
queue_bio(ms, bio, rw);
return 0;
@@ -1145,15 +1223,21 @@
static int mirror_end_io(struct dm_target *ti, struct bio *bio,
int error, union map_info *map_context)
{
+ region_t region;
int rw = bio_rw(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private;
- region_t region = map_context->ll;
/*
* We need to dec pending if this was a write.
*/
- if (rw == WRITE)
- rh_dec(&ms->rh, region);
+ if (rw == WRITE){
+ struct region_pair *rp = (struct region_pair *)map_context->ptr;
+ for(region=rp->first;region<=rp->last;region++)
+ rh_dec(&ms->rh, region);
+ spin_lock_irq(&ms->rh.region_pair_lock);
+ list_add(&rp->list,&ms->rh.unused_region_pairs);
+ spin_unlock_irq(&ms->rh.region_pair_lock);
+ }
return 0;
}
More information about the dm-devel
mailing list