[dm-devel] [patch] dm-raid1.c patch to fix bug when bio's bi_size exceed the region's size

Zhao Qian zhaoqian at aaastor.com
Mon Jun 27 04:39:47 UTC 2005


under my test, sometimes the bios dispatched to the dm-mirror's bi_size is larger then the dm-mirror target's region size,
here some bi_size when i run bonnie++ on a dm-mirror target(mkfs as ext3), dm-mirror's region size is 2048 bytes:
.........
*************************bi_size(in bytes) - bi_sector
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1060816
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1070040
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1150960
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1160184
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1176552
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1185768
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1225704
Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1234912
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1258464
Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1291256
Jun 24 15:09:14 darkstar kernel: Catch it:131072 - 1920960
Jun 24 15:09:15 darkstar kernel: Catch it:131072 - 1921984
......................
the debug code insert to function queue_bio(struct mirror_set *ms, struct bio *bio, int rw)'s first line:
+       if( (bio->bi_sector>>ms->rh.region_shift) 
+               != ((bio->bi_sector+((bio->bi_size-1)>>9))>> ms->rh.region_shift) )
+       {
+               printk("Catch it:%d - %d\n",bio->bi_size,bio->bi_sector);
+       }

this problem's keypoint is READ/WRITE request to dm-mirror could affect not only one region, but in current implementation it assumes 
only one region is affected and locked, others are not locked. 

for example,

[__________________________MIRROR_1____________________________]
[__________________________MIRROR_2____________________________]
+---------------------+--------------------+-----------------+----------------------+
|       sync region          |     recovering region|   nosync region   |    nosync region          |
+---------------------+--------------------+-----------------+----------------------+
                                       ^ recover working here now.
 <--------------------------READ bio range------------------------------------------>
In this situation, READ could read nosync mirrror then return corrupted data.  WRITE has similar problem with it.

Sincerely,
                Zhao Qian <zhaoqian at aaastor.com>
                Du Jun <dujun at aaastor.com>

Patch to solve this problem here:
[root at darkstar src]# diff -u orig/linux-2.6.11/drivers/md/dm-raid1.c linux-2.6.11.11/drivers/md/dm-raid1.c 
--- orig/linux-2.6.11/drivers/md/dm-raid1.c     2005-03-02 15:38:33.000000000 +0800
+++ linux-2.6.11.11/drivers/md/dm-raid1.c       2005-06-27 11:58:08.000000000 +0800
@@ -76,6 +76,7 @@
        /* hash table */
        rwlock_t hash_lock;
        mempool_t *region_pool;
+       mempool_t *region_pair_pool;
        unsigned int mask;
        unsigned int nr_buckets;
        struct list_head *buckets;
@@ -85,6 +86,9 @@
        struct list_head clean_regions;
        struct list_head quiesced_regions;
        struct list_head recovered_regions;
+
+       spinlock_t region_pair_lock;
+       struct list_head unused_region_pairs;
 };
 
 enum {
@@ -132,7 +136,32 @@
        kfree(element);
 }
 
+struct region_pair
+{
+       region_t first;
+       region_t last;
+        struct list_head list;
+};
+
+static struct region_pair *bio_to_region_pair(struct region_hash *rh, struct bio *bio,struct region_pair *rp)
+{
+        rp->first = bio->bi_sector >> rh->region_shift;
+       rp->last = (bio->bi_sector+((bio->bi_size-1)>>9)) >> rh->region_shift;
+       return rp;
+}
+
+static void *region_pair_alloc(int gfp_mask, void *pool_data)
+{       
+        return kmalloc(sizeof(struct region_pair), gfp_mask);
+}
+
+static void region_pair_free(void *element, void *pool_data)
+{       
+        kfree(element);
+} 
+
 #define MIN_REGIONS 64
+#define MIN_REGION_PAIRS 128
 #define MAX_RECOVERY 1
 static int rh_init(struct region_hash *rh, struct mirror_set *ms,
                   struct dirty_log *log, uint32_t region_size,
@@ -173,6 +202,9 @@
        INIT_LIST_HEAD(&rh->quiesced_regions);
        INIT_LIST_HEAD(&rh->recovered_regions);
 
+        spin_lock_init(&rh->region_pair_lock);
+        INIT_LIST_HEAD(&rh->unused_region_pairs);
+
        rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
                                         region_free, NULL);
        if (!rh->region_pool) {
@@ -181,6 +213,15 @@
                return -ENOMEM;
        }
 
+        rh->region_pair_pool = mempool_create(MIN_REGION_PAIRS, region_pair_alloc,
+                                         region_pair_free, NULL);
+        if (!rh->region_pair_pool) {
+                vfree(rh->buckets);
+                rh->buckets = NULL;
+               mempool_destroy(rh->region_pool);
+                return -ENOMEM;
+        }
+
        return 0;
 }
 
@@ -188,6 +229,7 @@
 {
        unsigned int h;
        struct region *reg, *nreg;
+       struct region_pair *pair,*pair_next;
 
        BUG_ON(!list_empty(&rh->quiesced_regions));
        for (h = 0; h < rh->nr_buckets; h++) {
@@ -197,10 +239,15 @@
                }
        }
 
+       list_for_each_entry_safe( pair,pair_next,&rh->unused_region_pairs,list)
+               mempool_free(pair,rh->region_pair_pool);
+
        if (rh->log)
                dm_destroy_dirty_log(rh->log);
        if (rh->region_pool)
                mempool_destroy(rh->region_pool);
+        if (rh->region_pair_pool)
+                mempool_destroy(rh->region_pair_pool);
        vfree(rh->buckets);
 }
 
@@ -320,9 +367,11 @@
 static void rh_update_states(struct region_hash *rh)
 {
        struct region *reg, *next;
+       struct region_pair *pair,*pair_next;
 
        LIST_HEAD(clean);
        LIST_HEAD(recovered);
+       LIST_HEAD(unused);
 
        /*
         * Quickly grab the lists.
@@ -347,6 +396,12 @@
                        list_del(&reg->hash_list);
        }
        spin_unlock(&rh->region_lock);
+
+       spin_lock(&rh->region_pair_lock);
+               list_splice(&rh->unused_region_pairs,&unused);
+               INIT_LIST_HEAD(&rh->unused_region_pairs);
+       spin_unlock(&rh->region_pair_lock);
+
        write_unlock_irq(&rh->hash_lock);
 
        /*
@@ -367,6 +422,9 @@
 
        list_for_each_entry_safe (reg, next, &clean, list)
                mempool_free(reg, rh->region_pool);
+
+       list_for_each_entry_safe( pair,pair_next,&unused,list)
+               mempool_free(pair,rh->region_pair_pool);
 }
 
 static void rh_inc(struct region_hash *rh, region_t region)
@@ -391,9 +449,15 @@
 static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
 {
        struct bio *bio;
+       struct region_pair rp;
+       region_t r;
 
-       for (bio = bios->head; bio; bio = bio->bi_next)
-               rh_inc(rh, bio_to_region(rh, bio));
+       for (bio = bios->head; bio; bio = bio->bi_next){
+               bio_to_region_pair(rh,bio,&rp);
+               for( r=rp.first; r<=rp.last;r++ ){
+                       rh_inc(rh, r);
+               }
+       }
 }
 
 static void rh_dec(struct region_hash *rh, region_t region)
@@ -506,14 +570,14 @@
        rh->log->type->flush(rh->log);
 }
 
-static void rh_delay(struct region_hash *rh, struct bio *bio)
+static void rh_delay_on_region(struct region_hash *rh, struct bio *bio,region_t region)
 {
-       struct region *reg;
+        struct region *reg;
 
-       read_lock(&rh->hash_lock);
-       reg = __rh_find(rh, bio_to_region(rh, bio));
-       bio_list_add(&reg->delayed_bios, bio);
-       read_unlock(&rh->hash_lock);
+        read_lock(&rh->hash_lock);
+        reg = __rh_find(rh, region);
+        bio_list_add(&reg->delayed_bios, bio);
+        read_unlock(&rh->hash_lock);
 }
 
 static void rh_stop_recovery(struct region_hash *rh)
@@ -692,17 +756,26 @@
 
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 {
-       region_t region;
        struct bio *bio;
        struct mirror *m;
+       struct region_pair rp;
+       region_t r;
+       int sync_count;
 
        while ((bio = bio_list_pop(reads))) {
-               region = bio_to_region(&ms->rh, bio);
+               bio_to_region_pair(&ms->rh,bio,&rp);
 
                /*
                 * We can only read balance if the region is in sync.
                 */
-               if (rh_in_sync(&ms->rh, region, 0))
+
+               sync_count = 0;
+               for(r=rp.first;r<=rp.last;r++){
+                       if (rh_in_sync(&ms->rh, r, 0)){
+                               sync_count++;
+                       }
+               }
+               if( sync_count>(rp.last-rp.first) )
                        m = choose_mirror(ms, bio->bi_sector);
                else
                        m = ms->mirror + DEFAULT_MIRROR;
@@ -779,6 +852,8 @@
        int state;
        struct bio *bio;
        struct bio_list sync, nosync, recover, *this_list = NULL;
+       struct region_pair rp;
+       region_t r;
 
        if (!writes->head)
                return;
@@ -791,23 +866,28 @@
        bio_list_init(&recover);
 
        while ((bio = bio_list_pop(writes))) {
-               state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
-               switch (state) {
-               case RH_CLEAN:
-               case RH_DIRTY:
-                       this_list = &sync;
-                       break;
-
-               case RH_NOSYNC:
-                       this_list = &nosync;
-                       break;
+               bio_to_region_pair(&ms->rh,bio,&rp);
+               this_list = &sync;
+               for(r=rp.first;r<=rp.last;r++){
+                       state = rh_state(&ms->rh, r, 1);
+                       switch (state) {
+                       case RH_CLEAN:
+                       case RH_DIRTY:
+                               break;
 
-               case RH_RECOVERING:
-                       this_list = &recover;
-                       break;
+                       case RH_NOSYNC:
+                               this_list = &nosync;
+                               break;
+
+                       case RH_RECOVERING:
+                               rh_delay_on_region(&ms->rh,bio,r);
+                               this_list = &recover;
+                               break;
+                       }
+                       if( &recover == this_list )break;
                }
 
-               bio_list_add(this_list, bio);
+               if( &recover!=this_list )bio_list_add(this_list, bio);
        }
 
        /*
@@ -825,9 +905,6 @@
        while ((bio = bio_list_pop(&sync)))
                do_write(ms, bio);
 
-       while ((bio = bio_list_pop(&recover)))
-               rh_delay(&ms->rh, bio);
-
        while ((bio = bio_list_pop(&nosync))) {
                map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
                generic_make_request(bio);
@@ -1105,9 +1182,10 @@
        struct mirror *m;
        struct mirror_set *ms = ti->private;
 
-       map_context->ll = bio->bi_sector >> ms->rh.region_shift;
-
        if (rw == WRITE) {
+               struct region_pair *rp  = mempool_alloc(ms->rh.region_pair_pool, GFP_NOIO);
+               bio_to_region_pair(&ms->rh,bio,rp);
+               map_context->ptr = rp;
                queue_bio(ms, bio, rw);
                return 0;
        }
@@ -1128,7 +1206,7 @@
        if (!r && rw == READA)
                return -EIO;
 
-       if (!r) {
+       if ( !r || bio_sectors(bio)>1 ) {
                /* Pass this io over to the daemon */
                queue_bio(ms, bio, rw);
                return 0;
@@ -1145,15 +1223,21 @@
 static int mirror_end_io(struct dm_target *ti, struct bio *bio,
                         int error, union map_info *map_context)
 {
+       region_t region;
        int rw = bio_rw(bio);
        struct mirror_set *ms = (struct mirror_set *) ti->private;
-       region_t region = map_context->ll;
 
        /*
         * We need to dec pending if this was a write.
         */
-       if (rw == WRITE)
-               rh_dec(&ms->rh, region);
+       if (rw == WRITE){
+               struct region_pair *rp = (struct region_pair *)map_context->ptr;
+               for(region=rp->first;region<=rp->last;region++)
+                       rh_dec(&ms->rh, region);
+               spin_lock_irq(&ms->rh.region_pair_lock);
+               list_add(&rp->list,&ms->rh.unused_region_pairs);
+               spin_unlock_irq(&ms->rh.region_pair_lock);
+       }
 
        return 0;
 }





More information about the dm-devel mailing list