[dm-devel] Re: 2.6.28.2 & dm-snapshot or kcopyd Oops

Mikulas Patocka mpatocka at redhat.com
Mon Feb 9 08:13:34 UTC 2009


On Sat, 7 Feb 2009, Jacky Kim wrote:

> Hi
> 
> I create PV over a RAID set, and its stripe size is 64KB.
> The chunk size of the snapshot is 4KB, is it too small? then what size 
> is better?

It means that there is another bug besides the one I have just fixed :-(

So try another patch (on the top of them all) with even more debug points.

Mikulas

> I test with kernel 2.6.28.2, and get the follow message:
> 
> [  531.209879] ------------[ cut here ]------------
> [  531.209884] kernel BUG at drivers/md/dm-exception-store.c:715!
> [  531.209886] invalid opcode: 0000 [#1] SMP 
> [  531.209888] last sysfs file: /sys/devices/virtual/block/dm-11/dev
> [  531.209890] Modules linked in: iscsi_trgt arcmsr bonding e1000
> [  531.209893] 
> [  531.209896] Pid: 8241, comm: kcopyd Not tainted (2.6.28.2-dm #6) S5000PSL
> [  531.209898] EIP: 0060:[<c03c7dc2>] EFLAGS: 00010246 CPU: 1
> [  531.209903] EIP is at persistent_commit+0x222/0x280
> [  531.209905] EAX: f5385708 EBX: 00000006 ECX: fabab030 EDX: 00000000
> [  531.209906] ESI: 00000000 EDI: ef35f840 EBP: 00000075 ESP: f4fc3f14
> [  531.209908]  DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
> [  531.209910] Process kcopyd (pid: 8241, ti=f4fc2000 task=f4f16800 task.ti=f4fc2000)
> [  531.209911] Stack:
> [  531.209912]  c03c6710 000165da 00000000 f5015348 f482f2c0 00000000 c03c66b0 c03c66e3
> [  531.209916]  f5015348 f487f160 ef52e1c0 c03c13f8 f4f16800 f5015348 00000000 f487f160
> [  531.209919]  f487f164 ef52e1fc ef52e200 c03c119c c03c13a0 ef52e1c0 00000001 ef52e1ec
> [  531.209923] Call Trace:
> [  531.209925]  [<c03c6710>] commit_callback+0x0/0x30
> [  531.209928]  [<c03c66b0>] copy_callback+0x0/0x60
> [  531.209935]  [<c03c66e3>] copy_callback+0x33/0x60
> [  531.209938]  [<c03c13f8>] run_complete_job+0x58/0xa0
> [  531.209945]  [<c03c119c>] process_jobs+0x4c/0xe0
> [  531.209947]  [<c03c13a0>] run_complete_job+0x0/0xa0
> [  531.209950]  [<c03c1230>] do_work+0x0/0x50
> [  531.209951]  [<c03c124e>] do_work+0x1e/0x50
> [  531.209953]  [<c012ef32>] run_workqueue+0x72/0x100
> [  531.209962]  [<c0132570>] prepare_to_wait+0x20/0x60
> [  531.209965]  [<c012f840>] worker_thread+0x0/0xb0
> [  531.209972]  [<c012f8b9>] worker_thread+0x79/0xb0
> [  531.209974]  [<c01323d0>] autoremove_wake_function+0x0/0x50
> [  531.209976]  [<c012f840>] worker_thread+0x0/0xb0
> [  531.209978]  [<c01320d2>] kthread+0x42/0x70
> [  531.209980]  [<c0132090>] kthread+0x0/0x70
> [  531.209982]  [<c0103eff>] kernel_thread_helper+0x7/0x18
> [  531.209984] Code: 0b eb fe 0f 0b eb fe ba 01 00 00 00 89 f8 e8 d6 f8 ff ff 85 c0 0f 84 18 ff ff ff c7 47 08 00 00 00 00 e9 0c ff ff ff 0f 0b eb fe <0f> 0b eb fe 0f 0b eb fe 0f 0b eb fe 0f 0b eb fe 0f 0b eb fe 83 
> [  531.210006] EIP: [<c03c7dc2>] persistent_commit+0x222/0x280 SS:ESP 0068:f4fc3f14
> [  531.210010] ---[ end trace fc1bc1bb8712a6ff ]---
> [  556.042136] iscsi_trgt: Logical Unit Reset (05) issued on tid:1 lun:0 by sid:281475899523136 (Function Complete)
> 
> Jacky
> .

---
 drivers/md/dm-exception-store.c |    7 +++++++
 drivers/md/dm-kcopyd.c          |   34 +++++++++++++++++++++++++++++++++-
 drivers/md/dm-snap.c            |    4 ++++
 3 files changed, 44 insertions(+), 1 deletion(-)

Index: linux-2.6.28-clean/drivers/md/dm-exception-store.c
===================================================================
--- linux-2.6.28-clean.orig/drivers/md/dm-exception-store.c	2009-02-09 08:43:40.000000000 +0100
+++ linux-2.6.28-clean/drivers/md/dm-exception-store.c	2009-02-09 08:43:46.000000000 +0100
@@ -645,6 +645,13 @@ static void persistent_commit(struct exc
 	de.new_chunk = e->new_chunk;
 	write_exception(ps, ps->current_committed++, &de);
 
+	for (i = 0; i < ps->callback_count; i++) {
+		cb = ps->callbacks + i;
+		pe = cb->context;
+		BUG_ON(pe->e.hash_list.next == LIST_POISON1);
+		BUG_ON(pe->e.hash_list.prev == LIST_POISON2);
+		BUG_ON(pe == callback_context);
+	}
 	/*
 	 * Add the callback to the back of the array.  This code
 	 * is the only place where the callback array is
Index: linux-2.6.28-clean/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.28-clean.orig/drivers/md/dm-snap.c	2009-02-09 08:43:40.000000000 +0100
+++ linux-2.6.28-clean/drivers/md/dm-snap.c	2009-02-09 08:43:46.000000000 +0100
@@ -979,6 +979,10 @@ static void start_copy(struct dm_snap_pe
 	struct dm_io_region src, dest;
 	struct block_device *bdev = s->origin->bdev;
 	sector_t dev_size;
+	BUG_ON(!pe->started);
+	BUG_ON(pe->started == 2);
+	BUG_ON(pe->started != 1);
+	pe->started = 2;
 
 	dev_size = get_dev_size(bdev);
 
Index: linux-2.6.28-clean/drivers/md/dm-kcopyd.c
===================================================================
--- linux-2.6.28-clean.orig/drivers/md/dm-kcopyd.c	2009-02-09 08:43:40.000000000 +0100
+++ linux-2.6.28-clean/drivers/md/dm-kcopyd.c	2009-02-09 08:44:47.000000000 +0100
@@ -60,6 +60,7 @@ struct dm_kcopyd_client {
 	struct list_head complete_jobs;
 	struct list_head io_jobs;
 	struct list_head pages_jobs;
+	struct list_head all_jobs;
 };
 
 static void wake(struct dm_kcopyd_client *kc)
@@ -209,6 +210,8 @@ struct kcopyd_job {
 	dm_kcopyd_notify_fn fn;
 	void *context;
 
+	struct list_head list_all;
+
 	/*
 	 * These fields are only used if the job has been split
 	 * into more manageable parts.
@@ -280,6 +283,9 @@ static void push_head(struct list_head *
 	spin_unlock_irqrestore(&kc->job_lock, flags);
 }
 
+static void segment_complete(int read_err, unsigned long write_err,
+			     void *context);
+
 /*
  * These three functions process 1 item from the corresponding
  * job list.
@@ -291,6 +297,8 @@ static void push_head(struct list_head *
  */
 static int run_complete_job(struct kcopyd_job *job)
 {
+	struct kcopyd_job *jobb;
+	unsigned long flags;
 	void *context = job->context;
 	int read_err = job->read_err;
 	unsigned long write_err = job->write_err;
@@ -299,6 +307,18 @@ static int run_complete_job(struct kcopy
 
 	if (job->pages)
 		kcopyd_put_pages(kc, job->pages);
+
+	if (fn != segment_complete) {
+		spin_lock_irqsave(&kc->job_lock, flags);
+		list_del(&job->list_all);
+		spin_unlock_irqrestore(&kc->job_lock, flags);
+
+		spin_lock_irqsave(&kc->job_lock, flags);
+		list_for_each_entry(jobb, &kc->all_jobs, list_all)
+			BUG_ON(jobb->fn == fn && jobb->context == context);
+		spin_unlock_irqrestore(&kc->job_lock, flags);
+	}
+
 	mempool_free(job, kc->job_pool);
 	fn(read_err, write_err, context);
 
@@ -535,7 +555,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
 		   unsigned int num_dests, struct dm_io_region *dests,
 		   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
-	struct kcopyd_job *job;
+	struct kcopyd_job *job, *jobb;
+	unsigned long fflags;
 
 	/*
 	 * Allocate a new job.
@@ -563,6 +584,15 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
 	job->fn = fn;
 	job->context = context;
 
+	spin_lock_irqsave(&kc->job_lock, fflags);
+	list_for_each_entry(jobb, &kc->all_jobs, list_all)
+		BUG_ON(jobb->fn == fn && jobb->context == context);
+	spin_unlock_irqrestore(&kc->job_lock, fflags);
+
+	spin_lock_irqsave(&kc->job_lock, fflags);
+	list_add_tail(&job->list_all, &kc->all_jobs);
+	spin_unlock_irqrestore(&kc->job_lock, fflags);
+
 	if (job->source.count < SUB_JOB_SIZE)
 		dispatch_job(job);
 
@@ -603,6 +633,7 @@ int dm_kcopyd_client_create(unsigned int
 
 	spin_lock_init(&kc->lock);
 	spin_lock_init(&kc->job_lock);
+	INIT_LIST_HEAD(&kc->all_jobs);
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
@@ -652,6 +683,7 @@ void dm_kcopyd_client_destroy(struct dm_
 	/* Wait for completion of all jobs submitted by this client. */
 	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
 
+	BUG_ON(!list_empty(&kc->all_jobs));
 	BUG_ON(!list_empty(&kc->complete_jobs));
 	BUG_ON(!list_empty(&kc->io_jobs));
 	BUG_ON(!list_empty(&kc->pages_jobs));




More information about the dm-devel mailing list