[dm-devel] [PATCH] md: use preallocated hashed wait queues instead of mddev->sb_wait
Xiao Ni
xni at redhat.com
Tue Jan 31 03:13:05 UTC 2023
Hi Mikulas
Can we fix this by this:
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -929,10 +929,10 @@ static void super_written(struct bio *bio)
bio_put(bio);
- rdev_dec_pending(rdev, mddev);
-
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
+
+ rdev_dec_pending(rdev, mddev);
}
Regards
Xiao
On Tue, Jan 24, 2023 at 10:06 PM Mikulas Patocka <mpatocka at redhat.com> wrote:
>
> There's a theoretical race condition in md.
>
> super_written calls:
> if (atomic_dec_and_test(&mddev->pending_writes))
> wake_up(&mddev->sb_wait);
>
> If the process is rescheduled just after atomic_dec_and_test and before
> wake_up, it may happen that the mddev structure is freed (because
> mddev->pending_writes is zero) and on scheduling back,
> wake_up(&mddev->sb_wait) would access freed memory.
>
> Fix this bug by using an array of preallocated wait_queues, so that the
> wait queue exist even after mddev was freed.
>
> Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>
>
> ---
> drivers/md/md.c | 56 ++++++++++++++++++++++++++---------------------
> drivers/md/md.h | 10 +++++++-
> drivers/md/raid10.c | 4 +--
> drivers/md/raid5-cache.c | 6 ++---
> drivers/md/raid5.c | 8 +++---
> 5 files changed, 49 insertions(+), 35 deletions(-)
>
> Index: linux-2.6/drivers/md/md.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.c
> +++ linux-2.6/drivers/md/md.c
> @@ -89,6 +89,9 @@ static struct workqueue_struct *md_wq;
> static struct workqueue_struct *md_misc_wq;
> static struct workqueue_struct *md_rdev_misc_wq;
>
> +wait_queue_head_t md_sb_wait_table[MD_SB_WAIT_TABLE_SIZE];
> +EXPORT_SYMBOL(md_sb_wait_table);
> +
> static int remove_and_add_spares(struct mddev *mddev,
> struct md_rdev *this);
> static void mddev_detach(struct mddev *mddev);
> @@ -415,7 +418,7 @@ check_suspended:
> return;
> }
> for (;;) {
> - prepare_to_wait(&mddev->sb_wait, &__wait,
> + prepare_to_wait(md_sb_wait(mddev), &__wait,
> TASK_UNINTERRUPTIBLE);
> if (!is_suspended(mddev, bio))
> break;
> @@ -423,19 +426,19 @@ check_suspended:
> schedule();
> rcu_read_lock();
> }
> - finish_wait(&mddev->sb_wait, &__wait);
> + finish_wait(md_sb_wait(mddev), &__wait);
> }
> atomic_inc(&mddev->active_io);
> rcu_read_unlock();
>
> if (!mddev->pers->make_request(mddev, bio)) {
> atomic_dec(&mddev->active_io);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> goto check_suspended;
> }
>
> if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> }
> EXPORT_SYMBOL(md_handle_request);
>
> @@ -484,13 +487,13 @@ void mddev_suspend(struct mddev *mddev)
> if (mddev->suspended++)
> return;
> synchronize_rcu();
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
> smp_mb__after_atomic();
> - wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
> + wait_event(*md_sb_wait(mddev), atomic_read(&mddev->active_io) == 0);
> mddev->pers->quiesce(mddev, 1);
> clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
> - wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
> + wait_event(*md_sb_wait(mddev), !test_bit(MD_UPDATING_SB, &mddev->flags));
>
> del_timer_sync(&mddev->safemode_timer);
> /* restrict memory reclaim I/O during raid array is suspend */
> @@ -505,7 +508,7 @@ void mddev_resume(struct mddev *mddev)
> lockdep_assert_held(&mddev->reconfig_mutex);
> if (--mddev->suspended)
> return;
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> mddev->pers->quiesce(mddev, 0);
>
> set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> @@ -585,7 +588,7 @@ static void md_submit_flush_data(struct
> mddev->prev_flush_start = mddev->start_flush;
> mddev->flush_bio = NULL;
> spin_unlock_irq(&mddev->lock);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
>
> if (bio->bi_iter.bi_size == 0) {
> /* an empty barrier - all done */
> @@ -609,7 +612,7 @@ bool md_flush_request(struct mddev *mdde
> /* flush requests wait until ongoing flush completes,
> * hence coalescing all the pending requests.
> */
> - wait_event_lock_irq(mddev->sb_wait,
> + wait_event_lock_irq(*md_sb_wait(mddev),
> !mddev->flush_bio ||
> ktime_before(req_start, mddev->prev_flush_start),
> mddev->lock);
> @@ -686,7 +689,6 @@ void mddev_init(struct mddev *mddev)
> atomic_set(&mddev->active_io, 0);
> spin_lock_init(&mddev->lock);
> atomic_set(&mddev->flush_pending, 0);
> - init_waitqueue_head(&mddev->sb_wait);
> init_waitqueue_head(&mddev->recovery_wait);
> mddev->reshape_position = MaxSector;
> mddev->reshape_backwards = 0;
> @@ -828,7 +830,7 @@ void mddev_unlock(struct mddev *mddev)
> */
> spin_lock(&pers_lock);
> md_wakeup_thread(mddev->thread);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> spin_unlock(&pers_lock);
> }
> EXPORT_SYMBOL_GPL(mddev_unlock);
> @@ -933,7 +935,7 @@ static void super_written(struct bio *bi
> rdev_dec_pending(rdev, mddev);
>
> if (atomic_dec_and_test(&mddev->pending_writes))
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> }
>
> void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
> @@ -977,7 +979,7 @@ void md_super_write(struct mddev *mddev,
> int md_super_wait(struct mddev *mddev)
> {
> /* wait for all superblock writes that were scheduled to complete */
> - wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
> + wait_event(*md_sb_wait(mddev), atomic_read(&mddev->pending_writes)==0);
> if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
> return -EAGAIN;
> return 0;
> @@ -2681,7 +2683,7 @@ repeat:
> wake_up(&rdev->blocked_wait);
> }
> }
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> return;
> }
>
> @@ -2791,7 +2793,7 @@ rewrite:
> BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
> /* have to write it out again */
> goto repeat;
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> sysfs_notify_dirent_safe(mddev->sysfs_completed);
>
> @@ -4383,7 +4385,7 @@ array_state_store(struct mddev *mddev, c
> restart_array(mddev);
> clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
> md_wakeup_thread(mddev->thread);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> } else /* st == clean */ {
> restart_array(mddev);
> if (!set_in_sync(mddev))
> @@ -4456,7 +4458,7 @@ array_state_store(struct mddev *mddev, c
> if (err)
> break;
> clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> err = 0;
> } else {
> mddev->ro = MD_RDWR;
> @@ -6283,7 +6285,7 @@ static int md_set_readonly(struct mddev
> mddev_unlock(mddev);
> wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
> &mddev->recovery));
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
> mddev_lock_nointr(mddev);
>
> @@ -7564,7 +7566,7 @@ static int md_ioctl(struct block_device
>
> if (cmd == HOT_REMOVE_DISK)
> /* need to ensure recovery thread has run */
> - wait_event_interruptible_timeout(mddev->sb_wait,
> + wait_event_interruptible_timeout(*md_sb_wait(mddev),
> !test_bit(MD_RECOVERY_NEEDED,
> &mddev->recovery),
> msecs_to_jiffies(5000));
> @@ -7669,7 +7671,7 @@ static int md_ioctl(struct block_device
> */
> if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
> mddev_unlock(mddev);
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
> mddev_lock_nointr(mddev);
> @@ -8529,7 +8531,7 @@ bool md_write_start(struct mddev *mddev,
> sysfs_notify_dirent_safe(mddev->sysfs_state);
> if (!mddev->has_superblocks)
> return true;
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
> mddev->suspended);
> if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
> @@ -8674,7 +8676,7 @@ void md_allow_write(struct mddev *mddev)
> md_update_sb(mddev, 0);
> sysfs_notify_dirent_safe(mddev->sysfs_state);
> /* wait for the dirty state to be recorded in the metadata */
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
> } else
> spin_unlock(&mddev->lock);
> @@ -9256,7 +9258,7 @@ void md_check_recovery(struct mddev *mdd
> if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
> md_update_sb(mddev, 0);
> clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> }
>
> if (mddev->suspended)
> @@ -9420,7 +9422,7 @@ void md_check_recovery(struct mddev *mdd
> sysfs_notify_dirent_safe(mddev->sysfs_action);
> }
> unlock:
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> mddev_unlock(mddev);
> }
> }
> @@ -9608,6 +9610,10 @@ static void md_geninit(void)
> static int __init md_init(void)
> {
> int ret = -ENOMEM;
> + int i;
> +
> + for (i = 0; i < MD_SB_WAIT_TABLE_SIZE; i++)
> + init_waitqueue_head(md_sb_wait_table + i);
>
> md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
> if (!md_wq)
> Index: linux-2.6/drivers/md/md.h
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.h
> +++ linux-2.6/drivers/md/md.h
> @@ -466,7 +466,6 @@ struct mddev {
> * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
> */
> spinlock_t lock;
> - wait_queue_head_t sb_wait; /* for waiting on superblock updates */
> atomic_t pending_writes; /* number of active superblock writes */
>
> unsigned int safemode; /* if set, update "clean" superblock
> @@ -584,6 +583,15 @@ static inline void md_sync_acct_bio(stru
> md_sync_acct(bio->bi_bdev, nr_sectors);
> }
>
> +#define MD_SB_WAIT_TABLE_BITS 8
> +#define MD_SB_WAIT_TABLE_SIZE (1U << MD_SB_WAIT_TABLE_BITS)
> +extern wait_queue_head_t md_sb_wait_table[MD_SB_WAIT_TABLE_SIZE];
> +
> +static inline wait_queue_head_t *md_sb_wait(struct mddev *md)
> +{
> + return md_sb_wait_table + hash_long((unsigned long)md, MD_SB_WAIT_TABLE_BITS);
> +}
> +
> struct md_personality
> {
> char *name;
> Index: linux-2.6/drivers/md/raid10.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/raid10.c
> +++ linux-2.6/drivers/md/raid10.c
> @@ -1446,7 +1446,7 @@ static void raid10_write_request(struct
> return;
> }
> raid10_log(conf->mddev, "wait reshape metadata");
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>
> conf->reshape_safe = mddev->reshape_position;
> @@ -4876,7 +4876,7 @@ static sector_t reshape_request(struct m
> conf->reshape_checkpoint = jiffies;
> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
> md_wakeup_thread(mddev->thread);
> - wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
> + wait_event(*md_sb_wait(mddev), mddev->sb_flags == 0 ||
> test_bit(MD_RECOVERY_INTR, &mddev->recovery));
> if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
> allow_barrier(conf);
> Index: linux-2.6/drivers/md/raid5-cache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/raid5-cache.c
> +++ linux-2.6/drivers/md/raid5-cache.c
> @@ -691,7 +691,7 @@ static void r5c_disable_writeback_async(
> mdname(mddev));
>
> /* wait superblock change before suspend */
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> conf->log == NULL ||
> (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
> (locked = mddev_trylock(mddev))));
> @@ -1581,7 +1581,7 @@ void r5l_quiesce(struct r5l_log *log, in
> if (quiesce) {
> /* make sure r5l_write_super_and_discard_space exits */
> mddev = log->rdev->mddev;
> - wake_up(&mddev->sb_wait);
> + wake_up_all(md_sb_wait(mddev));
> kthread_park(log->reclaim_thread->tsk);
> r5l_wake_reclaim(log, MaxSector);
> r5l_do_reclaim(log);
> @@ -3165,7 +3165,7 @@ void r5l_exit_log(struct r5conf *conf)
> struct r5l_log *log = conf->log;
>
> /* Ensure disable_writeback_work wakes up and exits */
> - wake_up(&conf->mddev->sb_wait);
> + wake_up_all(md_sb_wait(conf->mddev));
> flush_work(&log->disable_writeback_work);
> md_unregister_thread(&log->reclaim_thread);
>
> Index: linux-2.6/drivers/md/raid5.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/raid5.c
> +++ linux-2.6/drivers/md/raid5.c
> @@ -6346,7 +6346,7 @@ static sector_t reshape_request(struct m
> conf->reshape_checkpoint = jiffies;
> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
> md_wakeup_thread(mddev->thread);
> - wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
> + wait_event(*md_sb_wait(mddev), mddev->sb_flags == 0 ||
> test_bit(MD_RECOVERY_INTR, &mddev->recovery));
> if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> return 0;
> @@ -6454,7 +6454,7 @@ finish:
> conf->reshape_checkpoint = jiffies;
> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
> md_wakeup_thread(mddev->thread);
> - wait_event(mddev->sb_wait,
> + wait_event(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
> || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
> if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> @@ -6703,7 +6703,7 @@ static void raid5_do_work(struct work_st
> if (!batch_size && !released)
> break;
> handled += batch_size;
> - wait_event_lock_irq(mddev->sb_wait,
> + wait_event_lock_irq(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
> conf->device_lock);
> }
> @@ -6792,7 +6792,7 @@ static void raid5d(struct md_thread *thr
> continue;
> }
>
> - wait_event_lock_irq(mddev->sb_wait,
> + wait_event_lock_irq(*md_sb_wait(mddev),
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
> conf->device_lock);
> }
> --
> dm-devel mailing list
> dm-devel at redhat.com
> https://listman.redhat.com/mailman/listinfo/dm-devel
>
More information about the dm-devel
mailing list