diff --git a/ChangeLog b/ChangeLog index 47d535d..eddc3a7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,14 @@ Latest: ------ - For even more detail, use "git log" or visit http://git.drbd.org/. + For even more detail, visit http://git.linbit.com/drbd-8.4.git + + * fix latency regression introduced with 8.4.5 + protocol A was sometimes synchronous, C sometimes double-latency + * avoid potential deadlock during handshake + * avoid potential deadlock in disconnect during resync + with fencing resource-and-stonith + * allow IO during some bitmap bulk IO operations + * fix "endless" transfer log walk in protocol A 8.4.6 (api:genl1/proto:86-101) ------- diff --git a/drbd-kernel.spec b/drbd-kernel.spec index 489b09f..9b70767 100644 --- a/drbd-kernel.spec +++ b/drbd-kernel.spec @@ -1,7 +1,7 @@ Name: drbd-kernel Summary: Kernel driver for DRBD Version: 8.4.6 -Release: 1%{?dist} +Release: 5%{?dist} Source: http://oss.linbit.com/drbd/drbd-%{version}.tar.gz License: GPLv2+ Group: System Environment/Kernel @@ -97,6 +97,12 @@ echo "override drbd * weak-updates" \ rm -rf %{buildroot} %changelog +* Wed Sep 16 2015 Lars Ellenberg - 8.4.6-5 +- New upstream release. + +* Thu Jul 30 2015 Lars Ellenberg - 8.4.6-4 +- New upstream release. + * Fri Apr 3 2015 Philipp Reisner - 8.4.6-1 - New upstream release. diff --git a/drbd-km.spec b/drbd-km.spec index 859aa51..0cb976d 100644 --- a/drbd-km.spec +++ b/drbd-km.spec @@ -10,7 +10,7 @@ Name: drbd-km Summary: DRBD driver for Linux Version: 8.4.6 -Release: 1 +Release: 5 Source: http://oss.linbit.com/%{name}/8.4/drbd-%{version}.tar.gz License: GPLv2+ ExclusiveOS: linux @@ -32,7 +32,7 @@ setting up high availability (HA) clusters. Summary: Kernel driver for DRBD. Group: System Environment/Kernel # always require a suitable userland and depmod. -Requires: drbd-utils = %{version}, /sbin/depmod +Requires: drbd-utils >= 8.9.2, /sbin/depmod # to be able to override from build scripts which flavor of kernel we are building against. Requires: %{expand: %(echo ${DRBD_KMOD_REQUIRES:-kernel})} # TODO: break up this generic .spec file into per distribution ones, @@ -92,6 +92,12 @@ uname -r | grep BOOT || %changelog +* Wed Sep 16 2015 Lars Ellenberg - 8.4.6-5 +- New upstream release. + +* Thu Jul 30 2015 Lars Ellenberg - 8.4.6-4 +- New upstream release. + * Fri Apr 3 2015 Philipp Reisner - 8.4.6-1 - New upstream release. diff --git a/drbd/compat/tests/have_WB_congested_enum.c b/drbd/compat/tests/have_WB_congested_enum.c new file mode 100644 index 0000000..37fb32d --- /dev/null +++ b/drbd/compat/tests/have_WB_congested_enum.c @@ -0,0 +1,13 @@ +#include + + +/* With commit 4452226 (linux v4.2) + BDI_async_congested was renamed to WB_async_congested and + BDI_sync_congested was renamed to WB_sync_congested. + */ + +void foo(void) +{ + int a = WB_async_congested; + int b = WB_sync_congested; +} diff --git a/drbd/compat/tests/have_generic_start_io_acct.c b/drbd/compat/tests/have_generic_start_io_acct.c new file mode 100644 index 0000000..14a18d1 --- /dev/null +++ b/drbd/compat/tests/have_generic_start_io_acct.c @@ -0,0 +1,8 @@ +#include + +/* Introduced by mainline commit 394ffa503b, available since v3.19 */ + +void foo(void) +{ + generic_start_io_acct(WRITE, 0, (struct hd_struct *) NULL); +} diff --git a/drbd/compat/tests/have_simple_positive.c b/drbd/compat/tests/have_simple_positive.c new file mode 100644 index 0000000..410f157 --- /dev/null +++ b/drbd/compat/tests/have_simple_positive.c @@ -0,0 +1,8 @@ +#include + +/* Since dc3f4198e (linux v4.2) simple_positive is accessible for modules */ + +void foo(void) +{ + int r = simple_positive((struct dentry *)NULL); +} diff --git a/drbd/compat/tests/sock_create_kern_has_five_parameters.c b/drbd/compat/tests/sock_create_kern_has_five_parameters.c new file mode 100644 index 0000000..342af09 --- /dev/null +++ b/drbd/compat/tests/sock_create_kern_has_five_parameters.c @@ -0,0 +1,11 @@ +#include + + +/* With commit eeb1bd5 (linux v4.2) a new parameter was inserted in + first position */ + +void foo(void) +{ + int err; + err = sock_create_kern((struct net *)NULL, 0, 0, 0, (struct socket **)NULL); +} diff --git a/drbd/drbd_actlog.c b/drbd/drbd_actlog.c index 00c7956..1a274c5 100644 --- a/drbd/drbd_actlog.c +++ b/drbd/drbd_actlog.c @@ -312,7 +312,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * return need_transaction; } -static int al_write_transaction(struct drbd_device *device); +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) +{ + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + return err; +} + +static int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + int err; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + err = __al_write_transaction(device, buffer); + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + void drbd_al_begin_io_commit(struct drbd_device *device) { @@ -444,153 +599,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) wake_up(&device->al_wait); } -#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) -/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT - * are still coupled, or assume too much about their relation. - * Code below will not work if this is violated. - * Will be cleaned up with some followup patch. - */ -# error FIXME -#endif - -static unsigned int al_extent_to_bm_page(unsigned int al_enr) -{ - return al_enr >> - /* bit to page */ - ((PAGE_SHIFT + 3) - - /* al extent number to bit */ - (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); -} - -static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) -{ - const unsigned int stripes = device->ldev->md.al_stripes; - const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; - - /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); - - /* ... to aligned 4k on disk block */ - t = ((t % stripes) * stripe_size_4kB) + t/stripes; - - /* ... to 512 byte sector in activity log */ - t *= 8; - - /* ... plus offset to the on disk position */ - return device->ldev->md.md_offset + device->ldev->md.al_offset + t; -} - -int al_write_transaction(struct drbd_device *device) -{ - struct al_transaction_on_disk *buffer; - struct lc_element *e; - sector_t sector; - int i, mx; - unsigned extent_nr; - unsigned crc = 0; - int err = 0; - - if (!get_ldev(device)) { - drbd_err(device, "disk is %s, cannot start al transaction\n", - drbd_disk_str(device->state.disk)); - return -EIO; - } - - /* The bitmap write may have failed, causing a state change. */ - if (device->state.disk < D_INCONSISTENT) { - drbd_err(device, - "disk is %s, cannot write al transaction\n", - drbd_disk_str(device->state.disk)); - put_ldev(device); - return -EIO; - } - - /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = drbd_md_get_buffer(device, __func__); - if (!buffer) { - drbd_err(device, "disk failed while waiting for md_io buffer\n"); - put_ldev(device); - return -ENODEV; - } - - memset(buffer, 0, sizeof(*buffer)); - buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); - buffer->tr_number = cpu_to_be32(device->al_tr_number); - - i = 0; - - /* Even though no one can start to change this list - * once we set the LC_LOCKED -- from drbd_al_begin_io(), - * lc_try_lock_for_transaction() --, someone may still - * be in the process of changing it. */ - spin_lock_irq(&device->al_lock); - list_for_each_entry(e, &device->act_log->to_be_changed, list) { - if (i == AL_UPDATES_PER_TRANSACTION) { - i++; - break; - } - buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); - buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); - if (e->lc_number != LC_FREE) - drbd_bm_mark_for_writeout(device, - al_extent_to_bm_page(e->lc_number)); - i++; - } - spin_unlock_irq(&device->al_lock); - BUG_ON(i > AL_UPDATES_PER_TRANSACTION); - - buffer->n_updates = cpu_to_be16(i); - for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { - buffer->update_slot_nr[i] = cpu_to_be16(-1); - buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); - } - - buffer->context_size = cpu_to_be16(device->act_log->nr_elements); - buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); - - mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, - device->act_log->nr_elements - device->al_tr_cycle); - for (i = 0; i < mx; i++) { - unsigned idx = device->al_tr_cycle + i; - extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; - buffer->context[i] = cpu_to_be32(extent_nr); - } - for (; i < AL_CONTEXT_PER_TRANSACTION; i++) - buffer->context[i] = cpu_to_be32(LC_FREE); - - device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; - if (device->al_tr_cycle >= device->act_log->nr_elements) - device->al_tr_cycle = 0; - - sector = al_tr_number_to_on_disk_sector(device); - - crc = crc32c(0, buffer, 4096); - buffer->crc32c = cpu_to_be32(crc); - - if (drbd_bm_write_hinted(device)) - err = -EIO; - else { - bool write_al_updates; - rcu_read_lock(); - write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; - rcu_read_unlock(); - if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); - } else { - device->al_tr_number++; - device->al_writ_cnt++; - } - } - } - - drbd_md_put_buffer(device); - put_ldev(device); - - return err; -} - static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) { int rv; @@ -630,21 +638,24 @@ void drbd_al_shrink(struct drbd_device *device) wake_up(&device->al_wait); } -int drbd_initialize_al(struct drbd_device *device, void *buffer) +int drbd_al_initialize(struct drbd_device *device, void *buffer) { struct al_transaction_on_disk *al = buffer; struct drbd_md *md = &device->ldev->md; - sector_t al_base = md->md_offset + md->al_offset; int al_size_4k = md->al_stripes * md->al_stripe_size_4k; int i; - memset(al, 0, 4096); - al->magic = cpu_to_be32(DRBD_AL_MAGIC); - al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); - al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + __al_write_transaction(device, al); + /* There may or may not have been a pending transaction. */ + spin_lock_irq(&device->al_lock); + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); - for (i = 0; i < al_size_4k; i++) { - int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + /* The rest of the transactions will have an empty "updates" list, and + * are written out only to provide the context, and to initialize the + * on-disk ring buffer. */ + for (i = 1; i < al_size_4k; i++) { + int err = __al_write_transaction(device, al); if (err) return err; } diff --git a/drbd/drbd_debugfs.c b/drbd/drbd_debugfs.c index da50b19..df9b4a8 100644 --- a/drbd/drbd_debugfs.c +++ b/drbd/drbd_debugfs.c @@ -429,14 +429,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) #endif -/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), - * but neither is "reachable" from here. - * So we have our own inline version of it above. :-( */ -static inline int debugfs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - /* make sure at *open* time that the respective object won't go away. */ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data, struct kref *kref, @@ -454,7 +446,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* serialize with d_delete() */ mutex_lock(&parent->d_inode->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_dentry) + if (simple_positive(file->f_dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&parent->d_inode->i_mutex); diff --git a/drbd/drbd_int.h b/drbd/drbd_int.h index 08d6648..d1e2bc0 100644 --- a/drbd/drbd_int.h +++ b/drbd/drbd_int.h @@ -36,6 +36,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -98,14 +101,10 @@ extern int fault_devs; extern char usermode_helper[]; -#include #ifndef DRBD_MAJOR # define DRBD_MAJOR 147 #endif -#include -#include - /* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes @@ -593,7 +592,6 @@ enum { MD_NO_BARRIER, /* meta data device does not support barriers, so don't even try */ - SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ @@ -986,6 +984,7 @@ struct drbd_device { atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t unacked_cnt; /* Need to send replies for */ atomic_t local_cnt; /* Waiting for local completion */ + atomic_t suspend_cnt; /* Interval tree of pending local write requests */ struct rb_root read_requests; @@ -1785,7 +1784,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s #define drbd_rs_failed_io(device, sector, size) \ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); -extern int drbd_initialize_al(struct drbd_device *, void *); +extern int drbd_al_initialize(struct drbd_device *, void *); /* drbd_nl.c */ /* state info broadcast */ @@ -2376,7 +2375,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device) if (drbd_suspended(device)) return false; - if (test_bit(SUSPEND_IO, &device->flags)) + if (atomic_read(&device->suspend_cnt)) return false; /* to avoid potential deadlock or bitmap corruption, diff --git a/drbd/drbd_main.c b/drbd/drbd_main.c index f41aa8a..31bf43f 100644 --- a/drbd/drbd_main.c +++ b/drbd/drbd_main.c @@ -2435,7 +2435,7 @@ static void drbd_cleanup(void) * @congested_data: User data * @bdi_bits: Bits the BDI flusher thread is currently interested in * - * Returns 1<connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); /* Without good local data, we would need to read from remote, * and that would need the worker thread as well, which is * currently blocked waiting for that usermode helper to * finish. */ if (!get_ldev_if_state(device, D_UP_TO_DATE)) - r |= (1 << BDI_sync_congested); + r |= (1 << WB_sync_congested); else put_ldev(device); r &= bdi_bits; @@ -2475,9 +2475,9 @@ static int drbd_congested(void *congested_data, int bdi_bits) reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && + if (bdi_bits & (1 << WB_async_congested) && test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); reason = reason == 'b' ? 'a' : 'n'; } @@ -3601,7 +3601,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused) struct bm_io_work *work = &device->bm_io_work; int rv = -EIO; - D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); + if (work->flags != BM_LOCKED_CHANGE_ALLOWED) { + int cnt = atomic_read(&device->ap_bio_cnt); + if (cnt) + drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n", + cnt, work->why); + } if (get_ldev(device)) { drbd_bm_lock(device, work->why, work->flags); @@ -3659,7 +3664,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, spin_lock_irq(&device->resource->req_lock); set_bit(BITMAP_IO, &device->flags); - if (atomic_read(&device->ap_bio_cnt) == 0) { + /* don't wait for pending application IO if the caller indicates that + * application IO does not conflict anyways. */ + if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->bm_io_work.w); @@ -3679,18 +3686,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device, int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), char *why, enum bm_flag flags) { + /* Only suspend io, if some operation is supposed to be locked out */ + const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST); int rv; D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_suspend_io(device); drbd_bm_lock(device, why, flags); rv = io_fn(device); drbd_bm_unlock(device); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_resume_io(device); return rv; diff --git a/drbd/drbd_nl.c b/drbd/drbd_nl.c index 9c14cf3..bb7e1b0 100644 --- a/drbd/drbd_nl.c +++ b/drbd/drbd_nl.c @@ -903,9 +903,11 @@ char *ppsize(char *buf, unsigned long long size) * and can be long lived. * This changes an device->flag, is triggered by drbd internals, * and should be short-lived. */ +/* It needs to be a counter, since multiple threads might + independently suspend and resume IO. */ void drbd_suspend_io(struct drbd_device *device) { - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); if (drbd_suspended(device)) return; wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); @@ -913,8 +915,8 @@ void drbd_suspend_io(struct drbd_device *device) void drbd_resume_io(struct drbd_device *device) { - clear_bit(SUSPEND_IO, &device->flags); - wake_up(&device->misc_wait); + if (atomic_dec_and_test(&device->suspend_cnt)) + wake_up(&device->misc_wait); } /** @@ -927,27 +929,32 @@ void drbd_resume_io(struct drbd_device *device) enum determine_dev_size drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) { - sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size_sect, u_size; + struct md_offsets_and_sizes { + u64 last_agreed_sect; + u64 md_offset; + s32 al_offset; + s32 bm_offset; + u32 md_size_sect; + + u32 al_stripes; + u32 al_stripe_size_4k; + } prev; + sector_t u_size, size; struct drbd_md *md = &device->ldev->md; - u32 prev_al_stripe_size_4k; - u32 prev_al_stripes; - sector_t size; char ppb[10]; void *buffer; int md_moved, la_size_changed; enum determine_dev_size rv = DS_UNCHANGED; - /* race: - * application request passes inc_ap_bio, - * but then cannot get an AL-reference. - * this function later may wait on ap_bio_cnt == 0. -> deadlock. + /* We may change the on-disk offsets of our meta data below. Lock out + * anything that may cause meta data IO, to avoid acting on incomplete + * layout changes or scribbling over meta data that is in the process + * of being moved. * - * to avoid that: - * Suspend IO right here. - * still lock the act_log to not trigger ASSERTs there. - */ + * Move is not exactly correct, btw, currently we have all our meta + * data in core memory, to "move" it we just write it all out, there + * are no reads. */ drbd_suspend_io(device); buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) { @@ -955,19 +962,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct return DS_ERROR; } - /* no wait necessary anymore, actually we could assert that */ - wait_event(device->al_wait, lc_try_lock(device->act_log)); - - prev_first_sect = drbd_md_first_sector(device->ldev); - prev_size = device->ldev->md.md_size_sect; - la_size_sect = device->ldev->md.la_size_sect; + /* remember current offset and sizes */ + prev.last_agreed_sect = md->la_size_sect; + prev.md_offset = md->md_offset; + prev.al_offset = md->al_offset; + prev.bm_offset = md->bm_offset; + prev.md_size_sect = md->md_size_sect; + prev.al_stripes = md->al_stripes; + prev.al_stripe_size_4k = md->al_stripe_size_4k; if (rs) { /* rs is non NULL if we should change the AL layout only */ - - prev_al_stripes = md->al_stripes; - prev_al_stripe_size_4k = md->al_stripe_size_4k; - md->al_stripes = rs->al_stripes; md->al_stripe_size_4k = rs->al_stripe_size / 4; md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; @@ -980,7 +985,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct rcu_read_unlock(); size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); - if (size < la_size_sect) { + if (size < prev.last_agreed_sect) { if (rs && u_size == 0) { /* Remove "rs &&" later. This check should always be active, but right now the receiver expects the permissive behavior */ @@ -1001,30 +1006,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ - size = drbd_bm_capacity(device)>>1; + size = drbd_bm_capacity(device); if (size == 0) { drbd_err(device, "OUT OF MEMORY! " "Could not allocate bitmap!\n"); } else { drbd_err(device, "BM resizing failed. " - "Leaving size unchanged at size = %lu KB\n", - (unsigned long)size); + "Leaving size unchanged\n"); } rv = DS_ERROR; } /* racy, see comments above. */ drbd_set_my_capacity(device, size); - device->ldev->md.la_size_sect = size; + md->la_size_sect = size; drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); } if (rv <= DS_ERROR) goto err_out; - la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + la_size_changed = (prev.last_agreed_sect != md->la_size_sect); - md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) - || prev_size != device->ldev->md.md_size_sect; + md_moved = prev.md_offset != md->md_offset + || prev.md_size_sect != md->md_size_sect; if (la_size_changed || md_moved || rs) { u32 prev_flags; @@ -1033,20 +1037,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct * Clear the timer, to avoid scary "timer expired!" messages, * "Superblock" is written out at least twice below, anyways. */ del_timer(&device->md_sync_timer); - drbd_al_shrink(device); /* All extents inactive. */ + /* We won't change the "al-extents" setting, we just may need + * to move the on-disk location of the activity log ringbuffer. + * Lock for transaction is good enough, it may well be "dirty" + * or even "starving". */ + wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log)); + + /* mark current on-disk bitmap and activity log as unreliable */ prev_flags = md->flags; - md->flags &= ~MDF_PRIMARY_IND; + md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED; drbd_md_write(device, buffer); + drbd_al_initialize(device, buffer); + drbd_info(device, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, "size changed", BM_LOCKED_MASK); - drbd_initialize_al(device, buffer); + /* on-disk bitmap and activity log is authoritative again + * (unless there was an IO error meanwhile...) */ md->flags = prev_flags; drbd_md_write(device, buffer); @@ -1055,20 +1068,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct md->al_stripes, md->al_stripe_size_4k * 4); } - if (size > la_size_sect) - rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; - if (size < la_size_sect) + if (size > prev.last_agreed_sect) + rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < prev.last_agreed_sect) rv = DS_SHRUNK; if (0) { err_out: - if (rs) { - md->al_stripes = prev_al_stripes; - md->al_stripe_size_4k = prev_al_stripe_size_4k; - md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; - - drbd_md_set_sector_offsets(device, device->ldev); - } + /* restore previous offset and sizes */ + md->la_size_sect = prev.last_agreed_sect; + md->md_offset = prev.md_offset; + md->al_offset = prev.al_offset; + md->bm_offset = prev.bm_offset; + md->md_size_sect = prev.md_size_sect; + md->al_stripes = prev.al_stripes; + md->al_stripe_size_4k = prev.al_stripe_size_4k; + md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k; } lc_unlock(device->act_log); wake_up(&device->al_wait); @@ -2764,6 +2779,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&device->resource->conf_update); synchronize_rcu(); kfree(old_disk_conf); + new_disk_conf = NULL; } ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); @@ -2797,6 +2813,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) fail_ldev: put_ldev(device); + kfree(new_disk_conf); goto fail; } @@ -3216,8 +3233,8 @@ static void device_to_statistics(struct device_statistics *s, q = bdev_get_queue(device->ldev->backing_bdev); s->dev_lower_blocked = bdi_congested(&q->backing_dev_info, - (1 << BDI_async_congested) | - (1 << BDI_sync_congested)); + (1 << WB_async_congested) | + (1 << WB_sync_congested)); put_ldev(device); } s->dev_size = drbd_get_capacity(device->this_bdev); diff --git a/drbd/drbd_receiver.c b/drbd/drbd_receiver.c index 5e6b149..06e5667 100644 --- a/drbd/drbd_receiver.c +++ b/drbd/drbd_receiver.c @@ -673,7 +673,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection, int u ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ what = "sock_create_kern_in_try_connect"; - err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; @@ -767,7 +767,7 @@ static struct socket *create_listen_socket(struct drbd_connection *connection, rcu_read_unlock(); what = "sock_create_kern"; - err = sock_create_kern(addr->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); + err = sock_create_kern(&init_net, addr->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; goto out; @@ -5173,9 +5173,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) drbd_md_sync(device); - /* serialize with bitmap writeout triggered by the state change, - * if any. */ - wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + if (get_ldev(device)) { + drbd_bitmap_io(device, &drbd_bm_write_copy_pages, + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(device); + } /* tcp_close and release of sendpage pages can be deferred. I don't * want to use SO_LINGER, because apparently it can be deferred for diff --git a/drbd/drbd_req.c b/drbd/drbd_req.c index 38fe40d..305fe71 100644 --- a/drbd/drbd_req.c +++ b/drbd/drbd_req.c @@ -31,73 +31,41 @@ #include "drbd_req.h" -/* We only support diskstats for 2.6.16 and up. - * see also commit commit a362357b6cd62643d4dda3b152639303d78473da - * Author: Jens Axboe - * Date: Tue Nov 1 09:26:16 2005 +0100 - * [BLOCK] Unify the separate read/write io stat fields into arrays */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) -#define _drbd_start_io_acct(...) do {} while (0) -#define _drbd_end_io_acct(...) do {} while (0) -#else static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size); +#ifndef __disk_stat_inc /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) { - const int rw = bio_data_dir(req->master_bio); -#ifndef __disk_stat_inc - int cpu; -#endif - -#ifndef COMPAT_HAVE_ATOMIC_IN_FLIGHT - spin_lock_irq(&device->resource->req_lock); -#endif - -#ifdef __disk_stat_inc - __disk_stat_inc(device->vdisk, ios[rw]); - __disk_stat_add(device->vdisk, sectors[rw], req->i.size >> 9); - disk_round_stats(device->vdisk); - device->vdisk->in_flight++; -#else - cpu = part_stat_lock(); - part_round_stats(cpu, &device->vdisk->part0); - part_stat_inc(cpu, &device->vdisk->part0, ios[rw]); - part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9); - (void) cpu; /* The macro invocations above want the cpu argument, I do not like - the compiler warning about cpu only assigned but never used... */ - part_inc_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); -#endif - -#ifndef COMPAT_HAVE_ATOMIC_IN_FLIGHT - spin_unlock_irq(&device->resource->req_lock); -#endif + generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, + &device->vdisk->part0); } /* Update disk stats when completing request upwards */ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) { - int rw = bio_data_dir(req->master_bio); + generic_end_io_acct(bio_data_dir(req->master_bio), + &device->vdisk->part0, req->start_jif); +} +#else +static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + const int rw = bio_data_dir(req->master_bio); + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(device->vdisk->in_flight)); + disk_stat_inc(device->vdisk, ios[rw]); + disk_stat_add(device->vdisk, sectors[rw], req->i.size >> 9); + disk_round_stats(device->vdisk); + atomic_inc((atomic_t*)&device->vdisk->in_flight); +} +static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + const int rw = bio_data_dir(req->master_bio); unsigned long duration = jiffies - req->start_jif; -#ifndef __disk_stat_inc - int cpu; -#endif - -#ifdef __disk_stat_add - __disk_stat_add(device->vdisk, ticks[rw], duration); + disk_stat_add(device->vdisk, ticks[rw], duration); disk_round_stats(device->vdisk); - device->vdisk->in_flight--; -#else - cpu = part_stat_lock(); - part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); - part_round_stats(cpu, &device->vdisk->part0); - part_dec_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); -#endif + atomic_dec((atomic_t*)&device->vdisk->in_flight); } - #endif static struct drbd_request *drbd_req_new(struct drbd_device *device, @@ -509,7 +477,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, atomic_add(req->i.size >> 9, &device->ap_in_flight); set_if_null_req_not_net_done(peer_device, req); } - if (s & RQ_NET_PENDING) + if (req->rq_state & RQ_NET_PENDING) set_if_null_req_ack_pending(peer_device, req); } @@ -1028,16 +996,20 @@ static void complete_conflicting_writes(struct drbd_request *req) sector_t sector = req->i.sector; int size = req->i.size; - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) - return; - for (;;) { - prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + /* Ignore, if already completed to upper layers. */ + if (i->completed) + continue; + /* Handle the first found overlap. After the schedule + * we have to restart the tree walk. */ break; + } + if (!i) /* if any */ + break; + /* Indicate to wake up device->misc_wait on progress. */ + prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); i->waiting = true; spin_unlock_irq(&device->resource->req_lock); schedule(); diff --git a/drbd/drbd_state.c b/drbd/drbd_state.c index a64cf22..4cf2c93 100644 --- a/drbd/drbd_state.c +++ b/drbd/drbd_state.c @@ -1490,7 +1490,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); /* open coded non-blocking drbd_suspend_io(device); */ - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); rv = io_fn(device); @@ -1940,12 +1940,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk - * failure, or because of connection loss. + * failure, or on transition from resync back to AHEAD/BEHIND. + * + * Connection loss is handled in drbd_disconnected() by the receiver. + * * For resync aborted because of local disk failure, we cannot do * any bitmap writeout anymore. + * * No harm done if some bits change during this phase. */ - if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { + if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && + (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(device); diff --git a/drbd/drbd_wrappers.h b/drbd/drbd_wrappers.h index ea2a1fe..d7a4138 100644 --- a/drbd/drbd_wrappers.h +++ b/drbd/drbd_wrappers.h @@ -1421,4 +1421,57 @@ do { \ } while (0) #endif +#ifndef COMPAT_HAVE_GENERIC_START_IO_ACCT +#ifndef __disk_stat_inc +static inline void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu; + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(part->in_flight[0])); + + cpu = part_stat_lock(); + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + (void) cpu; /* The macro invocations above want the cpu argument, I do not like + the compiler warning about cpu only assigned but never used... */ + /* part_inc_in_flight(part, rw); */ + atomic_inc((atomic_t*)&part->in_flight[rw]); + part_stat_unlock(); +} + +static inline void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu; + + cpu = part_stat_lock(); + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + /* part_dec_in_flight(part, rw); */ + atomic_dec((atomic_t*)&part->in_flight[rw]); + part_stat_unlock(); +} +#endif /* __disk_stat_inc */ +#endif /* COMPAT_HAVE_GENERIC_START_IO_ACCT */ + + +#ifndef COMPAT_SOCK_CREATE_KERN_HAS_FIVE_PARAMETERS +#define sock_create_kern(N,F,T,P,S) sock_create_kern(F,T,P,S) +#endif + +#ifndef COMPAT_HAVE_WB_CONGESTED_ENUM +#define WB_async_congested BDI_async_congested +#define WB_sync_congested BDI_sync_congested +#endif + +#ifndef COMPAT_HAVE_SIMPLE_POSITIVE +#include +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} +#endif + #endif diff --git a/drbd/linux/lru_cache.h b/drbd/linux/lru_cache.h index 98e231c..a1347c5 100644 --- a/drbd/linux/lru_cache.h +++ b/drbd/linux/lru_cache.h @@ -300,7 +300,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); extern void lc_committed(struct lru_cache *lc); struct seq_file; -extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); +extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, void (*detail) (struct seq_file *, struct lc_element *)); diff --git a/drbd/lru_cache.c b/drbd/lru_cache.c index 76308df..038c986 100644 --- a/drbd/lru_cache.c +++ b/drbd/lru_cache.c @@ -233,7 +233,7 @@ void lc_reset(struct lru_cache *lc) * @seq: the seq_file to print into * @lc: the lru cache to print statistics of */ -size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) +void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) { /* NOTE: * total calls to lc_get are @@ -242,10 +242,9 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) * progress) and "changed", when this in fact lead to an successful * update of the cache. */ - return seq_printf(seq, "\t%s: used:%u/%u " - "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", - lc->name, lc->used, lc->nr_elements, - lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); + seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", + lc->name, lc->used, lc->nr_elements, + lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) diff --git a/preamble b/preamble index 603c8ca..a230a24 100644 --- a/preamble +++ b/preamble @@ -1,5 +1,5 @@ # always require a suitable userland -Requires: drbd-utils = 8.4.5 +Requires: drbd-utils >= 8.9.2 %if %{defined suse_kernel_module_package} %if 0%{?sles_version} == 10