vga: add sr_vbe register set

Commit "fd3c136 vga: make sure vga register setup for vbe stays intact (CVE-2016-3712)." causes a regression. The win7 installer is unhappy because it can't freely modify vga registers any more while in vbe mode. This patch introduces a new sr_vbe register set. The vbe_update_vgaregs will fill sr_vbe[] instead of sr[]. Normal vga register reads and writes go to sr[]. Any sr register read access happens through a new sr() helper function which will read from sr_vbe[] with vbe active and from sr[] otherwise. This way we can allow guests update sr[] registers as they want, without allowing them disrupt vbe video modes that way. Cc: qemu-stable@nongnu.org Reported-by: Thomas Lamprecht <thomas@lamprecht.org> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com> Message-id: 1463475294-14119-1-git-send-email-kraxel@redhat.com
virtio-gpu: fix ui idx check
2016-05-23 14:28:25 +02:00 · 2016-05-23 13:30:03 +02:00 · 2016-05-23 13:30:03 +02:00 · 2016-05-23 13:30:03 +02:00 · 2016-05-23 13:30:03 +02:00 · 2016-05-23 13:30:03 +02:00
853 changed files with 14140 additions and 8530 deletions
--- a/3
+++ b/3
@@ -1050,7 +1050,6 @@ M: Andreas Färber <afaerber@suse.de>
 S: Supported
 F: qom/cpu.c
 F: include/qom/cpu.h
-F: target-i386/cpu.c

 ICC Bus
 M: Igor Mammedov <imammedo@redhat.com>
@@ -1156,8 +1155,6 @@ M: Eduardo Habkost <ehabkost@redhat.com>
 S: Maintained
 F: numa.c
 F: include/sysemu/numa.h
-K: numa|NUMA
-K: srat|SRAT
 T: git git://github.com/ehabkost/qemu.git numa

 QAPI
--- a/2
+++ b/2
@@ -1 +1 @@
-2.5.91
+2.6.50
--- a/accel.c
+++ b/accel.c
@@ -77,7 +77,7 @@ static int accel_init_machine(AccelClass *acc, MachineState *ms)
    return ret;
 }

-int configure_accelerator(MachineState *ms)
+void configure_accelerator(MachineState *ms)
 {
    const char *p;
    char buf[10];
@@ -128,8 +128,6 @@ int configure_accelerator(MachineState *ms)
    if (init_failed) {
        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
    }
-
-    return !accel_initialised;
 }


--- a/aio-posix.c
+++ b/aio-posix.c
@@ -282,10 +282,12 @@ bool aio_pending(AioContext *ctx)
        int revents;

        revents = node->pfd.revents & node->pfd.events;
-        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
+        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
+            aio_node_check(ctx, node->is_external)) {
            return true;
        }
-        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
+        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
+            aio_node_check(ctx, node->is_external)) {
            return true;
        }
    }
@@ -323,6 +325,7 @@ bool aio_dispatch(AioContext *ctx)

        if (!node->deleted &&
            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+            aio_node_check(ctx, node->is_external) &&
            node->io_read) {
            node->io_read(node->opaque);

@@ -333,6 +336,7 @@ bool aio_dispatch(AioContext *ctx)
        }
        if (!node->deleted &&
            (revents & (G_IO_OUT | G_IO_ERR)) &&
+            aio_node_check(ctx, node->is_external) &&
            node->io_write) {
            node->io_write(node->opaque);
            progress = true;
--- a/arch_init.c
+++ b/arch_init.c
@@ -22,6 +22,8 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -24,6 +24,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/bswap.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
@@ -270,7 +271,7 @@ f_sample *mixeng_clip[2][2][2][3] = {
 * August 21, 1998
 * Copyright 1998 Fabrice Bellard.
 *
- * [Rewrote completly the code of Lance Norskog And Sundry
+ * [Rewrote completely the code of Lance Norskog And Sundry
 * Contributors with a more efficient algorithm.]
 *
 * This source code is freely redistributable and may be used for
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@@ -23,6 +23,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "audio.h"
 #include "qemu/timer.h"

--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@@ -898,7 +898,7 @@ static struct audio_option oss_options[] = {
        .name  = "EXCLUSIVE",
        .tag   = AUD_OPT_BOOL,
        .valp  = &glob_conf.exclusive,
-        .descr = "Open device in exclusive mode (vmix wont work)"
+        .descr = "Open device in exclusive mode (vmix won't work)"
    },
 #ifdef USE_DSP_POLICY
    {
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -19,6 +19,7 @@

 #include "qemu/osdep.h"
 #include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "ui/qemu-spice.h"
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/timer.h"
 #include "audio.h"

--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -121,11 +121,19 @@ file_backend_instance_init(Object *o)
                            set_mem_path, NULL);
 }

+static void file_backend_instance_finalize(Object *o)
+{
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+
+    g_free(fb->mem_path);
+}
+
 static const TypeInfo file_backend_info = {
    .name = TYPE_MEMORY_BACKEND_FILE,
    .parent = TYPE_MEMORY_BACKEND,
    .class_init = file_backend_class_init,
    .instance_init = file_backend_instance_init,
+    .instance_finalize = file_backend_instance_finalize,
    .instance_size = sizeof(HostMemoryBackendFile),
 };

--- a/block.c
+++ b/block.c
@@ -38,7 +38,6 @@
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"
-#include "block/throttle-groups.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"

@@ -218,8 +217,6 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,

 void bdrv_register(BlockDriver *bdrv)
 {
-    bdrv_setup_io_funcs(bdrv);
-
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }

@@ -239,8 +236,6 @@ BlockDriverState *bdrv_new(void)
        QLIST_INIT(&bs->op_blockers[i]);
    }
    notifier_with_return_list_init(&bs->before_write_notifiers);
-    qemu_co_queue_init(&bs->throttled_reqs[0]);
-    qemu_co_queue_init(&bs->throttled_reqs[1]);
    bs->refcnt = 1;
    bs->aio_context = qemu_get_aio_context();

@@ -1176,10 +1171,10 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
    return child;
 }

-static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
-                                    BlockDriverState *child_bs,
-                                    const char *child_name,
-                                    const BdrvChildRole *child_role)
+BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+                             BlockDriverState *child_bs,
+                             const char *child_name,
+                             const BdrvChildRole *child_role)
 {
    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
@@ -1219,6 +1214,27 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
    bdrv_root_unref_child(child);
 }

+
+static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->change_media) {
+            c->role->change_media(c, load);
+        }
+    }
+}
+
+static void bdrv_parent_cb_resize(BlockDriverState *bs)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->resize) {
+            c->role->resize(c);
+        }
+    }
+}
+
 /*
 * Sets the backing file link of a BDS. A new reference is created; callers
 * which don't need their own reference any more must call bdrv_unref().
@@ -1527,12 +1543,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
            return -ENODEV;
        }

-        if (bs->throttle_state) {
-            error_setg(errp, "Cannot reference an existing block device for "
-                       "which I/O throttling is enabled");
-            return -EINVAL;
-        }
-
        bdrv_ref(bs);
        *pbs = bs;
        return 0;
@@ -1684,9 +1694,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    }

    if (!bdrv_key_required(bs)) {
-        if (bs->blk) {
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
    } else if (!runstate_check(RUN_STATE_PRELAUNCH)
               && !runstate_check(RUN_STATE_INMIGRATE)
               && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
@@ -2125,11 +2133,6 @@ static void bdrv_close(BlockDriverState *bs)

    assert(!bs->job);

-    /* Disable I/O limits and drain all pending throttled requests */
-    if (bs->throttle_state) {
-        bdrv_io_limits_disable(bs);
-    }
-
    bdrv_drained_begin(bs); /* complete I/O */
    bdrv_flush(bs);
    bdrv_drain(bs); /* in case flush left pending I/O */
@@ -2137,9 +2140,7 @@ static void bdrv_close(BlockDriverState *bs)
    bdrv_release_named_dirty_bitmaps(bs);
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

-    if (bs->blk) {
-        blk_dev_change_media_cb(bs->blk, false);
-    }
+    bdrv_parent_cb_change_media(bs, false);

    if (bs->drv) {
        BdrvChild *child, *next;
@@ -2220,26 +2221,11 @@ void bdrv_close_all(void)
    }
 }

-/* Fields that need to stay with the top-level BDS */
-static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
-                                     BlockDriverState *bs_src)
-{
-    /* move some fields that need to stay attached to the device */
-}
-
 static void change_parent_backing_link(BlockDriverState *from,
                                       BlockDriverState *to)
 {
    BdrvChild *c, *next;

-    if (from->blk) {
-        /* FIXME We bypass blk_set_bs(), so we need to make these updates
-         * manually. The root problem is not in this change function, but the
-         * existence of BlockDriverState.blk. */
-        to->blk = from->blk;
-        from->blk = NULL;
-    }
-
    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->role != &child_backing);
        c->bs = to;
@@ -2250,23 +2236,6 @@ static void change_parent_backing_link(BlockDriverState *from,
    }
 }

-static void swap_feature_fields(BlockDriverState *bs_top,
-                                BlockDriverState *bs_new)
-{
-    BlockDriverState tmp;
-
-    bdrv_move_feature_fields(&tmp, bs_top);
-    bdrv_move_feature_fields(bs_top, bs_new);
-    bdrv_move_feature_fields(bs_new, &tmp);
-
-    assert(!bs_new->throttle_state);
-    if (bs_top->throttle_state) {
-        assert(bs_top->io_limits_enabled);
-        bdrv_io_limits_enable(bs_new, throttle_group_get_name(bs_top));
-        bdrv_io_limits_disable(bs_top);
-    }
-}
-
 /*
 * Add new bs contents at the top of an image chain while the chain is
 * live, while keeping required fields on the top layer.
@@ -2289,11 +2258,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
    assert(!bdrv_requests_pending(bs_new));

    bdrv_ref(bs_top);
+
    change_parent_backing_link(bs_top, bs_new);
-
-    /* Some fields always stay on top of the backing file chain */
-    swap_feature_fields(bs_top, bs_new);
-
    bdrv_set_backing_hd(bs_new, bs_top);
    bdrv_unref(bs_top);

@@ -2309,16 +2275,6 @@ void bdrv_replace_in_backing_chain(BlockDriverState *old, BlockDriverState *new)

    bdrv_ref(old);

-    if (old->blk) {
-        /* As long as these fields aren't in BlockBackend, but in the top-level
-         * BlockDriverState, it's not possible for a BDS to have two BBs.
-         *
-         * We really want to copy the fields from old to new, but we go for a
-         * swap instead so that pointers aren't duplicated and cause trouble.
-         * (Also, bdrv_swap() used to do the same.) */
-        assert(!new->blk);
-        swap_feature_fields(old, new);
-    }
    change_parent_backing_link(old, new);

    /* Change backing files if a previously independent node is added to the
@@ -2627,9 +2583,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
    if (ret == 0) {
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
        bdrv_dirty_bitmap_truncate(bs);
-        if (bs->blk) {
-            blk_dev_resize_cb(bs->blk);
-        }
+        bdrv_parent_cb_resize(bs);
    }
    return ret;
 }
@@ -2739,11 +2693,9 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
    if (ret < 0) {
        bs->valid_key = 0;
    } else if (!bs->valid_key) {
+        /* call the change callback now, we skipped it on open */
        bs->valid_key = 1;
-        if (bs->blk) {
-            /* call the change callback now, we skipped it on open */
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
    }
    return ret;
 }
@@ -2910,34 +2862,33 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
    return QTAILQ_NEXT(bs, node_list);
 }

-/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
- * the monitor or attached to a BlockBackend */
-BlockDriverState *bdrv_next(BlockDriverState *bs)
-{
-    if (!bs || bs->blk) {
-        bs = blk_next_root_bs(bs);
-        if (bs) {
-            return bs;
-        }
-    }
-
-    /* Ignore all BDSs that are attached to a BlockBackend here; they have been
-     * handled by the above block already */
-    do {
-        bs = bdrv_next_monitor_owned(bs);
-    } while (bs && bs->blk);
-    return bs;
-}
-
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
    return bs->node_name;
 }

+const char *bdrv_get_parent_name(const BlockDriverState *bs)
+{
+    BdrvChild *c;
+    const char *name;
+
+    /* If multiple parents have a name, just pick the first one. */
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->get_name) {
+            name = c->role->get_name(c);
+            if (name && *name) {
+                return name;
+            }
+        }
+    }
+
+    return NULL;
+}
+
 /* TODO check what callers really want: bs->node_name or blk_name() */
 const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : "";
+    return bdrv_get_parent_name(bs) ?: "";
 }

 /* This can be used to identify nodes that might not have a device
@@ -2946,7 +2897,7 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
 * absent, then this returns an empty (non-null) string. */
 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : bs->node_name;
+    return bdrv_get_parent_name(bs) ?: bs->node_name;
 }

 int bdrv_get_flags(BlockDriverState *bs)
@@ -3201,6 +3152,7 @@ void bdrv_init_with_whitelist(void)

 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
+    BdrvChild *child;
    Error *local_err = NULL;
    int ret;

@@ -3215,13 +3167,20 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

    if (bs->drv->bdrv_invalidate_cache) {
        bs->drv->bdrv_invalidate_cache(bs, &local_err);
-    } else if (bs->file) {
-        bdrv_invalidate_cache(bs->file->bs, &local_err);
+        if (local_err) {
+            bs->open_flags |= BDRV_O_INACTIVE;
+            error_propagate(errp, local_err);
+            return;
+        }
    }
-    if (local_err) {
-        bs->open_flags |= BDRV_O_INACTIVE;
-        error_propagate(errp, local_err);
-        return;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_invalidate_cache(child->bs, &local_err);
+        if (local_err) {
+            bs->open_flags |= BDRV_O_INACTIVE;
+            error_propagate(errp, local_err);
+            return;
+        }
    }

    ret = refresh_total_sectors(bs, bs->total_sectors);
@@ -3234,10 +3193,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

 void bdrv_invalidate_cache_all(Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
    Error *local_err = NULL;
+    BdrvNextIterator *it = NULL;

-    while ((bs = bdrv_next(bs)) != NULL) {
+    while ((it = bdrv_next(it, &bs)) != NULL) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3250,38 +3210,64 @@ void bdrv_invalidate_cache_all(Error **errp)
    }
 }

-static int bdrv_inactivate(BlockDriverState *bs)
+static int bdrv_inactivate_recurse(BlockDriverState *bs,
+                                   bool setting_flag)
 {
+    BdrvChild *child;
    int ret;

-    if (bs->drv->bdrv_inactivate) {
+    if (!setting_flag && bs->drv->bdrv_inactivate) {
        ret = bs->drv->bdrv_inactivate(bs);
        if (ret < 0) {
            return ret;
        }
    }

-    bs->open_flags |= BDRV_O_INACTIVE;
+    QLIST_FOREACH(child, &bs->children, next) {
+        ret = bdrv_inactivate_recurse(child->bs, setting_flag);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (setting_flag) {
+        bs->open_flags |= BDRV_O_INACTIVE;
+    }
    return 0;
 }

 int bdrv_inactivate_all(void)
 {
    BlockDriverState *bs = NULL;
-    int ret;
+    BdrvNextIterator *it = NULL;
+    int ret = 0;
+    int pass;

-    while ((bs = bdrv_next(bs)) != NULL) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    while ((it = bdrv_next(it, &bs)) != NULL) {
+        aio_context_acquire(bdrv_get_aio_context(bs));
+    }

-        aio_context_acquire(aio_context);
-        ret = bdrv_inactivate(bs);
-        aio_context_release(aio_context);
-        if (ret < 0) {
-            return ret;
+    /* We do two passes of inactivation. The first pass calls to drivers'
+     * .bdrv_inactivate callbacks recursively so all cache is flushed to disk;
+     * the second pass sets the BDRV_O_INACTIVE flag so that no further write
+     * is allowed. */
+    for (pass = 0; pass < 2; pass++) {
+        it = NULL;
+        while ((it = bdrv_next(it, &bs)) != NULL) {
+            ret = bdrv_inactivate_recurse(bs, pass);
+            if (ret < 0) {
+                goto out;
+            }
        }
    }

-    return 0;
+out:
+    it = NULL;
+    while ((it = bdrv_next(it, &bs)) != NULL) {
+        aio_context_release(bdrv_get_aio_context(bs));
+    }
+
+    return ret;
 }

 /**************************************************************/
@@ -3623,6 +3609,7 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 void bdrv_detach_aio_context(BlockDriverState *bs)
 {
    BdrvAioNotifier *baf;
+    BdrvChild *child;

    if (!bs->drv) {
        return;
@@ -3632,17 +3619,11 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
        baf->detach_aio_context(baf->opaque);
    }

-    if (bs->throttle_state) {
-        throttle_timers_detach_aio_context(&bs->throttle_timers);
-    }
    if (bs->drv->bdrv_detach_aio_context) {
        bs->drv->bdrv_detach_aio_context(bs);
    }
-    if (bs->file) {
-        bdrv_detach_aio_context(bs->file->bs);
-    }
-    if (bs->backing) {
-        bdrv_detach_aio_context(bs->backing->bs);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_detach_aio_context(child->bs);
    }

    bs->aio_context = NULL;
@@ -3652,6 +3633,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
                             AioContext *new_context)
 {
    BdrvAioNotifier *ban;
+    BdrvChild *child;

    if (!bs->drv) {
        return;
@@ -3659,18 +3641,12 @@ void bdrv_attach_aio_context(BlockDriverState *bs,

    bs->aio_context = new_context;

-    if (bs->backing) {
-        bdrv_attach_aio_context(bs->backing->bs, new_context);
-    }
-    if (bs->file) {
-        bdrv_attach_aio_context(bs->file->bs, new_context);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_attach_aio_context(child->bs, new_context);
    }
    if (bs->drv->bdrv_attach_aio_context) {
        bs->drv->bdrv_attach_aio_context(bs, new_context);
    }
-    if (bs->throttle_state) {
-        throttle_timers_attach_aio_context(&bs->throttle_timers, new_context);
-    }

    QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
        ban->attached_aio_context(new_context, ban->opaque);
@@ -3776,10 +3752,11 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 */
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

    /* walk down the bs forest recursively */
-    while ((bs = bdrv_next(bs)) != NULL) {
+    while ((it = bdrv_next(it, &bs)) != NULL) {
        bool perm;

        /* try to recurse in this top level bs */
@@ -3981,3 +3958,52 @@ void bdrv_refresh_filename(BlockDriverState *bs)
        QDECREF(json);
    }
 }
+
+/*
+ * Hot add/remove a BDS's child. So the user can take a child offline when
+ * it is broken and take a new child online
+ */
+void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
+                    Error **errp)
+{
+
+    if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
+        error_setg(errp, "The node %s does not support adding a child",
+                   bdrv_get_device_or_node_name(parent_bs));
+        return;
+    }
+
+    if (!QLIST_EMPTY(&child_bs->parents)) {
+        error_setg(errp, "The node %s already has a parent",
+                   child_bs->node_name);
+        return;
+    }
+
+    parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
+}
+
+void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
+{
+    BdrvChild *tmp;
+
+    if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
+        error_setg(errp, "The node %s does not support removing a child",
+                   bdrv_get_device_or_node_name(parent_bs));
+        return;
+    }
+
+    QLIST_FOREACH(tmp, &parent_bs->children, next) {
+        if (tmp == child) {
+            break;
+        }
+    }
+
+    if (!tmp) {
+        error_setg(errp, "The node %s does not have a child named %s",
+                   bdrv_get_device_or_node_name(parent_bs),
+                   bdrv_get_device_or_node_name(child->bs));
+        return;
+    }
+
+    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
+}
--- a/block/backup.c
+++ b/block/backup.c
@@ -218,15 +218,6 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void backup_iostatus_reset(BlockJob *job)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
    BdrvDirtyBitmap *bm;
@@ -263,7 +254,6 @@ static const BlockJobDriver backup_job_driver = {
    .instance_size  = sizeof(BackupBlockJob),
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
    .set_speed      = backup_set_speed,
-    .iostatus_reset = backup_iostatus_reset,
    .commit         = backup_commit,
    .abort          = backup_abort,
 };
@@ -272,11 +262,11 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
 {
    if (read) {
-        return block_job_error_action(&job->common, job->common.bs,
-                                      job->on_source_error, true, error);
+        return block_job_error_action(&job->common, job->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&job->common, job->target,
-                                      job->on_target_error, false, error);
+        return block_job_error_action(&job->common, job->on_target_error,
+                                      false, error);
    }
 }

@@ -388,7 +378,6 @@ static void coroutine_fn backup_run(void *opaque)
    BackupCompleteData *data;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
-    BlockdevOnError on_target_error = job->on_target_error;
    NotifierWithReturn before_write = {
        .notify = backup_before_write_notify,
    };
@@ -404,11 +393,6 @@ static void coroutine_fn backup_run(void *opaque)

    job->done_bitmap = bitmap_new(end);

-    if (target->blk) {
-        blk_set_on_error(target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(target->blk);
-    }
-
    bdrv_add_before_write_notifier(bs, &before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
@@ -484,9 +468,6 @@ static void coroutine_fn backup_run(void *opaque)
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    if (target->blk) {
-        blk_iostatus_disable(target->blk);
-    }
    bdrv_op_unblock_all(target, job->common.blocker);

    data = g_malloc(sizeof(*data));
@@ -515,13 +496,6 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        return;
    }

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
    return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }

-/* Propagate AioContext changes to ->test_file */
-static void blkverify_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_detach_aio_context(s->test_file->bs);
-}
-
-static void blkverify_attach_aio_context(BlockDriverState *bs,
-                                         AioContext *new_context)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_attach_aio_context(s->test_file->bs, new_context);
-}
-
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkverifyState *s = bs->opaque;
@@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_aio_writev                  = blkverify_aio_writev,
    .bdrv_aio_flush                   = blkverify_aio_flush,

-    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
-    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
-
    .is_filter                        = true,
    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Block backends
 *
- * Copyright (C) 2014 Red Hat, Inc.
+ * Copyright (C) 2014-2016 Red Hat, Inc.
 *
 * Authors:
 *  Markus Armbruster <armbru@redhat.com>,
@@ -34,6 +34,7 @@ struct BlockBackend {
    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
+    BlockBackendPublic public;

    void *dev;                  /* attached device model, if any */
    /* TODO change to DeviceState when all users are qdevified */
@@ -74,6 +75,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };

 static void drive_info_del(DriveInfo *dinfo);
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -90,9 +92,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
    /* We're not supposed to call this function for root nodes */
    abort();
 }
+static void blk_root_drained_begin(BdrvChild *child);
+static void blk_root_drained_end(BdrvChild *child);
+
+static void blk_root_change_media(BdrvChild *child, bool load);
+static void blk_root_resize(BdrvChild *child);
+
+static const char *blk_root_get_name(BdrvChild *child)
+{
+    return blk_name(child->opaque);
+}

 static const BdrvChildRole child_root = {
-    .inherit_options = blk_root_inherit_options,
+    .inherit_options    = blk_root_inherit_options,
+
+    .change_media       = blk_root_change_media,
+    .resize             = blk_root_resize,
+    .get_name           = blk_root_get_name,
+
+    .drained_begin      = blk_root_drained_begin,
+    .drained_end        = blk_root_drained_end,
 };

 /*
@@ -106,8 +125,12 @@ BlockBackend *blk_new(Error **errp)

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+
    notifier_list_init(&blk->remove_bs_notifiers);
    notifier_list_init(&blk->insert_bs_notifiers);
+
    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
    return blk;
 }
@@ -128,7 +151,7 @@ BlockBackend *blk_new_with_bs(Error **errp)

    bs = bdrv_new_root();
    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root->opaque = blk;
    return blk;
 }

@@ -177,10 +200,6 @@ static void blk_delete(BlockBackend *blk)
    }
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
-    if (blk->root_state.throttle_state) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
    QTAILQ_REMOVE(&block_backends, blk, link);
    drive_info_del(blk->legacy_dinfo);
    block_acct_cleanup(&blk->stats);
@@ -267,28 +286,50 @@ BlockBackend *blk_next(BlockBackend *blk)
               : QTAILQ_FIRST(&monitor_block_backends);
 }

-/*
- * Iterates over all BlockDriverStates which are attached to a BlockBackend.
- * This function is for use by bdrv_next().
- *
- * @bs must be NULL or a BDS that is attached to a BB.
- */
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
-{
+struct BdrvNextIterator {
+    enum {
+        BDRV_NEXT_BACKEND_ROOTS,
+        BDRV_NEXT_MONITOR_OWNED,
+    } phase;
    BlockBackend *blk;
+    BlockDriverState *bs;
+};

-    if (bs) {
-        assert(bs->blk);
-        blk = bs->blk;
-    } else {
-        blk = NULL;
+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
+BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+{
+    if (!it) {
+        it = g_new(BdrvNextIterator, 1);
+        *it = (BdrvNextIterator) {
+            .phase = BDRV_NEXT_BACKEND_ROOTS,
+        };
    }

-    do {
-        blk = blk_all_next(blk);
-    } while (blk && !blk->root);
+    /* First, return all root nodes of BlockBackends. In order to avoid
+     * returning a BDS twice when multiple BBs refer to it, we only return it
+     * if the BB is the first one in the parent list of the BDS. */
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        do {
+            it->blk = blk_all_next(it->blk);
+            *bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));

-    return blk ? blk->root->bs : NULL;
+        if (*bs) {
+            return it;
+        }
+        it->phase = BDRV_NEXT_MONITOR_OWNED;
+    }
+
+    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
+     * BDSes that are attached to a BlockBackend here; they have been handled
+     * by the above block already */
+    do {
+        it->bs = bdrv_next_monitor_owned(it->bs);
+        *bs = it->bs;
+    } while (*bs && bdrv_has_blk(*bs));
+
+    return *bs ? it : NULL;
 }

 /*
@@ -375,6 +416,26 @@ BlockDriverState *blk_bs(BlockBackend *blk)
    return blk->root ? blk->root->bs : NULL;
 }

+static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
+{
+    BdrvChild *child;
+    QLIST_FOREACH(child, &bs->parents, next_parent) {
+        if (child->role == &child_root) {
+            return child->opaque;
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Returns true if @bs has an associated BlockBackend.
+ */
+bool bdrv_has_blk(BlockDriverState *bs)
+{
+    return bdrv_first_blk(bs) != NULL;
+}
+
 /*
 * Return @blk's DriveInfo if any, else null.
 */
@@ -410,18 +471,34 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
    abort();
 }

+/*
+ * Returns a pointer to the publicly accessible fields of @blk.
+ */
+BlockBackendPublic *blk_get_public(BlockBackend *blk)
+{
+    return &blk->public;
+}
+
+/*
+ * Returns a BlockBackend given the associated @public fields.
+ */
+BlockBackend *blk_by_public(BlockBackendPublic *public)
+{
+    return container_of(public, BlockBackend, public);
+}
+
 /*
 * Disassociates the currently associated BlockDriverState from @blk.
 */
 void blk_remove_bs(BlockBackend *blk)
 {
-    assert(blk->root->bs->blk == blk);
-
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    }

    blk_update_root_state(blk);

-    blk->root->bs->blk = NULL;
    bdrv_root_unref_child(blk->root);
    blk->root = NULL;
 }
@@ -431,12 +508,15 @@ void blk_remove_bs(BlockBackend *blk)
 */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
-    assert(!blk->root && !bs->blk);
    bdrv_ref(bs);
    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root->opaque = blk;

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_attach_aio_context(
+            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+    }
 }

 /*
@@ -525,6 +605,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
    }
 }

+static void blk_root_change_media(BdrvChild *child, bool load)
+{
+    blk_dev_change_media_cb(child->opaque, load);
+}
+
 /*
 * Does @blk's attached device model have removable media?
 * %true if no device model is attached.
@@ -579,8 +664,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
 * Notify @blk's attached device model of a backend size change.
 */
-void blk_dev_resize_cb(BlockBackend *blk)
+static void blk_root_resize(BdrvChild *child)
 {
+    BlockBackend *blk = child->opaque;
+
    if (blk->dev_ops && blk->dev_ops->resize_cb) {
        blk->dev_ops->resize_cb(blk->dev_opaque);
    }
@@ -692,7 +779,12 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
        return ret;
    }

-    return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags);
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    }
+
+    return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
 }

 static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
@@ -706,11 +798,16 @@ static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
        return ret;
    }

+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    }
+
    if (!blk->enable_write_cache) {
        flags |= BDRV_REQ_FUA;
    }

-    return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
+    return bdrv_co_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
 }

 typedef struct BlkRwCo {
@@ -772,55 +869,27 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
    return rwco.ret;
 }

-static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                  int nb_sectors, CoroutineEntry co_entry,
-                  BdrvRequestFlags flags)
+int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
+                          int count)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf,
-                   nb_sectors << BDRV_SECTOR_BITS, co_entry, flags);
-}
-
-int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-             int nb_sectors)
-{
-    return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0);
-}
-
-int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                         int nb_sectors)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    bool enabled;
    int ret;

-    ret = blk_check_request(blk, sector_num, nb_sectors);
+    ret = blk_check_byte_request(blk, offset, count);
    if (ret < 0) {
        return ret;
    }

-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = blk_read(blk, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
+    blk_root_drained_begin(blk->root);
+    ret = blk_pread(blk, offset, buf, count);
+    blk_root_drained_end(blk->root);
    return ret;
 }

-int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
-              int nb_sectors)
+int blk_write_zeroes(BlockBackend *blk, int64_t offset,
+                     int count, BdrvRequestFlags flags)
 {
-    return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors,
-                  blk_write_entry, 0);
-}
-
-int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                     int nb_sectors, BdrvRequestFlags flags)
-{
-    return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry,
-                  BDRV_REQ_ZERO_WRITE);
+    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+                   flags | BDRV_REQ_ZERO_WRITE);
 }

 static void error_callback_bh(void *opaque)
@@ -852,6 +921,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 typedef struct BlkAioEmAIOCB {
    BlockAIOCB common;
    BlkRwCo rwco;
+    int bytes;
    bool has_returned;
    QEMUBH* bh;
 } BlkAioEmAIOCB;
@@ -877,7 +947,7 @@ static void blk_aio_complete_bh(void *opaque)
    blk_aio_complete(opaque);
 }

-static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
+static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
                                QEMUIOVector *qiov, CoroutineEntry co_entry,
                                BdrvRequestFlags flags,
                                BlockCompletionFunc *cb, void *opaque)
@@ -893,6 +963,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
        .flags  = flags,
        .ret    = NOT_DONE,
    };
+    acb->bytes = bytes;
    acb->bh = NULL;
    acb->has_returned = false;

@@ -913,7 +984,8 @@ static void blk_aio_read_entry(void *opaque)
    BlkAioEmAIOCB *acb = opaque;
    BlkRwCo *rwco = &acb->rwco;

-    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size,
+    assert(rwco->qiov->size == acb->bytes);
+    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
                              rwco->qiov, rwco->flags);
    blk_aio_complete(acb);
 }
@@ -923,22 +995,18 @@ static void blk_aio_write_entry(void *opaque)
    BlkAioEmAIOCB *acb = opaque;
    BlkRwCo *rwco = &acb->rwco;

-    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset,
-                               rwco->qiov ? rwco->qiov->size : 0,
+    assert(!rwco->qiov || rwco->qiov->size == acb->bytes);
+    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
                               rwco->qiov, rwco->flags);
    blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                 int nb_sectors, BdrvRequestFlags flags,
+BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
+                                 int count, BdrvRequestFlags flags,
                                 BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, NULL,
-                        blk_aio_write_entry, BDRV_REQ_ZERO_WRITE, cb, opaque);
+    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
+                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }

 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
@@ -950,9 +1018,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
    return count;
 }

-int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
+               BdrvRequestFlags flags)
 {
-    int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0);
+    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
+                      flags);
    if (ret < 0) {
        return ret;
    }
@@ -986,28 +1056,20 @@ int64_t blk_nb_sectors(BlockBackend *blk)
    return bdrv_nb_sectors(blk_bs(blk));
 }

-BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
-                          QEMUIOVector *iov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov,
-                        blk_aio_read_entry, 0, cb, opaque);
-}
-
-BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
-                           QEMUIOVector *iov, int nb_sectors,
+BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
+                           QEMUIOVector *qiov, BdrvRequestFlags flags,
                           BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_read_entry, flags, cb, opaque);
+}

-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov,
-                        blk_aio_write_entry, 0, cb, opaque);
+BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
+                            QEMUIOVector *qiov, BdrvRequestFlags flags,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_write_entry, flags, cb, opaque);
 }

 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
@@ -1042,20 +1104,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
    bdrv_aio_cancel_async(acb);
 }

-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
-{
-    int i, ret;
-
-    for (i = 0; i < num_reqs; i++) {
-        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
-}
-
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
    if (!blk_is_available(blk)) {
@@ -1368,7 +1416,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
    BlockDriverState *bs = blk_bs(blk);

    if (bs) {
+        if (blk->public.throttle_state) {
+            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        }
        bdrv_set_aio_context(bs, new_context);
+        if (blk->public.throttle_state) {
+            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
+                                               new_context);
+        }
    }
 }

@@ -1437,16 +1492,11 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                     int nb_sectors, BdrvRequestFlags flags)
+int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
+                                     int count, BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS,
-                          nb_sectors << BDRV_SECTOR_BITS, NULL,
-                          BDRV_REQ_ZERO_WRITE);
+    return blk_co_pwritev(blk, offset, count, NULL,
+                          flags | BDRV_REQ_ZERO_WRITE);
 }

 int blk_write_compressed(BlockBackend *blk, int64_t sector_num,
@@ -1538,19 +1588,6 @@ void blk_update_root_state(BlockBackend *blk)
    blk->root_state.open_flags    = blk->root->bs->open_flags;
    blk->root_state.read_only     = blk->root->bs->read_only;
    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
-
-    if (blk->root_state.throttle_group) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
-    if (blk->root->bs->throttle_state) {
-        const char *name = throttle_group_get_name(blk->root->bs);
-        blk->root_state.throttle_group = g_strdup(name);
-        blk->root_state.throttle_state = throttle_group_incref(name);
-    } else {
-        blk->root_state.throttle_group = NULL;
-        blk->root_state.throttle_state = NULL;
-    }
 }

 /*
@@ -1561,9 +1598,6 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
    bs->detect_zeroes = blk->root_state.detect_zeroes;
-    if (blk->root_state.throttle_group) {
-        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
-    }
 }

 /*
@@ -1626,3 +1660,59 @@ int blk_flush_all(void)

    return result;
 }
+
+
+/* throttling disk I/O limits */
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+{
+    throttle_group_config(blk, cfg);
+}
+
+void blk_io_limits_disable(BlockBackend *blk)
+{
+    assert(blk->public.throttle_state);
+    bdrv_drained_begin(blk_bs(blk));
+    throttle_group_unregister_blk(blk);
+    bdrv_drained_end(blk_bs(blk));
+}
+
+/* should be called before blk_set_io_limits if a limit is set */
+void blk_io_limits_enable(BlockBackend *blk, const char *group)
+{
+    assert(!blk->public.throttle_state);
+    throttle_group_register_blk(blk, group);
+}
+
+void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+{
+    /* this BB is not part of any group */
+    if (!blk->public.throttle_state) {
+        return;
+    }
+
+    /* this BB is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    blk_io_limits_disable(blk);
+    blk_io_limits_enable(blk, group);
+}
+
+static void blk_root_drained_begin(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    if (blk->public.io_limits_disabled++ == 0) {
+        throttle_group_restart_blk(blk);
+    }
+}
+
+static void blk_root_drained_end(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    assert(blk->public.io_limits_disabled);
+    --blk->public.io_limits_disabled;
+}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -27,6 +27,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"

 /**************************************************************/

@@ -104,6 +105,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1; // no write support yet
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs));
    if (ret < 0) {
@@ -221,38 +223,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
 }

-static int bochs_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
+    BDRVBochsState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    uint64_t bytes_done = 0;
+    QEMUIOVector local_qiov;
    int ret;

+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
    while (nb_sectors > 0) {
        int64_t block_offset = seek_to_sector(bs, sector_num);
        if (block_offset < 0) {
-            return block_offset;
-        } else if (block_offset > 0) {
-            ret = bdrv_pread(bs->file->bs, block_offset, buf, 512);
+            ret = block_offset;
+            goto fail;
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512);
+
+        if (block_offset > 0) {
+            ret = bdrv_co_preadv(bs->file->bs, block_offset, 512,
+                                 &local_qiov, 0);
            if (ret < 0) {
-                return ret;
+                goto fail;
            }
        } else {
-            memset(buf, 0, 512);
+            qemu_iovec_memset(&local_qiov, 0, 0, 512);
        }
        nb_sectors--;
        sector_num++;
-        buf += 512;
+        bytes_done += 512;
    }
-    return 0;
-}

-static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVBochsState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = bochs_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -267,7 +283,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
-    .bdrv_read          = bochs_co_read,
+    .bdrv_co_preadv = bochs_co_preadv,
    .bdrv_close		= bochs_close,
 };

--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>

 /* Maximum compressed block size */
@@ -66,6 +67,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    /* read header */
    ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4);
@@ -229,33 +231,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
    return 0;
 }

-static int cloop_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    BDRVCloopState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
+        void *data;
        uint32_t sector_offset_in_block =
            ((sector_num + i) % s->sectors_per_block),
            block_num = (sector_num + i) / s->sectors_per_block;
        if (cloop_read_block(bs, block_num) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
-        memcpy(buf + i * 512,
-            s->uncompressed_block + sector_offset_in_block * 512, 512);
-    }
-    return 0;
-}

-static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCloopState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cloop_read(bs, sector_num, buf, nb_sectors);
+        data = s->uncompressed_block + sector_offset_in_block * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
+    }
+
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -273,7 +280,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
-    .bdrv_read      = cloop_co_read,
+    .bdrv_co_preadv = cloop_co_preadv,
    .bdrv_close     = cloop_close,
 };

--- a/block/commit.c
+++ b/block/commit.c
@@ -214,13 +214,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
    BlockDriverState *overlay_bs;
    Error *local_err = NULL;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, "Invalid parameter combination");
-        return;
-    }
-
    assert(top != bs);
    if (top == base) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block,
    struct BlockCryptoCreateData *data = opaque;
    ssize_t ret;

-    ret = blk_pwrite(data->blk, offset, buf, buflen);
+    ret = blk_pwrite(data->blk, offset, buf, buflen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        return ret;
@@ -196,7 +196,6 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockOpenOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockOpenOptions, 1);
    ret->format = format;
@@ -219,9 +218,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
@@ -242,7 +243,6 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockCreateOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockCreateOptions, 1);
    ret->format = format;
@@ -265,9 +265,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)                                            \
+    do {                                                             \
+        if (DEBUG_CURL_PRINT) {                                      \
+            fprintf(stderr, fmt, ## __VA_ARGS__);                    \
+        }                                                            \
+    } while (0)

 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -440,6 +440,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
+
    s->n_chunks = 0;
    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
    /* used by dmg_read_mish_block to keep track of the current I/O position */
@@ -659,38 +661,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    return 0;
 }

-static int dmg_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVDMGState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
        uint32_t sector_offset_in_chunk;
+        void *data;
+
        if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
         * s->uncompressed_chunk may be too small to cover the large all-zeroes
         * section. dmg_read_chunk is called to find s->current_chunk */
        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-            memset(buf + i * 512, 0, 512);
+            qemu_iovec_memset(qiov, i * 512, 0, 512);
            continue;
        }
        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        memcpy(buf + i * 512,
-               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
+        data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
    }
-    return 0;
-}

-static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVDMGState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = dmg_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -715,7 +721,7 @@ static BlockDriver bdrv_dmg = {
    .instance_size  = sizeof(BDRVDMGState),
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
-    .bdrv_read      = dmg_co_read,
+    .bdrv_co_preadv = dmg_co_preadv,
    .bdrv_close     = dmg_close,
 };

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -247,7 +247,7 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
    if (!ret || ret == acb->size) {
        acb->ret = 0; /* Success */
    } else if (ret < 0) {
-        acb->ret = ret; /* Read/Write failed */
+        acb->ret = -errno; /* Read/Write failed */
    } else {
        acb->ret = -EIO; /* Partial read/write - fail it */
    }
@@ -314,6 +314,23 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
        goto out;
    }

+#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
+    /* Without this, if fsync fails for a recoverable reason (for instance,
+     * ENOSPC), gluster will dump its cache, preventing retries.  This means
+     * almost certain data loss.  Not all gluster versions support the
+     * 'resync-failed-syncs-after-fsync' key value, but there is no way to
+     * discover during runtime if it is supported (this api returns success for
+     * unknown key/value pairs) */
+    ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
+                                          "resync-failed-syncs-after-fsync",
+                                          "on");
+    if (ret < 0) {
+        error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
+        ret = -errno;
+        goto out;
+    }
+#endif
+
    qemu_gluster_parse_flags(bdrv_flags, &open_flags);

    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
@@ -366,6 +383,16 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
        goto exit;
    }

+#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
+    ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
+                                 "resync-failed-syncs-after-fsync", "on");
+    if (ret < 0) {
+        error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
+        ret = -errno;
+        goto exit;
+    }
+#endif
+
    reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
    if (reop_s->fd == NULL) {
        /* reops->glfs will be cleaned up in _abort */
@@ -589,6 +616,17 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }

+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
    int ret;
@@ -602,11 +640,35 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)

    ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        return -errno;
+        ret = -errno;
+        goto error;
    }

    qemu_coroutine_yield();
+    if (acb.ret < 0) {
+        ret = acb.ret;
+        goto error;
+    }
+
    return acb.ret;
+
+error:
+    /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
+     * after a fsync failure, so we have no way of allowing the guest to safely
+     * continue.  Gluster versions prior to 3.5.6 don't retain the cache
+     * either, but will invalidate the fd on error, so this is again our only
+     * option.
+     *
+     * The 'resync-failed-syncs-after-fsync' xlator option for the
+     * write-behind cache will cause later gluster versions to retain its
+     * cache after error, so long as the fd remains open.  However, we
+     * currently have no way of knowing if this option is supported.
+     *
+     * TODO: Once gluster provides a way for us to determine if the option
+     * is supported, bypass the closure and setting drv to NULL.  */
+    qemu_gluster_close(bs);
+    bs->drv = NULL;
+    return ret;
 }

 #ifdef CONFIG_GLUSTERFS_DISCARD
@@ -661,17 +723,6 @@ static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
    }
 }

-static void qemu_gluster_close(BlockDriverState *bs)
-{
-    BDRVGlusterState *s = bs->opaque;
-
-    if (s->fd) {
-        glfs_close(s->fd);
-        s->fd = NULL;
-    }
-    glfs_fini(s->glfs);
-}
-
 static int qemu_gluster_has_zero_init(BlockDriverState *bs)
 {
    /* GlusterFS volume could be backed by a block device */
--- a/block/io.c
+++ b/block/io.c
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -456,8 +456,11 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
-    bool fua;
+    bool fua = flags & BDRV_REQ_FUA;

+    if (fua) {
+        assert(iscsilun->dpofua);
+    }
    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }
@@ -472,7 +475,6 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
-    fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA);
    if (iscsilun->use_16_for_rw) {
        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        NULL, num_sectors * iscsilun->block_size,
@@ -513,13 +515,6 @@ retry:
    return 0;
 }

-static int coroutine_fn
-iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                QEMUIOVector *iov)
-{
-    return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0);
-}
-

 static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
                                             int64_t sector_num, int nb_sectors)
@@ -1555,6 +1550,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    task = NULL;

    iscsi_modesense_sync(iscsilun);
+    if (iscsilun->dpofua) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Check the write protect flag of the LUN if we want to write */
    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1847,9 +1846,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_co_discard      = iscsi_co_discard,
    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev        = iscsi_co_writev,
    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -30,7 +30,7 @@

 struct qemu_laiocb {
    BlockAIOCB common;
-    struct qemu_laio_state *ctx;
+    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
@@ -46,7 +46,7 @@ typedef struct {
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;

-struct qemu_laio_state {
+struct LinuxAioState {
    io_context_t ctx;
    EventNotifier e;

@@ -60,7 +60,7 @@ struct qemu_laio_state {
    int event_max;
 };

-static void ioq_submit(struct qemu_laio_state *s);
+static void ioq_submit(LinuxAioState *s);

 static inline ssize_t io_event_ret(struct io_event *ev)
 {
@@ -70,8 +70,7 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 /*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
-static void qemu_laio_process_completion(struct qemu_laio_state *s,
-    struct qemu_laiocb *laiocb)
+static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
    int ret;

@@ -99,7 +98,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
- * the completion events array and index are kept in qemu_laio_state.  The BH
+ * the completion events array and index are kept in LinuxAioState.  The BH
 * reschedules itself as long as there are completions pending so it will
 * either be called again in a nested event loop or will be called after all
 * events have been completed.  When there are no events left to complete, the
@@ -107,7 +106,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 */
 static void qemu_laio_completion_bh(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
+    LinuxAioState *s = opaque;

    /* Fetch more completion events when empty */
    if (s->event_idx == s->event_max) {
@@ -136,7 +135,7 @@ static void qemu_laio_completion_bh(void *opaque)
        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
        s->event_idx++;

-        qemu_laio_process_completion(s, laiocb);
+        qemu_laio_process_completion(laiocb);
    }

    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
@@ -146,7 +145,7 @@ static void qemu_laio_completion_bh(void *opaque)

 static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_bh_schedule(s->completion_bh);
@@ -185,7 +184,7 @@ static void ioq_init(LaioQueue *io_q)
    io_q->blocked = false;
 }

-static void ioq_submit(struct qemu_laio_state *s)
+static void ioq_submit(LinuxAioState *s)
 {
    int ret, len;
    struct qemu_laiocb *aiocb;
@@ -216,33 +215,25 @@ static void ioq_submit(struct qemu_laio_state *s)
    s->io_q.blocked = (s->io_q.n > 0);
 }

-void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    s->io_q.plugged++;
+    assert(!s->io_q.plugged);
+    s->io_q.plugged = 1;
 }

-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    assert(s->io_q.plugged > 0 || !unplug);
-
-    if (unplug && --s->io_q.plugged > 0) {
-        return;
-    }
-
+    assert(s->io_q.plugged);
+    s->io_q.plugged = 0;
    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }

-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type)
 {
-    struct qemu_laio_state *s = aio_ctx;
    struct qemu_laiocb *laiocb;
    struct iocb *iocbs;
    off_t offset = sector_num * 512;
@@ -284,26 +275,22 @@ out_free_aiocb:
    return NULL;
 }

-void laio_detach_aio_context(void *s_, AioContext *old_context)
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
-    struct qemu_laio_state *s = s_;
-
    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

-void laio_attach_aio_context(void *s_, AioContext *new_context)
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
-    struct qemu_laio_state *s = s_;
-
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb);
 }

-void *laio_init(void)
+LinuxAioState *laio_init(void)
 {
-    struct qemu_laio_state *s;
+    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    if (event_notifier_init(&s->e, false) < 0) {
@@ -325,10 +312,8 @@ out_free_state:
    return NULL;
 }

-void laio_cleanup(void *s_)
+void laio_cleanup(LinuxAioState *s)
 {
-    struct qemu_laio_state *s = s_;
-
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -80,11 +80,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
    s->synced = false;
    if (read) {
-        return block_job_error_action(&s->common, s->common.bs,
-                                      s->on_source_error, true, error);
+        return block_job_error_action(&s->common, s->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&s->common, s->target,
-                                      s->on_target_error, false, error);
+        return block_job_error_action(&s->common, s->on_target_error,
+                                      false, error);
    }
 }

@@ -108,7 +108,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)

    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    chunk_num = op->sector_num / sectors_per_chunk;
-    nb_chunks = op->nb_sectors / sectors_per_chunk;
+    nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
    if (ret >= 0) {
        if (s->cow_bitmap) {
@@ -161,6 +161,14 @@ static void mirror_read_complete(void *opaque, int ret)
                    mirror_write_complete, op);
 }

+static inline void mirror_clip_sectors(MirrorBlockJob *s,
+                                       int64_t sector_num,
+                                       int *nb_sectors)
+{
+    *nb_sectors = MIN(*nb_sectors,
+                      s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
+}
+
 /* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
 * return the offset of the adjusted tail sector against original. */
 static int mirror_cow_align(MirrorBlockJob *s,
@@ -189,6 +197,9 @@ static int mirror_cow_align(MirrorBlockJob *s,
                                               s->target_cluster_sectors);
        }
    }
+    /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
+     * that doesn't matter because it's already the end of source image. */
+    mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);

    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
    *sector_num = align_sector_num;
@@ -231,9 +242,8 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    /* The sector range must meet granularity because:
     * 1) Caller passes in aligned values;
     * 2) mirror_cow_align is used only when target cluster is larger. */
-    assert(!(nb_sectors % sectors_per_chunk));
    assert(!(sector_num % sectors_per_chunk));
-    nb_chunks = nb_sectors / sectors_per_chunk;
+    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);

    while (s->buf_free_count < nb_chunks) {
        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
@@ -298,7 +308,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->common.bs;
-    int64_t sector_num;
+    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
@@ -313,6 +323,12 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        assert(sector_num >= 0);
    }

+    first_chunk = sector_num / sectors_per_chunk;
+    while (test_bit(first_chunk, s->in_flight_bitmap)) {
+        trace_mirror_yield_in_flight(s, first_chunk, s->in_flight);
+        mirror_wait_for_io(s);
+    }
+
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
@@ -324,17 +340,17 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            break;
        }
        if (test_bit(next_chunk, s->in_flight_bitmap)) {
-            if (nb_chunks > 0) {
-                break;
-            }
-            trace_mirror_yield_in_flight(s, next_sector, s->in_flight);
-            mirror_wait_for_io(s);
-            /* Now retry.  */
-        } else {
-            hbitmap_next = hbitmap_iter_next(&s->hbi);
-            assert(hbitmap_next == next_sector);
-            nb_chunks++;
+            break;
        }
+
+        hbitmap_next = hbitmap_iter_next(&s->hbi);
+        if (hbitmap_next > next_sector || hbitmap_next < 0) {
+            /* The bitmap iterator's cache is stale, refresh it */
+            bdrv_set_dirty_iter(&s->hbi, next_sector);
+            hbitmap_next = hbitmap_iter_next(&s->hbi);
+        }
+        assert(hbitmap_next == next_sector);
+        nb_chunks++;
    }

    /* Clear dirty bits before querying the block status, because
@@ -378,6 +394,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            }
        }

+        mirror_clip_sectors(s, sector_num, &io_sectors);
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
@@ -393,7 +410,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        }
        assert(io_sectors);
        sector_num += io_sectors;
-        nb_chunks -= io_sectors / sectors_per_chunk;
+        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
    }
    return delay_ns;
@@ -451,7 +468,7 @@ static void mirror_exit(BlockJob *job, void *opaque)

        /* This was checked in mirror_start_job(), but meanwhile one of the
         * nodes could have been newly attached to a BlockBackend. */
-        if (to_replace->blk && s->target->blk) {
+        if (bdrv_has_blk(to_replace) && bdrv_has_blk(s->target)) {
            error_report("block job: Can't create node with two BlockBackends");
            data->ret = -EINVAL;
            goto out;
@@ -478,6 +495,9 @@ out:
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
+    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
+        aio_enable_external(iohandler_get_aio_context());
+    }
    bdrv_unref(src);
 }

@@ -690,15 +710,18 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    if (s->target->blk) {
-        blk_iostatus_disable(s->target->blk);
-    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
    bdrv_drained_begin(s->common.bs);
+    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
+        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
+         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
+         * need a block layer API change to achieve this. */
+        aio_disable_external(iohandler_get_aio_context());
+    }
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

@@ -713,15 +736,6 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void mirror_iostatus_reset(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -767,7 +781,6 @@ static const BlockJobDriver mirror_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_MIRROR,
    .set_speed     = mirror_set_speed,
-    .iostatus_reset= mirror_iostatus_reset,
    .complete      = mirror_complete,
 };

@@ -775,8 +788,6 @@ static const BlockJobDriver commit_active_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = mirror_set_speed,
-    .iostatus_reset
-                   = mirror_iostatus_reset,
    .complete      = mirror_complete,
 };

@@ -801,13 +812,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    assert ((granularity & (granularity - 1)) == 0);

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
@@ -827,7 +831,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    } else {
        replaced_bs = bs;
    }
-    if (replaced_bs->blk && target->blk) {
+    if (bdrv_has_blk(replaced_bs) && bdrv_has_blk(target)) {
        error_setg(errp, "Can't create node with two BlockBackends");
        return;
    }
@@ -856,10 +860,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    bdrv_op_block_all(s->target, s->common.blocker);

-    if (s->target->blk) {
-        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(s->target->blk);
-    }
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -243,15 +243,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,

 static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset, int *flags)
+                           int offset, int flags)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
    struct nbd_request request = { .type = NBD_CMD_WRITE };
    struct nbd_reply reply;
    ssize_t ret;

-    if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
-        *flags &= ~BDRV_REQ_FUA;
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
        request.type |= NBD_CMD_FLAG_FUA;
    }

@@ -291,7 +291,7 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
 }

 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags)
+                         int nb_sectors, QEMUIOVector *qiov, int flags)
 {
    int offset = 0;
    int ret;
@@ -414,6 +414,9 @@ int nbd_client_init(BlockDriverState *bs,
        logout("Failed to negotiate with the NBD server\n");
        return ret;
    }
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_mutex_init(&client->free_sema);
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -48,7 +48,7 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags);
+                         int nb_sectors, QEMUIOVector *qiov, int flags);
 int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -355,31 +355,6 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
 }

-static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
-                               int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-    int ret;
-
-    ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* The flag wasn't sent to the server, so we need to emulate it with an
-     * explicit flush */
-    if (flags & BDRV_REQ_FUA) {
-        ret = nbd_client_co_flush(bs);
-    }
-
-    return ret;
-}
-
-static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
    return nbd_client_co_flush(bs);
@@ -476,9 +451,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -496,9 +469,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -516,9 +487,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -33,6 +33,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"

@@ -512,11 +513,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(tmp, 0, sizeof(tmp));
    memcpy(tmp, &header, sizeof(header));

-    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0);
    if (ret < 0) {
        goto exit;
    }
-    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE,
+                           (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
    if (ret < 0) {
        goto exit;
    }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (bs->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_state) {
        ThrottleConfig cfg;

-        throttle_group_get_config(bs, &cfg);
+        throttle_group_get_config(blk, &cfg);

        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->iops_size = cfg.op_size;

        info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(bs));
+        info->group = g_strdup(throttle_group_get_name(blk));
    }

    info->write_threshold = bdrv_write_threshold_get(bs);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
 #include "crypto/cipher.h"
@@ -853,14 +854,14 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* write all the data */
-    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
    if (ret != sizeof(header)) {
        goto exit;
    }

    if (backing_file) {
        ret = blk_pwrite(qcow_blk, sizeof(header),
-            backing_file, backing_filename_len);
+                         backing_file, backing_filename_len, 0);
        if (ret != backing_filename_len) {
            goto exit;
        }
@@ -869,8 +870,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    tmp = g_malloc0(BDRV_SECTOR_SIZE);
    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
        BDRV_SECTOR_SIZE); i++) {
-        ret = blk_pwrite(qcow_blk, header_size +
-            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+        ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
+                         tmp, BDRV_SECTOR_SIZE, 0);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
            goto exit;
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "trace.h"

 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 #include "qemu/range.h"
+#include "qemu/bswap.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,6 +26,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"

--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -36,6 +36,7 @@
 #include "trace.h"
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"

 /*
  Differences with QCOW:
@@ -1757,13 +1758,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)

    qcow2_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bs->drv = NULL;
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQcow2State));
    options = qdict_clone_shallow(bs->options);

@@ -2207,7 +2201,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

-    ret = blk_pwrite(blk, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
    g_free(header);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2217,7 +2211,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
    g_free(refcount_table);

    if (ret < 0) {
@@ -2411,21 +2405,75 @@ finish:
    return ret;
 }

+
+static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int nr;
+    BlockDriverState *file;
+    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
+                                              s->cluster_sectors, &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
+}
+
+static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int nr = s->cluster_sectors;
+    uint64_t off;
+    int ret;
+
+    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
+    assert(nr == s->cluster_sectors);
+    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
+}
+
 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;

-    /* Emulate misaligned zero writes */
-    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
-        return -ENOTSUP;
+    int head = sector_num % s->cluster_sectors;
+    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
+
+    if (head != 0 || tail != 0) {
+        int64_t cl_end = -1;
+
+        sector_num -= head;
+        nb_sectors += head;
+
+        if (tail != 0) {
+            nb_sectors += s->cluster_sectors - tail;
+        }
+
+        if (!is_zero_cluster(bs, sector_num)) {
+            return -ENOTSUP;
+        }
+
+        if (nb_sectors > s->cluster_sectors) {
+            /* Technically the request can cover 2 clusters, f.e. 4k write
+               at s->cluster_sectors - 2k offset. One of these cluster can
+               be zeroed, one unallocated */
+            cl_end = sector_num + nb_sectors - s->cluster_sectors;
+            if (!is_zero_cluster(bs, cl_end)) {
+                return -ENOTSUP;
+            }
+        }
+
+        qemu_co_mutex_lock(&s->lock);
+        /* We can have new write after previous check */
+        if (!is_zero_cluster_top_locked(bs, sector_num) ||
+                (cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) {
+            qemu_co_mutex_unlock(&s->lock);
+            return -ENOTSUP;
+        }
+    } else {
+        qemu_co_mutex_lock(&s->lock);
    }

    /* Whatever is left can use real zero clusters */
-    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -16,6 +16,7 @@
 #include "trace.h"
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
+#include "qemu/bswap.h"

 typedef struct {
    GenericCB gencb;
--- a/block/qed.c
+++ b/block/qed.c
@@ -15,6 +15,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
+#include "qemu/bswap.h"
 #include "trace.h"
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
@@ -601,18 +602,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    qed_header_cpu_to_le(&header, &le_header);
-    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
    if (ret < 0) {
        goto out;
    }
    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
-                     header.backing_filename_size);
+                     header.backing_filename_size, 0);
    if (ret < 0) {
        goto out;
    }

    l1_table = g_malloc0(l1_size);
-    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
    if (ret < 0) {
        goto out;
    }
@@ -1594,12 +1595,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)

    bdrv_qed_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQEDState));
    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
    if (local_err) {
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
    BdrvChild **children;  /* children BlockDriverStates */
    int num_children;      /* children count */
+    unsigned next_child_index;  /* the index of the next child that should
+                                 * be added
+                                 */
    int threshold;         /* if less than threshold children reads gave the
                            * same result a quorum error occurs.
                            */
@@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs)
    return result;
 }

-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
-    BDRVQuorumState *s = bs->opaque;
-    Error *local_err = NULL;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_invalidate_cache(s->children[i]->bs, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    }
-}
-
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
    BDRVQuorumState *s = bs->opaque;
@@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto exit;
    }
-    if (s->num_children < 2) {
+    if (s->num_children < 1) {
        error_setg(&local_err,
-                   "Number of provided children must be greater than 1");
+                   "Number of provided children must be 1 or more");
        ret = -EINVAL;
        goto exit;
    }
@@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

        opened[i] = true;
    }
+    s->next_child_index = s->num_children;

    g_free(opened);
    goto exit;
@@ -999,25 +989,70 @@ static void quorum_close(BlockDriverState *bs)
    g_free(s->children);
 }

-static void quorum_detach_aio_context(BlockDriverState *bs)
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
-    int i;
+    BdrvChild *child;
+    char indexstr[32];
+    int ret;

-    for (i = 0; i < s->num_children; i++) {
-        bdrv_detach_aio_context(s->children[i]->bs);
+    assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+    if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+        s->next_child_index == UINT_MAX) {
+        error_setg(errp, "Too many children");
+        return;
    }
+
+    ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+    if (ret < 0 || ret >= 32) {
+        error_setg(errp, "cannot generate child name");
+        return;
+    }
+    s->next_child_index++;
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely add the child now */
+    bdrv_ref(child_bs);
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+    s->children[s->num_children++] = child;
+
+    bdrv_drained_end(bs);
 }

-static void quorum_attach_aio_context(BlockDriverState *bs,
-                                      AioContext *new_context)
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_children; i++) {
-        bdrv_attach_aio_context(s->children[i]->bs, new_context);
+        if (s->children[i] == child) {
+            break;
+        }
    }
+
+    /* we have checked it in bdrv_del_child() */
+    assert(i < s->num_children);
+
+    if (s->num_children <= s->threshold) {
+        error_setg(errp,
+            "The number of children cannot be lower than the vote threshold %d",
+            s->threshold);
+        return;
+    }
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely remove this child now */
+    memmove(&s->children[i], &s->children[i + 1],
+            (s->num_children - i - 1) * sizeof(BdrvChild *));
+    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+    bdrv_unref_child(bs, child);
+
+    bdrv_drained_end(bs);
 }

 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -1070,10 +1105,9 @@ static BlockDriver bdrv_quorum = {

    .bdrv_aio_readv                     = quorum_aio_readv,
    .bdrv_aio_writev                    = quorum_aio_writev,
-    .bdrv_invalidate_cache              = quorum_invalidate_cache,

-    .bdrv_detach_aio_context            = quorum_detach_aio_context,
-    .bdrv_attach_aio_context            = quorum_attach_aio_context,
+    .bdrv_add_child                     = quorum_add_child,
+    .bdrv_del_child                     = quorum_del_child,

    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -35,15 +35,16 @@

 /* linux-aio.c - Linux native implementation */
 #ifdef CONFIG_LINUX_AIO
-void *laio_init(void);
-void laio_cleanup(void *s);
-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+typedef struct LinuxAioState LinuxAioState;
+LinuxAioState *laio_init(void);
+void laio_cleanup(LinuxAioState *s);
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type);
-void laio_detach_aio_context(void *s, AioContext *old_context);
-void laio_attach_aio_context(void *s, AioContext *new_context);
-void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s);
 #endif

 #ifdef _WIN32
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -139,7 +139,7 @@ typedef struct BDRVRawState {

 #ifdef CONFIG_LINUX_AIO
    int use_aio;
-    void *aio_ctx;
+    LinuxAioState *aio_ctx;
 #endif
 #ifdef CONFIG_XFS
    bool is_xfs:1;
@@ -398,7 +398,7 @@ static void raw_attach_aio_context(BlockDriverState *bs,
 }

 #ifdef CONFIG_LINUX_AIO
-static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
 {
    int ret = -1;
    assert(aio_ctx != NULL);
@@ -517,6 +517,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
@@ -1345,17 +1346,7 @@ static void raw_aio_unplug(BlockDriverState *bs)
 #ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, true);
-    }
-#endif
-}
-
-static void raw_aio_flush_io_queue(BlockDriverState *bs)
-{
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, false);
+        laio_io_unplug(bs, s->aio_ctx);
    }
 #endif
 }
@@ -1949,7 +1940,6 @@ BlockDriver bdrv_file = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
@@ -2398,7 +2388,6 @@ static BlockDriver bdrv_host_device = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
@@ -2528,7 +2517,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2664,7 +2652,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -105,8 +105,8 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
-                             nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+    ret = bdrv_co_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
+                          nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);

 fail:
    if (qiov == &local_qiov) {
@@ -116,13 +116,6 @@ fail:
    return ret;
 }

-static int coroutine_fn
-raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-              QEMUIOVector *qiov)
-{
-    return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int64_t sector_num,
                                            int nb_sectors, int *pnum,
@@ -211,6 +204,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->bs->sg;
+    bs->supported_write_flags = BDRV_REQ_FUA;
+    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP;

    if (bs->probed && !bdrv_is_read_only(bs)) {
        fprintf(stderr,
@@ -256,9 +251,7 @@ BlockDriver bdrv_raw = {
    .bdrv_close           = &raw_close,
    .bdrv_create          = &raw_create,
    .bdrv_co_readv        = &raw_co_readv,
-    .bdrv_co_writev       = &raw_co_writev,
    .bdrv_co_writev_flags = &raw_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
    .bdrv_co_discard      = &raw_co_discard,
    .bdrv_co_get_block_status = &raw_co_get_block_status,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)

 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)                                       \
-    do {                                                            \
-        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-    } while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)                                           \
+    do {                                                                \
+        if (DEBUG_SDOG_PRINT) {                                         \
+            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+        }                                                               \
+    } while (0)

 typedef struct SheepdogAIOCB SheepdogAIOCB;

@@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp)
        if (ret < 0) {
            goto out;
        }
-        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
        if (ret < 0) {
            goto out;
        }
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -373,9 +373,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
    bool ok = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    while (ok && (bs = bdrv_next(bs))) {
+    while (ok && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -393,10 +394,11 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                             Error **err)
 {
    int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
    QEMUSnapshotInfo sn1, *snapshot = &sn1;

-    while (ret == 0 && (bs = bdrv_next(bs))) {
+    while (ret == 0 && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -415,9 +417,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -435,9 +438,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    QEMUSnapshotInfo sn;
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -457,9 +461,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                             BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -480,9 +485,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
    bool not_found = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    while (not_found && (bs = bdrv_next(bs))) {
+    while (not_found && (it = bdrv_next(it, &bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
--- a/block/stream.c
+++ b/block/stream.c
@@ -163,8 +163,7 @@ wait:
        }
        if (ret < 0) {
            BlockErrorAction action =
-                block_job_error_action(&s->common, s->common.bs, s->on_error,
-                                       true, -ret);
+                block_job_error_action(&s->common, s->on_error, true, -ret);
            if (action == BLOCK_ERROR_ACTION_STOP) {
                n = 0;
                continue;
@@ -224,13 +223,6 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
 {
    StreamBlockJob *s;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
-        return;
-    }
-
    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,13 +23,14 @@
 */

 #include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"

 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockDriverState and it's independent from
+ * among different BlockBackends and it's independent from
 * AioContext, so in order to use it from different threads it needs
 * its own locking.
 *
@@ -39,26 +40,26 @@
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
 *
- * In addition to the ThrottleGroup structure, BlockDriverState has
+ * In addition to the ThrottleGroup structure, BlockBackendPublic has
 * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a BDS is
- * registered in a group those fields can be accessed by other threads
- * any time.
+ * therefore also need to be protected by this lock. Once a
+ * BlockBackend is registered in a group those fields can be accessed
+ * by other threads any time.
 *
 * Again, all this is handled internally and is mostly transparent to
 * the outside. The 'throttle_timers' field however has an additional
 * constraint because it may be temporarily invalid (see for example
 * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BDS's timers only after verifying that that BDS
- * has throttled requests in the queue.
+ * access some other BlockBackend's timers only after verifying that
+ * that BlockBackend has throttled requests in the queue.
 */
 typedef struct ThrottleGroup {
    char *name; /* This is constant during the lifetime of the group */

    QemuMutex lock; /* This lock protects the following four fields */
    ThrottleState ts;
-    QLIST_HEAD(, BlockDriverState) head;
-    BlockDriverState *tokens[2];
+    QLIST_HEAD(, BlockBackendPublic) head;
+    BlockBackend *tokens[2];
    bool any_timer_armed[2];

    /* These two are protected by the global throttle_groups_lock */
@@ -132,93 +133,98 @@ void throttle_group_unref(ThrottleState *ts)
    qemu_mutex_unlock(&throttle_groups_lock);
 }

-/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
- * the pointer) is guaranteed to remain constant during the lifetime
- * of the group.
+/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+ * is guaranteed to remain constant during the lifetime of the group.
 *
- * @bs:   a BlockDriverState that is member of a throttling group
+ * @blk:  a BlockBackend that is member of a throttling group
 * @ret:  the name of the group.
 */
-const char *throttle_group_get_name(BlockDriverState *bs)
+const char *throttle_group_get_name(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    return tg->name;
 }

-/* Return the next BlockDriverState in the round-robin sequence,
- * simulating a circular list.
+/* Return the next BlockBackend in the round-robin sequence, simulating a
+ * circular list.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:  the current BlockDriverState
- * @ret: the next BlockDriverState in the sequence
+ * @blk: the current BlockBackend
+ * @ret: the next BlockBackend in the sequence
 */
-static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);

    if (!next) {
-        return QLIST_FIRST(&tg->head);
+        next = QLIST_FIRST(&tg->head);
    }

-    return next;
+    return blk_by_public(next);
 }

-/* Return the next BlockDriverState in the round-robin sequence with
- * pending I/O requests.
+/* Return the next BlockBackend in the round-robin sequence with pending I/O
+ * requests.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockDriverState with pending requests, or bs
- *             if there is none.
+ * @ret:       the next BlockBackend with pending requests, or blk if there is
+ *             none.
 */
-static BlockDriverState *next_throttle_token(BlockDriverState *bs,
-                                             bool is_write)
+static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    BlockDriverState *token, *start;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    BlockBackend *token, *start;

    start = token = tg->tokens[is_write];

    /* get next bs round in round robin style */
-    token = throttle_group_next_bs(token);
-    while (token != start && !token->pending_reqs[is_write]) {
-        token = throttle_group_next_bs(token);
+    token = throttle_group_next_blk(token);
+    while (token != start && !blkp->pending_reqs[is_write]) {
+        token = throttle_group_next_blk(token);
    }

    /* If no IO are queued for scheduling on the next round robin token
     * then decide the token is the current bs because chances are
     * the current bs get the current request queued.
     */
-    if (token == start && !token->pending_reqs[is_write]) {
-        token = bs;
+    if (token == start && !blkp->pending_reqs[is_write]) {
+        token = blk;
    }

    return token;
 }

-/* Check if the next I/O request for a BlockDriverState needs to be
- * throttled or not. If there's no timer set in this group, set one
- * and update the token accordingly.
+/* Check if the next I/O request for a BlockBackend needs to be throttled or
+ * not. If there's no timer set in this group, set one and update the token
+ * accordingly.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:         the current BlockDriverState
+ * @blk:        the current BlockBackend
 * @is_write:   the type of operation (read/write)
 * @ret:        whether the I/O request needs to be throttled or not
 */
-static bool throttle_group_schedule_timer(BlockDriverState *bs,
-                                          bool is_write)
+static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
-    ThrottleTimers *tt = &bs->throttle_timers;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &blkp->throttle_timers;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool must_wait;

+    if (blkp->io_limits_disabled) {
+        return false;
+    }
+
    /* Check if any of the timers in this group is already armed */
    if (tg->any_timer_armed[is_write]) {
        return true;
@@ -226,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,

    must_wait = throttle_schedule_timer(ts, tt, is_write);

-    /* If a timer just got armed, set bs as the current token */
+    /* If a timer just got armed, set blk as the current token */
    if (must_wait) {
-        tg->tokens[is_write] = bs;
+        tg->tokens[is_write] = blk;
        tg->any_timer_armed[is_write] = true;
    }

@@ -239,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
 */
-static void schedule_next_request(BlockDriverState *bs, bool is_write)
+static void schedule_next_request(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

    /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(bs, is_write);
-    if (!token->pending_reqs[is_write]) {
+    token = next_throttle_token(blk, is_write);
+    if (!blkp->pending_reqs[is_write]) {
        return;
    }

@@ -259,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)

    /* If it doesn't have to wait, queue it for immediate execution */
    if (!must_wait) {
-        /* Give preference to requests from the current bs */
+        /* Give preference to requests from the current blk */
        if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
-            token = bs;
+            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            token = blk;
        } else {
-            ThrottleTimers *tt = &token->throttle_timers;
+            ThrottleTimers *tt = &blkp->throttle_timers;
            int64_t now = qemu_clock_get_ns(tt->clock_type);
            timer_mod(tt->timers[is_write], now + 1);
            tg->any_timer_armed[is_write] = true;
@@ -277,53 +284,67 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
 * if necessary, and schedule the next request using a round robin
 * algorithm.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @bytes:     the number of bytes for this I/O
 * @is_write:  the type of operation (read/write)
 */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                        unsigned int bytes,
                                                        bool is_write)
 {
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);

    /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(bs, is_write);
+    token = next_throttle_token(blk, is_write);
    must_wait = throttle_group_schedule_timer(token, is_write);

    /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || bs->pending_reqs[is_write]) {
-        bs->pending_reqs[is_write]++;
+    if (must_wait || blkp->pending_reqs[is_write]) {
+        blkp->pending_reqs[is_write]++;
        qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
        qemu_mutex_lock(&tg->lock);
-        bs->pending_reqs[is_write]--;
+        blkp->pending_reqs[is_write]--;
    }

    /* The I/O will be executed, so do the accounting */
-    throttle_account(bs->throttle_state, is_write, bytes);
+    throttle_account(blkp->throttle_state, is_write, bytes);

    /* Schedule the next request */
-    schedule_next_request(bs, is_write);
+    schedule_next_request(blk, is_write);

    qemu_mutex_unlock(&tg->lock);
 }

+void throttle_group_restart_blk(BlockBackend *blk)
+{
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
+            ;
+        }
+    }
+}
+
 /* Update the throttle configuration for a particular group. Similar
 * to throttle_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration to set
 */
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleTimers *tt = &bs->throttle_timers;
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    /* throttle_config() cancels the timers */
@@ -335,18 +356,22 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
    }
    throttle_config(ts, tt, cfg);
    qemu_mutex_unlock(&tg->lock);
+
+    qemu_co_enter_next(&blkp->throttled_reqs[0]);
+    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }

 /* Get the throttle configuration from a particular group. Similar to
 * throttle_get_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration will be written here
 */
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    throttle_get_config(ts, cfg);
@@ -356,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
 * because it had been throttled.
 *
- * @bs:        the BlockDriverState whose request had been throttled
+ * @blk:       the BlockBackend whose request had been throttled
 * @is_write:  the type of operation (read/write)
 */
-static void timer_cb(BlockDriverState *bs, bool is_write)
+static void timer_cb(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool empty_queue;

@@ -371,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write)
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
    if (empty_queue) {
        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(bs, is_write);
+        schedule_next_request(blk, is_write);
        qemu_mutex_unlock(&tg->lock);
    }
 }
@@ -392,17 +418,17 @@ static void write_timer_cb(void *opaque)
    timer_cb(opaque, true);
 }

-/* Register a BlockDriverState in the throttling group, also
- * initializing its timers and updating its throttle_state pointer to
- * point to it. If a throttling group with that name does not exist
- * yet, it will be created.
+/* Register a BlockBackend in the throttling group, also initializing its
+ * timers and updating its throttle_state pointer to point to it. If a
+ * throttling group with that name does not exist yet, it will be created.
 *
- * @bs:        the BlockDriverState to insert
+ * @blk:       the BlockBackend to insert
 * @groupname: the name of the group
 */
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
 {
    int i;
+    BlockBackendPublic *blkp = blk_get_public(blk);
    ThrottleState *ts = throttle_group_incref(groupname);
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;
@@ -412,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    bs->throttle_state = ts;
+    blkp->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    /* If the ThrottleGroup is new set this BlockBackend as the token */
    for (i = 0; i < 2; i++) {
        if (!tg->tokens[i]) {
-            tg->tokens[i] = bs;
+            tg->tokens[i] = blk;
        }
    }

-    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);

-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
+    throttle_timers_init(&blkp->throttle_timers,
+                         blk_get_aio_context(blk),
                         clock_type,
                         read_timer_cb,
                         write_timer_cb,
-                         bs);
+                         blk);

    qemu_mutex_unlock(&tg->lock);
 }

-/* Unregister a BlockDriverState from its group, removing it from the
- * list, destroying the timers and setting the throttle_state pointer
- * to NULL.
+/* Unregister a BlockBackend from its group, removing it from the list,
+ * destroying the timers and setting the throttle_state pointer to NULL.
 *
- * The BlockDriverState must not have pending throttled requests, so
- * the caller has to drain them first.
+ * The BlockBackend must not have pending throttled requests, so the caller has
+ * to drain them first.
 *
 * The group will be destroyed if it's empty after this operation.
 *
- * @bs: the BlockDriverState to remove
+ * @blk: the BlockBackend to remove
 */
-void throttle_group_unregister_bs(BlockDriverState *bs)
+void throttle_group_unregister_blk(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    int i;

-    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));

    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == bs) {
-            BlockDriverState *token = throttle_group_next_bs(bs);
-            /* Take care of the case where this is the last bs in the group */
-            if (token == bs) {
+        if (tg->tokens[i] == blk) {
+            BlockBackend *token = throttle_group_next_blk(blk);
+            /* Take care of the case where this is the last blk in the group */
+            if (token == blk) {
                token = NULL;
            }
            tg->tokens[i] = token;
        }
    }

-    /* remove the current bs from the list */
-    QLIST_REMOVE(bs, round_robin);
-    throttle_timers_destroy(&bs->throttle_timers);
+    /* remove the current blk from the list */
+    QLIST_REMOVE(blkp, round_robin);
+    throttle_timers_destroy(&blkp->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

    throttle_group_unref(&tg->ts);
-    bs->throttle_state = NULL;
+    blkp->throttle_state = NULL;
 }

 static void throttle_groups_init(void)
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -54,6 +54,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
 #include "qemu/cutils.h"
@@ -557,98 +558,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }

-static int vdi_co_read(BlockDriverState *bs,
-        int64_t sector_num, uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will read %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will read %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Block not allocated, return zeros, no need to wait. */
-            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            ret = 0;
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
-            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
-        }
-        logout("%u sectors read\n", n_sectors);
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, data_offset, n_bytes,
+                                 &local_qiov, 0);
+        }
+        logout("%u bytes read\n", n_bytes);
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }

+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

-static int vdi_co_write(BlockDriverState *bs,
-        int64_t sector_num, const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
    uint32_t bmap_first = VDI_UNALLOCATED;
    uint32_t bmap_last = VDI_UNALLOCATED;
    uint8_t *block = NULL;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will write %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will write %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Allocate new block and write to it. */
-            uint64_t offset;
+            uint64_t data_offset;
            bmap_entry = s->header.blocks_allocated;
            s->bmap[block_index] = cpu_to_le32(bmap_entry);
            s->header.blocks_allocated++;
-            offset = s->header.offset_data / SECTOR_SIZE +
-                     (uint64_t)bmap_entry * s->block_sectors;
+            data_offset = s->header.offset_data +
+                          (uint64_t)bmap_entry * s->block_size;
            if (block == NULL) {
                block = g_malloc(s->block_size);
                bmap_first = block_index;
            }
            bmap_last = block_index;
            /* Copy data to be written to new block and zero unused parts. */
-            memset(block, 0, sector_in_block * SECTOR_SIZE);
-            memcpy(block + sector_in_block * SECTOR_SIZE,
-                   buf, n_sectors * SECTOR_SIZE);
-            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            memset(block, 0, offset_in_block);
+            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
+                              n_bytes);
+            memset(block + offset_in_block + n_bytes, 0,
+                   s->block_size - n_bytes - offset_in_block);

            /* Note that this coroutine does not yield anywhere from reading the
             * bmap entry until here, so in regards to all the coroutines trying
@@ -658,12 +670,12 @@ static int vdi_co_write(BlockDriverState *bs,
             * acquire the lock and thus the padded cluster is written before
             * the other coroutines can write to the affected area. */
            qemu_co_mutex_lock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
+            ret = bdrv_pwrite(bs->file->bs, data_offset, block, s->block_size);
            qemu_co_mutex_unlock(&s->write_lock);
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;
            qemu_co_mutex_lock(&s->write_lock);
            /* This lock is only used to make sure the following write operation
             * is executed after the write issued by the coroutine allocating
@@ -674,16 +686,23 @@ static int vdi_co_write(BlockDriverState *bs,
             * that that write operation has returned (there may be other writes
             * in flight, but they do not concern this very operation). */
            qemu_co_mutex_unlock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
+
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_pwritev(bs->file->bs, data_offset, n_bytes,
+                                  &local_qiov, 0);
        }

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

-        logout("%u sectors written\n", n_sectors);
+        logout("%u bytes written\n", n_bytes);
    }

+    qemu_iovec_destroy(&local_qiov);
+
    logout("finished data write\n");
    if (ret < 0) {
        return ret;
@@ -694,6 +713,7 @@ static int vdi_co_write(BlockDriverState *bs,
        VdiHeader *header = (VdiHeader *) block;
        uint8_t *base;
        uint64_t offset;
+        uint32_t n_sectors;

        logout("now writing modified header\n");
        assert(VDI_IS_ALLOCATED(bmap_first));
@@ -808,7 +828,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    ret = blk_pwrite(blk, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, "Error writing header to %s", filename);
        goto exit;
@@ -829,7 +849,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        ret = blk_pwrite(blk, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size, 0);
        if (ret < 0) {
            error_setg(errp, "Error writing bmap to %s", filename);
            goto exit;
@@ -903,9 +923,9 @@ static BlockDriver bdrv_vdi = {
    .bdrv_co_get_block_status = vdi_co_get_block_status,
    .bdrv_make_empty = vdi_make_empty,

-    .bdrv_read = vdi_co_read,
+    .bdrv_co_preadv     = vdi_co_preadv,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_write = vdi_co_write,
+    .bdrv_co_pwritev    = vdi_co_pwritev,
 #endif

    .bdrv_get_info = vdi_get_info,
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"

 #include <uuid/uuid.h>
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -23,6 +23,7 @@
 #include "block/block_int.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"


--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -22,6 +22,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 #include "migration/migration.h"

@@ -1856,13 +1857,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                              &creator_items, NULL);
    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature),
+                     0);
    if (ret < 0) {
        goto delete_and_exit;
    }
    if (creator) {
        ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                         creator, creator_items * sizeof(gunichar2));
+                         creator, creator_items * sizeof(gunichar2), 0);
        if (ret < 0) {
            goto delete_and_exit;
        }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,6 +30,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/cutils.h"
 #include <zlib.h>
@@ -1016,27 +1017,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 */
 static int get_whole_cluster(BlockDriverState *bs,
                             VmdkExtent *extent,
-                             uint64_t cluster_sector_num,
-                             uint64_t sector_num,
-                             uint64_t skip_start_sector,
-                             uint64_t skip_end_sector)
+                             uint64_t cluster_offset,
+                             uint64_t offset,
+                             uint64_t skip_start_bytes,
+                             uint64_t skip_end_bytes)
 {
    int ret = VMDK_OK;
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
-    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
    whole_grain = qemu_blockalign(bs, cluster_bytes);

    if (!bs->backing) {
-        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
-        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
-               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+        memset(whole_grain, 0, skip_start_bytes);
+        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
    }

-    assert(skip_end_sector <= extent->cluster_sectors);
+    assert(skip_end_bytes <= cluster_bytes);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
@@ -1045,42 +1045,43 @@ static int get_whole_cluster(BlockDriverState *bs,
    }

    /* Read backing data before skip range */
-    if (skip_start_sector > 0) {
+    if (skip_start_bytes > 0) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num,
-                            whole_grain, skip_start_sector);
+            ret = bdrv_pread(bs->backing->bs, offset, whole_grain,
+                             skip_start_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain,
-                         skip_start_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset, whole_grain,
+                          skip_start_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
    /* Read backing data after skip range */
-    if (skip_end_sector < extent->cluster_sectors) {
+    if (skip_end_bytes < cluster_bytes) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector,
-                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                            extent->cluster_sectors - skip_end_sector);
+            ret = bdrv_pread(bs->backing->bs, offset + skip_end_bytes,
+                             whole_grain + skip_end_bytes,
+                             cluster_bytes - skip_end_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector,
-                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                         extent->cluster_sectors - skip_end_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset + skip_end_bytes,
+                          whole_grain + skip_end_bytes,
+                          cluster_bytes - skip_end_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }

+    ret = VMDK_OK;
 exit:
    qemu_vfree(whole_grain);
    return ret;
@@ -1142,8 +1143,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
-                              uint64_t skip_start_sector,
-                              uint64_t skip_end_sector)
+                              uint64_t skip_start_bytes,
+                              uint64_t skip_end_bytes)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
@@ -1230,10 +1231,8 @@ static int get_cluster_offset(BlockDriverState *bs,
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        ret = get_whole_cluster(bs, extent,
-                                cluster_sector,
-                                offset >> BDRV_SECTOR_BITS,
-                                skip_start_sector, skip_end_sector);
+        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+                                offset, skip_start_bytes, skip_end_bytes);
        if (ret) {
            return ret;
        }
@@ -1259,15 +1258,26 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
    return NULL;
 }

+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+                                                   int64_t offset)
+{
+    uint64_t offset_in_cluster, extent_begin_offset, extent_relative_offset;
+    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+    extent_begin_offset =
+        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+    extent_relative_offset = offset - extent_begin_offset;
+    offset_in_cluster = extent_relative_offset % cluster_size;
+
+    return offset_in_cluster;
+}
+
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
 {
-    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
-
-    extent_begin_sector = extent->end_sector - extent->sectors;
-    extent_relative_sector_num = sector_num - extent_begin_sector;
-    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-    return index_in_cluster;
+    uint64_t offset;
+    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
+    return offset / BDRV_SECTOR_SIZE;
 }

 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
@@ -1319,38 +1329,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
 }

 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, const uint8_t *buf,
-                            int nb_sectors, int64_t sector_num)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            uint64_t qiov_offset, uint64_t n_bytes,
+                            uint64_t offset)
 {
    int ret;
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
-    const uint8_t *write_buf = buf;
-    int write_len = nb_sectors * 512;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
    int64_t write_offset;
    int64_t write_end_sector;

    if (extent->compressed) {
+        void *compressed_data;
+
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
-        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
-                buf_len == 0) {
+
+        compressed_data = g_malloc(n_bytes);
+        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
+        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
+        g_free(compressed_data);
+
+        if (ret != Z_OK || buf_len == 0) {
            ret = -EINVAL;
            goto out;
        }
-        data->lba = sector_num;
-        data->size = buf_len;
-        write_buf = (uint8_t *)data;
-        write_len = buf_len + sizeof(VmdkGrainMarker);
-    }
-    write_offset = cluster_offset + offset_in_cluster,
-    ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len);

-    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE);
+        data->lba = offset >> BDRV_SECTOR_BITS;
+        data->size = buf_len;
+
+        n_bytes = buf_len + sizeof(VmdkGrainMarker);
+        iov = (struct iovec) {
+            .iov_base   = data,
+            .iov_len    = n_bytes,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    } else {
+        qemu_iovec_init(&local_qiov, qiov->niov);
+        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
+    }
+
+    write_offset = cluster_offset + offset_in_cluster,
+    ret = bdrv_co_pwritev(extent->file->bs, write_offset, n_bytes,
+                          &local_qiov, 0);
+
+    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);

    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
@@ -1359,19 +1388,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
                                          write_end_sector);
    }

-    if (ret != write_len) {
-        ret = ret < 0 ? ret : -EIO;
+    if (ret < 0) {
        goto out;
    }
    ret = 0;
 out:
    g_free(data);
+    if (!extent->compressed) {
+        qemu_iovec_destroy(&local_qiov);
+    }
    return ret;
 }

 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, uint8_t *buf,
-                            int nb_sectors)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            int bytes)
 {
    int ret;
    int cluster_bytes, buf_bytes;
@@ -1383,14 +1414,13 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,


    if (!extent->compressed) {
-        ret = bdrv_pread(extent->file->bs,
-                          cluster_offset + offset_in_cluster,
-                          buf, nb_sectors * 512);
-        if (ret == nb_sectors * 512) {
-            return 0;
-        } else {
-            return -EIO;
+        ret = bdrv_co_preadv(extent->file->bs,
+                             cluster_offset + offset_in_cluster, bytes,
+                             qiov, 0);
+        if (ret < 0) {
+            return ret;
        }
+        return 0;
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
@@ -1422,11 +1452,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,

    }
    if (offset_in_cluster < 0 ||
-            offset_in_cluster + nb_sectors * 512 > buf_len) {
+            offset_in_cluster + bytes > buf_len) {
        ret = -EINVAL;
        goto out;
    }
-    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
+    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
    ret = 0;

 out:
@@ -1435,64 +1465,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
    return ret;
 }

-static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVmdkState *s = bs->opaque;
    int ret;
-    uint64_t n, index_in_cluster;
+    uint64_t n_bytes, offset_in_cluster;
    VmdkExtent *extent = NULL;
+    QEMUIOVector local_qiov;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
-            return -EIO;
+            ret = -EIO;
+            goto fail;
        }
        ret = get_cluster_offset(bs, extent, NULL,
-                                 sector_num << 9, false, &cluster_offset,
-                                 0, 0);
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
+                                 offset, false, &cluster_offset, 0, 0);
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
        if (ret != VMDK_OK) {
            /* if not allocated, try to read from parent image, if exist */
            if (bs->backing && ret != VMDK_ZEROED) {
                if (!vmdk_is_cid_valid(bs)) {
-                    return -EINVAL;
+                    ret = -EINVAL;
+                    goto fail;
                }
-                ret = bdrv_read(bs->backing->bs, sector_num, buf, n);
+
+                qemu_iovec_reset(&local_qiov);
+                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+                ret = bdrv_co_preadv(bs->backing->bs, offset, n_bytes,
+                                     &local_qiov, 0);
                if (ret < 0) {
-                    return ret;
+                    goto fail;
                }
            } else {
-                memset(buf, 0, 512 * n);
+                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            }
        } else {
-            ret = vmdk_read_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n);
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
+                                   &local_qiov, n_bytes);
            if (ret) {
-                return ret;
+                goto fail;
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
-                                     uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVmdkState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -1506,38 +1545,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
 *
 * Returns: error code with 0 for success.
 */
-static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                      const uint8_t *buf, int nb_sectors,
-                      bool zeroed, bool zero_dry_run)
+static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
+                       uint64_t bytes, QEMUIOVector *qiov,
+                       bool zeroed, bool zero_dry_run)
 {
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int ret;
-    int64_t index_in_cluster, n;
+    int64_t offset_in_cluster, n_bytes;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;
    VmdkMetaData m_data;

-    if (sector_num > bs->total_sectors) {
-        error_report("Wrong offset: sector_num=0x%" PRIx64
+    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
+        error_report("Wrong offset: offset=0x%" PRIx64
                     " total_sectors=0x%" PRIx64,
-                     sector_num, bs->total_sectors);
+                     offset, bs->total_sectors);
        return -EIO;
    }

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
            return -EIO;
        }
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
-        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
+        ret = get_cluster_offset(bs, extent, &m_data, offset,
                                 !(extent->compressed || zeroed),
-                                 &cluster_offset,
-                                 index_in_cluster, index_in_cluster + n);
+                                 &cluster_offset, offset_in_cluster,
+                                 offset_in_cluster + n_bytes);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1546,7 +1585,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                ret = get_cluster_offset(bs, extent, &m_data, offset,
                                         true, &cluster_offset, 0, 0);
            }
        }
@@ -1556,9 +1595,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
-                    index_in_cluster == 0 &&
-                    n >= extent->cluster_sectors) {
-                n = extent->cluster_sectors;
+                    offset_in_cluster == 0 &&
+                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
+                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
                if (!zero_dry_run) {
                    /* update L2 tables */
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
@@ -1570,9 +1609,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -ENOTSUP;
            }
        } else {
-            ret = vmdk_write_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n, sector_num);
+            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
+                                    qiov, bytes_done, n_bytes, offset);
            if (ret) {
                return ret;
            }
@@ -1585,9 +1623,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                }
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

        /* update CID on the first write every time the virtual disk is
         * opened */
@@ -1602,25 +1640,65 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
    return 0;
 }

-static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
-                                      const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }

+typedef struct VmdkWriteCompressedCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} VmdkWriteCompressedCo;
+
+static void vmdk_co_write_compressed(void *opaque)
+{
+    VmdkWriteCompressedCo *co = opaque;
+    QEMUIOVector local_qiov;
+    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = bytes,
+    };
+    qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+    co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false);
+}
+
 static int vmdk_write_compressed(BlockDriverState *bs,
                                 int64_t sector_num,
                                 const uint8_t *buf,
                                 int nb_sectors)
 {
    BDRVVmdkState *s = bs->opaque;
+
    if (s->num_extents == 1 && s->extents[0].compressed) {
-        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+        Coroutine *co;
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        VmdkWriteCompressedCo data = {
+            .bs         = bs,
+            .sector_num = sector_num,
+            .buf        = buf,
+            .nb_sectors = nb_sectors,
+            .ret        = -EINPROGRESS,
+        };
+        co = qemu_coroutine_create(vmdk_co_write_compressed);
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(aio_context, true);
+        }
+        return data.ret;
    } else {
        return -ENOTSUP;
    }
@@ -1633,12 +1711,15 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
+    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE;
+
    qemu_co_mutex_lock(&s->lock);
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
-    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
    if (!ret) {
-        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
    }
    qemu_co_mutex_unlock(&s->lock);
    return ret;
@@ -1728,12 +1809,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1753,7 +1834,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1765,7 +1846,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1829,8 +1910,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size = 0, filesize;
    char *adapter_type = NULL;
    char *backing_file = NULL;
+    char *hw_version = NULL;
    char *fmt = NULL;
-    int flags = 0;
    int ret = 0;
    bool flat, split, compress;
    GString *ext_desc_lines;
@@ -1861,7 +1942,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
-        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.virtualHWVersion = \"%s\"\n"
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
        "ddb.geometry.sectors = \"63\"\n"
@@ -1878,8 +1959,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        flags |= BLOCK_FLAG_COMPAT6;
+        if (strcmp(hw_version, "undefined")) {
+            error_setg(errp,
+                       "compat6 cannot be enabled with hwversion set");
+            ret = -EINVAL;
+            goto exit;
+        }
+        g_free(hw_version);
+        hw_version = g_strdup("6");
+    }
+    if (strcmp(hw_version, "undefined") == 0) {
+        g_free(hw_version);
+        hw_version = g_strdup("4");
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
@@ -2001,7 +2094,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
-                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           hw_version,
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                           number_heads,
@@ -2028,7 +2121,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)

    blk_set_allow_write_beyond_eof(new_blk, true);

-    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2047,6 +2140,7 @@ exit:
    }
    g_free(adapter_type);
    g_free(backing_file);
+    g_free(hw_version);
    g_free(fmt);
    g_free(desc);
    g_free(path);
@@ -2250,27 +2344,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static void vmdk_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_detach_aio_context(s->extents[i].file->bs);
-    }
-}
-
-static void vmdk_attach_aio_context(BlockDriverState *bs,
-                                    AioContext *new_context)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
-    }
-}
-
 static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2297,6 +2370,12 @@ static QemuOptsList vmdk_create_opts = {
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
+        {
+            .name = BLOCK_OPT_HWVERSION,
+            .type = QEMU_OPT_STRING,
+            .help = "VMDK hardware version",
+            .def_value_str = "undefined"
+        },
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
@@ -2321,8 +2400,8 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
-    .bdrv_read                    = vmdk_co_read,
-    .bdrv_write                   = vmdk_co_write,
+    .bdrv_co_preadv               = vmdk_co_preadv,
+    .bdrv_co_pwritev              = vmdk_co_pwritev,
    .bdrv_write_compressed        = vmdk_write_compressed,
    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
    .bdrv_close                   = vmdk_close,
@@ -2334,8 +2413,6 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_get_specific_info       = vmdk_get_specific_info,
    .bdrv_refresh_limits          = vmdk_refresh_limits,
    .bdrv_get_info                = vmdk_get_info,
-    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
-    .bdrv_attach_aio_context      = vmdk_attach_aio_context,

    .supports_backing             = true,
    .create_opts                  = &vmdk_create_opts,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
+#include "qemu/bswap.h"
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
@@ -45,34 +46,34 @@ enum vhd_type {
    VHD_DIFFERENCING    = 4,
 };

-// Seconds since Jan 1, 2000 0:00:00 (UTC)
+/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
 #define VHD_TIMESTAMP_BASE 946684800

 #define VHD_CHS_MAX_C   65535LL
 #define VHD_CHS_MAX_H   16
 #define VHD_CHS_MAX_S   255

-#define VHD_MAX_SECTORS       (65535LL * 255 * 255)
+#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)

 #define VPC_OPT_FORCE_SIZE "force_size"

-// always big-endian
+/* always big-endian */
 typedef struct vhd_footer {
-    char        creator[8]; // "conectix"
+    char        creator[8]; /* "conectix" */
    uint32_t    features;
    uint32_t    version;

-    // Offset of next header structure, 0xFFFFFFFF if none
+    /* Offset of next header structure, 0xFFFFFFFF if none */
    uint64_t    data_offset;

-    // Seconds since Jan 1, 2000 0:00:00 (UTC)
+    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
    uint32_t    timestamp;

-    char        creator_app[4]; // "vpc "
+    char        creator_app[4]; /*  e.g., "vpc " */
    uint16_t    major;
    uint16_t    minor;
-    char        creator_os[4]; // "Wi2k"
+    char        creator_os[4]; /* "Wi2k" */

    uint64_t    orig_size;
    uint64_t    current_size;
@@ -83,29 +84,29 @@ typedef struct vhd_footer {

    uint32_t    type;

-    // Checksum of the Hard Disk Footer ("one's complement of the sum of all
-    // the bytes in the footer without the checksum field")
+    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
+       the bytes in the footer without the checksum field") */
    uint32_t    checksum;

-    // UUID used to identify a parent hard disk (backing file)
+    /* UUID used to identify a parent hard disk (backing file) */
    uint8_t     uuid[16];

    uint8_t     in_saved_state;
 } QEMU_PACKED VHDFooter;

 typedef struct vhd_dyndisk_header {
-    char        magic[8]; // "cxsparse"
+    char        magic[8]; /* "cxsparse" */

-    // Offset of next header structure, 0xFFFFFFFF if none
+    /* Offset of next header structure, 0xFFFFFFFF if none */
    uint64_t    data_offset;

-    // Offset of the Block Allocation Table (BAT)
+    /* Offset of the Block Allocation Table (BAT) */
    uint64_t    table_offset;

    uint32_t    version;
-    uint32_t    max_table_entries; // 32bit/entry
+    uint32_t    max_table_entries; /* 32bit/entry */

-    // 2 MB by default, must be a power of two
+    /* 2 MB by default, must be a power of two */
    uint32_t    block_size;

    uint32_t    checksum;
@@ -113,7 +114,7 @@ typedef struct vhd_dyndisk_header {
    uint32_t    parent_timestamp;
    uint32_t    reserved;

-    // Backing file name (in UTF-16)
+    /* Backing file name (in UTF-16) */
    uint8_t     parent_name[512];

    struct {
@@ -238,6 +239,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,

    ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
    if (ret < 0) {
+        error_setg(errp, "Unable to read VHD header");
        goto fail;
    }

@@ -246,9 +248,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        int64_t offset = bdrv_getlength(bs->file->bs);
        if (offset < 0) {
            ret = offset;
+            error_setg(errp, "Invalid file size");
            goto fail;
        } else if (offset < HEADER_SIZE) {
            ret = -EINVAL;
+            error_setg(errp, "File too small for a VHD header");
            goto fail;
        }

@@ -275,9 +279,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    /* Write 'checksum' back to footer, or else will leave it with zero. */
    footer->checksum = cpu_to_be32(checksum);

-    // The visible size of a image in Virtual PC depends on the geometry
-    // rather than on the size stored in the footer (the size in the footer
-    // is too large usually)
+    /* The visible size of a image in Virtual PC depends on the geometry
+       rather than on the size stored in the footer (the size in the footer
+       is too large usually) */
    bs->total_sectors = (int64_t)
        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;

@@ -299,6 +303,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     *      'qem2'  :  current_size     QEMU (uses current_size)
     *      'win '  :  current_size     Hyper-V
     *      'd2v '  :  current_size     Disk2vhd
+     *      'tap\0' :  current_size     XenServer
+     *      'CTXS'  :  current_size     XenConverter
     *
     *  The user can override the table values via drive options, however
     *  even with an override we will still use current_size for images
@@ -306,15 +312,17 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     */
    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
               !!strncmp(footer->creator_app, "qem2", 4) &&
-               !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs;
+               !!strncmp(footer->creator_app, "d2v ", 4) &&
+               !!strncmp(footer->creator_app, "CTXS", 4) &&
+               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;

    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
        bs->total_sectors = be64_to_cpu(footer->current_size) /
                                        BDRV_SECTOR_SIZE;
    }

-    /* Allow a maximum disk size of approximately 2 TB */
-    if (bs->total_sectors >= VHD_MAX_SECTORS) {
+    /* Allow a maximum disk size of 2040 GiB */
+    if (bs->total_sectors > VHD_MAX_SECTORS) {
        ret = -EFBIG;
        goto fail;
    }
@@ -323,12 +331,14 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
                         HEADER_SIZE);
        if (ret < 0) {
+            error_setg(errp, "Error reading dynamic VHD header");
            goto fail;
        }

        dyndisk_header = (VHDDynDiskHeader *) buf;

        if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
+            error_setg(errp, "Invalid header magic");
            ret = -EINVAL;
            goto fail;
        }
@@ -344,16 +354,14 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);

        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
-            ret = -EINVAL;
-            goto fail;
-        }
-        if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) {
+            error_setg(errp, "Too many blocks");
            ret = -EINVAL;
            goto fail;
        }

        computed_size = (uint64_t) s->max_table_entries * s->block_size;
        if (computed_size < bs->total_sectors * 512) {
+            error_setg(errp, "Page table too small");
            ret = -EINVAL;
            goto fail;
        }
@@ -370,6 +378,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,

        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
        if (s->pagetable == NULL) {
+            error_setg(errp, "Unable to allocate memory for page table");
            ret = -ENOMEM;
            goto fail;
        }
@@ -379,6 +388,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
                         pagetable_size);
        if (ret < 0) {
+            error_setg(errp, "Error reading pagetable");
            goto fail;
        }

@@ -445,28 +455,27 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 * The parameter write must be 1 if the offset will be used for a write
 * operation (the block bitmaps is updated then), 0 otherwise.
 */
-static inline int64_t get_sector_offset(BlockDriverState *bs,
-    int64_t sector_num, int write)
+static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
+                                       bool write)
 {
    BDRVVPCState *s = bs->opaque;
-    uint64_t offset = sector_num * 512;
    uint64_t bitmap_offset, block_offset;
-    uint32_t pagetable_index, pageentry_index;
+    uint32_t pagetable_index, offset_in_block;

    pagetable_index = offset / s->block_size;
-    pageentry_index = (offset % s->block_size) / 512;
+    offset_in_block = offset % s->block_size;

    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
-        return -1; // not allocated
+        return -1; /* not allocated */

    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
-    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;

-    // We must ensure that we don't write to any sectors which are marked as
-    // unused in the bitmap. We get away with setting all bits in the block
-    // bitmap each time we write to a new block. This might cause Virtual PC to
-    // miss sparse read optimization, but it's not a problem in terms of
-    // correctness.
+    /* We must ensure that we don't write to any sectors which are marked as
+       unused in the bitmap. We get away with setting all bits in the block
+       bitmap each time we write to a new block. This might cause Virtual PC to
+       miss sparse read optimization, but it's not a problem in terms of
+       correctness. */
    if (write && (s->last_bitmap_offset != bitmap_offset)) {
        uint8_t bitmap[s->bitmap_size];

@@ -478,6 +487,12 @@ static inline int64_t get_sector_offset(BlockDriverState *bs,
    return block_offset;
 }

+static inline int64_t get_sector_offset(BlockDriverState *bs,
+                                        int64_t sector_num, bool write)
+{
+    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
+}
+
 /*
 * Writes the footer to the end of the image file. This is needed when the
 * file grows as it overwrites the old footer
@@ -504,7 +519,7 @@ static int rewrite_footer(BlockDriverState* bs)
 *
 * Returns the sectors' offset in the image file on success and < 0 on error
 */
-static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t bat_offset;
@@ -512,18 +527,17 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
    int ret;
    uint8_t bitmap[s->bitmap_size];

-    // Check if sector_num is valid
-    if ((sector_num < 0) || (sector_num > bs->total_sectors))
-        return -1;
-
-    // Write entry into in-memory BAT
-    index = (sector_num * 512) / s->block_size;
-    if (s->pagetable[index] != 0xFFFFFFFF)
-        return -1;
+    /* Check if sector_num is valid */
+    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
+        return -EINVAL;
+    }

+    /* Write entry into in-memory BAT */
+    index = offset / s->block_size;
+    assert(s->pagetable[index] == 0xFFFFFFFF);
    s->pagetable[index] = s->free_data_block_offset / 512;

-    // Initialize the block's bitmap
+    /* Initialize the block's bitmap */
    memset(bitmap, 0xff, s->bitmap_size);
    ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
        s->bitmap_size);
@@ -531,24 +545,24 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
        return ret;
    }

-    // Write new footer (the old one will be overwritten)
+    /* Write new footer (the old one will be overwritten) */
    s->free_data_block_offset += s->block_size + s->bitmap_size;
    ret = rewrite_footer(bs);
    if (ret < 0)
        goto fail;

-    // Write BAT entry to disk
+    /* Write BAT entry to disk */
    bat_offset = s->bat_offset + (4 * index);
    bat_value = cpu_to_be32(s->pagetable[index]);
    ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
    if (ret < 0)
        goto fail;

-    return get_sector_offset(bs, sector_num, 0);
+    return get_image_offset(bs, offset, false);

 fail:
    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
-    return -1;
+    return ret;
 }

 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -564,104 +578,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static int vpc_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
    int ret;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
+        return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0);
    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 0);

-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        if (offset == -1) {
-            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, false);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
        } else {
-            ret = bdrv_pread(bs->file->bs, offset, buf,
-                sectors * BDRV_SECTOR_SIZE);
-            if (ret != sectors * BDRV_SECTOR_SIZE) {
-                return -1;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes,
+                                 &local_qiov, 0);
+            if (ret < 0) {
+                goto fail;
            }
        }

-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

-static int vpc_write(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
-    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 1);
-
-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
-
-        if (offset == -1) {
-            offset = alloc_block(bs, sector_num);
-            if (offset < 0)
-                return -1;
-        }
-
-        ret = bdrv_pwrite(bs->file->bs, offset, buf,
-                          sectors * BDRV_SECTOR_SIZE);
-        if (ret != sectors * BDRV_SECTOR_SIZE) {
-            return -1;
-        }
-
-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0);
    }

-    return 0;
-}
-
-static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_write(bs, sector_num, buf, nb_sectors);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, true);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            image_offset = alloc_block(bs, offset);
+            if (image_offset < 0) {
+                ret = image_offset;
+                goto fail;
+            }
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+        ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes,
+                              &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
+    }
+
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -718,7 +733,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
 * Note that the geometry doesn't always exactly match total_sectors but
 * may round it down.
 *
- * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override
+ * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
 * and instead allow up to 255 heads.
 */
@@ -770,34 +785,34 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    int ret;
    int64_t offset = 0;

-    // Write the footer (twice: at the beginning and at the end)
+    /* Write the footer (twice: at the beginning and at the end) */
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }

-    // Write the initial BAT
+    /* Write the initial BAT */
    offset = 3 * 512;

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = blk_pwrite(blk, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512, 0);
        if (ret < 0) {
            goto fail;
        }
        offset += 512;
    }

-    // Prepare the Dynamic Disk Header
+    /* Prepare the Dynamic Disk Header */
    memset(buf, 0, 1024);

    memcpy(dyndisk_header->magic, "cxsparse", 8);
@@ -814,10 +829,10 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,

    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));

-    // Write the header
+    /* Write the header */
    offset = 512;

-    ret = blk_pwrite(blk, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -839,7 +854,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
        return ret;
    }

-    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        return ret;
    }
@@ -874,6 +889,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
        } else if (!strcmp(disk_type_param, "fixed")) {
            disk_type = VHD_FIXED;
        } else {
+            error_setg(errp, "Invalid disk type, %s", disk_type_param);
            ret = -EINVAL;
            goto out;
        }
@@ -922,8 +938,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)

    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
        total_sectors = total_size / BDRV_SECTOR_SIZE;
-        /* Allow a maximum disk size of approximately 2 TB */
+        /* Allow a maximum disk size of 2040 GiB */
        if (total_sectors > VHD_MAX_SECTORS) {
+            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
            ret = -EFBIG;
            goto out;
        }
@@ -974,6 +991,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    } else {
        ret = create_fixed_disk(blk, buf, total_size);
    }
+    if (ret < 0) {
+        error_setg(errp, "Unable to create or write VHD header");
+    }

 out:
    blk_unref(blk);
@@ -1042,8 +1062,8 @@ static BlockDriver bdrv_vpc = {
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
    .bdrv_create            = vpc_create,

-    .bdrv_read                  = vpc_co_read,
-    .bdrv_write                 = vpc_co_write,
+    .bdrv_co_preadv             = vpc_co_preadv,
+    .bdrv_co_pwritev            = vpc_co_pwritev,
    .bdrv_co_get_block_status   = vpc_co_get_block_status,

    .bdrv_get_info          = vpc_get_info,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
@@ -1109,6 +1110,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
            goto fail;
        }
        memcpy(s->volume_label, label, label_length);
+    } else {
+        memcpy(s->volume_label, "QEMU VVFAT", 10);
    }

    if (floppy) {
@@ -1177,6 +1180,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
        bs->read_only = 0;
    }

+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
    bs->total_sectors = cyls * heads * secs;

    if (init_directories(s, dirname, heads, secs, errp)) {
@@ -1419,14 +1423,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
    return 0;
 }

-static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_from_buf(qiov, 0, buf, bytes);
+    g_free(buf);
+
    return ret;
 }

@@ -2283,12 +2304,17 @@ DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapp
 		factor * (old_cluster_count - new_cluster_count));

    for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) {
+        direntry_t *first_direntry;
 	void* direntry = array_get(&(s->directory), current_dir_index);
 	int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry,
 		s->sectors_per_cluster);
 	if (ret)
 	    return ret;
-	assert(!strncmp(s->directory.pointer, "QEMU", 4));
+
+        /* The first directory entry on the filesystem is the volume name */
+        first_direntry = (direntry_t*) s->directory.pointer;
+        assert(!memcmp(first_direntry->name, s->volume_label, 11));
+
 	current_dir_index += factor;
    }

@@ -2873,14 +2899,31 @@ DLOG(checkpoint());
    return 0;
 }

-static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
-                                       const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                 QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+    qemu_iovec_to_buf(qiov, 0, buf, bytes);
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_write(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    g_free(buf);
+
    return ret;
 }

@@ -2897,8 +2940,10 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA;
 }

-static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
-	const uint8_t* buffer, int nb_sectors) {
+static int coroutine_fn
+write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                    QEMUIOVector *qiov, int flags)
+{
    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
    return try_commit(s);
 }
@@ -2911,7 +2956,7 @@ static void write_target_close(BlockDriverState *bs) {

 static BlockDriver vvfat_write_target = {
    .format_name        = "vvfat_write_target",
-    .bdrv_write         = write_target_commit,
+    .bdrv_co_pwritev    = write_target_commit,
    .bdrv_close         = write_target_close,
 };

@@ -3007,8 +3052,8 @@ static BlockDriver bdrv_vvfat = {
    .bdrv_file_open         = vvfat_open,
    .bdrv_close             = vvfat_close,

-    .bdrv_read              = vvfat_co_read,
-    .bdrv_write             = vvfat_co_write,
+    .bdrv_co_preadv         = vvfat_co_preadv,
+    .bdrv_co_pwritev        = vvfat_co_pwritev,
    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };

--- a/blockdev.c
+++ b/blockdev.c
@@ -73,7 +73,7 @@ static int if_max_devs[IF_COUNT] = {
     * Do not change these numbers!  They govern how drive option
     * index maps to unit and bus.  That mapping is ABI.
     *
-     * All controllers used to imlement if=T drives need to support
+     * All controllers used to implement if=T drives need to support
     * if_max_devs[T] units, for any T with if_max_devs[T] != 0.
     * Otherwise, some index values map to "impossible" bus, unit
     * values.
@@ -577,15 +577,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
        blk_rs->detect_zeroes = detect_zeroes;

-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            blk_rs->throttle_group = g_strdup(throttling_group);
-            blk_rs->throttle_state = throttle_group_incref(throttling_group);
-            blk_rs->throttle_state->cfg = cfg;
-        }
-
        QDECREF(bs_opts);
    } else {
        if (file && !*file) {
@@ -611,15 +602,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

        bs->detect_zeroes = detect_zeroes;

-        /* disk I/O throttling */
-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            bdrv_io_limits_enable(bs, throttling_group);
-            bdrv_set_io_limits(bs, &cfg);
-        }
-
        if (bdrv_key_required(bs)) {
            autostart = 0;
        }
@@ -633,6 +615,15 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        }
    }

+    /* disk I/O throttling */
+    if (throttle_enabled(&cfg)) {
+        if (!throttling_group) {
+            throttling_group = blk_name(blk);
+        }
+        blk_io_limits_enable(blk, throttling_group);
+        blk_set_io_limits(blk, &cfg);
+    }
+
    blk_set_enable_write_cache(blk, !writethrough);
    blk_set_on_error(blk, on_read_error, on_write_error);

@@ -1785,9 +1776,9 @@ static void external_snapshot_prepare(BlkActionState *common,
        return;
    }

-    if (state->new_bs->blk != NULL) {
+    if (bdrv_has_blk(state->new_bs)) {
        error_setg(errp, "The snapshot is already in use by %s",
-                   blk_name(state->new_bs->blk));
+                   bdrv_get_parent_name(state->new_bs));
        return;
    }

@@ -2290,16 +2281,29 @@ exit:
    block_job_txn_unref(block_job_txn);
 }

+static int do_open_tray(const char *device, bool force, Error **errp);
+
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
    Error *local_err = NULL;
+    int rc;

-    qmp_blockdev_open_tray(device, has_force, force, &local_err);
+    if (!has_force) {
+        force = false;
+    }
+
+    rc = do_open_tray(device, force, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
    }

+    if (rc == EINPROGRESS) {
+        error_setg(errp, "Device '%s' is locked and force was not specified, "
+                   "wait for tray to open and try again", device);
+        return;
+    }
+
    qmp_x_blockdev_remove_medium(device, errp);
 }

@@ -2327,35 +2331,36 @@ void qmp_block_passwd(bool has_device, const char *device,
    aio_context_release(aio_context);
 }

-void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
-                            Error **errp)
+/**
+ * returns -errno on fatal error, +errno for non-fatal situations.
+ * errp will always be set when the return code is negative.
+ * May return +ENOSYS if the device has no tray,
+ * or +EINPROGRESS if the tray is locked and the guest has been notified.
+ */
+static int do_open_tray(const char *device, bool force, Error **errp)
 {
    BlockBackend *blk;
    bool locked;

-    if (!has_force) {
-        force = false;
-    }
-
    blk = blk_by_name(device);
    if (!blk) {
        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                  "Device '%s' not found", device);
-        return;
+        return -ENODEV;
    }

    if (!blk_dev_has_removable_media(blk)) {
        error_setg(errp, "Device '%s' is not removable", device);
-        return;
+        return -ENOTSUP;
    }

    if (!blk_dev_has_tray(blk)) {
        /* Ignore this command on tray-less devices */
-        return;
+        return ENOSYS;
    }

    if (blk_dev_is_tray_open(blk)) {
-        return;
+        return 0;
    }

    locked = blk_dev_is_medium_locked(blk);
@@ -2366,6 +2371,21 @@ void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
    if (!locked || force) {
        blk_dev_change_media_cb(blk, false);
    }
+
+    if (locked && !force) {
+        return EINPROGRESS;
+    }
+
+    return 0;
+}
+
+void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
+                            Error **errp)
+{
+    if (!has_force) {
+        force = false;
+    }
+    do_open_tray(device, force, errp);
 }

 void qmp_blockdev_close_tray(const char *device, Error **errp)
@@ -2503,9 +2523,9 @@ void qmp_x_blockdev_insert_medium(const char *device, const char *node_name,
        return;
    }

-    if (bs->blk) {
+    if (bdrv_has_blk(bs)) {
        error_setg(errp, "Node '%s' is already in use by '%s'", node_name,
-                   blk_name(bs->blk));
+                   bdrv_get_parent_name(bs));
        return;
    }

@@ -2570,8 +2590,6 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        goto fail;
    }

-    blk_apply_root_state(blk, medium_bs);
-
    bdrv_add_key(medium_bs, NULL, &err);
    if (err) {
        error_propagate(errp, err);
@@ -2596,6 +2614,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        goto fail;
    }

+    blk_apply_root_state(blk, medium_bs);
+
    qmp_blockdev_close_tray(device, errp);

 fail:
@@ -2661,13 +2681,6 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
        goto out;
    }

-    /* The BlockBackend must be the only parent */
-    assert(QLIST_FIRST(&bs->parents));
-    if (QLIST_NEXT(QLIST_FIRST(&bs->parents), next_parent)) {
-        error_setg(errp, "Cannot throttle device with multiple parents");
-        goto out;
-    }
-
    throttle_config_init(&cfg);
    cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
    cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
@@ -2726,16 +2739,16 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
    if (throttle_enabled(&cfg)) {
        /* Enable I/O limits if they're not enabled yet, otherwise
         * just update the throttling group. */
-        if (!bs->throttle_state) {
-            bdrv_io_limits_enable(bs, has_group ? group : device);
+        if (!blk_get_public(blk)->throttle_state) {
+            blk_io_limits_enable(blk, has_group ? group : device);
        } else if (has_group) {
-            bdrv_io_limits_update_group(bs, group);
+            blk_io_limits_update_group(blk, group);
        }
        /* Set the new throttling configuration */
-        bdrv_set_io_limits(bs, &cfg);
-    } else if (bs->throttle_state) {
+        blk_set_io_limits(blk, &cfg);
+    } else if (blk_get_public(blk)->throttle_state) {
        /* If all throttling settings are set to 0, disable I/O limits */
-        bdrv_io_limits_disable(bs);
+        blk_io_limits_disable(blk);
    }

 out:
@@ -3457,7 +3470,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
        return;
    }
-    if (target->blk) {
+    if (bdrv_has_blk(target)) {
        error_setg(errp, "Cannot mirror to an attached block device");
        return;
    }
@@ -4046,15 +4059,15 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
        bs = blk_bs(blk);
        aio_context = blk_get_aio_context(blk);
    } else {
+        blk = NULL;
        bs = bdrv_find_node(node_name);
        if (!bs) {
            error_setg(errp, "Cannot find node %s", node_name);
            return;
        }
-        blk = bs->blk;
-        if (blk) {
+        if (bdrv_has_blk(bs)) {
            error_setg(errp, "Node %s is in use by %s",
-                       node_name, blk_name(blk));
+                       node_name, bdrv_get_parent_name(bs));
            return;
        }
        aio_context = bdrv_get_aio_context(bs);
@@ -4092,12 +4105,68 @@ out:
    aio_context_release(aio_context);
 }

+static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
+                                  const char *child_name)
+{
+    BdrvChild *child;
+
+    QLIST_FOREACH(child, &parent_bs->children, next) {
+        if (strcmp(child->name, child_name) == 0) {
+            return child;
+        }
+    }
+
+    return NULL;
+}
+
+void qmp_x_blockdev_change(const char *parent, bool has_child,
+                           const char *child, bool has_node,
+                           const char *node, Error **errp)
+{
+    BlockDriverState *parent_bs, *new_bs = NULL;
+    BdrvChild *p_child;
+
+    parent_bs = bdrv_lookup_bs(parent, parent, errp);
+    if (!parent_bs) {
+        return;
+    }
+
+    if (has_child == has_node) {
+        if (has_child) {
+            error_setg(errp, "The parameters child and node are in conflict");
+        } else {
+            error_setg(errp, "Either child or node must be specified");
+        }
+        return;
+    }
+
+    if (has_child) {
+        p_child = bdrv_find_child(parent_bs, child);
+        if (!p_child) {
+            error_setg(errp, "Node '%s' does not have child '%s'",
+                       parent, child);
+            return;
+        }
+        bdrv_del_child(parent_bs, p_child, errp);
+    }
+
+    if (has_node) {
+        new_bs = bdrv_find_node(node);
+        if (!new_bs) {
+            error_setg(errp, "Node '%s' not found", node);
+            return;
+        }
+        bdrv_add_child(parent_bs, new_bs, errp);
+    }
+}
+
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
    BlockJobInfoList *head = NULL, **p_next = &head;
    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;

-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+    while ((it = bdrv_next(it, &bs))) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
--- a/blockjob.c
+++ b/blockjob.c
@@ -411,8 +411,7 @@ void block_job_event_ready(BlockJob *job)
                                    job->speed, &error_abort);
 }

-BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
-                                        BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                        int is_read, int error)
 {
    BlockErrorAction action;
@@ -443,9 +442,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
-        if (bs->blk && bs != job->bs) {
-            blk_iostatus_set_err(bs->blk, error);
-        }
    }
    return action;
 }
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -28,6 +28,7 @@
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
 #include "hw/hw.h"
+#include "hw/qdev-core.h"

 typedef struct FWBootEntry FWBootEntry;

--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -25,6 +25,7 @@
 #include "qemu/help_option.h"
 /* For tb_lock */
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
@@ -849,6 +850,7 @@ int main(int argc, char **argv)
    }

    /* init debug */
+    qemu_log_needs_buffers();
    qemu_set_log_filename(log_file);
    if (log_mask) {
        int mask;
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -19,6 +19,7 @@


 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"

 #undef DEBUG_REMAP
--- a/98
+++ b/98
@@ -207,7 +207,7 @@ fdt=""
 netmap="no"
 pixman=""
 sdl=""
-sdlabi="1.2"
+sdlabi=""
 virtfs=""
 vnc="yes"
 sparse="no"
@@ -298,6 +298,7 @@ coroutine=""
 coroutine_pool=""
 seccomp=""
 glusterfs=""
+glusterfs_xlator_opt="no"
 glusterfs_discard="no"
 glusterfs_zerofill="no"
 archipelago="no"
@@ -1872,6 +1873,9 @@ if test "$seccomp" != "no" ; then
    i386|x86_64)
        libseccomp_minver="2.1.0"
        ;;
+    mips)
+        libseccomp_minver="2.2.0"
+        ;;
    arm|aarch64)
        libseccomp_minver="2.2.3"
        ;;
@@ -2153,6 +2157,7 @@ if test "$gtk" != "no"; then
    if $pkg_config --exists "$gtkpackage >= $gtkversion"; then
        gtk_cflags=`$pkg_config --cflags $gtkpackage`
        gtk_libs=`$pkg_config --libs $gtkpackage`
+        gtk_version=`$pkg_config --modversion $gtkpackage`
        if $pkg_config --exists "$gtkx11package >= $gtkversion"; then
            gtk_cflags="$gtk_cflags $x11_cflags"
            gtk_libs="$gtk_libs $x11_libs"
@@ -2388,20 +2393,25 @@ fi

 if test "$vte" != "no"; then
    if test "$gtkabi" = "3.0"; then
-      vtepackage="vte-2.90"
-      vteversion="0.32.0"
+      vteminversion="0.32.0"
+      if $pkg_config --exists "vte-2.91"; then
+        vtepackage="vte-2.91"
+      else
+        vtepackage="vte-2.90"
+      fi
    else
      vtepackage="vte"
-      vteversion="0.24.0"
+      vteminversion="0.24.0"
    fi
-    if $pkg_config --exists "$vtepackage >= $vteversion"; then
+    if $pkg_config --exists "$vtepackage >= $vteminversion"; then
        vte_cflags=`$pkg_config --cflags $vtepackage`
        vte_libs=`$pkg_config --libs $vtepackage`
+        vteversion=`$pkg_config --modversion $vtepackage`
        libs_softmmu="$vte_libs $libs_softmmu"
        vte="yes"
    elif test "$vte" = "yes"; then
        if test "$gtkabi" = "3.0"; then
-            feature_not_found "vte" "Install libvte-2.90 devel"
+            feature_not_found "vte" "Install libvte-2.90/2.91 devel"
        else
            feature_not_found "vte" "Install libvte devel"
        fi
@@ -2416,13 +2426,25 @@ fi
 # Look for sdl configuration program (pkg-config or sdl-config).  Try
 # sdl-config even without cross prefix, and favour pkg-config over sdl-config.

+if test "$sdlabi" = ""; then
+    if $pkg_config --exists "sdl"; then
+        sdlabi=1.2
+    elif $pkg_config --exists "sdl2"; then
+        sdlabi=2.0
+    else
+        sdlabi=1.2
+    fi
+fi
+
 if test $sdlabi = "2.0"; then
    sdl_config=$sdl2_config
    sdlname=sdl2
    sdlconfigname=sdl2_config
-else
+elif test $sdlabi = "1.2"; then
    sdlname=sdl
    sdlconfigname=sdl_config
+else
+    error_exit "Unknown sdlabi $sdlabi, must be 1.2 or 2.0"
 fi

 if test "`basename $sdl_config`" != $sdlconfigname && ! has ${sdl_config}; then
@@ -2431,10 +2453,10 @@ fi

 if $pkg_config $sdlname --exists; then
  sdlconfig="$pkg_config $sdlname"
-  _sdlversion=`$sdlconfig --modversion 2>/dev/null | sed 's/[^0-9]//g'`
+  sdlversion=`$sdlconfig --modversion 2>/dev/null`
 elif has ${sdl_config}; then
  sdlconfig="$sdl_config"
-  _sdlversion=`$sdlconfig --version | sed 's/[^0-9]//g'`
+  sdlversion=`$sdlconfig --version`
 else
  if test "$sdl" = "yes" ; then
    feature_not_found "sdl" "Install SDL devel"
@@ -2459,7 +2481,7 @@ EOF
    sdl_libs=`$sdlconfig --libs 2> /dev/null`
  fi
  if compile_prog "$sdl_cflags" "$sdl_libs" ; then
-    if test "$_sdlversion" -lt 121 ; then
+    if test `echo $sdlversion | sed 's/[^0-9]//g'` -lt 121 ; then
      sdl_too_old=yes
    else
      sdl=yes
@@ -3397,6 +3419,9 @@ if test "$glusterfs" != "no" ; then
    glusterfs="yes"
    glusterfs_cflags=`$pkg_config --cflags glusterfs-api`
    glusterfs_libs=`$pkg_config --libs glusterfs-api`
+    if $pkg_config --atleast-version=4 glusterfs-api; then
+      glusterfs_xlator_opt="yes"
+    fi
    if $pkg_config --atleast-version=5 glusterfs-api; then
      glusterfs_discard="yes"
    fi
@@ -4486,6 +4511,21 @@ if test "$fortify_source" != "no"; then
  fi
 fi

+##########################################
+# check if struct fsxattr is available via linux/fs.h
+
+have_fsxattr=no
+cat > $TMPC << EOF
+#include <linux/fs.h>
+struct fsxattr foo;
+int main(void) {
+  return 0;
+}
+EOF
+if compile_prog "" "" ; then
+    have_fsxattr=yes
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -4571,7 +4611,7 @@ if test "$softmmu" = yes ; then
      tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)"
    else
      if test "$virtfs" = yes; then
-        error_exit "VirtFS is supported only on Linux and requires libcap-devel and libattr-devel"
+        error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel"
      fi
      virtfs=no
    fi
@@ -4696,6 +4736,12 @@ EOF
  fi
 fi

+echo_version() {
+    if test "$1" = "yes" ; then
+        echo "($2)"
+    fi
+}
+
 # prepend pixman and ftd flags after all config tests are done
 QEMU_CFLAGS="$pixman_cflags $fdt_cflags $QEMU_CFLAGS"
 libs_softmmu="$pixman_libs $libs_softmmu"
@@ -4745,22 +4791,18 @@ if test "$darwin" = "yes" ; then
    echo "Cocoa support     $cocoa"
 fi
 echo "pixman            $pixman"
-echo "SDL support       $sdl"
-echo "GTK support       $gtk"
+echo "SDL support       $sdl `echo_version $sdl $sdlversion`"
+echo "GTK support       $gtk `echo_version $gtk $gtk_version`"
 echo "GTK GL support    $gtk_gl"
+echo "VTE support       $vte `echo_version $vte $vteversion`"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
 echo "GNUTLS rnd        $gnutls_rnd"
 echo "libgcrypt         $gcrypt"
 echo "libgcrypt kdf     $gcrypt_kdf"
-if test "$nettle" = "yes"; then
-    echo "nettle            $nettle ($nettle_version)"
-else
-    echo "nettle            $nettle"
-fi
+echo "nettle            $nettle `echo_version $nettle $nettle_version`"
 echo "nettle kdf        $nettle_kdf"
 echo "libtasn1          $tasn1"
-echo "VTE support       $vte"
 echo "curses support    $curses"
 echo "virgl support     $virglrenderer"
 echo "curl support      $curl"
@@ -4809,11 +4851,7 @@ echo "Trace backends    $trace_backends"
 if have_backend "simple"; then
 echo "Trace output file $trace_file-<pid>"
 fi
-if test "$spice" = "yes"; then
-echo "spice support     $spice ($spice_protocol_version/$spice_server_version)"
-else
-echo "spice support     $spice"
-fi
+echo "spice support     $spice `echo_version $spice $spice_protocol_version/$spice_server_version`"
 echo "rbd support       $rbd"
 echo "xfsctl support    $xfs"
 echo "smartcard support $smartcard"
@@ -5153,6 +5191,14 @@ fi
 if test "$have_ifaddrs_h" = "yes" ; then
    echo "HAVE_IFADDRS_H=y" >> $config_host_mak
 fi
+
+# Work around a system header bug with some kernel/XFS header
+# versions where they both try to define 'struct fsxattr':
+# xfs headers will not try to redefine structs from linux headers
+# if this macro is set.
+if test "$have_fsxattr" = "yes" ; then
+    echo "HAVE_FSXATTR=y" >> $config_host_mak
+fi
 if test "$vte" = "yes" ; then
  echo "CONFIG_VTE=y" >> $config_host_mak
  echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak
@@ -5339,6 +5385,10 @@ if test "$glusterfs" = "yes" ; then
  echo "GLUSTERFS_LIBS=$glusterfs_libs" >> $config_host_mak
 fi

+if test "$glusterfs_xlator_opt" = "yes" ; then
+  echo "CONFIG_GLUSTERFS_XLATOR_OPT=y" >> $config_host_mak
+fi
+
 if test "$glusterfs_discard" = "yes" ; then
  echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak
 fi
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -7,6 +7,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "qemu/sockets.h"

 #include <sys/mman.h>
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -15,7 +15,7 @@
 * unix socket. For each client, the server will create some eventfd
 * (see EVENTFD(2)), one per vector. These fd are transmitted to all
 * clients using the SCM_RIGHTS cmsg message. Therefore, each client is
- * able to send a notification to another client without beeing
+ * able to send a notification to another client without being
 * "profixied" by the server.
 *
 * We use this mechanism to send interruptions between guests.
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -20,6 +20,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "sysemu/cpus.h"
+#include "exec/exec-all.h"
 #include "exec/memory-internal.h"

 bool exit_request;
@@ -68,7 +69,6 @@ void cpu_reloading_memory_map(void)

 void cpu_loop_exit(CPUState *cpu)
 {
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }

@@ -77,6 +77,5 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
    if (pc) {
        cpu_restore_state(cpu, pc);
    }
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -20,6 +20,7 @@
 #include "cpu.h"
 #include "trace.h"
 #include "disas/disas.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
@@ -136,7 +137,9 @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
 static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 {
    CPUArchState *env = cpu->env_ptr;
-    uintptr_t next_tb;
+    uintptr_t ret;
+    TranslationBlock *last_tb;
+    int tb_exit;
    uint8_t *tb_ptr = itb->tc_ptr;

    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
@@ -160,118 +163,125 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 #endif /* DEBUG_DISAS */

    cpu->can_do_io = !use_icount;
-    next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    ret = tcg_qemu_tb_exec(env, tb_ptr);
    cpu->can_do_io = 1;
-    trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
-                       next_tb & TB_EXIT_MASK);
+    last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    tb_exit = ret & TB_EXIT_MASK;
+    trace_exec_tb_exit(last_tb, tb_exit);

-    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
+    if (tb_exit > TB_EXIT_IDX1) {
        /* We didn't start executing this TB (eg because the instruction
         * counter hit zero); we must restore the guest PC to the address
         * of the start of the TB.
         */
        CPUClass *cc = CPU_GET_CLASS(cpu);
-        TranslationBlock *tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-        qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
+        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
                               "Stopped execution of TB chain before %p ["
                               TARGET_FMT_lx "] %s\n",
-                               itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
+                               last_tb->tc_ptr, last_tb->pc,
+                               lookup_symbol(last_tb->pc));
        if (cc->synchronize_from_tb) {
-            cc->synchronize_from_tb(cpu, tb);
+            cc->synchronize_from_tb(cpu, last_tb);
        } else {
            assert(cc->set_pc);
-            cc->set_pc(cpu, tb->pc);
+            cc->set_pc(cpu, last_tb->pc);
        }
    }
-    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_REQUESTED) {
+    if (tb_exit == TB_EXIT_REQUESTED) {
        /* We were asked to stop executing TBs (probably a pending
         * interrupt. We've now stopped, so clear the flag.
         */
        cpu->tcg_exit_req = 0;
    }
-    return next_tb;
+    return ret;
 }

+#ifndef CONFIG_USER_ONLY
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;
+    bool old_tb_flushed;

    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;

+    old_tb_flushed = cpu->tb_flushed;
+    cpu->tb_flushed = false;
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
-    cpu->current_tb = tb;
+    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
+    cpu->tb_flushed |= old_tb_flushed;
    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);
-    cpu->current_tb = NULL;
    tb_phys_invalidate(tb, -1);
    tb_free(tb);
 }
+#endif

 static TranslationBlock *tb_find_physical(CPUState *cpu,
                                          target_ulong pc,
                                          target_ulong cs_base,
-                                          uint64_t flags)
+                                          uint32_t flags)
 {
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
-    TranslationBlock *tb, **ptb1;
+    TranslationBlock *tb, **tb_hash_head, **ptb1;
    unsigned int h;
    tb_page_addr_t phys_pc, phys_page1;
-    target_ulong virt_page2;
-
-    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;

    /* find translated block using physical mappings */
    phys_pc = get_page_addr_code(env, pc);
    phys_page1 = phys_pc & TARGET_PAGE_MASK;
    h = tb_phys_hash_func(phys_pc);
-    ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h];
-    for(;;) {
-        tb = *ptb1;
-        if (!tb) {
-            return NULL;
-        }
+
+    /* Start at head of the hash entry */
+    ptb1 = tb_hash_head = &tcg_ctx.tb_ctx.tb_phys_hash[h];
+    tb = *ptb1;
+
+    while (tb) {
        if (tb->pc == pc &&
            tb->page_addr[0] == phys_page1 &&
            tb->cs_base == cs_base &&
            tb->flags == flags) {
-            /* check next page if needed */
-            if (tb->page_addr[1] != -1) {
-                tb_page_addr_t phys_page2;

-                virt_page2 = (pc & TARGET_PAGE_MASK) +
-                    TARGET_PAGE_SIZE;
-                phys_page2 = get_page_addr_code(env, virt_page2);
+            if (tb->page_addr[1] == -1) {
+                /* done, we have a match */
+                break;
+            } else {
+                /* check next page if needed */
+                target_ulong virt_page2 = (pc & TARGET_PAGE_MASK) +
+                                          TARGET_PAGE_SIZE;
+                tb_page_addr_t phys_page2 = get_page_addr_code(env, virt_page2);
+
                if (tb->page_addr[1] == phys_page2) {
                    break;
                }
-            } else {
-                break;
            }
        }
+
        ptb1 = &tb->phys_hash_next;
+        tb = *ptb1;
    }

-    /* Move the TB to the head of the list */
-    *ptb1 = tb->phys_hash_next;
-    tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h];
-    tcg_ctx.tb_ctx.tb_phys_hash[h] = tb;
+    if (tb) {
+        /* Move the TB to the head of the list */
+        *ptb1 = tb->phys_hash_next;
+        tb->phys_hash_next = *tb_hash_head;
+        *tb_hash_head = tb;
+    }
    return tb;
 }

 static TranslationBlock *tb_find_slow(CPUState *cpu,
                                      target_ulong pc,
                                      target_ulong cs_base,
-                                      uint64_t flags)
+                                      uint32_t flags)
 {
    TranslationBlock *tb;

@@ -309,26 +319,63 @@ found:
    return tb;
 }

-static inline TranslationBlock *tb_find_fast(CPUState *cpu)
+static inline TranslationBlock *tb_find_fast(CPUState *cpu,
+                                             TranslationBlock **last_tb,
+                                             int tb_exit)
 {
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
-    int flags;
+    uint32_t flags;

    /* we record a subset of the CPU state. It will
       always be the same before a given translated block
       is executed. */
    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb_lock();
    tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                 tb->flags != flags)) {
        tb = tb_find_slow(cpu, pc, cs_base, flags);
    }
+    if (cpu->tb_flushed) {
+        /* Ensure that no TB jump will be modified as the
+         * translation buffer has been flushed.
+         */
+        *last_tb = NULL;
+        cpu->tb_flushed = false;
+    }
+    /* See if we can patch the calling TB. */
+    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_add_jump(*last_tb, tb_exit, tb);
+    }
+    tb_unlock();
    return tb;
 }

-static void cpu_handle_debug_exception(CPUState *cpu)
+static inline bool cpu_handle_halt(CPUState *cpu)
+{
+    if (cpu->halted) {
+#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            apic_poll_irq(x86_cpu->apic_state);
+            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
+        }
+#endif
+        if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
+            return true;
+        }
+
+        cpu->halted = 0;
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_debug_exception(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUWatchpoint *wp;
@@ -342,37 +389,197 @@ static void cpu_handle_debug_exception(CPUState *cpu)
    cc->debug_excp_handler(cpu);
 }

+static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
+{
+    if (cpu->exception_index >= 0) {
+        if (cpu->exception_index >= EXCP_INTERRUPT) {
+            /* exit request from the cpu execution loop */
+            *ret = cpu->exception_index;
+            if (*ret == EXCP_DEBUG) {
+                cpu_handle_debug_exception(cpu);
+            }
+            cpu->exception_index = -1;
+            return true;
+        } else {
+#if defined(CONFIG_USER_ONLY)
+            /* if user mode only, we simulate a fake exception
+               which will be handled outside the cpu execution
+               loop */
+#if defined(TARGET_I386)
+            CPUClass *cc = CPU_GET_CLASS(cpu);
+            cc->do_interrupt(cpu);
+#endif
+            *ret = cpu->exception_index;
+            cpu->exception_index = -1;
+            return true;
+#else
+            if (replay_exception()) {
+                CPUClass *cc = CPU_GET_CLASS(cpu);
+                cc->do_interrupt(cpu);
+                cpu->exception_index = -1;
+            } else if (!replay_has_interrupt()) {
+                /* give a chance to iothread in replay mode */
+                *ret = EXCP_INTERRUPT;
+                return true;
+            }
+#endif
+        }
+#ifndef CONFIG_USER_ONLY
+    } else if (replay_has_exception()
+               && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+        /* try to cause an exception pending in the log */
+        TranslationBlock *last_tb = NULL; /* Avoid chaining TBs */
+        cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, &last_tb, 0), true);
+        *ret = -1;
+        return true;
+#endif
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_interrupt(CPUState *cpu,
+                                        TranslationBlock **last_tb)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    int interrupt_request = cpu->interrupt_request;
+
+    if (unlikely(interrupt_request)) {
+        if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
+            /* Mask out external interrupts for this step. */
+            interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
+        }
+        if (interrupt_request & CPU_INTERRUPT_DEBUG) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
+            cpu->exception_index = EXCP_DEBUG;
+            cpu_loop_exit(cpu);
+        }
+        if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
+            /* Do nothing */
+        } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+            replay_interrupt();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
+            cpu->halted = 1;
+            cpu->exception_index = EXCP_HLT;
+            cpu_loop_exit(cpu);
+        }
+#if defined(TARGET_I386)
+        else if (interrupt_request & CPU_INTERRUPT_INIT) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            CPUArchState *env = &x86_cpu->env;
+            replay_interrupt();
+            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
+            do_cpu_init(x86_cpu);
+            cpu->exception_index = EXCP_HALTED;
+            cpu_loop_exit(cpu);
+        }
+#else
+        else if (interrupt_request & CPU_INTERRUPT_RESET) {
+            replay_interrupt();
+            cpu_reset(cpu);
+            cpu_loop_exit(cpu);
+        }
+#endif
+        /* The target hook has 3 exit conditions:
+           False when the interrupt isn't processed,
+           True when it is, and we should restart on a new TB,
+           and via longjmp via cpu_loop_exit.  */
+        else {
+            replay_interrupt();
+            if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                *last_tb = NULL;
+            }
+            /* The target hook may have updated the 'cpu->interrupt_request';
+             * reload the 'interrupt_request' value */
+            interrupt_request = cpu->interrupt_request;
+        }
+        if (interrupt_request & CPU_INTERRUPT_EXITTB) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
+            /* ensure that no TB jump will be modified as
+               the program flow was changed */
+            *last_tb = NULL;
+        }
+    }
+    if (unlikely(cpu->exit_request || replay_has_interrupt())) {
+        cpu->exit_request = 0;
+        cpu->exception_index = EXCP_INTERRUPT;
+        cpu_loop_exit(cpu);
+    }
+}
+
+static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
+                                    TranslationBlock **last_tb, int *tb_exit,
+                                    SyncClocks *sc)
+{
+    uintptr_t ret;
+
+    if (unlikely(cpu->exit_request)) {
+        return;
+    }
+
+    trace_exec_tb(tb, tb->pc);
+    ret = cpu_tb_exec(cpu, tb);
+    *last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    *tb_exit = ret & TB_EXIT_MASK;
+    switch (*tb_exit) {
+    case TB_EXIT_REQUESTED:
+        /* Something asked us to stop executing
+         * chained TBs; just continue round the main
+         * loop. Whatever requested the exit will also
+         * have set something else (eg exit_request or
+         * interrupt_request) which we will handle
+         * next time around the loop.  But we need to
+         * ensure the tcg_exit_req read in generated code
+         * comes before the next read of cpu->exit_request
+         * or cpu->interrupt_request.
+         */
+        smp_rmb();
+        *last_tb = NULL;
+        break;
+    case TB_EXIT_ICOUNT_EXPIRED:
+    {
+        /* Instruction counter expired.  */
+#ifdef CONFIG_USER_ONLY
+        abort();
+#else
+        int insns_left = cpu->icount_decr.u32;
+        if (cpu->icount_extra && insns_left >= 0) {
+            /* Refill decrementer and continue execution.  */
+            cpu->icount_extra += insns_left;
+            insns_left = MIN(0xffff, cpu->icount_extra);
+            cpu->icount_extra -= insns_left;
+            cpu->icount_decr.u16.low = insns_left;
+        } else {
+            if (insns_left > 0) {
+                /* Execute remaining instructions.  */
+                cpu_exec_nocache(cpu, insns_left, *last_tb, false);
+                align_clocks(sc, cpu);
+            }
+            cpu->exception_index = EXCP_INTERRUPT;
+            *last_tb = NULL;
+            cpu_loop_exit(cpu);
+        }
+        break;
+#endif
+    }
+    default:
+        break;
+    }
+}
+
 /* main execution loop */

 int cpu_exec(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-    X86CPU *x86_cpu = X86_CPU(cpu);
-    CPUArchState *env = &x86_cpu->env;
-#endif
-    int ret, interrupt_request;
-    TranslationBlock *tb;
-    uintptr_t next_tb;
+    int ret;
    SyncClocks sc;

    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;

-    if (cpu->halted) {
-#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
-            && replay_interrupt()) {
-            apic_poll_irq(x86_cpu->apic_state);
-            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
-        }
-#endif
-        if (!cpu_has_work(cpu)) {
-            current_cpu = NULL;
-            return EXCP_HALTED;
-        }
-
-        cpu->halted = 0;
+    if (cpu_handle_halt(cpu)) {
+        return EXCP_HALTED;
    }

    atomic_mb_set(&tcg_current_cpu, cpu);
@@ -391,185 +598,26 @@ int cpu_exec(CPUState *cpu)
     */
    init_delay_params(&sc, cpu);

-    /* prepare setjmp context for exception handling */
    for(;;) {
+        TranslationBlock *tb, *last_tb;
+        int tb_exit = 0;
+
+        /* prepare setjmp context for exception handling */
        if (sigsetjmp(cpu->jmp_env, 0) == 0) {
            /* if an exception is pending, we execute it here */
-            if (cpu->exception_index >= 0) {
-                if (cpu->exception_index >= EXCP_INTERRUPT) {
-                    /* exit request from the cpu execution loop */
-                    ret = cpu->exception_index;
-                    if (ret == EXCP_DEBUG) {
-                        cpu_handle_debug_exception(cpu);
-                    }
-                    cpu->exception_index = -1;
-                    break;
-                } else {
-#if defined(CONFIG_USER_ONLY)
-                    /* if user mode only, we simulate a fake exception
-                       which will be handled outside the cpu execution
-                       loop */
-#if defined(TARGET_I386)
-                    cc->do_interrupt(cpu);
-#endif
-                    ret = cpu->exception_index;
-                    cpu->exception_index = -1;
-                    break;
-#else
-                    if (replay_exception()) {
-                        cc->do_interrupt(cpu);
-                        cpu->exception_index = -1;
-                    } else if (!replay_has_interrupt()) {
-                        /* give a chance to iothread in replay mode */
-                        ret = EXCP_INTERRUPT;
-                        break;
-                    }
-#endif
-                }
-            } else if (replay_has_exception()
-                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
-                /* try to cause an exception pending in the log */
-                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
-                ret = -1;
+            if (cpu_handle_exception(cpu, &ret)) {
                break;
            }

-            next_tb = 0; /* force lookup of first TB */
+            last_tb = NULL; /* forget the last executed TB after exception */
+            cpu->tb_flushed = false; /* reset before first TB lookup */
            for(;;) {
-                interrupt_request = cpu->interrupt_request;
-                if (unlikely(interrupt_request)) {
-                    if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
-                        /* Mask out external interrupts for this step. */
-                        interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
-                    }
-                    if (interrupt_request & CPU_INTERRUPT_DEBUG) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
-                        cpu->exception_index = EXCP_DEBUG;
-                        cpu_loop_exit(cpu);
-                    }
-                    if (replay_mode == REPLAY_MODE_PLAY
-                        && !replay_has_interrupt()) {
-                        /* Do nothing */
-                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
-                        replay_interrupt();
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
-                        cpu->halted = 1;
-                        cpu->exception_index = EXCP_HLT;
-                        cpu_loop_exit(cpu);
-                    }
-#if defined(TARGET_I386)
-                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
-                        replay_interrupt();
-                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
-                        do_cpu_init(x86_cpu);
-                        cpu->exception_index = EXCP_HALTED;
-                        cpu_loop_exit(cpu);
-                    }
-#else
-                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
-                        replay_interrupt();
-                        cpu_reset(cpu);
-                        cpu_loop_exit(cpu);
-                    }
-#endif
-                    /* The target hook has 3 exit conditions:
-                       False when the interrupt isn't processed,
-                       True when it is, and we should restart on a new TB,
-                       and via longjmp via cpu_loop_exit.  */
-                    else {
-                        replay_interrupt();
-                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                            next_tb = 0;
-                        }
-                    }
-                    /* Don't use the cached interrupt_request value,
-                       do_interrupt may have updated the EXITTB flag. */
-                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
-                        /* ensure that no TB jump will be modified as
-                           the program flow was changed */
-                        next_tb = 0;
-                    }
-                }
-                if (unlikely(cpu->exit_request
-                             || replay_has_interrupt())) {
-                    cpu->exit_request = 0;
-                    cpu->exception_index = EXCP_INTERRUPT;
-                    cpu_loop_exit(cpu);
-                }
-                tb_lock();
-                tb = tb_find_fast(cpu);
-                /* Note: we do it here to avoid a gcc bug on Mac OS X when
-                   doing it in tb_find_slow */
-                if (tcg_ctx.tb_ctx.tb_invalidated_flag) {
-                    /* as some TB could have been invalidated because
-                       of memory exceptions while generating the code, we
-                       must recompute the hash index here */
-                    next_tb = 0;
-                    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
-                }
-                /* see if we can patch the calling TB. When the TB
-                   spans two pages, we cannot safely do a direct
-                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1
-                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
-                                next_tb & TB_EXIT_MASK, tb);
-                }
-                tb_unlock();
-                if (likely(!cpu->exit_request)) {
-                    trace_exec_tb(tb, tb->pc);
-                    /* execute the generated code */
-                    cpu->current_tb = tb;
-                    next_tb = cpu_tb_exec(cpu, tb);
-                    cpu->current_tb = NULL;
-                    switch (next_tb & TB_EXIT_MASK) {
-                    case TB_EXIT_REQUESTED:
-                        /* Something asked us to stop executing
-                         * chained TBs; just continue round the main
-                         * loop. Whatever requested the exit will also
-                         * have set something else (eg exit_request or
-                         * interrupt_request) which we will handle
-                         * next time around the loop.  But we need to
-                         * ensure the tcg_exit_req read in generated code
-                         * comes before the next read of cpu->exit_request
-                         * or cpu->interrupt_request.
-                         */
-                        smp_rmb();
-                        next_tb = 0;
-                        break;
-                    case TB_EXIT_ICOUNT_EXPIRED:
-                    {
-                        /* Instruction counter expired.  */
-                        int insns_left = cpu->icount_decr.u32;
-                        if (cpu->icount_extra && insns_left >= 0) {
-                            /* Refill decrementer and continue execution.  */
-                            cpu->icount_extra += insns_left;
-                            insns_left = MIN(0xffff, cpu->icount_extra);
-                            cpu->icount_extra -= insns_left;
-                            cpu->icount_decr.u16.low = insns_left;
-                        } else {
-                            if (insns_left > 0) {
-                                /* Execute remaining instructions.  */
-                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb, false);
-                                align_clocks(&sc, cpu);
-                            }
-                            cpu->exception_index = EXCP_INTERRUPT;
-                            next_tb = 0;
-                            cpu_loop_exit(cpu);
-                        }
-                        break;
-                    }
-                    default:
-                        break;
-                    }
-                }
+                cpu_handle_interrupt(cpu, &last_tb);
+                tb = tb_find_fast(cpu, &last_tb, tb_exit);
+                cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
                /* Try to align the host and virtual clocks
                   if the guest is in advance */
                align_clocks(&sc, cpu);
-                /* reset soft MMU for next block (it can currently
-                   only be set by a memory fault) */
            } /* for(;;) */
        } else {
 #if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
@@ -579,18 +627,10 @@ int cpu_exec(CPUState *cpu)
             * Newer versions of gcc would complain about this code (-Wclobbered). */
            cpu = current_cpu;
            cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-            x86_cpu = X86_CPU(cpu);
-            env = &x86_cpu->env;
-#endif
 #else /* buggy compiler */
            /* Assert that the compiler does not smash local variables. */
            g_assert(cpu == current_cpu);
            g_assert(cc == CPU_GET_CLASS(cpu));
-#ifdef TARGET_I386
-            g_assert(x86_cpu == X86_CPU(cpu));
-            g_assert(env == &x86_cpu->env);
-#endif
 #endif /* buggy compiler */
            cpu->can_do_io = 1;
            tb_lock_reset();
--- a/cpus.c
+++ b/cpus.c
@@ -24,7 +24,8 @@

 /* Needed early for CONFIG_BSD etc. */
 #include "qemu/osdep.h"
-
+#include "qemu-common.h"
+#include "cpu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
@@ -34,6 +35,7 @@
 #include "sysemu/dma.h"
 #include "sysemu/kvm.h"
 #include "qmp-commands.h"
+#include "exec/exec-all.h"

 #include "qemu/thread.h"
 #include "sysemu/cpus.h"
--- a/cputlb.c
+++ b/cputlb.c
@@ -28,6 +28,7 @@

 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
+#include "exec/exec-all.h"
 #include "tcg/tcg.h"

 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
@@ -76,10 +77,6 @@ void tlb_flush(CPUState *cpu, int flush_global)

    tlb_debug("(%d)\n", flush_global);

-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;
-
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
@@ -95,9 +92,6 @@ static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
    CPUArchState *env = cpu->env_ptr;

    tlb_debug("start\n");
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    for (;;) {
        int mmu_idx = va_arg(argp, int);
@@ -152,9 +146,6 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
        tlb_flush(cpu, 1);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -193,9 +184,6 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
        va_end(argp);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--- a/crypto/afsplit.c
+++ b/crypto/afsplit.c
@@ -24,6 +24,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/bswap.h"
 #include "crypto/afsplit.h"
 #include "crypto/random.h"

--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -20,6 +20,7 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qemu/bswap.h"

 #include "crypto/block-luks.h"

--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -100,6 +100,7 @@ CONFIG_ALLWINNER_A10_PIT=y
 CONFIG_ALLWINNER_A10_PIC=y
 CONFIG_ALLWINNER_A10=y

+CONFIG_FSL_IMX6=y
 CONFIG_FSL_IMX31=y
 CONFIG_FSL_IMX25=y

--- a/device_tree.c
+++ b/device_tree.c
@@ -20,6 +20,7 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "qemu/error-report.h"
+#include "qemu/bswap.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -73,7 +73,7 @@ typedef struct {
    BlockBackend *blk;
    BlockAIOCB *acb;
    QEMUSGList *sg;
-    uint64_t sector_num;
+    uint64_t offset;
    DMADirection dir;
    int sg_cur_index;
    dma_addr_t sg_cur_byte;
@@ -130,7 +130,7 @@ static void dma_blk_cb(void *opaque, int ret)
    trace_dma_blk_cb(dbs, ret);

    dbs->acb = NULL;
-    dbs->sector_num += dbs->iov.size / 512;
+    dbs->offset += dbs->iov.size;

    if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
        dma_complete(dbs, ret);
@@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret)
        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
    }

-    dbs->acb = dbs->io_func(dbs->blk, dbs->sector_num, &dbs->iov,
-                            dbs->iov.size / 512, dma_blk_cb, dbs);
+    dbs->acb = dbs->io_func(dbs->blk, dbs->offset, &dbs->iov, 0,
+                            dma_blk_cb, dbs);
    assert(dbs->acb);
 }

@@ -203,7 +203,7 @@ BlockAIOCB *dma_blk_io(
    dbs->acb = NULL;
    dbs->blk = blk;
    dbs->sg = sg;
-    dbs->sector_num = sector_num;
+    dbs->offset = sector_num << BDRV_SECTOR_BITS;
    dbs->sg_cur_index = 0;
    dbs->sg_cur_byte = 0;
    dbs->dir = dir;
@@ -219,7 +219,7 @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
                         QEMUSGList *sg, uint64_t sector,
                         void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_readv, cb, opaque,
+    return dma_blk_io(blk, sg, sector, blk_aio_preadv, cb, opaque,
                      DMA_DIRECTION_FROM_DEVICE);
 }

@@ -227,7 +227,7 @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
                          QEMUSGList *sg, uint64_t sector,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_writev, cb, opaque,
+    return dma_blk_io(blk, sg, sector, blk_aio_pwritev, cb, opaque,
                      DMA_DIRECTION_TO_DEVICE);
 }

--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -899,10 +899,16 @@ Example:
            goto out_obj;
        }
        visit_type_UserDefOne_members(v, *obj, &err);
-        error_propagate(errp, err);
-        err = NULL;
+        if (err) {
+            goto out_obj;
+        }
+        visit_check_struct(v, &err);
    out_obj:
-        visit_end_struct(v, &err);
+        visit_end_struct(v);
+        if (err && visit_is_input(v)) {
+            qapi_free_UserDefOne(*obj);
+            *obj = NULL;
+        }
    out:
        error_propagate(errp, err);
    }
@@ -910,21 +916,27 @@ Example:
    void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp)
    {
        Error *err = NULL;
-        GenericList *i, **prev;
+        UserDefOneList *tail;
+        size_t size = sizeof(**obj);

-        visit_start_list(v, name, &err);
+        visit_start_list(v, name, (GenericList **)obj, size, &err);
        if (err) {
            goto out;
        }

-        for (prev = (GenericList **)obj;
-             !err && (i = visit_next_list(v, prev, sizeof(**obj))) != NULL;
-             prev = &i) {
-            UserDefOneList *native_i = (UserDefOneList *)i;
-            visit_type_UserDefOne(v, NULL, &native_i->value, &err);
+        for (tail = *obj; tail;
+             tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) {
+            visit_type_UserDefOne(v, NULL, &tail->value, &err);
+            if (err) {
+                break;
+            }
        }

        visit_end_list(v);
+        if (err && visit_is_input(v)) {
+            qapi_free_UserDefOneList(*obj);
+            *obj = NULL;
+        }
    out:
        error_propagate(errp, err);
    }
@@ -996,13 +1008,21 @@ Example:
    {
        Error *err = NULL;
        UserDefOne *retval;
-        QmpInputVisitor *qiv = qmp_input_visitor_new_strict(QOBJECT(args));
+        QmpInputVisitor *qiv = qmp_input_visitor_new(QOBJECT(args), true);
        QapiDeallocVisitor *qdv;
        Visitor *v;
        UserDefOneList *arg1 = NULL;

        v = qmp_input_get_visitor(qiv);
+        visit_start_struct(v, NULL, NULL, 0, &err);
+        if (err) {
+            goto out;
+        }
        visit_type_UserDefOneList(v, "arg1", &arg1, &err);
+        if (!err) {
+            visit_check_struct(v, &err);
+        }
+        visit_end_struct(v);
        if (err) {
            goto out;
        }
@@ -1019,7 +1039,9 @@ Example:
        qmp_input_visitor_cleanup(qiv);
        qdv = qapi_dealloc_visitor_new();
        v = qapi_dealloc_get_visitor(qdv);
+        visit_start_struct(v, NULL, NULL, 0, NULL);
        visit_type_UserDefOneList(v, "arg1", &arg1, NULL);
+        visit_end_struct(v);
        qapi_dealloc_visitor_cleanup(qdv);
    }

--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -210,29 +210,27 @@ the following syntax:

    -fw_cfg [name=]<item_name>,file=<path>

-where <item_name> is the fw_cfg item name, and <path> is the location
-on the host file system of a file containing the data to be inserted.
-
-Small enough items may be provided directly as strings on the command
-line, using the syntax:
+Or

    -fw_cfg [name=]<item_name>,string=<string>

-The terminating NUL character of the content <string> will NOT be
-included as part of the fw_cfg item data, which is consistent with
-the absence of a NUL terminator for items inserted via the file option.
+See QEMU man page for more documentation.

-Both <item_name> and, if applicable, the content <string> are passed
-through by QEMU without any interpretation, expansion, or further
-processing. Any such processing (potentially performed e.g., by the shell)
-is outside of QEMU's responsibility; as such, using plain ASCII characters
-is recommended.
+Using item_name with plain ASCII characters only is recommended.

-NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/"
-when using the "-fw_cfg" command line option, to avoid conflicting with
-item names used internally by QEMU. For instance:
+Item names beginning with "opt/" are reserved for users.  QEMU will
+never create entries with such names unless explicitly ordered by the
+user.

-    -fw_cfg name=opt/my_item_name,file=./my_blob.bin
+To avoid clashes among different users, it is strongly recommended
+that you use names beginning with opt/RFQDN/, where RFQDN is a reverse
+fully qualified domain name you control.  For instance, if SeaBIOS
+wanted to define additional names, the prefix "opt/org.seabios/" would
+be appropriate.

-Similarly, QEMU developers *SHOULD NOT* use item names prefixed with
-"opt/" when inserting items programmatically, e.g. via fw_cfg_add_file().
+For historical reasons, "opt/ovmf/" is reserved for OVMF firmware.
+
+Prefix "opt/org.qemu/" is reserved for QEMU itself.
+
+Use of names not beginning with "opt/" is potentially dangerous and
+entirely unsupported.  QEMU will warn if you try.
--- a/docs/specs/rocker.txt
+++ b/docs/specs/rocker.txt
@@ -303,7 +303,7 @@ Endianness
 ----------

 Device registers are hard-coded to little-endian (LE).  The driver should
-convert to/from host endianess to LE for device register accesses.
+convert to/from host endianness to LE for device register accesses.

 Descriptors are LE.  Descriptor buffer TLVs will have LE type and length
 fields, but the value field can either be LE or network-byte-order, depending
--- a/docs/specs/vhost-user.txt
+++ b/docs/specs/vhost-user.txt
@@ -364,7 +364,7 @@ Message types
      Equivalent ioctl: VHOST_SET_VRING_NUM
      Master payload: vring state description

-      Sets the number of vrings for this owner.
+      Set the size of the queue.

 * VHOST_USER_SET_VRING_ADDR

@@ -438,7 +438,7 @@ Message types
      Slave payload: u64

      Query how many queues the backend supports. This request should be
-      sent only when VHOST_USER_PROTOCOL_F_MQ is set in quried protocol
+      sent only when VHOST_USER_PROTOCOL_F_MQ is set in queried protocol
      features by VHOST_USER_GET_PROTOCOL_FEATURES.

 * VHOST_USER_SET_VRING_ENABLE
--- a/docs/throttle.txt
+++ b/docs/throttle.txt
@@ -10,7 +10,7 @@ Introduction
 ------------
 QEMU includes a throttling module that can be used to set limits to
 I/O operations. The code itself is generic and independent of the I/O
-units, but it is currenly used to limit the number of bytes per second
+units, but it is currently used to limit the number of bytes per second
 and operations per second (IOPS) when performing disk I/O.

 This document explains how to use the throttling code in QEMU, and how
--- a/exec.c
+++ b/exec.c
@@ -24,24 +24,26 @@

 #include "qemu/cutils.h"
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
-#include "hw/hw.h"
+#include "hw/qdev-core.h"
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/boards.h"
+#include "hw/xen/xen.h"
 #endif
-#include "hw/qdev.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
-#include "hw/xen/xen.h"
 #include "qemu/timer.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "exec/memory.h"
-#include "sysemu/dma.h"
-#include "exec/address-spaces.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #else /* !CONFIG_USER_ONLY */
+#include "hw/hw.h"
+#include "exec/memory.h"
+#include "exec/ioport.h"
+#include "sysemu/dma.h"
+#include "exec/address-spaces.h"
 #include "sysemu/xen-mapcache.h"
 #include "trace.h"
 #endif
@@ -641,7 +643,6 @@ void cpu_exec_exit(CPUState *cpu)
 void cpu_exec_init(CPUState *cpu, Error **errp)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    int cpu_index;
    Error *local_err = NULL;

    cpu->as = NULL;
@@ -668,7 +669,7 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
 #if defined(CONFIG_USER_ONLY)
    cpu_list_lock();
 #endif
-    cpu_index = cpu->cpu_index = cpu_get_free_index(&local_err);
+    cpu->cpu_index = cpu_get_free_index(&local_err);
    if (local_err) {
        error_propagate(errp, local_err);
 #if defined(CONFIG_USER_ONLY)
@@ -678,14 +679,16 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
    }
    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 #if defined(CONFIG_USER_ONLY)
+    (void) cc;
    cpu_list_unlock();
-#endif
+#else
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
-        vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
+        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
    }
    if (cc->vmsd != NULL) {
-        vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
+        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
    }
+#endif
 }

 #if defined(CONFIG_USER_ONLY)
@@ -2087,7 +2090,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
    target_ulong pc, cs_base;
    target_ulong vaddr;
    CPUWatchpoint *wp;
-    int cpu_flags;
+    uint32_t cpu_flags;

    if (cpu->watchpoint_hit) {
        /* We re-entered the check after replacing the TB. Now raise
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -19,7 +19,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/cutils.h"
-
+#include "cpu.h"
 #ifdef CONFIG_USER_ONLY
 #include "qemu.h"
 #else
@@ -35,6 +35,7 @@
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
+#include "exec/exec-all.h"

 #ifdef CONFIG_USER_ONLY
 #define GDB_ATTACHED "0"
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -47,7 +47,6 @@
    } while (0)

 extern void co_run_in_worker_bh(void *);
-extern int v9fs_init_worker_threads(void);
 extern int v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
 extern int v9fs_co_readdir_r(V9fsPDU *, V9fsFidState *,
                           struct dirent *, struct dirent **result);
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1563,3 +1563,14 @@ build_rsdt(GArray *table_data, GArray *linker, GArray *table_offsets,
    build_header(linker, table_data,
                 (void *)rsdt, "RSDT", rsdt_len, 1, oem_id, oem_table_id);
 }
+
+void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
+                       uint64_t len, int node, MemoryAffinityFlags flags)
+{
+    numamem->type = ACPI_SRAT_MEMORY;
+    numamem->length = sizeof(*numamem);
+    numamem->proximity = cpu_to_le32(node);
+    numamem->flags = cpu_to_le32(flags);
+    numamem->base_addr = cpu_to_le64(base);
+    numamem->range_length = cpu_to_le64(len);
+}
--- a/hw/acpi/bios-linker-loader.c
+++ b/hw/acpi/bios-linker-loader.c
@@ -135,9 +135,8 @@ void bios_linker_loader_alloc(GArray *linker,
    strncpy(entry.alloc.file, file, sizeof entry.alloc.file - 1);
    entry.command = cpu_to_le32(BIOS_LINKER_LOADER_COMMAND_ALLOCATE);
    entry.alloc.align = cpu_to_le32(alloc_align);
-    entry.alloc.zone = cpu_to_le32(alloc_fseg ?
-                                    BIOS_LINKER_LOADER_ALLOC_ZONE_FSEG :
-                                    BIOS_LINKER_LOADER_ALLOC_ZONE_HIGH);
+    entry.alloc.zone = alloc_fseg ? BIOS_LINKER_LOADER_ALLOC_ZONE_FSEG :
+                                    BIOS_LINKER_LOADER_ALLOC_ZONE_HIGH;

    /* Alloc entries must come first, so prepend them */
    g_array_prepend_vals(linker, &entry, sizeof entry);
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -491,6 +491,12 @@ void acpi_pm_tmr_update(ACPIREGS *ar, bool enable)
    }
 }

+static inline int64_t acpi_pm_tmr_get_clock(void)
+{
+    return muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), PM_TIMER_FREQUENCY,
+                    NANOSECONDS_PER_SECOND);
+}
+
 void acpi_pm_tmr_calc_overflow_time(ACPIREGS *ar)
 {
    int64_t d = acpi_pm_tmr_get_clock();
@@ -536,7 +542,6 @@ void acpi_pm_tmr_init(ACPIREGS *ar, acpi_update_sci_fn update_sci,
    ar->tmr.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, acpi_pm_tmr_timer, ar);
    memory_region_init_io(&ar->tmr.io, memory_region_owner(parent),
                          &acpi_pm_tmr_ops, ar, "acpi-tmr", 4);
-    memory_region_clear_global_locking(&ar->tmr.io);
    memory_region_add_subregion(parent, 8, &ar->tmr.io);
 }

--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -378,17 +378,19 @@ struct NvdimmDsmIn {
    uint32_t function;
    /* the remaining size in the page is used by arg3. */
    union {
-        uint8_t arg3[0];
+        uint8_t arg3[4084];
    };
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
+QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmIn) != 4096);

 struct NvdimmDsmOut {
    /* the size of buffer filled by QEMU. */
    uint32_t len;
-    uint8_t data[0];
+    uint8_t data[4092];
 } QEMU_PACKED;
 typedef struct NvdimmDsmOut NvdimmDsmOut;
+QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmOut) != 4096);

 struct NvdimmDsmFunc0Out {
    /* the size of buffer filled by QEMU. */
@@ -424,8 +426,8 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
     * can change its content while we are doing DSM emulation. Avoid
     * this by copying DSM memory to QEMU local memory.
     */
-    in = g_malloc(TARGET_PAGE_SIZE);
-    cpu_physical_memory_read(dsm_mem_addr, in, TARGET_PAGE_SIZE);
+    in = g_new(NvdimmDsmIn, 1);
+    cpu_physical_memory_read(dsm_mem_addr, in, sizeof(*in));

    le32_to_cpus(&in->revision);
    le32_to_cpus(&in->function);
@@ -475,7 +477,7 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);

    state->dsm_mem = g_array_new(false, true /* clear */, 1);
-    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
+    acpi_data_push(state->dsm_mem, sizeof(NvdimmDsmIn));
    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
                    state->dsm_mem->len);
 }
@@ -608,7 +610,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(dev, aml_operation_region("NPIO", AML_SYSTEM_IO,
               aml_int(NVDIMM_ACPI_IO_BASE), NVDIMM_ACPI_IO_LEN));
    aml_append(dev, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
-               aml_name(NVDIMM_ACPI_MEM_ADDR), TARGET_PAGE_SIZE));
+               aml_name(NVDIMM_ACPI_MEM_ADDR), sizeof(NvdimmDsmIn)));

    /*
     * DSM notifier:
@@ -642,8 +644,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(field, aml_named_field("FUNC",
               sizeof(typeof_field(NvdimmDsmIn, function)) * BITS_PER_BYTE));
    aml_append(field, aml_named_field("ARG3",
-               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmIn, arg3)) *
-                BITS_PER_BYTE));
+               (sizeof(NvdimmDsmIn) - offsetof(NvdimmDsmIn, arg3)) * BITS_PER_BYTE));
    aml_append(dev, field);

    /*
@@ -659,8 +660,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(field, aml_named_field("RLEN",
               sizeof(typeof_field(NvdimmDsmOut, len)) * BITS_PER_BYTE));
    aml_append(field, aml_named_field("ODAT",
-               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmOut, data)) *
-                     BITS_PER_BYTE));
+               (sizeof(NvdimmDsmOut) - offsetof(NvdimmDsmOut, data)) * BITS_PER_BYTE));
    aml_append(dev, field);

    nvdimm_build_common_dsm(dev);
@@ -678,7 +678,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    mem_addr_offset = build_append_named_dword(table_data,
                                               NVDIMM_ACPI_MEM_ADDR);

-    bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
+    bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, sizeof(NvdimmDsmIn),
                             false /* high memory */);
    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   NVDIMM_DSM_MEM_FILE, table_data,
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -39,6 +39,7 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/acpi_dev_interface.h"
 #include "hw/xen/xen.h"
+#include "qom/cpu.h"

 //#define DEBUG

--- a/hw/alpha/alpha_sys.h
+++ b/hw/alpha/alpha_sys.h
@@ -3,6 +3,7 @@
 #ifndef HW_ALPHA_H
 #define HW_ALPHA_H 1

+#include "target-alpha/cpu-qom.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_host.h"
 #include "hw/ide.h"
--- a/hw/alpha/pci.c
+++ b/hw/alpha/pci.c
@@ -8,7 +8,6 @@

 #include "qemu/osdep.h"
 #include "qemu-common.h"
-#include "cpu.h"
 #include "alpha_sys.h"
 #include "qemu/log.h"
 #include "sysemu/sysemu.h"
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -16,4 +16,5 @@ obj-$(CONFIG_STM32F205_SOC) += stm32f205_soc.o
 obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp.o xlnx-ep108.o
 obj-$(CONFIG_FSL_IMX25) += fsl-imx25.o imx25_pdk.o
 obj-$(CONFIG_FSL_IMX31) += fsl-imx31.o kzm.o
+obj-$(CONFIG_FSL_IMX6) += fsl-imx6.o sabrelite.o
 obj-$(CONFIG_ASPEED_SOC) += ast2400.o palmetto-bmc.o
--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -132,14 +132,14 @@ typedef struct {
    uint32_t base;
 } BitBandState;

-static int bitband_init(SysBusDevice *dev)
+static void bitband_init(Object *obj)
 {
-    BitBandState *s = BITBAND(dev);
+    BitBandState *s = BITBAND(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

-    memory_region_init_io(&s->iomem, OBJECT(s), &bitband_ops, &s->base,
+    memory_region_init_io(&s->iomem, obj, &bitband_ops, &s->base,
                          "bitband", 0x02000000);
    sysbus_init_mmio(dev, &s->iomem);
-    return 0;
 }

 static void armv7m_bitband_init(void)
@@ -244,9 +244,7 @@ static Property bitband_properties[] = {
 static void bitband_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = bitband_init;
    dc->props = bitband_properties;
 }

@@ -254,6 +252,7 @@ static const TypeInfo bitband_info = {
    .name          = TYPE_BITBAND,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(BitBandState),
+    .instance_init = bitband_init,
    .class_init    = bitband_class_init,
 };

--- a/hw/arm/ast2400.c
+++ b/hw/arm/ast2400.c
@@ -17,6 +17,7 @@
 #include "exec/address-spaces.h"
 #include "hw/arm/ast2400.h"
 #include "hw/char/serial.h"
+#include "qemu/log.h"

 #define AST2400_UART_5_BASE      0x00184000
 #define AST2400_IOMEM_SIZE       0x00200000
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -14,6 +14,7 @@
 #include "hw/arm/linux-boot-if.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/numa.h"
 #include "hw/boards.h"
 #include "hw/loader.h"
 #include "elf.h"
@@ -68,7 +69,7 @@ static const ARMInsnFixup bootloader_aarch64[] = {
 */

 static const ARMInsnFixup bootloader[] = {
-    { 0xe28fe008 }, /* add     lr, pc, #8 */
+    { 0xe28fe004 }, /* add     lr, pc, #4 */
    { 0xe51ff004 }, /* ldr     pc, [pc, #-4] */
    { 0, FIXUP_BOARD_SETUP },
 #define BOOTLOADER_NO_BOARD_SETUP_OFFSET 3
@@ -405,6 +406,9 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
    void *fdt = NULL;
    int size, rc;
    uint32_t acells, scells;
+    char *nodename;
+    unsigned int i;
+    hwaddr mem_base, mem_len;

    if (binfo->dtb_filename) {
        char *filename;
@@ -456,12 +460,39 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
        goto fail;
    }

-    rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
-                                      acells, binfo->loader_start,
-                                      scells, binfo->ram_size);
-    if (rc < 0) {
-        fprintf(stderr, "couldn't set /memory/reg\n");
-        goto fail;
+    if (nb_numa_nodes > 0) {
+        /*
+         * Turn the /memory node created before into a NOP node, then create
+         * /memory@addr nodes for all numa nodes respectively.
+         */
+        qemu_fdt_nop_node(fdt, "/memory");
+        mem_base = binfo->loader_start;
+        for (i = 0; i < nb_numa_nodes; i++) {
+            mem_len = numa_info[i].node_mem;
+            nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
+            qemu_fdt_add_subnode(fdt, nodename);
+            qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
+            rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
+                                              acells, mem_base,
+                                              scells, mem_len);
+            if (rc < 0) {
+                fprintf(stderr, "couldn't set %s/reg for node %d\n", nodename,
+                        i);
+                goto fail;
+            }
+
+            qemu_fdt_setprop_cell(fdt, nodename, "numa-node-id", i);
+            mem_base += mem_len;
+            g_free(nodename);
+        }
+    } else {
+        rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
+                                          acells, binfo->loader_start,
+                                          scells, binfo->ram_size);
+        if (rc < 0) {
+            fprintf(stderr, "couldn't set /memory/reg\n");
+            goto fail;
+        }
    }

    if (binfo->kernel_cmdline && *binfo->kernel_cmdline) {
--- a/hw/arm/collie.c
+++ b/hw/arm/collie.c
@@ -18,6 +18,7 @@
 #include "hw/block/flash.h"
 #include "sysemu/block-backend.h"
 #include "exec/address-spaces.h"
+#include "qom/cpu.h"

 static struct arm_boot_info collie_binfo = {
    .loader_start = SA_SDCS0,
--- a/hw/arm/fsl-imx6.c
+++ b/hw/arm/fsl-imx6.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2015 Jean-Christophe Dubois <jcd@tribudubois.net>
+ *
+ * i.MX6 SOC emulation.
+ *
+ * Based on hw/arm/fsl-imx31.c
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/arm/fsl-imx6.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/char.h"
+#include "qemu/error-report.h"
+
+#define NAME_SIZE 20
+
+static void fsl_imx6_init(Object *obj)
+{
+    FslIMX6State *s = FSL_IMX6(obj);
+    char name[NAME_SIZE];
+    int i;
+
+    if (smp_cpus > FSL_IMX6_NUM_CPUS) {
+        error_report("%s: Only %d CPUs are supported (%d requested)",
+                     TYPE_FSL_IMX6, FSL_IMX6_NUM_CPUS, smp_cpus);
+        exit(1);
+    }
+
+    for (i = 0; i < smp_cpus; i++) {
+        object_initialize(&s->cpu[i], sizeof(s->cpu[i]),
+                          "cortex-a9-" TYPE_ARM_CPU);
+        snprintf(name, NAME_SIZE, "cpu%d", i);
+        object_property_add_child(obj, name, OBJECT(&s->cpu[i]), NULL);
+    }
+
+    object_initialize(&s->a9mpcore, sizeof(s->a9mpcore), TYPE_A9MPCORE_PRIV);
+    qdev_set_parent_bus(DEVICE(&s->a9mpcore), sysbus_get_default());
+    object_property_add_child(obj, "a9mpcore", OBJECT(&s->a9mpcore), NULL);
+
+    object_initialize(&s->ccm, sizeof(s->ccm), TYPE_IMX6_CCM);
+    qdev_set_parent_bus(DEVICE(&s->ccm), sysbus_get_default());
+    object_property_add_child(obj, "ccm", OBJECT(&s->ccm), NULL);
+
+    object_initialize(&s->src, sizeof(s->src), TYPE_IMX6_SRC);
+    qdev_set_parent_bus(DEVICE(&s->src), sysbus_get_default());
+    object_property_add_child(obj, "src", OBJECT(&s->src), NULL);
+
+    for (i = 0; i < FSL_IMX6_NUM_UARTS; i++) {
+        object_initialize(&s->uart[i], sizeof(s->uart[i]), TYPE_IMX_SERIAL);
+        qdev_set_parent_bus(DEVICE(&s->uart[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "uart%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->uart[i]), NULL);
+    }
+
+    object_initialize(&s->gpt, sizeof(s->gpt), TYPE_IMX_GPT);
+    qdev_set_parent_bus(DEVICE(&s->gpt), sysbus_get_default());
+    object_property_add_child(obj, "gpt", OBJECT(&s->gpt), NULL);
+
+    for (i = 0; i < FSL_IMX6_NUM_EPITS; i++) {
+        object_initialize(&s->epit[i], sizeof(s->epit[i]), TYPE_IMX_EPIT);
+        qdev_set_parent_bus(DEVICE(&s->epit[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "epit%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->epit[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_I2CS; i++) {
+        object_initialize(&s->i2c[i], sizeof(s->i2c[i]), TYPE_IMX_I2C);
+        qdev_set_parent_bus(DEVICE(&s->i2c[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "i2c%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->i2c[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_GPIOS; i++) {
+        object_initialize(&s->gpio[i], sizeof(s->gpio[i]), TYPE_IMX_GPIO);
+        qdev_set_parent_bus(DEVICE(&s->gpio[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "gpio%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->gpio[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_ESDHCS; i++) {
+        object_initialize(&s->esdhc[i], sizeof(s->esdhc[i]), TYPE_SYSBUS_SDHCI);
+        qdev_set_parent_bus(DEVICE(&s->esdhc[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "sdhc%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->esdhc[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_ECSPIS; i++) {
+        object_initialize(&s->spi[i], sizeof(s->spi[i]), TYPE_IMX_SPI);
+        qdev_set_parent_bus(DEVICE(&s->spi[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "spi%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->spi[i]), NULL);
+    }
+}
+
+static void fsl_imx6_realize(DeviceState *dev, Error **errp)
+{
+    FslIMX6State *s = FSL_IMX6(dev);
+    uint16_t i;
+    Error *err = NULL;
+
+    for (i = 0; i < smp_cpus; i++) {
+
+        /* On uniprocessor, the CBAR is set to 0 */
+        if (smp_cpus > 1) {
+            object_property_set_int(OBJECT(&s->cpu[i]), FSL_IMX6_A9MPCORE_ADDR,
+                                    "reset-cbar", &error_abort);
+        }
+
+        /* All CPU but CPU 0 start in power off mode */
+        if (i) {
+            object_property_set_bool(OBJECT(&s->cpu[i]), true,
+                                     "start-powered-off", &error_abort);
+        }
+
+        object_property_set_bool(OBJECT(&s->cpu[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+    }
+
+    object_property_set_int(OBJECT(&s->a9mpcore), smp_cpus, "num-cpu",
+                            &error_abort);
+
+    object_property_set_int(OBJECT(&s->a9mpcore),
+                            FSL_IMX6_MAX_IRQ + GIC_INTERNAL, "num-irq",
+                            &error_abort);
+
+    object_property_set_bool(OBJECT(&s->a9mpcore), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->a9mpcore), 0, FSL_IMX6_A9MPCORE_ADDR);
+
+    for (i = 0; i < smp_cpus; i++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->a9mpcore), i,
+                           qdev_get_gpio_in(DEVICE(&s->cpu[i]), ARM_CPU_IRQ));
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->a9mpcore), i + smp_cpus,
+                           qdev_get_gpio_in(DEVICE(&s->cpu[i]), ARM_CPU_FIQ));
+    }
+
+    object_property_set_bool(OBJECT(&s->ccm), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->ccm), 0, FSL_IMX6_CCM_ADDR);
+
+    object_property_set_bool(OBJECT(&s->src), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->src), 0, FSL_IMX6_SRC_ADDR);
+
+    /* Initialize all UARTs */
+    for (i = 0; i < FSL_IMX6_NUM_UARTS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } serial_table[FSL_IMX6_NUM_UARTS] = {
+            { FSL_IMX6_UART1_ADDR, FSL_IMX6_UART1_IRQ },
+            { FSL_IMX6_UART2_ADDR, FSL_IMX6_UART2_IRQ },
+            { FSL_IMX6_UART3_ADDR, FSL_IMX6_UART3_IRQ },
+            { FSL_IMX6_UART4_ADDR, FSL_IMX6_UART4_IRQ },
+            { FSL_IMX6_UART5_ADDR, FSL_IMX6_UART5_IRQ },
+        };
+
+        if (i < MAX_SERIAL_PORTS) {
+            CharDriverState *chr;
+
+            chr = serial_hds[i];
+
+            if (!chr) {
+                char *label = g_strdup_printf("imx6.uart%d", i + 1);
+                chr = qemu_chr_new(label, "null", NULL);
+                g_free(label);
+                serial_hds[i] = chr;
+            }
+
+            qdev_prop_set_chr(DEVICE(&s->uart[i]), "chardev", chr);
+        }
+
+        object_property_set_bool(OBJECT(&s->uart[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->uart[i]), 0, serial_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->uart[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            serial_table[i].irq));
+    }
+
+    s->gpt.ccm = IMX_CCM(&s->ccm);
+
+    object_property_set_bool(OBJECT(&s->gpt), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->gpt), 0, FSL_IMX6_GPT_ADDR);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpt), 0,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_GPT_IRQ));
+
+    /* Initialize all EPIT timers */
+    for (i = 0; i < FSL_IMX6_NUM_EPITS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } epit_table[FSL_IMX6_NUM_EPITS] = {
+            { FSL_IMX6_EPIT1_ADDR, FSL_IMX6_EPIT1_IRQ },
+            { FSL_IMX6_EPIT2_ADDR, FSL_IMX6_EPIT2_IRQ },
+        };
+
+        s->epit[i].ccm = IMX_CCM(&s->ccm);
+
+        object_property_set_bool(OBJECT(&s->epit[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->epit[i]), 0, epit_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->epit[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            epit_table[i].irq));
+    }
+
+    /* Initialize all I2C */
+    for (i = 0; i < FSL_IMX6_NUM_I2CS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } i2c_table[FSL_IMX6_NUM_I2CS] = {
+            { FSL_IMX6_I2C1_ADDR, FSL_IMX6_I2C1_IRQ },
+            { FSL_IMX6_I2C2_ADDR, FSL_IMX6_I2C2_IRQ },
+            { FSL_IMX6_I2C3_ADDR, FSL_IMX6_I2C3_IRQ }
+        };
+
+        object_property_set_bool(OBJECT(&s->i2c[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c[i]), 0, i2c_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            i2c_table[i].irq));
+    }
+
+    /* Initialize all GPIOs */
+    for (i = 0; i < FSL_IMX6_NUM_GPIOS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq_low;
+            unsigned int irq_high;
+        } gpio_table[FSL_IMX6_NUM_GPIOS] = {
+            {
+                FSL_IMX6_GPIO1_ADDR,
+                FSL_IMX6_GPIO1_LOW_IRQ,
+                FSL_IMX6_GPIO1_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO2_ADDR,
+                FSL_IMX6_GPIO2_LOW_IRQ,
+                FSL_IMX6_GPIO2_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO3_ADDR,
+                FSL_IMX6_GPIO3_LOW_IRQ,
+                FSL_IMX6_GPIO3_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO4_ADDR,
+                FSL_IMX6_GPIO4_LOW_IRQ,
+                FSL_IMX6_GPIO4_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO5_ADDR,
+                FSL_IMX6_GPIO5_LOW_IRQ,
+                FSL_IMX6_GPIO5_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO6_ADDR,
+                FSL_IMX6_GPIO6_LOW_IRQ,
+                FSL_IMX6_GPIO6_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO7_ADDR,
+                FSL_IMX6_GPIO7_LOW_IRQ,
+                FSL_IMX6_GPIO7_HIGH_IRQ
+            },
+        };
+
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "has-edge-sel",
+                                 &error_abort);
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "has-upper-pin-irq",
+                                 &error_abort);
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->gpio[i]), 0, gpio_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpio[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            gpio_table[i].irq_low));
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpio[i]), 1,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            gpio_table[i].irq_high));
+    }
+
+    /* Initialize all SDHC */
+    for (i = 0; i < FSL_IMX6_NUM_ESDHCS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } esdhc_table[FSL_IMX6_NUM_ESDHCS] = {
+            { FSL_IMX6_uSDHC1_ADDR, FSL_IMX6_uSDHC1_IRQ },
+            { FSL_IMX6_uSDHC2_ADDR, FSL_IMX6_uSDHC2_IRQ },
+            { FSL_IMX6_uSDHC3_ADDR, FSL_IMX6_uSDHC3_IRQ },
+            { FSL_IMX6_uSDHC4_ADDR, FSL_IMX6_uSDHC4_IRQ },
+        };
+
+        object_property_set_bool(OBJECT(&s->esdhc[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->esdhc[i]), 0, esdhc_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->esdhc[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            esdhc_table[i].irq));
+    }
+
+    /* Initialize all ECSPI */
+    for (i = 0; i < FSL_IMX6_NUM_ECSPIS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } spi_table[FSL_IMX6_NUM_ECSPIS] = {
+            { FSL_IMX6_eCSPI1_ADDR, FSL_IMX6_ECSPI1_IRQ },
+            { FSL_IMX6_eCSPI2_ADDR, FSL_IMX6_ECSPI2_IRQ },
+            { FSL_IMX6_eCSPI3_ADDR, FSL_IMX6_ECSPI3_IRQ },
+            { FSL_IMX6_eCSPI4_ADDR, FSL_IMX6_ECSPI4_IRQ },
+            { FSL_IMX6_eCSPI5_ADDR, FSL_IMX6_ECSPI5_IRQ },
+        };
+
+        /* Initialize the SPI */
+        object_property_set_bool(OBJECT(&s->spi[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->spi[i]), 0, spi_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->spi[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            spi_table[i].irq));
+    }
+
+    /* ROM memory */
+    memory_region_init_rom_device(&s->rom, NULL, NULL, NULL, "imx6.rom",
+                                  FSL_IMX6_ROM_SIZE, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_ROM_ADDR,
+                                &s->rom);
+
+    /* CAAM memory */
+    memory_region_init_rom_device(&s->caam, NULL, NULL, NULL, "imx6.caam",
+                                  FSL_IMX6_CAAM_MEM_SIZE, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_CAAM_MEM_ADDR,
+                                &s->caam);
+
+    /* OCRAM memory */
+    memory_region_init_ram(&s->ocram, NULL, "imx6.ocram", FSL_IMX6_OCRAM_SIZE,
+                           &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_OCRAM_ADDR,
+                                &s->ocram);
+    vmstate_register_ram_global(&s->ocram);
+
+    /* internal OCRAM (256 KB) is aliased over 1 MB */
+    memory_region_init_alias(&s->ocram_alias, NULL, "imx6.ocram_alias",
+                             &s->ocram, 0, FSL_IMX6_OCRAM_ALIAS_SIZE);
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_OCRAM_ALIAS_ADDR,
+                                &s->ocram_alias);
+}
+
+static void fsl_imx6_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = fsl_imx6_realize;
+
+    /*
+     * Reason: creates an ARM CPU, thus use after free(), see
+     * arm_cpu_class_init()
+     */
+    dc->cannot_destroy_with_object_finalize_yet = true;
+    dc->desc = "i.MX6 SOC";
+}
+
+static const TypeInfo fsl_imx6_type_info = {
+    .name = TYPE_FSL_IMX6,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(FslIMX6State),
+    .instance_init = fsl_imx6_init,
+    .class_init = fsl_imx6_class_init,
+};
+
+static void fsl_imx6_register_types(void)
+{
+    type_register_static(&fsl_imx6_type_info);
+}
+
+type_init(fsl_imx6_register_types)
--- a/hw/arm/highbank.c
+++ b/hw/arm/highbank.c
@@ -168,23 +168,20 @@ static void highbank_regs_reset(DeviceState *dev)
    s->regs[0x43] = 0x05F40121;
 }

-static int highbank_regs_init(SysBusDevice *dev)
+static void highbank_regs_init(Object *obj)
 {
-    HighbankRegsState *s = HIGHBANK_REGISTERS(dev);
+    HighbankRegsState *s = HIGHBANK_REGISTERS(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

-    memory_region_init_io(&s->iomem, OBJECT(s), &hb_mem_ops, s->regs,
+    memory_region_init_io(&s->iomem, obj, &hb_mem_ops, s->regs,
                          "highbank_regs", 0x1000);
    sysbus_init_mmio(dev, &s->iomem);
-
-    return 0;
 }

 static void highbank_regs_class_init(ObjectClass *klass, void *data)
 {
-    SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass);
    DeviceClass *dc = DEVICE_CLASS(klass);

-    sbc->init = highbank_regs_init;
    dc->desc = "Calxeda Highbank registers";
    dc->vmsd = &vmstate_highbank_regs;
    dc->reset = highbank_regs_reset;
@@ -194,6 +191,7 @@ static const TypeInfo highbank_regs_info = {
    .name          = TYPE_HIGHBANK_REGISTERS,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(HighbankRegsState),
+    .instance_init = highbank_regs_init,
    .class_init    = highbank_regs_class_init,
 };

--- a/hw/arm/integratorcp.c
+++ b/hw/arm/integratorcp.c
@@ -242,9 +242,10 @@ static const MemoryRegionOps integratorcm_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

-static int integratorcm_init(SysBusDevice *dev)
+static void integratorcm_init(Object *obj)
 {
-    IntegratorCMState *s = INTEGRATOR_CM(dev);
+    IntegratorCMState *s = INTEGRATOR_CM(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

    s->cm_osc = 0x01000048;
    /* ??? What should the high bits of this value be?  */
@@ -269,17 +270,16 @@ static int integratorcm_init(SysBusDevice *dev)
    s->cm_init = 0x00000112;
    s->cm_refcnt_offset = muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
                                   1000);
-    memory_region_init_ram(&s->flash, OBJECT(s), "integrator.flash", 0x100000,
+    memory_region_init_ram(&s->flash, obj, "integrator.flash", 0x100000,
                           &error_fatal);
    vmstate_register_ram_global(&s->flash);

-    memory_region_init_io(&s->iomem, OBJECT(s), &integratorcm_ops, s,
+    memory_region_init_io(&s->iomem, obj, &integratorcm_ops, s,
                          "integratorcm", 0x00800000);
    sysbus_init_mmio(dev, &s->iomem);

    integratorcm_do_remap(s);
    /* ??? Save/restore.  */
-    return 0;
 }

 /* Integrator/CP hardware emulation.  */
@@ -394,18 +394,18 @@ static const MemoryRegionOps icp_pic_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

-static int icp_pic_init(SysBusDevice *sbd)
+static void icp_pic_init(Object *obj)
 {
-    DeviceState *dev = DEVICE(sbd);
-    icp_pic_state *s = INTEGRATOR_PIC(dev);
+    DeviceState *dev = DEVICE(obj);
+    icp_pic_state *s = INTEGRATOR_PIC(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);

    qdev_init_gpio_in(dev, icp_pic_set_irq, 32);
    sysbus_init_irq(sbd, &s->parent_irq);
    sysbus_init_irq(sbd, &s->parent_fiq);
-    memory_region_init_io(&s->iomem, OBJECT(s), &icp_pic_ops, s,
+    memory_region_init_io(&s->iomem, obj, &icp_pic_ops, s,
                          "icp-pic", 0x00800000);
    sysbus_init_mmio(sbd, &s->iomem);
-    return 0;
 }

 /* CP control registers.  */
@@ -630,9 +630,7 @@ static Property core_properties[] = {
 static void core_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = integratorcm_init;
    dc->props = core_properties;
 }

@@ -640,21 +638,15 @@ static const TypeInfo core_info = {
    .name          = TYPE_INTEGRATOR_CM,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(IntegratorCMState),
+    .instance_init = integratorcm_init,
    .class_init    = core_class_init,
 };

-static void icp_pic_class_init(ObjectClass *klass, void *data)
-{
-    SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
-
-    sdc->init = icp_pic_init;
-}
-
 static const TypeInfo icp_pic_info = {
    .name          = TYPE_INTEGRATOR_PIC,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(icp_pic_state),
-    .class_init    = icp_pic_class_init,
+    .instance_init = icp_pic_init,
 };

 static const TypeInfo icp_ctrl_regs_info = {
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -20,7 +20,9 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "cpu.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"
 #include "sysemu/sysemu.h"
 #include "hw/arm/omap.h"
 #include "hw/arm/arm.h"
@@ -35,6 +37,7 @@
 #include "hw/loader.h"
 #include "sysemu/block-backend.h"
 #include "hw/sysbus.h"
+#include "qemu/log.h"
 #include "exec/address-spaces.h"

 /* Nokia N8x0 support */
@@ -1364,7 +1367,7 @@ static void n8x0_init(MachineState *machine,

    if (option_rom[0].name &&
        (machine->boot_order[0] == 'n' || !machine->kernel_filename)) {
-        uint8_t nolo_tags[0x10000];
+        uint8_t *nolo_tags = g_new(uint8_t, 0x10000);
        /* No, wait, better start at the ROM.  */
        s->mpu->cpu->env.regs[15] = OMAP2_Q2_BASE + 0x400000;

@@ -1383,6 +1386,7 @@ static void n8x0_init(MachineState *machine,

        n800_setup_nolo_tags(nolo_tags);
        cpu_physical_memory_write(OMAP2_SRAM_BASE, nolo_tags, 0x10000);
+        g_free(nolo_tags);
    }
 }

--- a/hw/arm/palmetto-bmc.c
+++ b/hw/arm/palmetto-bmc.c
@@ -17,6 +17,7 @@
 #include "hw/arm/arm.h"
 #include "hw/arm/ast2400.h"
 #include "hw/boards.h"
+#include "qemu/log.h"

 static struct arm_boot_info palmetto_bmc_binfo = {
    .loader_start = AST2400_SDRAM_BASE,
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -1107,9 +1107,10 @@ static const MemoryRegionOps pxa2xx_rtc_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

-static int pxa2xx_rtc_init(SysBusDevice *dev)
+static void pxa2xx_rtc_init(Object *obj)
 {
-    PXA2xxRTCState *s = PXA2XX_RTC(dev);
+    PXA2xxRTCState *s = PXA2XX_RTC(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
    struct tm tm;
    int wom;

@@ -1138,11 +1139,9 @@ static int pxa2xx_rtc_init(SysBusDevice *dev)

    sysbus_init_irq(dev, &s->rtc_irq);

-    memory_region_init_io(&s->iomem, OBJECT(s), &pxa2xx_rtc_ops, s,
+    memory_region_init_io(&s->iomem, obj, &pxa2xx_rtc_ops, s,
                          "pxa2xx-rtc", 0x10000);
    sysbus_init_mmio(dev, &s->iomem);
-
-    return 0;
 }

 static void pxa2xx_rtc_pre_save(void *opaque)
@@ -1195,9 +1194,7 @@ static const VMStateDescription vmstate_pxa2xx_rtc_regs = {
 static void pxa2xx_rtc_sysbus_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = pxa2xx_rtc_init;
    dc->desc = "PXA2xx RTC Controller";
    dc->vmsd = &vmstate_pxa2xx_rtc_regs;
 }
@@ -1206,6 +1203,7 @@ static const TypeInfo pxa2xx_rtc_sysbus_info = {
    .name          = TYPE_PXA2XX_RTC,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(PXA2xxRTCState),
+    .instance_init = pxa2xx_rtc_init,
    .class_init    = pxa2xx_rtc_sysbus_class_init,
 };

@@ -1501,19 +1499,18 @@ PXA2xxI2CState *pxa2xx_i2c_init(hwaddr base,
    return s;
 }

-static int pxa2xx_i2c_initfn(SysBusDevice *sbd)
+static void pxa2xx_i2c_initfn(Object *obj)
 {
-    DeviceState *dev = DEVICE(sbd);
-    PXA2xxI2CState *s = PXA2XX_I2C(dev);
+    DeviceState *dev = DEVICE(obj);
+    PXA2xxI2CState *s = PXA2XX_I2C(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);

    s->bus = i2c_init_bus(dev, "i2c");

-    memory_region_init_io(&s->iomem, OBJECT(s), &pxa2xx_i2c_ops, s,
+    memory_region_init_io(&s->iomem, obj, &pxa2xx_i2c_ops, s,
                          "pxa2xx-i2c", s->region_size);
    sysbus_init_mmio(sbd, &s->iomem);
    sysbus_init_irq(sbd, &s->irq);
-
-    return 0;
 }

 I2CBus *pxa2xx_i2c_bus(PXA2xxI2CState *s)
@@ -1530,9 +1527,7 @@ static Property pxa2xx_i2c_properties[] = {
 static void pxa2xx_i2c_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = pxa2xx_i2c_initfn;
    dc->desc = "PXA2xx I2C Bus Controller";
    dc->vmsd = &vmstate_pxa2xx_i2c;
    dc->props = pxa2xx_i2c_properties;
@@ -1542,6 +1537,7 @@ static const TypeInfo pxa2xx_i2c_info = {
    .name          = TYPE_PXA2XX_I2C,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(PXA2xxI2CState),
+    .instance_init = pxa2xx_i2c_initfn,
    .class_init    = pxa2xx_i2c_class_init,
 };

--- a/hw/arm/pxa2xx_gpio.c
+++ b/hw/arm/pxa2xx_gpio.c
@@ -8,9 +8,11 @@
 */

 #include "qemu/osdep.h"
+#include "cpu.h"
 #include "hw/hw.h"
 #include "hw/sysbus.h"
 #include "hw/arm/pxa.h"
+#include "qemu/log.h"

 #define PXA2XX_GPIO_BANKS	4

--- a/hw/arm/pxa2xx_pic.c
+++ b/hw/arm/pxa2xx_pic.c
@@ -310,17 +310,10 @@ static VMStateDescription vmstate_pxa2xx_pic_regs = {
    },
 };

-static int pxa2xx_pic_initfn(SysBusDevice *dev)
-{
-    return 0;
-}
-
 static void pxa2xx_pic_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = pxa2xx_pic_initfn;
    dc->desc = "PXA2xx PIC";
    dc->vmsd = &vmstate_pxa2xx_pic_regs;
 }
--- a/hw/arm/sabrelite.c
+++ b/hw/arm/sabrelite.c
@@ -0,0 +1,121 @@
+/*
+ * SABRELITE Board System emulation.
+ *
+ * Copyright (c) 2015 Jean-Christophe Dubois <jcd@tribudubois.net>
+ *
+ * This code is licensed under the GPL, version 2 or later.
+ * See the file `COPYING' in the top level directory.
+ *
+ * It (partially) emulates a sabrelite board, with a Freescale
+ * i.MX6 SoC
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/arm/fsl-imx6.h"
+#include "hw/boards.h"
+#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
+#include "sysemu/qtest.h"
+
+typedef struct IMX6Sabrelite {
+    FslIMX6State soc;
+    MemoryRegion ram;
+} IMX6Sabrelite;
+
+static struct arm_boot_info sabrelite_binfo = {
+    /* DDR memory start */
+    .loader_start = FSL_IMX6_MMDC_ADDR,
+    /* No board ID, we boot from DT tree */
+    .board_id = -1,
+};
+
+/* No need to do any particular setup for secondary boot */
+static void sabrelite_write_secondary(ARMCPU *cpu,
+                                      const struct arm_boot_info *info)
+{
+}
+
+/* Secondary cores are reset through SRC device */
+static void sabrelite_reset_secondary(ARMCPU *cpu,
+                                      const struct arm_boot_info *info)
+{
+}
+
+static void sabrelite_init(MachineState *machine)
+{
+    IMX6Sabrelite *s = g_new0(IMX6Sabrelite, 1);
+    Error *err = NULL;
+
+    /* Check the amount of memory is compatible with the SOC */
+    if (machine->ram_size > FSL_IMX6_MMDC_SIZE) {
+        error_report("RAM size " RAM_ADDR_FMT " above max supported (%08x)",
+                     machine->ram_size, FSL_IMX6_MMDC_SIZE);
+        exit(1);
+    }
+
+    object_initialize(&s->soc, sizeof(s->soc), TYPE_FSL_IMX6);
+    object_property_add_child(OBJECT(machine), "soc", OBJECT(&s->soc),
+                              &error_abort);
+
+    object_property_set_bool(OBJECT(&s->soc), true, "realized", &err);
+    if (err != NULL) {
+        error_report("%s", error_get_pretty(err));
+        exit(1);
+    }
+
+    memory_region_allocate_system_memory(&s->ram, NULL, "sabrelite.ram",
+                                         machine->ram_size);
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_MMDC_ADDR,
+                                &s->ram);
+
+    {
+        /*
+         * TODO: Ideally we would expose the chip select and spi bus on the
+         * SoC object using alias properties; then we would not need to
+         * directly access the underlying spi device object.
+         */
+        /* Add the sst25vf016b NOR FLASH memory to first SPI */
+        Object *spi_dev;
+
+        spi_dev = object_resolve_path_component(OBJECT(&s->soc), "spi1");
+        if (spi_dev) {
+            SSIBus *spi_bus;
+
+            spi_bus = (SSIBus *)qdev_get_child_bus(DEVICE(spi_dev), "spi");
+            if (spi_bus) {
+                DeviceState *flash_dev;
+
+                flash_dev = ssi_create_slave(spi_bus, "sst25vf016b");
+                if (flash_dev) {
+                    qemu_irq cs_line = qdev_get_gpio_in_named(flash_dev,
+                                                              SSI_GPIO_CS, 0);
+                    sysbus_connect_irq(SYS_BUS_DEVICE(spi_dev), 1, cs_line);
+                }
+            }
+        }
+    }
+
+    sabrelite_binfo.ram_size = machine->ram_size;
+    sabrelite_binfo.kernel_filename = machine->kernel_filename;
+    sabrelite_binfo.kernel_cmdline = machine->kernel_cmdline;
+    sabrelite_binfo.initrd_filename = machine->initrd_filename;
+    sabrelite_binfo.nb_cpus = smp_cpus;
+    sabrelite_binfo.secure_boot = true;
+    sabrelite_binfo.write_secondary_boot = sabrelite_write_secondary;
+    sabrelite_binfo.secondary_cpu_reset_hook = sabrelite_reset_secondary;
+
+    if (!qtest_enabled()) {
+        arm_load_kernel(&s->soc.cpu[0], &sabrelite_binfo);
+    }
+}
+
+static void sabrelite_machine_init(MachineClass *mc)
+{
+    mc->desc = "Freescale i.MX6 Quad SABRE Lite Board (Cortex A9)";
+    mc->init = sabrelite_init;
+    mc->max_cpus = FSL_IMX6_NUM_CPUS;
+}
+
+DEFINE_MACHINE("sabrelite", sabrelite_machine_init)
--- a/hw/arm/spitz.c
+++ b/hw/arm/spitz.c
@@ -164,9 +164,10 @@ static void sl_flash_register(PXA2xxState *cpu, int size)
    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, FLASH_BASE);
 }

-static int sl_nand_init(SysBusDevice *dev)
+static void sl_nand_init(Object *obj)
 {
-    SLNANDState *s = SL_NAND(dev);
+    SLNANDState *s = SL_NAND(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
    DriveInfo *nand;

    s->ctl = 0;
@@ -175,10 +176,8 @@ static int sl_nand_init(SysBusDevice *dev)
    s->nand = nand_init(nand ? blk_by_legacy_dinfo(nand) : NULL,
                        s->manf_id, s->chip_id);

-    memory_region_init_io(&s->iomem, OBJECT(s), &sl_ops, s, "sl", 0x40);
+    memory_region_init_io(&s->iomem, obj, &sl_ops, s, "sl", 0x40);
    sysbus_init_mmio(dev, &s->iomem);
-
-    return 0;
 }

 /* Spitz Keyboard */
@@ -501,10 +500,10 @@ static void spitz_keyboard_register(PXA2xxState *cpu)
    qemu_add_kbd_event_handler(spitz_keyboard_handler, s);
 }

-static int spitz_keyboard_init(SysBusDevice *sbd)
+static void spitz_keyboard_init(Object *obj)
 {
-    DeviceState *dev = DEVICE(sbd);
-    SpitzKeyboardState *s = SPITZ_KEYBOARD(dev);
+    DeviceState *dev = DEVICE(obj);
+    SpitzKeyboardState *s = SPITZ_KEYBOARD(obj);
    int i, j;

    for (i = 0; i < 0x80; i ++)
@@ -519,8 +518,6 @@ static int spitz_keyboard_init(SysBusDevice *sbd)
    s->kbdtimer = timer_new_ns(QEMU_CLOCK_VIRTUAL, spitz_keyboard_tick, s);
    qdev_init_gpio_in(dev, spitz_keyboard_strobe, SPITZ_KEY_STROBE_NUM);
    qdev_init_gpio_out(dev, s->sense, SPITZ_KEY_SENSE_NUM);
-
-    return 0;
 }

 /* LCD backlight controller */
@@ -1065,9 +1062,7 @@ static Property sl_nand_properties[] = {
 static void sl_nand_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = sl_nand_init;
    dc->vmsd = &vmstate_sl_nand_info;
    dc->props = sl_nand_properties;
    /* Reason: init() method uses drive_get() */
@@ -1078,6 +1073,7 @@ static const TypeInfo sl_nand_info = {
    .name          = TYPE_SL_NAND,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(SLNANDState),
+    .instance_init = sl_nand_init,
    .class_init    = sl_nand_class_init,
 };

@@ -1097,9 +1093,7 @@ static VMStateDescription vmstate_spitz_kbd = {
 static void spitz_keyboard_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = spitz_keyboard_init;
    dc->vmsd = &vmstate_spitz_kbd;
 }

@@ -1107,6 +1101,7 @@ static const TypeInfo spitz_keyboard_info = {
    .name          = TYPE_SPITZ_KEYBOARD,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(SpitzKeyboardState),
+    .instance_init = spitz_keyboard_init,
    .class_init    = spitz_keyboard_class_init,
 };

--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .5.91
 .6.50