vnc: add configurable keyboard delay

Limits the rate kbd events from the vnc server are forwarded to the guest, so input devices which are typically low-bandwidth can keep up even on bulky input. v2: update documentation too. v3: spell fixes. Signed-off-by: Gerd Hoffmann <kraxel@redhat.com> Tested-by: Yang Hongyang <hongyang.yang@easystack.cn> Message-id: 1464762150-25817-1-git-send-email-kraxel@redhat.com
sdl2: skip init without outputs
2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00
866 changed files with 30746 additions and 14867 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,5 @@
 cscope.*
 tags
 TAGS
+docker-src.*
 *~
--- a/34
+++ b/34
@@ -165,6 +165,7 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/

 PowerPC
+M: David Gibson <david@gibson.dropbear.id.au>
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Maintained
@@ -597,7 +598,7 @@ F: hw/pci-host/grackle.c
 F: hw/misc/macio/

 PReP
-M: Andreas Färber <andreas.faerber@web.de>
+L: qemu-devel@nongnu.org
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/prep.c
@@ -953,6 +954,14 @@ S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h

+Network packet abstractions
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: include/net/eth.h
+F: net/eth.c
+F: hw/net/net_rx_pkt*
+F: hw/net/net_tx_pkt*
+
 Vmware
 M: Dmitry Fleytman <dmitry@daynix.com>
 S: Maintained
@@ -972,6 +981,16 @@ F: hw/acpi/nvdimm.c
 F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h

+e1000x
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000x*
+
+e1000e
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000e*
+
 Subsystems
 ----------
 Audio
@@ -1046,7 +1065,7 @@ S: Supported
 F: scripts/coverity-model.c

 CPU
-M: Andreas Färber <afaerber@suse.de>
+L: qemu-devel@nongnu.org
 S: Supported
 F: qom/cpu.c
 F: include/qom/cpu.h
@@ -1105,7 +1124,6 @@ F: ui/
 F: include/ui/

 Cocoa graphics
-M: Andreas Färber <andreas.faerber@web.de>
 M: Peter Maydell <peter.maydell@linaro.org>
 S: Odd Fixes
 F: ui/cocoa.m
@@ -1400,9 +1418,8 @@ S: Orphan

 Stable 0.15
 L: qemu-stable@nongnu.org
-M: Andreas Färber <afaerber@suse.de>
 T: git git://git.qemu-project.org/qemu-stable-0.15.git
-S: Supported
+S: Orphan

 Stable 0.14
 L: qemu-stable@nongnu.org
@@ -1616,3 +1633,10 @@ Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt
+
+Docker testing
+--------------
+Docker based testing framework and cases
+M: Fam Zheng <famz@redhat.com>
+S: Maintained
+F: tests/docker/
--- a/17
+++ b/17
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.

-UNCHECKED_GOALS := %clean TAGS cscope ctags
+UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-%

 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -30,7 +30,6 @@ CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak

-include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@# TODO: The next lines include code which supports a smooth
@@ -49,6 +48,8 @@ ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fa
 endif
 endif

+include $(SRC_PATH)/rules.mak
+
 GENERATED_HEADERS = config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
 GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
@@ -92,9 +93,6 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 DOCS+=qmp-commands.txt
-ifdef CONFIG_LINUX
-DOCS+=kvm_stat.1
-endif
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -356,6 +354,7 @@ clean:
 	if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
 	rm -f $$d/qemu-options.def; \
        done
+	rm -f $(SUBDIR_DEVICES_MAK) config-all-devices.mak

 VERSION ?= $(shell cat VERSION)

@@ -570,12 +569,6 @@ qemu-ga.8: qemu-ga.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
 	  "  GEN   $@")

-kvm_stat.1: scripts/kvm/kvm_stat.texi
-	$(call quiet-command, \
-	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
-	  $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \
-	  "  GEN   $@")
-
 dvi: qemu-doc.dvi qemu-tech.dvi
 html: qemu-doc.html qemu-tech.html
 info: qemu-doc.info qemu-tech.info
@@ -651,3 +644,5 @@ endif
 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
 -include $(wildcard *.d tests/*.d)
+
+include $(SRC_PATH)/tests/docker/Makefile.include
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,7 +52,6 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 common-obj-y += migration/
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += page_cache.o
-common-obj-y += qjson.o

 common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

--- a/Makefile.target
+++ b/Makefile.target
@@ -108,7 +108,12 @@ obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o

 ifdef CONFIG_LINUX_USER

-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user
+# Note that we only add linux-user/host/$ARCH if it exists, and
+# that it must come before linux-user/host/generic in the search path.
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+             $(patsubst %,-I%,$(wildcard $(SRC_PATH)/linux-user/host/$(ARCH))) \
+             -I$(SRC_PATH)/linux-user/host/generic \
+             -I$(SRC_PATH)/linux-user

 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o
--- a/2
+++ b/2
@@ -1 +1 @@
-2.6.0
+2.6.50
--- a/accel.c
+++ b/accel.c
@@ -77,7 +77,7 @@ static int accel_init_machine(AccelClass *acc, MachineState *ms)
    return ret;
 }

-int configure_accelerator(MachineState *ms)
+void configure_accelerator(MachineState *ms)
 {
    const char *p;
    char buf[10];
@@ -128,8 +128,6 @@ int configure_accelerator(MachineState *ms)
    if (init_failed) {
        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
    }
-
-    return !accel_initialised;
 }


--- a/arch_init.c
+++ b/arch_init.c
@@ -22,6 +22,8 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
@@ -272,13 +274,6 @@ void do_smbios_option(QemuOpts *opts)
 #endif
 }

-void cpudef_init(void)
-{
-#if defined(cpudef_setup)
-    cpudef_setup(); /* parse cpu definitions in target config file */
-#endif
-}
-
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -24,6 +24,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/bswap.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
@@ -270,7 +271,7 @@ f_sample *mixeng_clip[2][2][2][3] = {
 * August 21, 1998
 * Copyright 1998 Fabrice Bellard.
 *
- * [Rewrote completly the code of Lance Norskog And Sundry
+ * [Rewrote completely the code of Lance Norskog And Sundry
 * Contributors with a more efficient algorithm.]
 *
 * This source code is freely redistributable and may be used for
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@@ -23,6 +23,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "audio.h"
 #include "qemu/timer.h"

--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@@ -898,7 +898,7 @@ static struct audio_option oss_options[] = {
        .name  = "EXCLUSIVE",
        .tag   = AUD_OPT_BOOL,
        .valp  = &glob_conf.exclusive,
-        .descr = "Open device in exclusive mode (vmix wont work)"
+        .descr = "Open device in exclusive mode (vmix won't work)"
    },
 #ifdef USE_DSP_POLICY
    {
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -19,6 +19,7 @@

 #include "qemu/osdep.h"
 #include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "ui/qemu-spice.h"
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/timer.h"
 #include "audio.h"

--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -17,7 +17,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/main-loop.h"

-struct RndRandom
+struct RngRandom
 {
    RngBackend parent;

@@ -34,7 +34,7 @@ struct RndRandom

 static void entropy_available(void *opaque)
 {
-    RndRandom *s = RNG_RANDOM(opaque);
+    RngRandom *s = RNG_RANDOM(opaque);

    while (!QSIMPLEQ_EMPTY(&s->parent.requests)) {
        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
@@ -57,7 +57,7 @@ static void entropy_available(void *opaque)

 static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
-    RndRandom *s = RNG_RANDOM(b);
+    RngRandom *s = RNG_RANDOM(b);

    if (QSIMPLEQ_EMPTY(&s->parent.requests)) {
        /* If there are no pending requests yet, we need to
@@ -68,7 +68,7 @@ static void rng_random_request_entropy(RngBackend *b, RngRequest *req)

 static void rng_random_opened(RngBackend *b, Error **errp)
 {
-    RndRandom *s = RNG_RANDOM(b);
+    RngRandom *s = RNG_RANDOM(b);

    if (s->filename == NULL) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
@@ -83,7 +83,7 @@ static void rng_random_opened(RngBackend *b, Error **errp)

 static char *rng_random_get_filename(Object *obj, Error **errp)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    return g_strdup(s->filename);
 }
@@ -92,7 +92,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,
                                 Error **errp)
 {
    RngBackend *b = RNG_BACKEND(obj);
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    if (b->opened) {
        error_setg(errp, QERR_PERMISSION_DENIED);
@@ -105,7 +105,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,

 static void rng_random_init(Object *obj)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    object_property_add_str(obj, "filename",
                            rng_random_get_filename,
@@ -118,7 +118,7 @@ static void rng_random_init(Object *obj)

 static void rng_random_finalize(Object *obj)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    if (s->fd != -1) {
        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
@@ -139,7 +139,7 @@ static void rng_random_class_init(ObjectClass *klass, void *data)
 static const TypeInfo rng_random_info = {
    .name = TYPE_RNG_RANDOM,
    .parent = TYPE_RNG_BACKEND,
-    .instance_size = sizeof(RndRandom),
+    .instance_size = sizeof(RngRandom),
    .class_init = rng_random_class_init,
    .instance_init = rng_random_init,
    .instance_finalize = rng_random_finalize,
--- a/block.c
+++ b/block.c
@@ -38,7 +38,6 @@
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"
-#include "block/throttle-groups.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"

@@ -65,16 +64,16 @@ static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
    QLIST_HEAD_INITIALIZER(bdrv_drivers);

-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp);
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp);

 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;

-static void bdrv_close(BlockDriverState *bs);
-
 #ifdef _WIN32
 static int is_windows_drive_prefix(const char *filename)
 {
@@ -218,16 +217,9 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,

 void bdrv_register(BlockDriver *bdrv)
 {
-    bdrv_setup_io_funcs(bdrv);
-
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }

-BlockDriverState *bdrv_new_root(void)
-{
-    return bdrv_new();
-}
-
 BlockDriverState *bdrv_new(void)
 {
    BlockDriverState *bs;
@@ -239,8 +231,6 @@ BlockDriverState *bdrv_new(void)
        QLIST_INIT(&bs->op_blockers[i]);
    }
    notifier_with_return_list_init(&bs->before_write_notifiers);
-    qemu_co_queue_init(&bs->throttled_reqs[0]);
-    qemu_co_queue_init(&bs->throttled_reqs[1]);
    bs->refcnt = 1;
    bs->aio_context = qemu_get_aio_context();

@@ -669,6 +659,18 @@ int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
    return 0;
 }

+static void bdrv_child_cb_drained_begin(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_begin(bs);
+}
+
+static void bdrv_child_cb_drained_end(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_end(bs);
+}
+
 /*
 * Returns the options and flags that a temporary snapshot should get, based on
 * the originally requested flags (the originally requested image will have
@@ -715,6 +717,8 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,

 const BdrvChildRole child_file = {
    .inherit_options = bdrv_inherited_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 /*
@@ -733,6 +737,8 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,

 const BdrvChildRole child_format = {
    .inherit_options = bdrv_inherited_fmt_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 /*
@@ -760,6 +766,8 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options,

 static const BdrvChildRole child_backing = {
    .inherit_options = bdrv_backing_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 static int bdrv_open_flags(BlockDriverState *bs, int flags)
@@ -1160,28 +1168,52 @@ static int bdrv_fill_options(QDict **options, const char *filename,
    return 0;
 }

+static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
+{
+    BlockDriverState *old_bs = child->bs;
+
+    if (old_bs) {
+        if (old_bs->quiesce_counter && child->role->drained_end) {
+            child->role->drained_end(child);
+        }
+        QLIST_REMOVE(child, next_parent);
+    }
+
+    child->bs = new_bs;
+
+    if (new_bs) {
+        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
+        if (new_bs->quiesce_counter && child->role->drained_begin) {
+            child->role->drained_begin(child);
+        }
+    }
+}
+
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
-                                  const BdrvChildRole *child_role)
+                                  const BdrvChildRole *child_role,
+                                  void *opaque)
 {
    BdrvChild *child = g_new(BdrvChild, 1);
    *child = (BdrvChild) {
-        .bs     = child_bs,
+        .bs     = NULL,
        .name   = g_strdup(child_name),
        .role   = child_role,
+        .opaque = opaque,
    };

-    QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent);
+    bdrv_replace_child(child, child_bs);

    return child;
 }

-static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
-                                    BlockDriverState *child_bs,
-                                    const char *child_name,
-                                    const BdrvChildRole *child_role)
+BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+                             BlockDriverState *child_bs,
+                             const char *child_name,
+                             const BdrvChildRole *child_role)
 {
-    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
+    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role,
+                                              parent_bs);
    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
    return child;
 }
@@ -1192,7 +1224,9 @@ static void bdrv_detach_child(BdrvChild *child)
        QLIST_REMOVE(child, next);
        child->next.le_prev = NULL;
    }
-    QLIST_REMOVE(child, next_parent);
+
+    bdrv_replace_child(child, NULL);
+
    g_free(child->name);
    g_free(child);
 }
@@ -1219,6 +1253,27 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
    bdrv_root_unref_child(child);
 }

+
+static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->change_media) {
+            c->role->change_media(c, load);
+        }
+    }
+}
+
+static void bdrv_parent_cb_resize(BlockDriverState *bs)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->resize) {
+            c->role->resize(c);
+        }
+    }
+}
+
 /*
 * Sets the backing file link of a BDS. A new reference is created; callers
 * which don't need their own reference any more must call bdrv_unref().
@@ -1325,14 +1380,13 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
        qdict_put(options, "driver", qstring_from_str(bs->backing_format));
    }

-    backing_hd = NULL;
-    ret = bdrv_open_inherit(&backing_hd,
-                            *backing_filename ? backing_filename : NULL,
-                            reference, options, 0, bs, &child_backing,
-                            errp);
-    if (ret < 0) {
+    backing_hd = bdrv_open_inherit(*backing_filename ? backing_filename : NULL,
+                                   reference, options, 0, bs, &child_backing,
+                                   errp);
+    if (!backing_hd) {
        bs->open_flags |= BDRV_O_NO_BACKING;
        error_prepend(errp, "Could not open backing file: ");
+        ret = -EINVAL;
        goto free_exit;
    }

@@ -1372,7 +1426,6 @@ BdrvChild *bdrv_open_child(const char *filename,
    BdrvChild *c = NULL;
    BlockDriverState *bs;
    QDict *image_options;
-    int ret;
    char *bdref_key_dot;
    const char *reference;

@@ -1392,10 +1445,9 @@ BdrvChild *bdrv_open_child(const char *filename,
        goto done;
    }

-    bs = NULL;
-    ret = bdrv_open_inherit(&bs, filename, reference, image_options, 0,
-                            parent, child_role, errp);
-    if (ret < 0) {
+    bs = bdrv_open_inherit(filename, reference, image_options, 0,
+                           parent, child_role, errp);
+    if (!bs) {
        goto done;
    }

@@ -1406,15 +1458,16 @@ done:
    return c;
 }

-static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
-                                     QDict *snapshot_options, Error **errp)
+static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
+                                                   int flags,
+                                                   QDict *snapshot_options,
+                                                   Error **errp)
 {
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
    char *tmp_filename = g_malloc0(PATH_MAX + 1);
    int64_t total_size;
    QemuOpts *opts = NULL;
    BlockDriverState *bs_snapshot;
-    Error *local_err = NULL;
    int ret;

    /* if snapshot, we create a temporary backing file and open it
@@ -1423,7 +1476,6 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
    /* Get the required size from the image */
    total_size = bdrv_getlength(bs);
    if (total_size < 0) {
-        ret = total_size;
        error_setg_errno(errp, -total_size, "Could not get image size");
        goto out;
    }
@@ -1454,22 +1506,26 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
    qdict_put(snapshot_options, "driver",
              qstring_from_str("qcow2"));

-    bs_snapshot = bdrv_new();
-
-    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
-                    flags, &local_err);
+    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
    snapshot_options = NULL;
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    if (!bs_snapshot) {
+        ret = -EINVAL;
        goto out;
    }

+    /* bdrv_append() consumes a strong reference to bs_snapshot (i.e. it will
+     * call bdrv_unref() on it), so in order to be able to return one, we have
+     * to increase bs_snapshot's refcount here */
+    bdrv_ref(bs_snapshot);
    bdrv_append(bs_snapshot, bs);

+    g_free(tmp_filename);
+    return bs_snapshot;
+
 out:
    QDECREF(snapshot_options);
    g_free(tmp_filename);
-    return ret;
+    return NULL;
 }

 /*
@@ -1487,10 +1543,12 @@ out:
 * should be opened. If specified, neither options nor a filename may be given,
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
 */
-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp)
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp)
 {
    int ret;
    BdrvChild *file = NULL;
@@ -1502,7 +1560,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;

-    assert(pbs);
    assert(!child_role || !flags);
    assert(!child_role == !parent);

@@ -1510,39 +1567,22 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        bool options_non_empty = options ? qdict_size(options) : false;
        QDECREF(options);

-        if (*pbs) {
-            error_setg(errp, "Cannot reuse an existing BDS when referencing "
-                       "another block device");
-            return -EINVAL;
-        }
-
        if (filename || options_non_empty) {
            error_setg(errp, "Cannot reference an existing block device with "
                       "additional options or a new filename");
-            return -EINVAL;
+            return NULL;
        }

        bs = bdrv_lookup_bs(reference, reference, errp);
        if (!bs) {
-            return -ENODEV;
-        }
-
-        if (bs->throttle_state) {
-            error_setg(errp, "Cannot reference an existing block device for "
-                       "which I/O throttling is enabled");
-            return -EINVAL;
+            return NULL;
        }

        bdrv_ref(bs);
-        *pbs = bs;
-        return 0;
+        return bs;
    }

-    if (*pbs) {
-        bs = *pbs;
-    } else {
-        bs = bdrv_new();
-    }
+    bs = bdrv_new();

    /* NULL means an empty set of options */
    if (options == NULL) {
@@ -1552,7 +1592,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    /* json: syntax counts as explicit options, as if in the QDict */
    parse_json_protocol(options, &filename, &local_err);
    if (local_err) {
-        ret = -EINVAL;
        goto fail;
    }

@@ -1579,7 +1618,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        drv = bdrv_find_format(drvname);
        if (!drv) {
            error_setg(errp, "Unknown driver: '%s'", drvname);
-            ret = -EINVAL;
            goto fail;
        }
    }
@@ -1609,7 +1647,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        file = bdrv_open_child(filename, options, "file", bs,
                               &child_file, true, &local_err);
        if (local_err) {
-            ret = -EINVAL;
            goto fail;
        }
    }
@@ -1636,7 +1673,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        qdict_put(options, "driver", qstring_from_str(drv->format_name));
    } else if (!drv) {
        error_setg(errp, "Must specify either driver or file");
-        ret = -EINVAL;
        goto fail;
    }

@@ -1679,38 +1715,40 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                       drv->format_name, entry->key);
        }

-        ret = -EINVAL;
        goto close_and_fail;
    }

    if (!bdrv_key_required(bs)) {
-        if (bs->blk) {
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
    } else if (!runstate_check(RUN_STATE_PRELAUNCH)
               && !runstate_check(RUN_STATE_INMIGRATE)
               && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
        error_setg(errp,
                   "Guest must be stopped for opening of encrypted image");
-        ret = -EBUSY;
        goto close_and_fail;
    }

    QDECREF(options);
-    *pbs = bs;

    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
     * temporary snapshot afterwards. */
    if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
-                                        &local_err);
+        BlockDriverState *snapshot_bs;
+        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
+                                                snapshot_options, &local_err);
        snapshot_options = NULL;
        if (local_err) {
            goto close_and_fail;
        }
+        /* We are not going to return bs but the overlay on top of it
+         * (snapshot_bs); thus, we have to drop the strong reference to bs
+         * (which we obtained by calling bdrv_new()). bs will not be deleted,
+         * though, because the overlay still has a reference to it. */
+        bdrv_unref(bs);
+        bs = snapshot_bs;
    }

-    return 0;
+    return bs;

 fail:
    if (file != NULL) {
@@ -1721,36 +1759,26 @@ fail:
    QDECREF(bs->options);
    QDECREF(options);
    bs->options = NULL;
-    if (!*pbs) {
-        /* If *pbs is NULL, a new BDS has been created in this function and
-           needs to be freed now. Otherwise, it does not need to be closed,
-           since it has not really been opened yet. */
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
    if (local_err) {
        error_propagate(errp, local_err);
    }
-    return ret;
+    return NULL;

 close_and_fail:
-    /* See fail path, but now the BDS has to be always closed */
-    if (*pbs) {
-        bdrv_close(bs);
-    } else {
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
    QDECREF(snapshot_options);
    QDECREF(options);
    if (local_err) {
        error_propagate(errp, local_err);
    }
-    return ret;
+    return NULL;
 }

-int bdrv_open(BlockDriverState **pbs, const char *filename,
-              const char *reference, QDict *options, int flags, Error **errp)
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp)
 {
-    return bdrv_open_inherit(pbs, filename, reference, options, flags, NULL,
+    return bdrv_open_inherit(filename, reference, options, flags, NULL,
                             NULL, errp);
 }

@@ -2124,11 +2152,7 @@ static void bdrv_close(BlockDriverState *bs)
    BdrvAioNotifier *ban, *ban_next;

    assert(!bs->job);
-
-    /* Disable I/O limits and drain all pending throttled requests */
-    if (bs->throttle_state) {
-        bdrv_io_limits_disable(bs);
-    }
+    assert(!bs->refcnt);

    bdrv_drained_begin(bs); /* complete I/O */
    bdrv_flush(bs);
@@ -2137,10 +2161,6 @@ static void bdrv_close(BlockDriverState *bs)
    bdrv_release_named_dirty_bitmaps(bs);
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

-    if (bs->blk) {
-        blk_dev_change_media_cb(bs->blk, false);
-    }
-
    if (bs->drv) {
        BdrvChild *child, *next;

@@ -2189,8 +2209,7 @@ static void bdrv_close(BlockDriverState *bs)

 void bdrv_close_all(void)
 {
-    BlockDriverState *bs;
-    AioContext *aio_context;
+    block_job_cancel_sync_all();

    /* Drop references from requests still in flight, such as canceled block
     * jobs whose AIO context has not been polled yet */
@@ -2199,32 +2218,7 @@ void bdrv_close_all(void)
    blk_remove_all_bs();
    blockdev_close_all_bdrv_states();

-    /* Cancel all block jobs */
-    while (!QTAILQ_EMPTY(&all_bdrv_states)) {
-        QTAILQ_FOREACH(bs, &all_bdrv_states, bs_list) {
-            aio_context = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(aio_context);
-            if (bs->job) {
-                block_job_cancel_sync(bs->job);
-                aio_context_release(aio_context);
-                break;
-            }
-            aio_context_release(aio_context);
-        }
-
-        /* All the remaining BlockDriverStates are referenced directly or
-         * indirectly from block jobs, so there needs to be at least one BDS
-         * directly used by a block job */
-        assert(bs);
-    }
-}
-
-/* Fields that need to stay with the top-level BDS */
-static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
-                                     BlockDriverState *bs_src)
-{
-    /* move some fields that need to stay attached to the device */
+    assert(QTAILQ_EMPTY(&all_bdrv_states));
 }

 static void change_parent_backing_link(BlockDriverState *from,
@@ -2232,41 +2226,14 @@ static void change_parent_backing_link(BlockDriverState *from,
 {
    BdrvChild *c, *next;

-    if (from->blk) {
-        /* FIXME We bypass blk_set_bs(), so we need to make these updates
-         * manually. The root problem is not in this change function, but the
-         * existence of BlockDriverState.blk. */
-        to->blk = from->blk;
-        from->blk = NULL;
-    }
-
    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->role != &child_backing);
-        c->bs = to;
-        QLIST_REMOVE(c, next_parent);
-        QLIST_INSERT_HEAD(&to->parents, c, next_parent);
        bdrv_ref(to);
+        bdrv_replace_child(c, to);
        bdrv_unref(from);
    }
 }

-static void swap_feature_fields(BlockDriverState *bs_top,
-                                BlockDriverState *bs_new)
-{
-    BlockDriverState tmp;
-
-    bdrv_move_feature_fields(&tmp, bs_top);
-    bdrv_move_feature_fields(bs_top, bs_new);
-    bdrv_move_feature_fields(bs_new, &tmp);
-
-    assert(!bs_new->throttle_state);
-    if (bs_top->throttle_state) {
-        assert(bs_top->io_limits_enabled);
-        bdrv_io_limits_enable(bs_new, throttle_group_get_name(bs_top));
-        bdrv_io_limits_disable(bs_top);
-    }
-}
-
 /*
 * Add new bs contents at the top of an image chain while the chain is
 * live, while keeping required fields on the top layer.
@@ -2289,11 +2256,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
    assert(!bdrv_requests_pending(bs_new));

    bdrv_ref(bs_top);
+
    change_parent_backing_link(bs_top, bs_new);
-
-    /* Some fields always stay on top of the backing file chain */
-    swap_feature_fields(bs_top, bs_new);
-
    bdrv_set_backing_hd(bs_new, bs_top);
    bdrv_unref(bs_top);

@@ -2309,16 +2273,6 @@ void bdrv_replace_in_backing_chain(BlockDriverState *old, BlockDriverState *new)

    bdrv_ref(old);

-    if (old->blk) {
-        /* As long as these fields aren't in BlockBackend, but in the top-level
-         * BlockDriverState, it's not possible for a BDS to have two BBs.
-         *
-         * We really want to copy the fields from old to new, but we go for a
-         * swap instead so that pointers aren't duplicated and cause trouble.
-         * (Also, bdrv_swap() used to do the same.) */
-        assert(!new->blk);
-        swap_feature_fields(old, new);
-    }
    change_parent_backing_link(old, new);

    /* Change backing files if a previously independent node is added to the
@@ -2627,9 +2581,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
    if (ret == 0) {
        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
        bdrv_dirty_bitmap_truncate(bs);
-        if (bs->blk) {
-            blk_dev_resize_cb(bs->blk);
-        }
+        bdrv_parent_cb_resize(bs);
    }
    return ret;
 }
@@ -2739,11 +2691,9 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
    if (ret < 0) {
        bs->valid_key = 0;
    } else if (!bs->valid_key) {
+        /* call the change callback now, we skipped it on open */
        bs->valid_key = 1;
-        if (bs->blk) {
-            /* call the change callback now, we skipped it on open */
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
    }
    return ret;
 }
@@ -2910,34 +2860,33 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
    return QTAILQ_NEXT(bs, node_list);
 }

-/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
- * the monitor or attached to a BlockBackend */
-BlockDriverState *bdrv_next(BlockDriverState *bs)
-{
-    if (!bs || bs->blk) {
-        bs = blk_next_root_bs(bs);
-        if (bs) {
-            return bs;
-        }
-    }
-
-    /* Ignore all BDSs that are attached to a BlockBackend here; they have been
-     * handled by the above block already */
-    do {
-        bs = bdrv_next_monitor_owned(bs);
-    } while (bs && bs->blk);
-    return bs;
-}
-
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
    return bs->node_name;
 }

+const char *bdrv_get_parent_name(const BlockDriverState *bs)
+{
+    BdrvChild *c;
+    const char *name;
+
+    /* If multiple parents have a name, just pick the first one. */
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->get_name) {
+            name = c->role->get_name(c);
+            if (name && *name) {
+                return name;
+            }
+        }
+    }
+
+    return NULL;
+}
+
 /* TODO check what callers really want: bs->node_name or blk_name() */
 const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : "";
+    return bdrv_get_parent_name(bs) ?: "";
 }

 /* This can be used to identify nodes that might not have a device
@@ -2946,7 +2895,7 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
 * absent, then this returns an empty (non-null) string. */
 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : bs->node_name;
+    return bdrv_get_parent_name(bs) ?: bs->node_name;
 }

 int bdrv_get_flags(BlockDriverState *bs)
@@ -3201,6 +3150,7 @@ void bdrv_init_with_whitelist(void)

 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
+    BdrvChild *child;
    Error *local_err = NULL;
    int ret;

@@ -3215,13 +3165,20 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

    if (bs->drv->bdrv_invalidate_cache) {
        bs->drv->bdrv_invalidate_cache(bs, &local_err);
-    } else if (bs->file) {
-        bdrv_invalidate_cache(bs->file->bs, &local_err);
+        if (local_err) {
+            bs->open_flags |= BDRV_O_INACTIVE;
+            error_propagate(errp, local_err);
+            return;
+        }
    }
-    if (local_err) {
-        bs->open_flags |= BDRV_O_INACTIVE;
-        error_propagate(errp, local_err);
-        return;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_invalidate_cache(child->bs, &local_err);
+        if (local_err) {
+            bs->open_flags |= BDRV_O_INACTIVE;
+            error_propagate(errp, local_err);
+            return;
+        }
    }

    ret = refresh_total_sectors(bs, bs->total_sectors);
@@ -3234,10 +3191,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

 void bdrv_invalidate_cache_all(Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
    Error *local_err = NULL;
+    BdrvNextIterator it;

-    while ((bs = bdrv_next(bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3250,38 +3208,62 @@ void bdrv_invalidate_cache_all(Error **errp)
    }
 }

-static int bdrv_inactivate(BlockDriverState *bs)
+static int bdrv_inactivate_recurse(BlockDriverState *bs,
+                                   bool setting_flag)
 {
+    BdrvChild *child;
    int ret;

-    if (bs->drv->bdrv_inactivate) {
+    if (!setting_flag && bs->drv->bdrv_inactivate) {
        ret = bs->drv->bdrv_inactivate(bs);
        if (ret < 0) {
            return ret;
        }
    }

-    bs->open_flags |= BDRV_O_INACTIVE;
+    QLIST_FOREACH(child, &bs->children, next) {
+        ret = bdrv_inactivate_recurse(child->bs, setting_flag);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (setting_flag) {
+        bs->open_flags |= BDRV_O_INACTIVE;
+    }
    return 0;
 }

 int bdrv_inactivate_all(void)
 {
    BlockDriverState *bs = NULL;
-    int ret;
+    BdrvNextIterator it;
+    int ret = 0;
+    int pass;

-    while ((bs = bdrv_next(bs)) != NULL) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        aio_context_acquire(bdrv_get_aio_context(bs));
+    }

-        aio_context_acquire(aio_context);
-        ret = bdrv_inactivate(bs);
-        aio_context_release(aio_context);
-        if (ret < 0) {
-            return ret;
+    /* We do two passes of inactivation. The first pass calls to drivers'
+     * .bdrv_inactivate callbacks recursively so all cache is flushed to disk;
+     * the second pass sets the BDRV_O_INACTIVE flag so that no further write
+     * is allowed. */
+    for (pass = 0; pass < 2; pass++) {
+        for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+            ret = bdrv_inactivate_recurse(bs, pass);
+            if (ret < 0) {
+                goto out;
+            }
        }
    }

-    return 0;
+out:
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        aio_context_release(bdrv_get_aio_context(bs));
+    }
+
+    return ret;
 }

 /**************************************************************/
@@ -3561,11 +3543,10 @@ void bdrv_img_create(const char *filename, const char *fmt,
                          qstring_from_str(backing_fmt));
            }

-            bs = NULL;
-            ret = bdrv_open(&bs, full_backing, NULL, backing_options,
-                            back_flags, &local_err);
+            bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
+                           &local_err);
            g_free(full_backing);
-            if (ret < 0) {
+            if (!bs) {
                goto out;
            }
            size = bdrv_getlength(bs);
@@ -3623,6 +3604,7 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 void bdrv_detach_aio_context(BlockDriverState *bs)
 {
    BdrvAioNotifier *baf;
+    BdrvChild *child;

    if (!bs->drv) {
        return;
@@ -3632,17 +3614,11 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
        baf->detach_aio_context(baf->opaque);
    }

-    if (bs->throttle_state) {
-        throttle_timers_detach_aio_context(&bs->throttle_timers);
-    }
    if (bs->drv->bdrv_detach_aio_context) {
        bs->drv->bdrv_detach_aio_context(bs);
    }
-    if (bs->file) {
-        bdrv_detach_aio_context(bs->file->bs);
-    }
-    if (bs->backing) {
-        bdrv_detach_aio_context(bs->backing->bs);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_detach_aio_context(child->bs);
    }

    bs->aio_context = NULL;
@@ -3652,6 +3628,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
                             AioContext *new_context)
 {
    BdrvAioNotifier *ban;
+    BdrvChild *child;

    if (!bs->drv) {
        return;
@@ -3659,18 +3636,12 @@ void bdrv_attach_aio_context(BlockDriverState *bs,

    bs->aio_context = new_context;

-    if (bs->backing) {
-        bdrv_attach_aio_context(bs->backing->bs, new_context);
-    }
-    if (bs->file) {
-        bdrv_attach_aio_context(bs->file->bs, new_context);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_attach_aio_context(child->bs, new_context);
    }
    if (bs->drv->bdrv_attach_aio_context) {
        bs->drv->bdrv_attach_aio_context(bs, new_context);
    }
-    if (bs->throttle_state) {
-        throttle_timers_attach_aio_context(&bs->throttle_timers, new_context);
-    }

    QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
        ban->attached_aio_context(new_context, ban->opaque);
@@ -3776,10 +3747,11 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 */
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

    /* walk down the bs forest recursively */
-    while ((bs = bdrv_next(bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        bool perm;

        /* try to recurse in this top level bs */
@@ -3981,3 +3953,52 @@ void bdrv_refresh_filename(BlockDriverState *bs)
        QDECREF(json);
    }
 }
+
+/*
+ * Hot add/remove a BDS's child. So the user can take a child offline when
+ * it is broken and take a new child online
+ */
+void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
+                    Error **errp)
+{
+
+    if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
+        error_setg(errp, "The node %s does not support adding a child",
+                   bdrv_get_device_or_node_name(parent_bs));
+        return;
+    }
+
+    if (!QLIST_EMPTY(&child_bs->parents)) {
+        error_setg(errp, "The node %s already has a parent",
+                   child_bs->node_name);
+        return;
+    }
+
+    parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
+}
+
+void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
+{
+    BdrvChild *tmp;
+
+    if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
+        error_setg(errp, "The node %s does not support removing a child",
+                   bdrv_get_device_or_node_name(parent_bs));
+        return;
+    }
+
+    QLIST_FOREACH(tmp, &parent_bs->children, next) {
+        if (tmp == child) {
+            break;
+        }
+    }
+
+    if (!tmp) {
+        error_setg(errp, "The node %s does not have a child named %s",
+                   bdrv_get_device_or_node_name(parent_bs),
+                   bdrv_get_device_or_node_name(child->bs));
+        return;
+    }
+
+    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
+}
--- a/block/backup.c
+++ b/block/backup.c
@@ -36,7 +36,7 @@ typedef struct CowRequest {

 typedef struct BackupBlockJob {
    BlockJob common;
-    BlockDriverState *target;
+    BlockBackend *target;
    /* bitmap for sync=incremental */
    BdrvDirtyBitmap *sync_bitmap;
    MirrorSyncMode sync_mode;
@@ -47,6 +47,7 @@ typedef struct BackupBlockJob {
    uint64_t sectors_read;
    unsigned long *done_bitmap;
    int64_t cluster_size;
+    NotifierWithReturn before_write;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

@@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req)
    qemu_co_queue_restart_all(&req->wait_queue);
 }

-static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read,
                                      bool is_write_notifier)
 {
-    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    BlockBackend *blk = job->common.blk;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
@@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
+            bounce_buffer = blk_blockalign(blk, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

-        if (is_write_notifier) {
-            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * sectors_per_cluster,
-                                           n, &bounce_qiov);
-        } else {
-            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
-                                &bounce_qiov);
-        }
+        ret = blk_co_preadv(blk, start * job->cluster_size,
+                            bounce_qiov.size, &bounce_qiov,
+                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
@@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = bdrv_co_write_zeroes(job->target,
-                                       start * sectors_per_cluster,
-                                       n, BDRV_REQ_MAY_UNMAP);
+            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
        } else {
-            ret = bdrv_co_writev(job->target,
-                                 start * sectors_per_cluster, n,
-                                 &bounce_qiov);
+            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+                                 bounce_qiov.size, &bounce_qiov, 0);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
@@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
 {
+    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
    BdrvTrackedRequest *req = opaque;
    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

+    assert(req->bs == blk_bs(job->common.blk));
    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);

-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -218,19 +214,10 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void backup_iostatus_reset(BlockJob *job)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
    BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *bs = blk_bs(job->common.blk);

    if (ret < 0 || block_job_is_cancelled(&job->common)) {
        /* Merge the successor back into the parent, delete nothing. */
@@ -263,7 +250,6 @@ static const BlockJobDriver backup_job_driver = {
    .instance_size  = sizeof(BackupBlockJob),
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
    .set_speed      = backup_set_speed,
-    .iostatus_reset = backup_iostatus_reset,
    .commit         = backup_commit,
    .abort          = backup_abort,
 };
@@ -272,11 +258,11 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
 {
    if (read) {
-        return block_job_error_action(&job->common, job->common.bs,
-                                      job->on_source_error, true, error);
+        return block_job_error_action(&job->common, job->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&job->common, job->target,
-                                      job->on_target_error, false, error);
+        return block_job_error_action(&job->common, job->on_target_error,
+                                      false, error);
    }
 }

@@ -289,7 +275,7 @@ static void backup_complete(BlockJob *job, void *opaque)
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
    BackupCompleteData *data = opaque;

-    bdrv_unref(s->target);
+    blk_unref(s->target);

    block_job_completed(job, data->ret);
    g_free(data);
@@ -331,7 +317,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
-    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -353,7 +338,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
@@ -386,12 +371,8 @@ static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
-    BlockDriverState *bs = job->common.bs;
-    BlockDriverState *target = job->target;
-    BlockdevOnError on_target_error = job->on_target_error;
-    NotifierWithReturn before_write = {
-        .notify = backup_before_write_notify,
-    };
+    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockBackend *target = job->target;
    int64_t start, end;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;
@@ -404,12 +385,8 @@ static void coroutine_fn backup_run(void *opaque)

    job->done_bitmap = bitmap_new(end);

-    if (target->blk) {
-        blk_set_on_error(target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(target->blk);
-    }
-
-    bdrv_add_before_write_notifier(bs, &before_write);
+    job->before_write.notify = backup_before_write_notify;
+    bdrv_add_before_write_notifier(bs, &job->before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
@@ -461,7 +438,7 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * sectors_per_cluster,
+            ret = backup_do_cow(job, start * sectors_per_cluster,
                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
@@ -477,17 +454,14 @@ static void coroutine_fn backup_run(void *opaque)
        }
    }

-    notifier_with_return_remove(&before_write);
+    notifier_with_return_remove(&job->before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    if (target->blk) {
-        blk_iostatus_disable(target->blk);
-    }
-    bdrv_op_unblock_all(target, job->common.blocker);
+    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
@@ -504,6 +478,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 {
    int64_t len;
    BlockDriverInfo bdi;
+    BackupBlockJob *job = NULL;
    int ret;

    assert(bs);
@@ -515,13 +490,6 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        return;
    }

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
@@ -568,15 +536,16 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        goto error;
    }

-    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
-                                           cb, opaque, errp);
+    job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp);
    if (!job) {
        goto error;
    }

+    job->target = blk_new();
+    blk_insert_bs(job->target, target);
+
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
-    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
@@ -584,7 +553,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    /* If there is no backing file on the target, we cannot rely on COW if our
     * backup cluster size is smaller than the target cluster size. Even for
     * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(job->target, &bdi);
+    ret = bdrv_get_info(target, &bdi);
    if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
            "Couldn't determine the cluster size of the target image, "
@@ -610,4 +579,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    if (sync_bitmap) {
        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
    }
+    if (job) {
+        blk_unref(job->target);
+        block_job_unref(&job->common);
+    }
 }
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
    return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }

-/* Propagate AioContext changes to ->test_file */
-static void blkverify_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_detach_aio_context(s->test_file->bs);
-}
-
-static void blkverify_attach_aio_context(BlockDriverState *bs,
-                                         AioContext *new_context)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_attach_aio_context(s->test_file->bs, new_context);
-}
-
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkverifyState *s = bs->opaque;
@@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_aio_writev                  = blkverify_aio_writev,
    .bdrv_aio_flush                   = blkverify_aio_flush,

-    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
-    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
-
    .is_filter                        = true,
    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Block backends
 *
- * Copyright (C) 2014 Red Hat, Inc.
+ * Copyright (C) 2014-2016 Red Hat, Inc.
 *
 * Authors:
 *  Markus Armbruster <armbru@redhat.com>,
@@ -19,6 +19,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
+#include "trace.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -34,6 +35,7 @@ struct BlockBackend {
    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
+    BlockBackendPublic public;

    void *dev;                  /* attached device model, if any */
    /* TODO change to DeviceState when all users are qdevified */
@@ -74,6 +76,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };

 static void drive_info_del(DriveInfo *dinfo);
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -90,9 +93,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
    /* We're not supposed to call this function for root nodes */
    abort();
 }
+static void blk_root_drained_begin(BdrvChild *child);
+static void blk_root_drained_end(BdrvChild *child);
+
+static void blk_root_change_media(BdrvChild *child, bool load);
+static void blk_root_resize(BdrvChild *child);
+
+static const char *blk_root_get_name(BdrvChild *child)
+{
+    return blk_name(child->opaque);
+}

 static const BdrvChildRole child_root = {
-    .inherit_options = blk_root_inherit_options,
+    .inherit_options    = blk_root_inherit_options,
+
+    .change_media       = blk_root_change_media,
+    .resize             = blk_root_resize,
+    .get_name           = blk_root_get_name,
+
+    .drained_begin      = blk_root_drained_begin,
+    .drained_end        = blk_root_drained_end,
 };

 /*
@@ -100,40 +120,26 @@ static const BdrvChildRole child_root = {
 * Store an error through @errp on failure, unless it's null.
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(Error **errp)
+BlockBackend *blk_new(void)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    blk_set_enable_write_cache(blk, true);
+
+    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+
    notifier_list_init(&blk->remove_bs_notifiers);
    notifier_list_init(&blk->insert_bs_notifiers);
+
    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
    return blk;
 }

 /*
- * Create a new BlockBackend with a new BlockDriverState attached.
- * Otherwise just like blk_new(), which see.
- */
-BlockBackend *blk_new_with_bs(Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
-
-    blk = blk_new(errp);
-    if (!blk) {
-        return NULL;
-    }
-
-    bs = bdrv_new_root();
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
-    return blk;
-}
-
-/*
- * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
+ * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
@@ -148,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                           QDict *options, int flags, Error **errp)
 {
    BlockBackend *blk;
-    int ret;
+    BlockDriverState *bs;

-    blk = blk_new_with_bs(errp);
-    if (!blk) {
-        QDECREF(options);
-        return NULL;
-    }
-
-    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
-    if (ret < 0) {
+    blk = blk_new();
+    bs = bdrv_open(filename, reference, options, flags, errp);
+    if (!bs) {
        blk_unref(blk);
        return NULL;
    }

-    blk_set_enable_write_cache(blk, true);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    return blk;
 }
@@ -177,10 +178,6 @@ static void blk_delete(BlockBackend *blk)
    }
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
-    if (blk->root_state.throttle_state) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
    QTAILQ_REMOVE(&block_backends, blk, link);
    drive_info_del(blk->legacy_dinfo);
    block_acct_cleanup(&blk->stats);
@@ -267,28 +264,45 @@ BlockBackend *blk_next(BlockBackend *blk)
               : QTAILQ_FIRST(&monitor_block_backends);
 }

-/*
- * Iterates over all BlockDriverStates which are attached to a BlockBackend.
- * This function is for use by bdrv_next().
- *
- * @bs must be NULL or a BDS that is attached to a BB.
- */
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-    BlockBackend *blk;
+    BlockDriverState *bs;

-    if (bs) {
-        assert(bs->blk);
-        blk = bs->blk;
-    } else {
-        blk = NULL;
+    /* First, return all root nodes of BlockBackends. In order to avoid
+     * returning a BDS twice when multiple BBs refer to it, we only return it
+     * if the BB is the first one in the parent list of the BDS. */
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        do {
+            it->blk = blk_all_next(it->blk);
+            bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
+
+        if (bs) {
+            return bs;
+        }
+        it->phase = BDRV_NEXT_MONITOR_OWNED;
    }

+    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
+     * BDSes that are attached to a BlockBackend here; they have been handled
+     * by the above block already */
    do {
-        blk = blk_all_next(blk);
-    } while (blk && !blk->root);
+        it->bs = bdrv_next_monitor_owned(it->bs);
+        bs = it->bs;
+    } while (bs && bdrv_has_blk(bs));

-    return blk ? blk->root->bs : NULL;
+    return bs;
+}
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+    *it = (BdrvNextIterator) {
+        .phase = BDRV_NEXT_BACKEND_ROOTS,
+    };
+
+    return bdrv_next(it);
 }

 /*
@@ -375,6 +389,26 @@ BlockDriverState *blk_bs(BlockBackend *blk)
    return blk->root ? blk->root->bs : NULL;
 }

+static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
+{
+    BdrvChild *child;
+    QLIST_FOREACH(child, &bs->parents, next_parent) {
+        if (child->role == &child_root) {
+            return child->opaque;
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Returns true if @bs has an associated BlockBackend.
+ */
+bool bdrv_has_blk(BlockDriverState *bs)
+{
+    return bdrv_first_blk(bs) != NULL;
+}
+
 /*
 * Return @blk's DriveInfo if any, else null.
 */
@@ -410,18 +444,34 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
    abort();
 }

+/*
+ * Returns a pointer to the publicly accessible fields of @blk.
+ */
+BlockBackendPublic *blk_get_public(BlockBackend *blk)
+{
+    return &blk->public;
+}
+
+/*
+ * Returns a BlockBackend given the associated @public fields.
+ */
+BlockBackend *blk_by_public(BlockBackendPublic *public)
+{
+    return container_of(public, BlockBackend, public);
+}
+
 /*
 * Disassociates the currently associated BlockDriverState from @blk.
 */
 void blk_remove_bs(BlockBackend *blk)
 {
-    assert(blk->root->bs->blk == blk);
-
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    }

    blk_update_root_state(blk);

-    blk->root->bs->blk = NULL;
    bdrv_root_unref_child(blk->root);
    blk->root = NULL;
 }
@@ -431,12 +481,14 @@ void blk_remove_bs(BlockBackend *blk)
 */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
-    assert(!blk->root && !bs->blk);
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_attach_aio_context(
+            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+    }
 }

 /*
@@ -525,6 +577,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
    }
 }

+static void blk_root_change_media(BdrvChild *child, bool load)
+{
+    blk_dev_change_media_cb(child->opaque, load);
+}
+
 /*
 * Does @blk's attached device model have removable media?
 * %true if no device model is attached.
@@ -579,8 +636,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
 * Notify @blk's attached device model of a backend size change.
 */
-void blk_dev_resize_cb(BlockBackend *blk)
+static void blk_root_resize(BdrvChild *child)
 {
+    BlockBackend *blk = child->opaque;
+
    if (blk->dev_ops && blk->dev_ops->resize_cb) {
        blk->dev_ops->resize_cb(blk->dev_opaque);
    }
@@ -683,34 +742,50 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num,
                                  nb_sectors * BDRV_SECTOR_SIZE);
 }

-static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
-{
-    int ret = blk_check_byte_request(blk, offset, bytes);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags);
-}
-
-static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
 {
    int ret;

+    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+
    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    }
+
+    return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
+}
+
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
+{
+    int ret;
+
+    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    }
+
    if (!blk->enable_write_cache) {
        flags |= BDRV_REQ_FUA;
    }

-    return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
+    return bdrv_co_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
 }

 typedef struct BlkRwCo {
@@ -772,55 +847,27 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
    return rwco.ret;
 }

-static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                  int nb_sectors, CoroutineEntry co_entry,
-                  BdrvRequestFlags flags)
+int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
+                          int count)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf,
-                   nb_sectors << BDRV_SECTOR_BITS, co_entry, flags);
-}
-
-int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-             int nb_sectors)
-{
-    return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0);
-}
-
-int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                         int nb_sectors)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    bool enabled;
    int ret;

-    ret = blk_check_request(blk, sector_num, nb_sectors);
+    ret = blk_check_byte_request(blk, offset, count);
    if (ret < 0) {
        return ret;
    }

-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = blk_read(blk, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
+    blk_root_drained_begin(blk->root);
+    ret = blk_pread(blk, offset, buf, count);
+    blk_root_drained_end(blk->root);
    return ret;
 }

-int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
-              int nb_sectors)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags)
 {
-    return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors,
-                  blk_write_entry, 0);
-}
-
-int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                     int nb_sectors, BdrvRequestFlags flags)
-{
-    return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry,
-                  flags | BDRV_REQ_ZERO_WRITE);
+    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+                   flags | BDRV_REQ_ZERO_WRITE);
 }

 static void error_callback_bh(void *opaque)
@@ -932,18 +979,12 @@ static void blk_aio_write_entry(void *opaque)
    blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                 int nb_sectors, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS,
-                        nb_sectors << BDRV_SECTOR_BITS, NULL,
-                        blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE,
-                        cb, opaque);
+    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
+                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }

 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
@@ -955,9 +996,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
    return count;
 }

-int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
+               BdrvRequestFlags flags)
 {
-    int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0);
+    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
+                      flags);
    if (ret < 0) {
        return ret;
    }
@@ -991,30 +1034,20 @@ int64_t blk_nb_sectors(BlockBackend *blk)
    return bdrv_nb_sectors(blk_bs(blk));
 }

-BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
-                          QEMUIOVector *iov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_read_entry, 0, cb, opaque);
-}
-
-BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
-                           QEMUIOVector *iov, int nb_sectors,
+BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
+                           QEMUIOVector *qiov, BdrvRequestFlags flags,
                           BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_read_entry, flags, cb, opaque);
+}

-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_write_entry, 0, cb, opaque);
+BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
+                            QEMUIOVector *qiov, BdrvRequestFlags flags,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_write_entry, flags, cb, opaque);
 }

 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
@@ -1049,20 +1082,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
    bdrv_aio_cancel_async(acb);
 }

-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
-{
-    int i, ret;
-
-    for (i = 0; i < num_reqs; i++) {
-        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
-}
-
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
    if (!blk_is_available(blk)) {
@@ -1375,7 +1394,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
    BlockDriverState *bs = blk_bs(blk);

    if (bs) {
+        if (blk->public.throttle_state) {
+            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        }
        bdrv_set_aio_context(bs, new_context);
+        if (blk->public.throttle_state) {
+            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
+                                               new_context);
+        }
    }
 }

@@ -1444,15 +1470,10 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                     int nb_sectors, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS,
-                          nb_sectors << BDRV_SECTOR_BITS, NULL,
+    return blk_co_pwritev(blk, offset, count, NULL,
                          flags | BDRV_REQ_ZERO_WRITE);
 }

@@ -1545,19 +1566,6 @@ void blk_update_root_state(BlockBackend *blk)
    blk->root_state.open_flags    = blk->root->bs->open_flags;
    blk->root_state.read_only     = blk->root->bs->read_only;
    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
-
-    if (blk->root_state.throttle_group) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
-    if (blk->root->bs->throttle_state) {
-        const char *name = throttle_group_get_name(blk->root->bs);
-        blk->root_state.throttle_group = g_strdup(name);
-        blk->root_state.throttle_state = throttle_group_incref(name);
-    } else {
-        blk->root_state.throttle_group = NULL;
-        blk->root_state.throttle_state = NULL;
-    }
 }

 /*
@@ -1568,9 +1576,6 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
    bs->detect_zeroes = blk->root_state.detect_zeroes;
-    if (blk->root_state.throttle_group) {
-        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
-    }
 }

 /*
@@ -1633,3 +1638,62 @@ int blk_flush_all(void)

    return result;
 }
+
+
+/* throttling disk I/O limits */
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+{
+    throttle_group_config(blk, cfg);
+}
+
+void blk_io_limits_disable(BlockBackend *blk)
+{
+    assert(blk->public.throttle_state);
+    bdrv_drained_begin(blk_bs(blk));
+    throttle_group_unregister_blk(blk);
+    bdrv_drained_end(blk_bs(blk));
+}
+
+/* should be called before blk_set_io_limits if a limit is set */
+void blk_io_limits_enable(BlockBackend *blk, const char *group)
+{
+    assert(!blk->public.throttle_state);
+    throttle_group_register_blk(blk, group);
+}
+
+void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+{
+    /* this BB is not part of any group */
+    if (!blk->public.throttle_state) {
+        return;
+    }
+
+    /* this BB is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    blk_io_limits_disable(blk);
+    blk_io_limits_enable(blk, group);
+}
+
+static void blk_root_drained_begin(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    /* Note that blk->root may not be accessible here yet if we are just
+     * attaching to a BlockDriverState that is drained. Use child instead. */
+
+    if (blk->public.io_limits_disabled++ == 0) {
+        throttle_group_restart_blk(blk);
+    }
+}
+
+static void blk_root_drained_end(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    assert(blk->public.io_limits_disabled);
+    --blk->public.io_limits_disabled;
+}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -27,6 +27,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"

 /**************************************************************/

@@ -104,6 +105,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1; // no write support yet
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs));
    if (ret < 0) {
@@ -221,38 +223,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
 }

-static int bochs_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
+    BDRVBochsState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    uint64_t bytes_done = 0;
+    QEMUIOVector local_qiov;
    int ret;

+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
    while (nb_sectors > 0) {
        int64_t block_offset = seek_to_sector(bs, sector_num);
        if (block_offset < 0) {
-            return block_offset;
-        } else if (block_offset > 0) {
-            ret = bdrv_pread(bs->file->bs, block_offset, buf, 512);
+            ret = block_offset;
+            goto fail;
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512);
+
+        if (block_offset > 0) {
+            ret = bdrv_co_preadv(bs->file->bs, block_offset, 512,
+                                 &local_qiov, 0);
            if (ret < 0) {
-                return ret;
+                goto fail;
            }
        } else {
-            memset(buf, 0, 512);
+            qemu_iovec_memset(&local_qiov, 0, 0, 512);
        }
        nb_sectors--;
        sector_num++;
-        buf += 512;
+        bytes_done += 512;
    }
-    return 0;
-}

-static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVBochsState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = bochs_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -267,7 +283,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
-    .bdrv_read          = bochs_co_read,
+    .bdrv_co_preadv = bochs_co_preadv,
    .bdrv_close		= bochs_close,
 };

--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>

 /* Maximum compressed block size */
@@ -66,6 +67,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    /* read header */
    ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4);
@@ -229,33 +231,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
    return 0;
 }

-static int cloop_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    BDRVCloopState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
+        void *data;
        uint32_t sector_offset_in_block =
            ((sector_num + i) % s->sectors_per_block),
            block_num = (sector_num + i) / s->sectors_per_block;
        if (cloop_read_block(bs, block_num) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
-        memcpy(buf + i * 512,
-            s->uncompressed_block + sector_offset_in_block * 512, 512);
-    }
-    return 0;
-}

-static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCloopState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cloop_read(bs, sector_num, buf, nb_sectors);
+        data = s->uncompressed_block + sector_offset_in_block * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
+    }
+
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -273,7 +280,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
-    .bdrv_read      = cloop_co_read,
+    .bdrv_co_preadv = cloop_co_preadv,
    .bdrv_close     = cloop_close,
 };

--- a/block/commit.c
+++ b/block/commit.c
@@ -36,28 +36,36 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
-    BlockDriverState *top;
-    BlockDriverState *base;
+    BlockBackend *top;
+    BlockBackend *base;
    BlockdevOnError on_error;
    int base_flags;
    int orig_overlay_flags;
    char *backing_file_str;
 } CommitBlockJob;

-static int coroutine_fn commit_populate(BlockDriverState *bs,
-                                        BlockDriverState *base,
+static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
    int ret = 0;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };

-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    if (ret) {
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
+                        qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

-    ret = bdrv_write(base, sector_num, buf, nb_sectors);
-    if (ret) {
+    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
+                         qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

@@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque)
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
+    BlockDriverState *top = blk_bs(s->top);
+    BlockDriverState *base = blk_bs(s->base);
    BlockDriverState *overlay_bs;
    int ret = data->ret;

@@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque)
        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
    }
    g_free(s->backing_file_str);
+    blk_unref(s->top);
+    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
 }
@@ -102,8 +112,6 @@ static void coroutine_fn commit_run(void *opaque)
 {
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
@@ -111,27 +119,27 @@ static void coroutine_fn commit_run(void *opaque)
    int bytes_written = 0;
    int64_t base_len;

-    ret = s->common.len = bdrv_getlength(top);
+    ret = s->common.len = blk_getlength(s->top);


    if (s->common.len < 0) {
        goto out;
    }

-    ret = base_len = bdrv_getlength(base);
+    ret = base_len = blk_getlength(s->base);
    if (base_len < 0) {
        goto out;
    }

    if (base_len < s->common.len) {
-        ret = bdrv_truncate(base, s->common.len);
+        ret = blk_truncate(s->base, s->common.len);
        if (ret) {
            goto out;
        }
    }

    end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);

    for (sector_num = 0; sector_num < end; sector_num += n) {
        uint64_t delay_ns = 0;
@@ -146,7 +154,8 @@ wait:
            break;
        }
        /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(top, base, sector_num,
+        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
+                                      sector_num,
                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                      &n);
        copy = (ret == 1);
@@ -158,7 +167,7 @@ wait:
                    goto wait;
                }
            }
-            ret = commit_populate(top, base, sector_num, n, buf);
+            ret = commit_populate(s->top, s->base, sector_num, n, buf);
            bytes_written += n * BDRV_SECTOR_SIZE;
        }
        if (ret < 0) {
@@ -214,13 +223,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
    BlockDriverState *overlay_bs;
    Error *local_err = NULL;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, "Invalid parameter combination");
-        return;
-    }
-
    assert(top != bs);
    if (top == base) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
@@ -260,8 +262,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
        return;
    }

-    s->base   = base;
-    s->top    = top;
+    s->base = blk_new();
+    blk_insert_bs(s->base, base);
+
+    s->top = blk_new();
+    blk_insert_bs(s->top, top);
+
    s->active = bs;

    s->base_flags          = orig_base_flags;
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block,
    struct BlockCryptoCreateData *data = opaque;
    ssize_t ret;

-    ret = blk_pwrite(data->blk, offset, buf, buflen);
+    ret = blk_pwrite(data->blk, offset, buf, buflen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        return ret;
@@ -196,7 +196,6 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockOpenOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockOpenOptions, 1);
    ret->format = format;
@@ -219,9 +218,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
@@ -242,7 +243,6 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockCreateOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockCreateOptions, 1);
    ret->format = format;
@@ -265,9 +265,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)                                            \
+    do {                                                             \
+        if (DEBUG_CURL_PRINT) {                                      \
+            fprintf(stderr, fmt, ## __VA_ARGS__);                    \
+        }                                                            \
+    } while (0)

 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -440,6 +440,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
+
    s->n_chunks = 0;
    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
    /* used by dmg_read_mish_block to keep track of the current I/O position */
@@ -659,38 +661,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    return 0;
 }

-static int dmg_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVDMGState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
        uint32_t sector_offset_in_chunk;
+        void *data;
+
        if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
         * s->uncompressed_chunk may be too small to cover the large all-zeroes
         * section. dmg_read_chunk is called to find s->current_chunk */
        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-            memset(buf + i * 512, 0, 512);
+            qemu_iovec_memset(qiov, i * 512, 0, 512);
            continue;
        }
        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        memcpy(buf + i * 512,
-               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
+        data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
    }
-    return 0;
-}

-static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVDMGState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = dmg_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -715,7 +721,7 @@ static BlockDriver bdrv_dmg = {
    .instance_size  = sizeof(BDRVDMGState),
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
-    .bdrv_read      = dmg_co_read,
+    .bdrv_co_preadv = dmg_co_preadv,
    .bdrv_close     = dmg_close,
 };

--- a/block/io.c
+++ b/block/io.c
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -456,8 +456,11 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
-    bool fua;
+    bool fua = flags & BDRV_REQ_FUA;

+    if (fua) {
+        assert(iscsilun->dpofua);
+    }
    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }
@@ -472,7 +475,6 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
-    fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA);
    if (iscsilun->use_16_for_rw) {
        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        NULL, num_sectors * iscsilun->block_size,
@@ -513,13 +515,6 @@ retry:
    return 0;
 }

-static int coroutine_fn
-iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                QEMUIOVector *iov)
-{
-    return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0);
-}
-

 static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
                                             int64_t sector_num, int nb_sectors)
@@ -766,6 +761,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    acb->ioh->driver_status = 0;
    acb->ioh->host_status   = 0;
    acb->ioh->resid         = 0;
+    acb->ioh->status        = status;

 #define SG_ERR_DRIVER_SENSE    0x08

@@ -837,6 +833,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        return &acb->common;
    }

+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -1555,6 +1558,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    task = NULL;

    iscsi_modesense_sync(iscsilun);
+    if (iscsilun->dpofua) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Check the write protect flag of the LUN if we want to write */
    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1847,9 +1854,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_co_discard      = iscsi_co_discard,
    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev        = iscsi_co_writev,
    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -30,7 +30,7 @@

 struct qemu_laiocb {
    BlockAIOCB common;
-    struct qemu_laio_state *ctx;
+    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
@@ -46,7 +46,7 @@ typedef struct {
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;

-struct qemu_laio_state {
+struct LinuxAioState {
    io_context_t ctx;
    EventNotifier e;

@@ -60,7 +60,7 @@ struct qemu_laio_state {
    int event_max;
 };

-static void ioq_submit(struct qemu_laio_state *s);
+static void ioq_submit(LinuxAioState *s);

 static inline ssize_t io_event_ret(struct io_event *ev)
 {
@@ -70,8 +70,7 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 /*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
-static void qemu_laio_process_completion(struct qemu_laio_state *s,
-    struct qemu_laiocb *laiocb)
+static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
    int ret;

@@ -99,7 +98,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
- * the completion events array and index are kept in qemu_laio_state.  The BH
+ * the completion events array and index are kept in LinuxAioState.  The BH
 * reschedules itself as long as there are completions pending so it will
 * either be called again in a nested event loop or will be called after all
 * events have been completed.  When there are no events left to complete, the
@@ -107,7 +106,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 */
 static void qemu_laio_completion_bh(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
+    LinuxAioState *s = opaque;

    /* Fetch more completion events when empty */
    if (s->event_idx == s->event_max) {
@@ -136,7 +135,7 @@ static void qemu_laio_completion_bh(void *opaque)
        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
        s->event_idx++;

-        qemu_laio_process_completion(s, laiocb);
+        qemu_laio_process_completion(laiocb);
    }

    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
@@ -146,7 +145,7 @@ static void qemu_laio_completion_bh(void *opaque)

 static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_bh_schedule(s->completion_bh);
@@ -185,7 +184,7 @@ static void ioq_init(LaioQueue *io_q)
    io_q->blocked = false;
 }

-static void ioq_submit(struct qemu_laio_state *s)
+static void ioq_submit(LinuxAioState *s)
 {
    int ret, len;
    struct qemu_laiocb *aiocb;
@@ -216,33 +215,25 @@ static void ioq_submit(struct qemu_laio_state *s)
    s->io_q.blocked = (s->io_q.n > 0);
 }

-void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    s->io_q.plugged++;
+    assert(!s->io_q.plugged);
+    s->io_q.plugged = 1;
 }

-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    assert(s->io_q.plugged > 0 || !unplug);
-
-    if (unplug && --s->io_q.plugged > 0) {
-        return;
-    }
-
+    assert(s->io_q.plugged);
+    s->io_q.plugged = 0;
    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }

-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type)
 {
-    struct qemu_laio_state *s = aio_ctx;
    struct qemu_laiocb *laiocb;
    struct iocb *iocbs;
    off_t offset = sector_num * 512;
@@ -284,26 +275,22 @@ out_free_aiocb:
    return NULL;
 }

-void laio_detach_aio_context(void *s_, AioContext *old_context)
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
-    struct qemu_laio_state *s = s_;
-
    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

-void laio_attach_aio_context(void *s_, AioContext *new_context)
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
-    struct qemu_laio_state *s = s_;
-
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb);
 }

-void *laio_init(void)
+LinuxAioState *laio_init(void)
 {
-    struct qemu_laio_state *s;
+    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    if (event_notifier_init(&s->e, false) < 0) {
@@ -325,10 +312,8 @@ out_free_state:
    return NULL;
 }

-void laio_cleanup(void *s_)
+void laio_cleanup(LinuxAioState *s)
 {
-    struct qemu_laio_state *s = s_;
-
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,7 +20,6 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
-#include "qemu/error-report.h"

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
@@ -36,7 +35,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
-    BlockDriverState *target;
+    BlockBackend *target;
    BlockDriverState *base;
    /* The name of the graph node to replace */
    char *replaces;
@@ -80,11 +79,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
    s->synced = false;
    if (read) {
-        return block_job_error_action(&s->common, s->common.bs,
-                                      s->on_source_error, true, error);
+        return block_job_error_action(&s->common, s->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&s->common, s->target,
-                                      s->on_target_error, false, error);
+        return block_job_error_action(&s->common, s->on_target_error,
+                                      false, error);
    }
 }

@@ -157,7 +156,8 @@ static void mirror_read_complete(void *opaque, int ret)
        mirror_iteration_done(op, ret);
        return;
    }
-    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                    op->nb_sectors * BDRV_SECTOR_SIZE,
                    mirror_write_complete, op);
 }

@@ -186,7 +186,7 @@ static int mirror_cow_align(MirrorBlockJob *s,
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
-        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+        bdrv_round_to_clusters(blk_bs(s->target), *sector_num, *nb_sectors,
                               &align_sector_num, &align_nb_sectors);
    }

@@ -224,7 +224,7 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockBackend *source = s->common.blk;
    int sectors_per_chunk, nb_chunks;
    int ret = nb_sectors;
    MirrorOp *op;
@@ -274,7 +274,8 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                   nb_sectors * BDRV_SECTOR_SIZE,
                   mirror_read_complete, op);
    return ret;
 }
@@ -296,10 +297,11 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+        blk_aio_discard(s->target, sector_num, op->nb_sectors,
+                        mirror_write_complete, op);
    } else {
-        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
+                              op->nb_sectors * BDRV_SECTOR_SIZE,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    }
@@ -307,7 +309,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockDriverState *source = blk_bs(s->common.blk);
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -384,7 +386,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
-            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+            bdrv_round_to_clusters(blk_bs(s->target), sector_num, io_sectors,
                                   &target_sector_num, &target_nb_sectors);
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
@@ -449,7 +451,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->common.bs;
+    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
@@ -461,26 +464,25 @@ static void mirror_exit(BlockJob *job, void *opaque)
    }

    if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
+        BlockDriverState *to_replace = src;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }

-        /* This was checked in mirror_start_job(), but meanwhile one of the
-         * nodes could have been newly attached to a BlockBackend. */
-        if (to_replace->blk && s->target->blk) {
-            error_report("block job: Can't create node with two BlockBackends");
-            data->ret = -EINVAL;
-            goto out;
+        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
        }

-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_replace_in_backing_chain(to_replace, s->target);
+        /* The mirror job has no requests in flight any more, but we need to
+         * drain potential other users of the BDS before changing the graph. */
+        bdrv_drained_begin(target_bs);
+        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_drained_end(target_bs);
+
+        /* We just changed the BDS the job BB refers to */
+        blk_remove_bs(job->blk);
+        blk_insert_bs(job->blk, src);
    }
-
-out:
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
@@ -490,8 +492,8 @@ out:
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_op_unblock_all(s->target, s->common.blocker);
-    bdrv_unref(s->target);
+    bdrv_op_unblock_all(target_bs, s->common.blocker);
+    blk_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
@@ -505,7 +507,8 @@ static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
    int64_t sector_num, end, length;
    uint64_t last_pause_ns;
    BlockDriverInfo bdi;
@@ -541,18 +544,18 @@ static void coroutine_fn mirror_run(void *opaque)
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
-    bdrv_get_backing_filename(s->target, backing_filename,
+    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
-    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
        target_cluster_size = bdi.cluster_size;
    }
-    if (backing_filename[0] && !s->target->backing
+    if (backing_filename[0] && !target_bs->backing
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
+    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);

    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -567,7 +570,7 @@ static void coroutine_fn mirror_run(void *opaque)
    if (!s->is_none_mode) {
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
        BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
+        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);

        for (sector_num = 0; sector_num < end; ) {
            /* Just to make sure we are not exceeding int limit. */
@@ -637,7 +640,7 @@ static void coroutine_fn mirror_run(void *opaque)
        should_complete = false;
        if (s->in_flight == 0 && cnt == 0) {
            trace_mirror_before_flush(s);
-            ret = bdrv_flush(s->target);
+            ret = blk_flush(s->target);
            if (ret < 0) {
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
@@ -710,15 +713,12 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    if (s->target->blk) {
-        blk_iostatus_disable(s->target->blk);
-    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
-    bdrv_drained_begin(s->common.bs);
+    bdrv_drained_begin(bs);
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
@@ -739,22 +739,14 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void mirror_iostatus_reset(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    Error *local_err = NULL;
    int ret;

-    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
+    ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing",
+                                 &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        return;
@@ -793,7 +785,6 @@ static const BlockJobDriver mirror_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_MIRROR,
    .set_speed     = mirror_set_speed,
-    .iostatus_reset= mirror_iostatus_reset,
    .complete      = mirror_complete,
 };

@@ -801,8 +792,6 @@ static const BlockJobDriver commit_active_job_driver = {
    .instance_size = sizeof(MirrorBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = mirror_set_speed,
-    .iostatus_reset
-                   = mirror_iostatus_reset,
    .complete      = mirror_complete,
 };

@@ -819,7 +808,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             bool is_none_mode, BlockDriverState *base)
 {
    MirrorBlockJob *s;
-    BlockDriverState *replaced_bs;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -827,13 +815,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    assert ((granularity & (granularity - 1)) == 0);

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
@@ -843,30 +824,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    /* We can't support this case as long as the block layer can't handle
-     * multiple BlockBackends per BlockDriverState. */
-    if (replaces) {
-        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
-        if (replaced_bs == NULL) {
-            return;
-        }
-    } else {
-        replaced_bs = bs;
-    }
-    if (replaced_bs->blk && target->blk) {
-        error_setg(errp, "Can't create node with two BlockBackends");
-        return;
-    }
-
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
    }

+    s->target = blk_new();
+    blk_insert_bs(s->target, target);
+
    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
-    s->target = target;
    s->is_none_mode = is_none_mode;
    s->base = base;
    s->granularity = granularity;
@@ -876,16 +844,13 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
+        blk_unref(s->target);
        block_job_unref(&s->common);
        return;
    }

-    bdrv_op_block_all(s->target, s->common.blocker);
+    bdrv_op_block_all(target, s->common.blocker);

-    if (s->target->blk) {
-        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(s->target->blk);
-    }
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
@@ -957,7 +922,6 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
        }
    }

-    bdrv_ref(base);
    mirror_start_job(bs, base, NULL, speed, 0, 0,
                     on_error, on_error, false, cb, opaque, &local_err,
                     &commit_active_job_driver, false, base);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -243,15 +243,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,

 static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset, int *flags)
+                           int offset, int flags)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
    struct nbd_request request = { .type = NBD_CMD_WRITE };
    struct nbd_reply reply;
    ssize_t ret;

-    if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
-        *flags &= ~BDRV_REQ_FUA;
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
        request.type |= NBD_CMD_FLAG_FUA;
    }

@@ -291,7 +291,7 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
 }

 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags)
+                         int nb_sectors, QEMUIOVector *qiov, int flags)
 {
    int offset = 0;
    int ret;
@@ -414,6 +414,9 @@ int nbd_client_init(BlockDriverState *bs,
        logout("Failed to negotiate with the NBD server\n");
        return ret;
    }
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_mutex_init(&client->free_sema);
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -48,7 +48,7 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags);
+                         int nb_sectors, QEMUIOVector *qiov, int flags);
 int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -355,31 +355,6 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
 }

-static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
-                               int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-    int ret;
-
-    ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* The flag wasn't sent to the server, so we need to emulate it with an
-     * explicit flush */
-    if (flags & BDRV_REQ_FUA) {
-        ret = nbd_client_co_flush(bs);
-    }
-
-    return ret;
-}
-
-static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
    return nbd_client_co_flush(bs);
@@ -476,9 +451,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -496,9 +469,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -516,9 +487,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -33,6 +33,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"

@@ -512,11 +513,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(tmp, 0, sizeof(tmp));
    memcpy(tmp, &header, sizeof(header));

-    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0);
    if (ret < 0) {
        goto exit;
    }
-    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
+                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
    if (ret < 0) {
        goto exit;
    }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (bs->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_state) {
        ThrottleConfig cfg;

-        throttle_group_get_config(bs, &cfg);
+        throttle_group_get_config(blk, &cfg);

        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->iops_size = cfg.op_size;

        info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(bs));
+        info->group = g_strdup(throttle_group_get_name(blk));
    }

    info->write_threshold = bdrv_write_threshold_get(bs);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
 #include "crypto/cipher.h"
@@ -853,14 +854,14 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* write all the data */
-    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
    if (ret != sizeof(header)) {
        goto exit;
    }

    if (backing_file) {
        ret = blk_pwrite(qcow_blk, sizeof(header),
-            backing_file, backing_filename_len);
+                         backing_file, backing_filename_len, 0);
        if (ret != backing_filename_len) {
            goto exit;
        }
@@ -869,8 +870,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    tmp = g_malloc0(BDRV_SECTOR_SIZE);
    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
        BDRV_SECTOR_SIZE); i++) {
-        ret = blk_pwrite(qcow_blk, header_size +
-            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+        ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
+                         tmp, BDRV_SECTOR_SIZE, 0);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
            goto exit;
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "trace.h"

 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 #include "qemu/range.h"
+#include "qemu/bswap.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,6 +26,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"

--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -36,6 +36,7 @@
 #include "trace.h"
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"

 /*
  Differences with QCOW:
@@ -1757,13 +1758,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)

    qcow2_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bs->drv = NULL;
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQcow2State));
    options = qdict_clone_shallow(bs->options);

@@ -2207,7 +2201,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

-    ret = blk_pwrite(blk, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
    g_free(header);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2217,7 +2211,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
    g_free(refcount_table);

    if (ret < 0) {
@@ -2411,21 +2405,75 @@ finish:
    return ret;
 }

+
+static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int nr;
+    BlockDriverState *file;
+    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
+                                              s->cluster_sectors, &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
+}
+
+static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int nr = s->cluster_sectors;
+    uint64_t off;
+    int ret;
+
+    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
+    assert(nr == s->cluster_sectors);
+    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
+}
+
 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;

-    /* Emulate misaligned zero writes */
-    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
-        return -ENOTSUP;
+    int head = sector_num % s->cluster_sectors;
+    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
+
+    if (head != 0 || tail != 0) {
+        int64_t cl_end = -1;
+
+        sector_num -= head;
+        nb_sectors += head;
+
+        if (tail != 0) {
+            nb_sectors += s->cluster_sectors - tail;
+        }
+
+        if (!is_zero_cluster(bs, sector_num)) {
+            return -ENOTSUP;
+        }
+
+        if (nb_sectors > s->cluster_sectors) {
+            /* Technically the request can cover 2 clusters, f.e. 4k write
+               at s->cluster_sectors - 2k offset. One of these cluster can
+               be zeroed, one unallocated */
+            cl_end = sector_num + nb_sectors - s->cluster_sectors;
+            if (!is_zero_cluster(bs, cl_end)) {
+                return -ENOTSUP;
+            }
+        }
+
+        qemu_co_mutex_lock(&s->lock);
+        /* We can have new write after previous check */
+        if (!is_zero_cluster_top_locked(bs, sector_num) ||
+                (cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) {
+            qemu_co_mutex_unlock(&s->lock);
+            return -ENOTSUP;
+        }
+    } else {
+        qemu_co_mutex_lock(&s->lock);
    }

    /* Whatever is left can use real zero clusters */
-    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -16,6 +16,7 @@
 #include "trace.h"
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
+#include "qemu/bswap.h"

 typedef struct {
    GenericCB gencb;
--- a/block/qed.c
+++ b/block/qed.c
@@ -15,6 +15,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
+#include "qemu/bswap.h"
 #include "trace.h"
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
@@ -601,18 +602,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    qed_header_cpu_to_le(&header, &le_header);
-    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
    if (ret < 0) {
        goto out;
    }
    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
-                     header.backing_filename_size);
+                     header.backing_filename_size, 0);
    if (ret < 0) {
        goto out;
    }

    l1_table = g_malloc0(l1_size);
-    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
    if (ret < 0) {
        goto out;
    }
@@ -1594,12 +1595,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)

    bdrv_qed_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQEDState));
    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
    if (local_err) {
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
    BdrvChild **children;  /* children BlockDriverStates */
    int num_children;      /* children count */
+    unsigned next_child_index;  /* the index of the next child that should
+                                 * be added
+                                 */
    int threshold;         /* if less than threshold children reads gave the
                            * same result a quorum error occurs.
                            */
@@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs)
    return result;
 }

-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
-    BDRVQuorumState *s = bs->opaque;
-    Error *local_err = NULL;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_invalidate_cache(s->children[i]->bs, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    }
-}
-
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
    BDRVQuorumState *s = bs->opaque;
@@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto exit;
    }
-    if (s->num_children < 2) {
+    if (s->num_children < 1) {
        error_setg(&local_err,
-                   "Number of provided children must be greater than 1");
+                   "Number of provided children must be 1 or more");
        ret = -EINVAL;
        goto exit;
    }
@@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

        opened[i] = true;
    }
+    s->next_child_index = s->num_children;

    g_free(opened);
    goto exit;
@@ -999,25 +989,70 @@ static void quorum_close(BlockDriverState *bs)
    g_free(s->children);
 }

-static void quorum_detach_aio_context(BlockDriverState *bs)
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
-    int i;
+    BdrvChild *child;
+    char indexstr[32];
+    int ret;

-    for (i = 0; i < s->num_children; i++) {
-        bdrv_detach_aio_context(s->children[i]->bs);
+    assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+    if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+        s->next_child_index == UINT_MAX) {
+        error_setg(errp, "Too many children");
+        return;
    }
+
+    ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+    if (ret < 0 || ret >= 32) {
+        error_setg(errp, "cannot generate child name");
+        return;
+    }
+    s->next_child_index++;
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely add the child now */
+    bdrv_ref(child_bs);
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+    s->children[s->num_children++] = child;
+
+    bdrv_drained_end(bs);
 }

-static void quorum_attach_aio_context(BlockDriverState *bs,
-                                      AioContext *new_context)
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_children; i++) {
-        bdrv_attach_aio_context(s->children[i]->bs, new_context);
+        if (s->children[i] == child) {
+            break;
+        }
    }
+
+    /* we have checked it in bdrv_del_child() */
+    assert(i < s->num_children);
+
+    if (s->num_children <= s->threshold) {
+        error_setg(errp,
+            "The number of children cannot be lower than the vote threshold %d",
+            s->threshold);
+        return;
+    }
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely remove this child now */
+    memmove(&s->children[i], &s->children[i + 1],
+            (s->num_children - i - 1) * sizeof(BdrvChild *));
+    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+    bdrv_unref_child(bs, child);
+
+    bdrv_drained_end(bs);
 }

 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -1070,10 +1105,9 @@ static BlockDriver bdrv_quorum = {

    .bdrv_aio_readv                     = quorum_aio_readv,
    .bdrv_aio_writev                    = quorum_aio_writev,
-    .bdrv_invalidate_cache              = quorum_invalidate_cache,

-    .bdrv_detach_aio_context            = quorum_detach_aio_context,
-    .bdrv_attach_aio_context            = quorum_attach_aio_context,
+    .bdrv_add_child                     = quorum_add_child,
+    .bdrv_del_child                     = quorum_del_child,

    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -35,15 +35,16 @@

 /* linux-aio.c - Linux native implementation */
 #ifdef CONFIG_LINUX_AIO
-void *laio_init(void);
-void laio_cleanup(void *s);
-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+typedef struct LinuxAioState LinuxAioState;
+LinuxAioState *laio_init(void);
+void laio_cleanup(LinuxAioState *s);
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type);
-void laio_detach_aio_context(void *s, AioContext *old_context);
-void laio_attach_aio_context(void *s, AioContext *new_context);
-void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s);
 #endif

 #ifdef _WIN32
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -139,7 +139,7 @@ typedef struct BDRVRawState {

 #ifdef CONFIG_LINUX_AIO
    int use_aio;
-    void *aio_ctx;
+    LinuxAioState *aio_ctx;
 #endif
 #ifdef CONFIG_XFS
    bool is_xfs:1;
@@ -398,7 +398,7 @@ static void raw_attach_aio_context(BlockDriverState *bs,
 }

 #ifdef CONFIG_LINUX_AIO
-static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
 {
    int ret = -1;
    assert(aio_ctx != NULL);
@@ -517,6 +517,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
@@ -1345,17 +1346,7 @@ static void raw_aio_unplug(BlockDriverState *bs)
 #ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, true);
-    }
-#endif
-}
-
-static void raw_aio_flush_io_queue(BlockDriverState *bs)
-{
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, false);
+        laio_io_unplug(bs, s->aio_ctx);
    }
 #endif
 }
@@ -1949,7 +1940,6 @@ BlockDriver bdrv_file = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
@@ -2398,7 +2388,6 @@ static BlockDriver bdrv_host_device = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
@@ -2528,7 +2517,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2664,7 +2652,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -105,8 +105,8 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
-                             nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+    ret = bdrv_co_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
+                          nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);

 fail:
    if (qiov == &local_qiov) {
@@ -116,13 +116,6 @@ fail:
    return ret;
 }

-static int coroutine_fn
-raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-              QEMUIOVector *qiov)
-{
-    return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int64_t sector_num,
                                            int nb_sectors, int *pnum,
@@ -211,6 +204,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->bs->sg;
+    bs->supported_write_flags = BDRV_REQ_FUA;
+    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP;

    if (bs->probed && !bdrv_is_read_only(bs)) {
        fprintf(stderr,
@@ -256,9 +251,7 @@ BlockDriver bdrv_raw = {
    .bdrv_close           = &raw_close,
    .bdrv_create          = &raw_create,
    .bdrv_co_readv        = &raw_co_readv,
-    .bdrv_co_writev       = &raw_co_writev,
    .bdrv_co_writev_flags = &raw_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
    .bdrv_co_discard      = &raw_co_discard,
    .bdrv_co_get_block_status = &raw_co_get_block_status,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)

 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)                                       \
-    do {                                                            \
-        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-    } while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)                                           \
+    do {                                                                \
+        if (DEBUG_SDOG_PRINT) {                                         \
+            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+        }                                                               \
+    } while (0)

 typedef struct SheepdogAIOCB SheepdogAIOCB;

@@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp)
        if (ret < 0) {
            goto out;
        }
-        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
        if (ret < 0) {
            goto out;
        }
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -373,9 +373,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
    bool ok = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (ok && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -383,8 +384,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
            ok = bdrv_can_snapshot(bs);
        }
        aio_context_release(ctx);
+        if (!ok) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ok;
 }
@@ -393,20 +398,28 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                             Error **err)
 {
    int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
    QEMUSnapshotInfo sn1, *snapshot = &sn1;

-    while (ret == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
        if (bdrv_can_snapshot(bs) &&
                bdrv_snapshot_find(bs, snapshot, name) >= 0) {
            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
+            if (ret < 0) {
+                goto fail;
+            }
        }
        aio_context_release(ctx);
+        if (ret < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ret;
 }
@@ -415,9 +428,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -425,8 +439,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_goto(bs, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -435,9 +453,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    QEMUSnapshotInfo sn;
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -445,8 +464,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_find(bs, &sn, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -457,9 +480,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                             BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -471,23 +495,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
            err = bdrv_snapshot_create(bs, sn);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }

 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    bool not_found = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (not_found && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);
+        bool found;

        aio_context_acquire(ctx);
-        not_found = !bdrv_can_snapshot(bs);
+        found = bdrv_can_snapshot(bs);
        aio_context_release(ctx);
+
+        if (found) {
+            break;
+        }
    }
    return bs;
 }
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
    char *backing_file_str;
 } StreamBlockJob;

-static int coroutine_fn stream_populate(BlockDriverState *bs,
+static int coroutine_fn stream_populate(BlockBackend *blk,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
@@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Copy-on-read the unallocated clusters */
-    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
+                         BDRV_REQ_COPY_ON_READ);
 }

 typedef struct {
@@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
    StreamCompleteData *data = opaque;
+    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                base_fmt = base->drv->format_name;
            }
        }
-        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
-        bdrv_set_backing_hd(job->bs, base);
+        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        bdrv_set_backing_hd(bs, base);
    }

    g_free(s->backing_file_str);
@@ -88,7 +90,8 @@ static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
    StreamCompleteData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockBackend *blk = s->common.blk;
+    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *base = s->base;
    int64_t sector_num = 0;
    int64_t end = -1;
@@ -159,12 +162,11 @@ wait:
                    goto wait;
                }
            }
-            ret = stream_populate(bs, sector_num, n, buf);
+            ret = stream_populate(blk, sector_num, n, buf);
        }
        if (ret < 0) {
            BlockErrorAction action =
-                block_job_error_action(&s->common, s->common.bs, s->on_error,
-                                       true, -ret);
+                block_job_error_action(&s->common, s->on_error, true, -ret);
            if (action == BLOCK_ERROR_ACTION_STOP) {
                n = 0;
                continue;
@@ -224,13 +226,6 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
 {
    StreamBlockJob *s;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
-        return;
-    }
-
    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,13 +23,14 @@
 */

 #include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"

 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockDriverState and it's independent from
+ * among different BlockBackends and it's independent from
 * AioContext, so in order to use it from different threads it needs
 * its own locking.
 *
@@ -39,26 +40,26 @@
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
 *
- * In addition to the ThrottleGroup structure, BlockDriverState has
+ * In addition to the ThrottleGroup structure, BlockBackendPublic has
 * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a BDS is
- * registered in a group those fields can be accessed by other threads
- * any time.
+ * therefore also need to be protected by this lock. Once a
+ * BlockBackend is registered in a group those fields can be accessed
+ * by other threads any time.
 *
 * Again, all this is handled internally and is mostly transparent to
 * the outside. The 'throttle_timers' field however has an additional
 * constraint because it may be temporarily invalid (see for example
 * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BDS's timers only after verifying that that BDS
- * has throttled requests in the queue.
+ * access some other BlockBackend's timers only after verifying that
+ * that BlockBackend has throttled requests in the queue.
 */
 typedef struct ThrottleGroup {
    char *name; /* This is constant during the lifetime of the group */

    QemuMutex lock; /* This lock protects the following four fields */
    ThrottleState ts;
-    QLIST_HEAD(, BlockDriverState) head;
-    BlockDriverState *tokens[2];
+    QLIST_HEAD(, BlockBackendPublic) head;
+    BlockBackend *tokens[2];
    bool any_timer_armed[2];

    /* These two are protected by the global throttle_groups_lock */
@@ -132,93 +133,98 @@ void throttle_group_unref(ThrottleState *ts)
    qemu_mutex_unlock(&throttle_groups_lock);
 }

-/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
- * the pointer) is guaranteed to remain constant during the lifetime
- * of the group.
+/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+ * is guaranteed to remain constant during the lifetime of the group.
 *
- * @bs:   a BlockDriverState that is member of a throttling group
+ * @blk:  a BlockBackend that is member of a throttling group
 * @ret:  the name of the group.
 */
-const char *throttle_group_get_name(BlockDriverState *bs)
+const char *throttle_group_get_name(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    return tg->name;
 }

-/* Return the next BlockDriverState in the round-robin sequence,
- * simulating a circular list.
+/* Return the next BlockBackend in the round-robin sequence, simulating a
+ * circular list.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:  the current BlockDriverState
- * @ret: the next BlockDriverState in the sequence
+ * @blk: the current BlockBackend
+ * @ret: the next BlockBackend in the sequence
 */
-static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);

    if (!next) {
-        return QLIST_FIRST(&tg->head);
+        next = QLIST_FIRST(&tg->head);
    }

-    return next;
+    return blk_by_public(next);
 }

-/* Return the next BlockDriverState in the round-robin sequence with
- * pending I/O requests.
+/* Return the next BlockBackend in the round-robin sequence with pending I/O
+ * requests.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockDriverState with pending requests, or bs
- *             if there is none.
+ * @ret:       the next BlockBackend with pending requests, or blk if there is
+ *             none.
 */
-static BlockDriverState *next_throttle_token(BlockDriverState *bs,
-                                             bool is_write)
+static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    BlockDriverState *token, *start;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    BlockBackend *token, *start;

    start = token = tg->tokens[is_write];

    /* get next bs round in round robin style */
-    token = throttle_group_next_bs(token);
-    while (token != start && !token->pending_reqs[is_write]) {
-        token = throttle_group_next_bs(token);
+    token = throttle_group_next_blk(token);
+    while (token != start && !blkp->pending_reqs[is_write]) {
+        token = throttle_group_next_blk(token);
    }

    /* If no IO are queued for scheduling on the next round robin token
     * then decide the token is the current bs because chances are
     * the current bs get the current request queued.
     */
-    if (token == start && !token->pending_reqs[is_write]) {
-        token = bs;
+    if (token == start && !blkp->pending_reqs[is_write]) {
+        token = blk;
    }

    return token;
 }

-/* Check if the next I/O request for a BlockDriverState needs to be
- * throttled or not. If there's no timer set in this group, set one
- * and update the token accordingly.
+/* Check if the next I/O request for a BlockBackend needs to be throttled or
+ * not. If there's no timer set in this group, set one and update the token
+ * accordingly.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:         the current BlockDriverState
+ * @blk:        the current BlockBackend
 * @is_write:   the type of operation (read/write)
 * @ret:        whether the I/O request needs to be throttled or not
 */
-static bool throttle_group_schedule_timer(BlockDriverState *bs,
-                                          bool is_write)
+static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
-    ThrottleTimers *tt = &bs->throttle_timers;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &blkp->throttle_timers;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool must_wait;

+    if (blkp->io_limits_disabled) {
+        return false;
+    }
+
    /* Check if any of the timers in this group is already armed */
    if (tg->any_timer_armed[is_write]) {
        return true;
@@ -226,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,

    must_wait = throttle_schedule_timer(ts, tt, is_write);

-    /* If a timer just got armed, set bs as the current token */
+    /* If a timer just got armed, set blk as the current token */
    if (must_wait) {
-        tg->tokens[is_write] = bs;
+        tg->tokens[is_write] = blk;
        tg->any_timer_armed[is_write] = true;
    }

@@ -239,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
 */
-static void schedule_next_request(BlockDriverState *bs, bool is_write)
+static void schedule_next_request(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

    /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(bs, is_write);
-    if (!token->pending_reqs[is_write]) {
+    token = next_throttle_token(blk, is_write);
+    if (!blkp->pending_reqs[is_write]) {
        return;
    }

@@ -259,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)

    /* If it doesn't have to wait, queue it for immediate execution */
    if (!must_wait) {
-        /* Give preference to requests from the current bs */
+        /* Give preference to requests from the current blk */
        if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
-            token = bs;
+            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            token = blk;
        } else {
-            ThrottleTimers *tt = &token->throttle_timers;
+            ThrottleTimers *tt = &blkp->throttle_timers;
            int64_t now = qemu_clock_get_ns(tt->clock_type);
            timer_mod(tt->timers[is_write], now + 1);
            tg->any_timer_armed[is_write] = true;
@@ -277,53 +284,67 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
 * if necessary, and schedule the next request using a round robin
 * algorithm.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @bytes:     the number of bytes for this I/O
 * @is_write:  the type of operation (read/write)
 */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                        unsigned int bytes,
                                                        bool is_write)
 {
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);

    /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(bs, is_write);
+    token = next_throttle_token(blk, is_write);
    must_wait = throttle_group_schedule_timer(token, is_write);

    /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || bs->pending_reqs[is_write]) {
-        bs->pending_reqs[is_write]++;
+    if (must_wait || blkp->pending_reqs[is_write]) {
+        blkp->pending_reqs[is_write]++;
        qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
        qemu_mutex_lock(&tg->lock);
-        bs->pending_reqs[is_write]--;
+        blkp->pending_reqs[is_write]--;
    }

    /* The I/O will be executed, so do the accounting */
-    throttle_account(bs->throttle_state, is_write, bytes);
+    throttle_account(blkp->throttle_state, is_write, bytes);

    /* Schedule the next request */
-    schedule_next_request(bs, is_write);
+    schedule_next_request(blk, is_write);

    qemu_mutex_unlock(&tg->lock);
 }

+void throttle_group_restart_blk(BlockBackend *blk)
+{
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
+            ;
+        }
+    }
+}
+
 /* Update the throttle configuration for a particular group. Similar
 * to throttle_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration to set
 */
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleTimers *tt = &bs->throttle_timers;
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    /* throttle_config() cancels the timers */
@@ -335,18 +356,22 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
    }
    throttle_config(ts, tt, cfg);
    qemu_mutex_unlock(&tg->lock);
+
+    qemu_co_enter_next(&blkp->throttled_reqs[0]);
+    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }

 /* Get the throttle configuration from a particular group. Similar to
 * throttle_get_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration will be written here
 */
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    throttle_get_config(ts, cfg);
@@ -356,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
 * because it had been throttled.
 *
- * @bs:        the BlockDriverState whose request had been throttled
+ * @blk:       the BlockBackend whose request had been throttled
 * @is_write:  the type of operation (read/write)
 */
-static void timer_cb(BlockDriverState *bs, bool is_write)
+static void timer_cb(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool empty_queue;

@@ -371,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write)
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
    if (empty_queue) {
        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(bs, is_write);
+        schedule_next_request(blk, is_write);
        qemu_mutex_unlock(&tg->lock);
    }
 }
@@ -392,17 +418,17 @@ static void write_timer_cb(void *opaque)
    timer_cb(opaque, true);
 }

-/* Register a BlockDriverState in the throttling group, also
- * initializing its timers and updating its throttle_state pointer to
- * point to it. If a throttling group with that name does not exist
- * yet, it will be created.
+/* Register a BlockBackend in the throttling group, also initializing its
+ * timers and updating its throttle_state pointer to point to it. If a
+ * throttling group with that name does not exist yet, it will be created.
 *
- * @bs:        the BlockDriverState to insert
+ * @blk:       the BlockBackend to insert
 * @groupname: the name of the group
 */
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
 {
    int i;
+    BlockBackendPublic *blkp = blk_get_public(blk);
    ThrottleState *ts = throttle_group_incref(groupname);
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;
@@ -412,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    bs->throttle_state = ts;
+    blkp->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    /* If the ThrottleGroup is new set this BlockBackend as the token */
    for (i = 0; i < 2; i++) {
        if (!tg->tokens[i]) {
-            tg->tokens[i] = bs;
+            tg->tokens[i] = blk;
        }
    }

-    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);

-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
+    throttle_timers_init(&blkp->throttle_timers,
+                         blk_get_aio_context(blk),
                         clock_type,
                         read_timer_cb,
                         write_timer_cb,
-                         bs);
+                         blk);

    qemu_mutex_unlock(&tg->lock);
 }

-/* Unregister a BlockDriverState from its group, removing it from the
- * list, destroying the timers and setting the throttle_state pointer
- * to NULL.
+/* Unregister a BlockBackend from its group, removing it from the list,
+ * destroying the timers and setting the throttle_state pointer to NULL.
 *
- * The BlockDriverState must not have pending throttled requests, so
- * the caller has to drain them first.
+ * The BlockBackend must not have pending throttled requests, so the caller has
+ * to drain them first.
 *
 * The group will be destroyed if it's empty after this operation.
 *
- * @bs: the BlockDriverState to remove
+ * @blk: the BlockBackend to remove
 */
-void throttle_group_unregister_bs(BlockDriverState *bs)
+void throttle_group_unregister_blk(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    int i;

-    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));

    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == bs) {
-            BlockDriverState *token = throttle_group_next_bs(bs);
-            /* Take care of the case where this is the last bs in the group */
-            if (token == bs) {
+        if (tg->tokens[i] == blk) {
+            BlockBackend *token = throttle_group_next_blk(blk);
+            /* Take care of the case where this is the last blk in the group */
+            if (token == blk) {
                token = NULL;
            }
            tg->tokens[i] = token;
        }
    }

-    /* remove the current bs from the list */
-    QLIST_REMOVE(bs, round_robin);
-    throttle_timers_destroy(&bs->throttle_timers);
+    /* remove the current blk from the list */
+    QLIST_REMOVE(blkp, round_robin);
+    throttle_timers_destroy(&blkp->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

    throttle_group_unref(&tg->ts);
-    bs->throttle_state = NULL;
+    blkp->throttle_state = NULL;
 }

 static void throttle_groups_init(void)
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -54,6 +54,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
 #include "qemu/cutils.h"
@@ -557,98 +558,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }

-static int vdi_co_read(BlockDriverState *bs,
-        int64_t sector_num, uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will read %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will read %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Block not allocated, return zeros, no need to wait. */
-            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            ret = 0;
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
-            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
-        }
-        logout("%u sectors read\n", n_sectors);
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, data_offset, n_bytes,
+                                 &local_qiov, 0);
+        }
+        logout("%u bytes read\n", n_bytes);
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }

+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

-static int vdi_co_write(BlockDriverState *bs,
-        int64_t sector_num, const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
    uint32_t bmap_first = VDI_UNALLOCATED;
    uint32_t bmap_last = VDI_UNALLOCATED;
    uint8_t *block = NULL;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will write %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will write %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Allocate new block and write to it. */
-            uint64_t offset;
+            uint64_t data_offset;
            bmap_entry = s->header.blocks_allocated;
            s->bmap[block_index] = cpu_to_le32(bmap_entry);
            s->header.blocks_allocated++;
-            offset = s->header.offset_data / SECTOR_SIZE +
-                     (uint64_t)bmap_entry * s->block_sectors;
+            data_offset = s->header.offset_data +
+                          (uint64_t)bmap_entry * s->block_size;
            if (block == NULL) {
                block = g_malloc(s->block_size);
                bmap_first = block_index;
            }
            bmap_last = block_index;
            /* Copy data to be written to new block and zero unused parts. */
-            memset(block, 0, sector_in_block * SECTOR_SIZE);
-            memcpy(block + sector_in_block * SECTOR_SIZE,
-                   buf, n_sectors * SECTOR_SIZE);
-            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            memset(block, 0, offset_in_block);
+            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
+                              n_bytes);
+            memset(block + offset_in_block + n_bytes, 0,
+                   s->block_size - n_bytes - offset_in_block);

            /* Note that this coroutine does not yield anywhere from reading the
             * bmap entry until here, so in regards to all the coroutines trying
@@ -658,12 +670,12 @@ static int vdi_co_write(BlockDriverState *bs,
             * acquire the lock and thus the padded cluster is written before
             * the other coroutines can write to the affected area. */
            qemu_co_mutex_lock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
+            ret = bdrv_pwrite(bs->file->bs, data_offset, block, s->block_size);
            qemu_co_mutex_unlock(&s->write_lock);
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;
            qemu_co_mutex_lock(&s->write_lock);
            /* This lock is only used to make sure the following write operation
             * is executed after the write issued by the coroutine allocating
@@ -674,16 +686,23 @@ static int vdi_co_write(BlockDriverState *bs,
             * that that write operation has returned (there may be other writes
             * in flight, but they do not concern this very operation). */
            qemu_co_mutex_unlock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
+
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_pwritev(bs->file->bs, data_offset, n_bytes,
+                                  &local_qiov, 0);
        }

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

-        logout("%u sectors written\n", n_sectors);
+        logout("%u bytes written\n", n_bytes);
    }

+    qemu_iovec_destroy(&local_qiov);
+
    logout("finished data write\n");
    if (ret < 0) {
        return ret;
@@ -694,6 +713,7 @@ static int vdi_co_write(BlockDriverState *bs,
        VdiHeader *header = (VdiHeader *) block;
        uint8_t *base;
        uint64_t offset;
+        uint32_t n_sectors;

        logout("now writing modified header\n");
        assert(VDI_IS_ALLOCATED(bmap_first));
@@ -808,7 +828,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    ret = blk_pwrite(blk, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, "Error writing header to %s", filename);
        goto exit;
@@ -829,7 +849,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        ret = blk_pwrite(blk, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size, 0);
        if (ret < 0) {
            error_setg(errp, "Error writing bmap to %s", filename);
            goto exit;
@@ -903,9 +923,9 @@ static BlockDriver bdrv_vdi = {
    .bdrv_co_get_block_status = vdi_co_get_block_status,
    .bdrv_make_empty = vdi_make_empty,

-    .bdrv_read = vdi_co_read,
+    .bdrv_co_preadv     = vdi_co_preadv,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_write = vdi_co_write,
+    .bdrv_co_pwritev    = vdi_co_pwritev,
 #endif

    .bdrv_get_info = vdi_get_info,
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"

 #include <uuid/uuid.h>
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -23,6 +23,7 @@
 #include "block/block_int.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"


--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -22,6 +22,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 #include "migration/migration.h"

@@ -1856,13 +1857,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                              &creator_items, NULL);
    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature),
+                     0);
    if (ret < 0) {
        goto delete_and_exit;
    }
    if (creator) {
        ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                         creator, creator_items * sizeof(gunichar2));
+                         creator, creator_items * sizeof(gunichar2), 0);
        if (ret < 0) {
            goto delete_and_exit;
        }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,6 +30,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/cutils.h"
 #include <zlib.h>
@@ -1016,27 +1017,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 */
 static int get_whole_cluster(BlockDriverState *bs,
                             VmdkExtent *extent,
-                             uint64_t cluster_sector_num,
-                             uint64_t sector_num,
-                             uint64_t skip_start_sector,
-                             uint64_t skip_end_sector)
+                             uint64_t cluster_offset,
+                             uint64_t offset,
+                             uint64_t skip_start_bytes,
+                             uint64_t skip_end_bytes)
 {
    int ret = VMDK_OK;
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
-    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
    whole_grain = qemu_blockalign(bs, cluster_bytes);

    if (!bs->backing) {
-        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
-        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
-               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+        memset(whole_grain, 0, skip_start_bytes);
+        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
    }

-    assert(skip_end_sector <= extent->cluster_sectors);
+    assert(skip_end_bytes <= cluster_bytes);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
@@ -1045,42 +1045,43 @@ static int get_whole_cluster(BlockDriverState *bs,
    }

    /* Read backing data before skip range */
-    if (skip_start_sector > 0) {
+    if (skip_start_bytes > 0) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num,
-                            whole_grain, skip_start_sector);
+            ret = bdrv_pread(bs->backing->bs, offset, whole_grain,
+                             skip_start_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain,
-                         skip_start_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset, whole_grain,
+                          skip_start_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
    /* Read backing data after skip range */
-    if (skip_end_sector < extent->cluster_sectors) {
+    if (skip_end_bytes < cluster_bytes) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector,
-                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                            extent->cluster_sectors - skip_end_sector);
+            ret = bdrv_pread(bs->backing->bs, offset + skip_end_bytes,
+                             whole_grain + skip_end_bytes,
+                             cluster_bytes - skip_end_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector,
-                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                         extent->cluster_sectors - skip_end_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset + skip_end_bytes,
+                          whole_grain + skip_end_bytes,
+                          cluster_bytes - skip_end_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }

+    ret = VMDK_OK;
 exit:
    qemu_vfree(whole_grain);
    return ret;
@@ -1142,8 +1143,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
-                              uint64_t skip_start_sector,
-                              uint64_t skip_end_sector)
+                              uint64_t skip_start_bytes,
+                              uint64_t skip_end_bytes)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
@@ -1230,10 +1231,8 @@ static int get_cluster_offset(BlockDriverState *bs,
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        ret = get_whole_cluster(bs, extent,
-                                cluster_sector,
-                                offset >> BDRV_SECTOR_BITS,
-                                skip_start_sector, skip_end_sector);
+        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+                                offset, skip_start_bytes, skip_end_bytes);
        if (ret) {
            return ret;
        }
@@ -1259,15 +1258,26 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
    return NULL;
 }

+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+                                                   int64_t offset)
+{
+    uint64_t offset_in_cluster, extent_begin_offset, extent_relative_offset;
+    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+    extent_begin_offset =
+        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+    extent_relative_offset = offset - extent_begin_offset;
+    offset_in_cluster = extent_relative_offset % cluster_size;
+
+    return offset_in_cluster;
+}
+
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
 {
-    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
-
-    extent_begin_sector = extent->end_sector - extent->sectors;
-    extent_relative_sector_num = sector_num - extent_begin_sector;
-    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-    return index_in_cluster;
+    uint64_t offset;
+    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
+    return offset / BDRV_SECTOR_SIZE;
 }

 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
@@ -1319,38 +1329,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
 }

 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, const uint8_t *buf,
-                            int nb_sectors, int64_t sector_num)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            uint64_t qiov_offset, uint64_t n_bytes,
+                            uint64_t offset)
 {
    int ret;
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
-    const uint8_t *write_buf = buf;
-    int write_len = nb_sectors * 512;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
    int64_t write_offset;
    int64_t write_end_sector;

    if (extent->compressed) {
+        void *compressed_data;
+
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
-        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
-                buf_len == 0) {
+
+        compressed_data = g_malloc(n_bytes);
+        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
+        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
+        g_free(compressed_data);
+
+        if (ret != Z_OK || buf_len == 0) {
            ret = -EINVAL;
            goto out;
        }
-        data->lba = sector_num;
-        data->size = buf_len;
-        write_buf = (uint8_t *)data;
-        write_len = buf_len + sizeof(VmdkGrainMarker);
-    }
-    write_offset = cluster_offset + offset_in_cluster,
-    ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len);

-    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE);
+        data->lba = offset >> BDRV_SECTOR_BITS;
+        data->size = buf_len;
+
+        n_bytes = buf_len + sizeof(VmdkGrainMarker);
+        iov = (struct iovec) {
+            .iov_base   = data,
+            .iov_len    = n_bytes,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    } else {
+        qemu_iovec_init(&local_qiov, qiov->niov);
+        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
+    }
+
+    write_offset = cluster_offset + offset_in_cluster,
+    ret = bdrv_co_pwritev(extent->file->bs, write_offset, n_bytes,
+                          &local_qiov, 0);
+
+    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);

    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
@@ -1359,19 +1388,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
                                          write_end_sector);
    }

-    if (ret != write_len) {
-        ret = ret < 0 ? ret : -EIO;
+    if (ret < 0) {
        goto out;
    }
    ret = 0;
 out:
    g_free(data);
+    if (!extent->compressed) {
+        qemu_iovec_destroy(&local_qiov);
+    }
    return ret;
 }

 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, uint8_t *buf,
-                            int nb_sectors)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            int bytes)
 {
    int ret;
    int cluster_bytes, buf_bytes;
@@ -1383,14 +1414,13 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,


    if (!extent->compressed) {
-        ret = bdrv_pread(extent->file->bs,
-                          cluster_offset + offset_in_cluster,
-                          buf, nb_sectors * 512);
-        if (ret == nb_sectors * 512) {
-            return 0;
-        } else {
-            return -EIO;
+        ret = bdrv_co_preadv(extent->file->bs,
+                             cluster_offset + offset_in_cluster, bytes,
+                             qiov, 0);
+        if (ret < 0) {
+            return ret;
        }
+        return 0;
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
@@ -1422,11 +1452,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,

    }
    if (offset_in_cluster < 0 ||
-            offset_in_cluster + nb_sectors * 512 > buf_len) {
+            offset_in_cluster + bytes > buf_len) {
        ret = -EINVAL;
        goto out;
    }
-    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
+    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
    ret = 0;

 out:
@@ -1435,64 +1465,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
    return ret;
 }

-static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVmdkState *s = bs->opaque;
    int ret;
-    uint64_t n, index_in_cluster;
+    uint64_t n_bytes, offset_in_cluster;
    VmdkExtent *extent = NULL;
+    QEMUIOVector local_qiov;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
-            return -EIO;
+            ret = -EIO;
+            goto fail;
        }
        ret = get_cluster_offset(bs, extent, NULL,
-                                 sector_num << 9, false, &cluster_offset,
-                                 0, 0);
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
+                                 offset, false, &cluster_offset, 0, 0);
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
        if (ret != VMDK_OK) {
            /* if not allocated, try to read from parent image, if exist */
            if (bs->backing && ret != VMDK_ZEROED) {
                if (!vmdk_is_cid_valid(bs)) {
-                    return -EINVAL;
+                    ret = -EINVAL;
+                    goto fail;
                }
-                ret = bdrv_read(bs->backing->bs, sector_num, buf, n);
+
+                qemu_iovec_reset(&local_qiov);
+                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+                ret = bdrv_co_preadv(bs->backing->bs, offset, n_bytes,
+                                     &local_qiov, 0);
                if (ret < 0) {
-                    return ret;
+                    goto fail;
                }
            } else {
-                memset(buf, 0, 512 * n);
+                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            }
        } else {
-            ret = vmdk_read_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n);
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
+                                   &local_qiov, n_bytes);
            if (ret) {
-                return ret;
+                goto fail;
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
-                                     uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVmdkState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -1506,38 +1545,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
 *
 * Returns: error code with 0 for success.
 */
-static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                      const uint8_t *buf, int nb_sectors,
-                      bool zeroed, bool zero_dry_run)
+static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
+                       uint64_t bytes, QEMUIOVector *qiov,
+                       bool zeroed, bool zero_dry_run)
 {
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int ret;
-    int64_t index_in_cluster, n;
+    int64_t offset_in_cluster, n_bytes;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;
    VmdkMetaData m_data;

-    if (sector_num > bs->total_sectors) {
-        error_report("Wrong offset: sector_num=0x%" PRIx64
+    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
+        error_report("Wrong offset: offset=0x%" PRIx64
                     " total_sectors=0x%" PRIx64,
-                     sector_num, bs->total_sectors);
+                     offset, bs->total_sectors);
        return -EIO;
    }

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
            return -EIO;
        }
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
-        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
+        ret = get_cluster_offset(bs, extent, &m_data, offset,
                                 !(extent->compressed || zeroed),
-                                 &cluster_offset,
-                                 index_in_cluster, index_in_cluster + n);
+                                 &cluster_offset, offset_in_cluster,
+                                 offset_in_cluster + n_bytes);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1546,7 +1585,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                ret = get_cluster_offset(bs, extent, &m_data, offset,
                                         true, &cluster_offset, 0, 0);
            }
        }
@@ -1556,9 +1595,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
-                    index_in_cluster == 0 &&
-                    n >= extent->cluster_sectors) {
-                n = extent->cluster_sectors;
+                    offset_in_cluster == 0 &&
+                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
+                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
                if (!zero_dry_run) {
                    /* update L2 tables */
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
@@ -1570,9 +1609,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -ENOTSUP;
            }
        } else {
-            ret = vmdk_write_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n, sector_num);
+            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
+                                    qiov, bytes_done, n_bytes, offset);
            if (ret) {
                return ret;
            }
@@ -1585,9 +1623,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                }
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

        /* update CID on the first write every time the virtual disk is
         * opened */
@@ -1602,25 +1640,65 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
    return 0;
 }

-static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
-                                      const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }

+typedef struct VmdkWriteCompressedCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} VmdkWriteCompressedCo;
+
+static void vmdk_co_write_compressed(void *opaque)
+{
+    VmdkWriteCompressedCo *co = opaque;
+    QEMUIOVector local_qiov;
+    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = bytes,
+    };
+    qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+    co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false);
+}
+
 static int vmdk_write_compressed(BlockDriverState *bs,
                                 int64_t sector_num,
                                 const uint8_t *buf,
                                 int nb_sectors)
 {
    BDRVVmdkState *s = bs->opaque;
+
    if (s->num_extents == 1 && s->extents[0].compressed) {
-        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+        Coroutine *co;
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        VmdkWriteCompressedCo data = {
+            .bs         = bs,
+            .sector_num = sector_num,
+            .buf        = buf,
+            .nb_sectors = nb_sectors,
+            .ret        = -EINPROGRESS,
+        };
+        co = qemu_coroutine_create(vmdk_co_write_compressed);
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(aio_context, true);
+        }
+        return data.ret;
    } else {
        return -ENOTSUP;
    }
@@ -1633,12 +1711,15 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
+    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE;
+
    qemu_co_mutex_lock(&s->lock);
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
-    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
    if (!ret) {
-        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
    }
    qemu_co_mutex_unlock(&s->lock);
    return ret;
@@ -1728,12 +1809,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1753,7 +1834,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1765,7 +1846,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1829,8 +1910,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size = 0, filesize;
    char *adapter_type = NULL;
    char *backing_file = NULL;
+    char *hw_version = NULL;
    char *fmt = NULL;
-    int flags = 0;
    int ret = 0;
    bool flat, split, compress;
    GString *ext_desc_lines;
@@ -1861,7 +1942,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
-        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.virtualHWVersion = \"%s\"\n"
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
        "ddb.geometry.sectors = \"63\"\n"
@@ -1878,8 +1959,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        flags |= BLOCK_FLAG_COMPAT6;
+        if (strcmp(hw_version, "undefined")) {
+            error_setg(errp,
+                       "compat6 cannot be enabled with hwversion set");
+            ret = -EINVAL;
+            goto exit;
+        }
+        g_free(hw_version);
+        hw_version = g_strdup("6");
+    }
+    if (strcmp(hw_version, "undefined") == 0) {
+        g_free(hw_version);
+        hw_version = g_strdup("4");
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
@@ -2001,7 +2094,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
-                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           hw_version,
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                           number_heads,
@@ -2028,7 +2121,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)

    blk_set_allow_write_beyond_eof(new_blk, true);

-    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2047,6 +2140,7 @@ exit:
    }
    g_free(adapter_type);
    g_free(backing_file);
+    g_free(hw_version);
    g_free(fmt);
    g_free(desc);
    g_free(path);
@@ -2250,27 +2344,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static void vmdk_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_detach_aio_context(s->extents[i].file->bs);
-    }
-}
-
-static void vmdk_attach_aio_context(BlockDriverState *bs,
-                                    AioContext *new_context)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
-    }
-}
-
 static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2297,6 +2370,12 @@ static QemuOptsList vmdk_create_opts = {
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
+        {
+            .name = BLOCK_OPT_HWVERSION,
+            .type = QEMU_OPT_STRING,
+            .help = "VMDK hardware version",
+            .def_value_str = "undefined"
+        },
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
@@ -2321,8 +2400,8 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
-    .bdrv_read                    = vmdk_co_read,
-    .bdrv_write                   = vmdk_co_write,
+    .bdrv_co_preadv               = vmdk_co_preadv,
+    .bdrv_co_pwritev              = vmdk_co_pwritev,
    .bdrv_write_compressed        = vmdk_write_compressed,
    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
    .bdrv_close                   = vmdk_close,
@@ -2334,8 +2413,6 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_get_specific_info       = vmdk_get_specific_info,
    .bdrv_refresh_limits          = vmdk_refresh_limits,
    .bdrv_get_info                = vmdk_get_info,
-    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
-    .bdrv_attach_aio_context      = vmdk_attach_aio_context,

    .supports_backing             = true,
    .create_opts                  = &vmdk_create_opts,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
+#include "qemu/bswap.h"
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
@@ -454,22 +455,21 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 * The parameter write must be 1 if the offset will be used for a write
 * operation (the block bitmaps is updated then), 0 otherwise.
 */
-static inline int64_t get_sector_offset(BlockDriverState *bs,
-    int64_t sector_num, int write)
+static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
+                                       bool write)
 {
    BDRVVPCState *s = bs->opaque;
-    uint64_t offset = sector_num * 512;
    uint64_t bitmap_offset, block_offset;
-    uint32_t pagetable_index, pageentry_index;
+    uint32_t pagetable_index, offset_in_block;

    pagetable_index = offset / s->block_size;
-    pageentry_index = (offset % s->block_size) / 512;
+    offset_in_block = offset % s->block_size;

    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
        return -1; /* not allocated */

    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
-    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;

    /* We must ensure that we don't write to any sectors which are marked as
       unused in the bitmap. We get away with setting all bits in the block
@@ -487,6 +487,12 @@ static inline int64_t get_sector_offset(BlockDriverState *bs,
    return block_offset;
 }

+static inline int64_t get_sector_offset(BlockDriverState *bs,
+                                        int64_t sector_num, bool write)
+{
+    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
+}
+
 /*
 * Writes the footer to the end of the image file. This is needed when the
 * file grows as it overwrites the old footer
@@ -513,7 +519,7 @@ static int rewrite_footer(BlockDriverState* bs)
 *
 * Returns the sectors' offset in the image file on success and < 0 on error
 */
-static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t bat_offset;
@@ -522,14 +528,13 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
    uint8_t bitmap[s->bitmap_size];

    /* Check if sector_num is valid */
-    if ((sector_num < 0) || (sector_num > bs->total_sectors))
-        return -1;
+    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
+        return -EINVAL;
+    }

    /* Write entry into in-memory BAT */
-    index = (sector_num * 512) / s->block_size;
-    if (s->pagetable[index] != 0xFFFFFFFF)
-        return -1;
-
+    index = offset / s->block_size;
+    assert(s->pagetable[index] == 0xFFFFFFFF);
    s->pagetable[index] = s->free_data_block_offset / 512;

    /* Initialize the block's bitmap */
@@ -553,11 +558,11 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
    if (ret < 0)
        goto fail;

-    return get_sector_offset(bs, sector_num, 0);
+    return get_image_offset(bs, offset, false);

 fail:
    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
-    return -1;
+    return ret;
 }

 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -573,104 +578,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static int vpc_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
    int ret;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
+        return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0);
    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 0);

-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        if (offset == -1) {
-            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, false);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
        } else {
-            ret = bdrv_pread(bs->file->bs, offset, buf,
-                sectors * BDRV_SECTOR_SIZE);
-            if (ret != sectors * BDRV_SECTOR_SIZE) {
-                return -1;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes,
+                                 &local_qiov, 0);
+            if (ret < 0) {
+                goto fail;
            }
        }

-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

-static int vpc_write(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
-    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 1);
-
-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
-
-        if (offset == -1) {
-            offset = alloc_block(bs, sector_num);
-            if (offset < 0)
-                return -1;
-        }
-
-        ret = bdrv_pwrite(bs->file->bs, offset, buf,
-                          sectors * BDRV_SECTOR_SIZE);
-        if (ret != sectors * BDRV_SECTOR_SIZE) {
-            return -1;
-        }
-
-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0);
    }

-    return 0;
-}
-
-static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_write(bs, sector_num, buf, nb_sectors);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, true);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            image_offset = alloc_block(bs, offset);
+            if (image_offset < 0) {
+                ret = image_offset;
+                goto fail;
+            }
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+        ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes,
+                              &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
+    }
+
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -783,13 +789,13 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -799,7 +805,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = blk_pwrite(blk, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512, 0);
        if (ret < 0) {
            goto fail;
        }
@@ -826,7 +832,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    /* Write the header */
    offset = 512;

-    ret = blk_pwrite(blk, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -848,7 +854,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
        return ret;
    }

-    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        return ret;
    }
@@ -1056,8 +1062,8 @@ static BlockDriver bdrv_vpc = {
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
    .bdrv_create            = vpc_create,

-    .bdrv_read                  = vpc_co_read,
-    .bdrv_write                 = vpc_co_write,
+    .bdrv_co_preadv             = vpc_co_preadv,
+    .bdrv_co_pwritev            = vpc_co_pwritev,
    .bdrv_co_get_block_status   = vpc_co_get_block_status,

    .bdrv_get_info          = vpc_get_info,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
@@ -1179,6 +1180,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
        bs->read_only = 0;
    }

+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
    bs->total_sectors = cyls * heads * secs;

    if (init_directories(s, dirname, heads, secs, errp)) {
@@ -1421,14 +1423,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
    return 0;
 }

-static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_from_buf(qiov, 0, buf, bytes);
+    g_free(buf);
+
    return ret;
 }

@@ -2880,14 +2899,31 @@ DLOG(checkpoint());
    return 0;
 }

-static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
-                                       const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                 QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+    qemu_iovec_to_buf(qiov, 0, buf, bytes);
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_write(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    g_free(buf);
+
    return ret;
 }

@@ -2904,8 +2940,10 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA;
 }

-static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
-	const uint8_t* buffer, int nb_sectors) {
+static int coroutine_fn
+write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                    QEMUIOVector *qiov, int flags)
+{
    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
    return try_commit(s);
 }
@@ -2918,7 +2956,7 @@ static void write_target_close(BlockDriverState *bs) {

 static BlockDriver vvfat_write_target = {
    .format_name        = "vvfat_write_target",
-    .bdrv_write         = write_target_commit,
+    .bdrv_co_pwritev    = write_target_commit,
    .bdrv_close         = write_target_close,
 };

@@ -2960,12 +2998,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
        goto err;
    }

-    s->qcow = NULL;
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow"));
-    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
-    if (ret < 0) {
+    s->qcow = bdrv_open(s->qcow_filename, NULL, options,
+                        BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
+    if (!s->qcow) {
+        ret = -EINVAL;
        goto err;
    }

@@ -3014,8 +3052,8 @@ static BlockDriver bdrv_vvfat = {
    .bdrv_file_open         = vvfat_open,
    .bdrv_close             = vvfat_close,

-    .bdrv_read              = vvfat_co_read,
-    .bdrv_write             = vvfat_co_write,
+    .bdrv_co_preadv         = vvfat_co_preadv,
+    .bdrv_co_pwritev        = vvfat_co_pwritev,
    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };

--- a/blockdev.c
+++ b/blockdev.c
@@ -73,7 +73,7 @@ static int if_max_devs[IF_COUNT] = {
     * Do not change these numbers!  They govern how drive option
     * index maps to unit and bus.  That mapping is ABI.
     *
-     * All controllers used to imlement if=T drives need to support
+     * All controllers used to implement if=T drives need to support
     * if_max_devs[T] units, for any T with if_max_devs[T] != 0.
     * Otherwise, some index values map to "impossible" bus, unit
     * values.
@@ -567,25 +567,12 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new(errp);
-        if (!blk) {
-            goto early_err;
-        }
-
+        blk = blk_new();
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags;
        blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
        blk_rs->detect_zeroes = detect_zeroes;

-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            blk_rs->throttle_group = g_strdup(throttling_group);
-            blk_rs->throttle_state = throttle_group_incref(throttling_group);
-            blk_rs->throttle_state->cfg = cfg;
-        }
-
        QDECREF(bs_opts);
    } else {
        if (file && !*file) {
@@ -611,15 +598,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

        bs->detect_zeroes = detect_zeroes;

-        /* disk I/O throttling */
-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            bdrv_io_limits_enable(bs, throttling_group);
-            bdrv_set_io_limits(bs, &cfg);
-        }
-
        if (bdrv_key_required(bs)) {
            autostart = 0;
        }
@@ -633,6 +611,15 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        }
    }

+    /* disk I/O throttling */
+    if (throttle_enabled(&cfg)) {
+        if (!throttling_group) {
+            throttling_group = blk_name(blk);
+        }
+        blk_io_limits_enable(blk, throttling_group);
+        blk_set_io_limits(blk, &cfg);
+    }
+
    blk_set_enable_write_cache(blk, !writethrough);
    blk_set_on_error(blk, on_read_error, on_write_error);

@@ -666,7 +653,6 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
    QemuOpts *opts;
    Error *local_error = NULL;
    BlockdevDetectZeroesOptions detect_zeroes;
-    int ret;
    int bdrv_flags = 0;

    opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp);
@@ -697,9 +683,8 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        bdrv_flags |= BDRV_O_INACTIVE;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp);
-    if (ret < 0) {
+    bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
+    if (!bs) {
        goto fail_no_bs_opts;
    }

@@ -1652,7 +1637,7 @@ typedef struct ExternalSnapshotState {
 static void external_snapshot_prepare(BlkActionState *common,
                                      Error **errp)
 {
-    int flags = 0, ret;
+    int flags = 0;
    QDict *options = NULL;
    Error *local_err = NULL;
    /* Device and node name of the image to generate the snapshot from */
@@ -1777,17 +1762,16 @@ static void external_snapshot_prepare(BlkActionState *common,
        flags |= BDRV_O_NO_BACKING;
    }

-    assert(state->new_bs == NULL);
-    ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options,
-                    flags, errp);
+    state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags,
+                              errp);
    /* We will manually add the backing_hd field to the bs later */
-    if (ret != 0) {
+    if (!state->new_bs) {
        return;
    }

-    if (state->new_bs->blk != NULL) {
+    if (bdrv_has_blk(state->new_bs)) {
        error_setg(errp, "The snapshot is already in use by %s",
-                   blk_name(state->new_bs->blk));
+                   bdrv_get_parent_name(state->new_bs));
        return;
    }

@@ -2290,16 +2274,29 @@ exit:
    block_job_txn_unref(block_job_txn);
 }

+static int do_open_tray(const char *device, bool force, Error **errp);
+
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
    Error *local_err = NULL;
+    int rc;

-    qmp_blockdev_open_tray(device, has_force, force, &local_err);
+    if (!has_force) {
+        force = false;
+    }
+
+    rc = do_open_tray(device, force, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
    }

+    if (rc == EINPROGRESS) {
+        error_setg(errp, "Device '%s' is locked and force was not specified, "
+                   "wait for tray to open and try again", device);
+        return;
+    }
+
    qmp_x_blockdev_remove_medium(device, errp);
 }

@@ -2327,35 +2324,36 @@ void qmp_block_passwd(bool has_device, const char *device,
    aio_context_release(aio_context);
 }

-void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
-                            Error **errp)
+/**
+ * returns -errno on fatal error, +errno for non-fatal situations.
+ * errp will always be set when the return code is negative.
+ * May return +ENOSYS if the device has no tray,
+ * or +EINPROGRESS if the tray is locked and the guest has been notified.
+ */
+static int do_open_tray(const char *device, bool force, Error **errp)
 {
    BlockBackend *blk;
    bool locked;

-    if (!has_force) {
-        force = false;
-    }
-
    blk = blk_by_name(device);
    if (!blk) {
        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                  "Device '%s' not found", device);
-        return;
+        return -ENODEV;
    }

    if (!blk_dev_has_removable_media(blk)) {
        error_setg(errp, "Device '%s' is not removable", device);
-        return;
+        return -ENOTSUP;
    }

    if (!blk_dev_has_tray(blk)) {
        /* Ignore this command on tray-less devices */
-        return;
+        return ENOSYS;
    }

    if (blk_dev_is_tray_open(blk)) {
-        return;
+        return 0;
    }

    locked = blk_dev_is_medium_locked(blk);
@@ -2366,6 +2364,21 @@ void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
    if (!locked || force) {
        blk_dev_change_media_cb(blk, false);
    }
+
+    if (locked && !force) {
+        return EINPROGRESS;
+    }
+
+    return 0;
+}
+
+void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
+                            Error **errp)
+{
+    if (!has_force) {
+        force = false;
+    }
+    do_open_tray(device, force, errp);
 }

 void qmp_blockdev_close_tray(const char *device, Error **errp)
@@ -2503,9 +2516,9 @@ void qmp_x_blockdev_insert_medium(const char *device, const char *node_name,
        return;
    }

-    if (bs->blk) {
+    if (bdrv_has_blk(bs)) {
        error_setg(errp, "Node '%s' is already in use by '%s'", node_name,
-                   blk_name(bs->blk));
+                   bdrv_get_parent_name(bs));
        return;
    }

@@ -2520,7 +2533,7 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
 {
    BlockBackend *blk;
    BlockDriverState *medium_bs = NULL;
-    int bdrv_flags, ret;
+    int bdrv_flags;
    QDict *options = NULL;
    Error *err = NULL;

@@ -2564,14 +2577,11 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    assert(!medium_bs);
-    ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp);
-    if (ret < 0) {
+    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
+    if (!medium_bs) {
        goto fail;
    }

-    blk_apply_root_state(blk, medium_bs);
-
    bdrv_add_key(medium_bs, NULL, &err);
    if (err) {
        error_propagate(errp, err);
@@ -2596,6 +2606,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        goto fail;
    }

+    blk_apply_root_state(blk, medium_bs);
+
    qmp_blockdev_close_tray(device, errp);

 fail:
@@ -2661,13 +2673,6 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
        goto out;
    }

-    /* The BlockBackend must be the only parent */
-    assert(QLIST_FIRST(&bs->parents));
-    if (QLIST_NEXT(QLIST_FIRST(&bs->parents), next_parent)) {
-        error_setg(errp, "Cannot throttle device with multiple parents");
-        goto out;
-    }
-
    throttle_config_init(&cfg);
    cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
    cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
@@ -2726,16 +2731,16 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
    if (throttle_enabled(&cfg)) {
        /* Enable I/O limits if they're not enabled yet, otherwise
         * just update the throttling group. */
-        if (!bs->throttle_state) {
-            bdrv_io_limits_enable(bs, has_group ? group : device);
+        if (!blk_get_public(blk)->throttle_state) {
+            blk_io_limits_enable(blk, has_group ? group : device);
        } else if (has_group) {
-            bdrv_io_limits_update_group(bs, group);
+            blk_io_limits_update_group(blk, group);
        }
        /* Set the new throttling configuration */
-        bdrv_set_io_limits(bs, &cfg);
-    } else if (bs->throttle_state) {
+        blk_set_io_limits(blk, &cfg);
+    } else if (blk_get_public(blk)->throttle_state) {
        /* If all throttling settings are set to 0, disable I/O limits */
-        bdrv_io_limits_disable(bs);
+        blk_io_limits_disable(blk);
    }

 out:
@@ -3186,7 +3191,6 @@ static void do_drive_backup(const char *device, const char *target,
    Error *local_err = NULL;
    int flags;
    int64_t size;
-    int ret;

    if (!has_speed) {
        speed = 0;
@@ -3270,10 +3274,8 @@ static void do_drive_backup(const char *device, const char *target,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options, flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags, errp);
+    if (!target_bs) {
        goto out;
    }

@@ -3291,8 +3293,8 @@ static void do_drive_backup(const char *device, const char *target,
    backup_start(bs, target_bs, speed, sync, bmap,
                 on_source_error, on_target_error,
                 block_job_cb, bs, txn, &local_err);
+    bdrv_unref(target_bs);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
        goto out;
    }
@@ -3376,12 +3378,10 @@ void do_blockdev_backup(const char *device, const char *target,
    }
    target_bs = blk_bs(target_blk);

-    bdrv_ref(target_bs);
    bdrv_set_aio_context(target_bs, aio_context);
    backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
                 on_target_error, block_job_cb, bs, txn, &local_err);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
    }
 out:
@@ -3457,10 +3457,6 @@ static void blockdev_mirror_common(BlockDriverState *bs,
    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
        return;
    }
-    if (target->blk) {
-        error_setg(errp, "Cannot mirror to an attached block device");
-        return;
-    }

    if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) {
        sync = MIRROR_SYNC_MODE_FULL;
@@ -3498,7 +3494,6 @@ void qmp_drive_mirror(const char *device, const char *target,
    QDict *options = NULL;
    int flags;
    int64_t size;
-    int ret;

    blk = blk_by_name(device);
    if (!blk) {
@@ -3607,11 +3602,9 @@ void qmp_drive_mirror(const char *device, const char *target,
    /* Mirroring takes care of copy-on-write using the source's backing
     * file.
     */
-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options,
-                    flags | BDRV_O_NO_BACKING, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags | BDRV_O_NO_BACKING,
+                          errp);
+    if (!target_bs) {
        goto out;
    }

@@ -3626,9 +3619,9 @@ void qmp_drive_mirror(const char *device, const char *target,
                           has_on_target_error, on_target_error,
                           has_unmap, unmap,
                           &local_err);
+    bdrv_unref(target_bs);
    if (local_err) {
        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
    }
 out:
    aio_context_release(aio_context);
@@ -3672,7 +3665,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

-    bdrv_ref(target_bs);
    bdrv_set_aio_context(target_bs, aio_context);

    blockdev_mirror_common(bs, target_bs,
@@ -3686,7 +3678,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
                           &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
    }

    aio_context_release(aio_context);
@@ -4046,15 +4037,15 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
        bs = blk_bs(blk);
        aio_context = blk_get_aio_context(blk);
    } else {
+        blk = NULL;
        bs = bdrv_find_node(node_name);
        if (!bs) {
            error_setg(errp, "Cannot find node %s", node_name);
            return;
        }
-        blk = bs->blk;
-        if (blk) {
+        if (bdrv_has_blk(bs)) {
            error_setg(errp, "Node %s is in use by %s",
-                       node_name, blk_name(blk));
+                       node_name, bdrv_get_parent_name(bs));
            return;
        }
        aio_context = bdrv_get_aio_context(bs);
@@ -4092,12 +4083,68 @@ out:
    aio_context_release(aio_context);
 }

+static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
+                                  const char *child_name)
+{
+    BdrvChild *child;
+
+    QLIST_FOREACH(child, &parent_bs->children, next) {
+        if (strcmp(child->name, child_name) == 0) {
+            return child;
+        }
+    }
+
+    return NULL;
+}
+
+void qmp_x_blockdev_change(const char *parent, bool has_child,
+                           const char *child, bool has_node,
+                           const char *node, Error **errp)
+{
+    BlockDriverState *parent_bs, *new_bs = NULL;
+    BdrvChild *p_child;
+
+    parent_bs = bdrv_lookup_bs(parent, parent, errp);
+    if (!parent_bs) {
+        return;
+    }
+
+    if (has_child == has_node) {
+        if (has_child) {
+            error_setg(errp, "The parameters child and node are in conflict");
+        } else {
+            error_setg(errp, "Either child or node must be specified");
+        }
+        return;
+    }
+
+    if (has_child) {
+        p_child = bdrv_find_child(parent_bs, child);
+        if (!p_child) {
+            error_setg(errp, "Node '%s' does not have child '%s'",
+                       parent, child);
+            return;
+        }
+        bdrv_del_child(parent_bs, p_child, errp);
+    }
+
+    if (has_node) {
+        new_bs = bdrv_find_node(node);
+        if (!new_bs) {
+            error_setg(errp, "Node '%s' not found", node);
+            return;
+        }
+        bdrv_add_child(parent_bs, new_bs, errp);
+    }
+}
+
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
    BlockJobInfoList *head = NULL, **p_next = &head;
    BlockDriverState *bs;
+    BdrvNextIterator it;

-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
--- a/blockjob.c
+++ b/blockjob.c
@@ -50,17 +50,31 @@ struct BlockJobTxn {
    int refcnt;
 };

+static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);
+
+BlockJob *block_job_next(BlockJob *job)
+{
+    if (!job) {
+        return QLIST_FIRST(&block_jobs);
+    }
+    return QLIST_NEXT(job, job_list);
+}
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
 {
+    BlockBackend *blk;
    BlockJob *job;

    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
        return NULL;
    }
-    bdrv_ref(bs);
+
+    blk = blk_new();
+    blk_insert_bs(blk, bs);
+
    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
@@ -69,13 +83,15 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,

    job->driver        = driver;
    job->id            = g_strdup(bdrv_get_device_name(bs));
-    job->bs            = bs;
+    job->blk           = blk;
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
    job->refcnt        = 1;
    bs->job = job;

+    QLIST_INSERT_HEAD(&block_jobs, job, job_list);
+
    /* Only set speed when necessary to avoid NotSupported error */
    if (speed != 0) {
        Error *local_err = NULL;
@@ -98,11 +114,13 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        job->bs->job = NULL;
-        bdrv_op_unblock_all(job->bs, job->blocker);
-        bdrv_unref(job->bs);
+        BlockDriverState *bs = blk_bs(job->blk);
+        bs->job = NULL;
+        bdrv_op_unblock_all(bs, job->blocker);
+        blk_unref(job->blk);
        error_free(job->blocker);
        g_free(job->id);
+        QLIST_REMOVE(job, job_list);
        g_free(job);
    }
 }
@@ -140,7 +158,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
    txn->aborting = true;
    /* We are the first failed job. Cancel other jobs. */
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
    }
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
@@ -157,7 +175,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
        assert(other_job->completed);
    }
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        block_job_completed_single(other_job);
        aio_context_release(ctx);
    }
@@ -179,7 +197,7 @@ static void block_job_completed_txn_success(BlockJob *job)
    }
    /* We are the last completed job, commit the transaction. */
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
        assert(other_job->ret == 0);
        block_job_completed_single(other_job);
@@ -189,9 +207,7 @@ static void block_job_completed_txn_success(BlockJob *job)

 void block_job_completed(BlockJob *job, int ret)
 {
-    BlockDriverState *bs = job->bs;
-
-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);
    assert(!job->completed);
    job->completed = true;
    job->ret = ret;
@@ -282,11 +298,10 @@ static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
-    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
    int ret;

-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);

    block_job_ref(job);
    finish(job, &local_err);
@@ -297,7 +312,7 @@ static int block_job_finish_sync(BlockJob *job,
    }
    while (!job->completed) {
        aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
-                                              bdrv_get_aio_context(bs),
+                                              blk_get_aio_context(job->blk),
                 true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
@@ -318,6 +333,19 @@ int block_job_cancel_sync(BlockJob *job)
    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
 }

+void block_job_cancel_sync_all(void)
+{
+    BlockJob *job;
+    AioContext *aio_context;
+
+    while ((job = QLIST_FIRST(&block_jobs))) {
+        aio_context = blk_get_aio_context(job->blk);
+        aio_context_acquire(aio_context);
+        block_job_cancel_sync(job);
+        aio_context_release(aio_context);
+    }
+}
+
 int block_job_complete_sync(BlockJob *job, Error **errp)
 {
    return block_job_finish_sync(job, &block_job_complete, errp);
@@ -336,7 +364,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    if (block_job_is_paused(job)) {
        qemu_coroutine_yield();
    } else {
-        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
+        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
    }
    job->busy = true;
 }
@@ -411,8 +439,7 @@ void block_job_event_ready(BlockJob *job)
                                    job->speed, &error_abort);
 }

-BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
-                                        BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                        int is_read, int error)
 {
    BlockErrorAction action;
@@ -443,9 +470,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
-        if (bs->blk && bs != job->bs) {
-            blk_iostatus_set_err(bs->blk, error);
-        }
    }
    return action;
 }
@@ -469,7 +493,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    aio_context_acquire(data->aio_context);

    /* Fetch BDS AioContext again, in case it has changed */
-    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context = blk_get_aio_context(data->job->blk);
    aio_context_acquire(aio_context);

    data->job->deferred_to_main_loop = false;
@@ -489,7 +513,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
    data->job = job;
    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
-    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->aio_context = blk_get_aio_context(job->blk);
    data->fn = fn;
    data->opaque = opaque;
    job->deferred_to_main_loop = true;
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -28,6 +28,7 @@
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
 #include "hw/hw.h"
+#include "hw/qdev-core.h"

 typedef struct FWBootEntry FWBootEntry;

--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -25,6 +25,7 @@
 #include "qemu/help_option.h"
 /* For tb_lock */
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
@@ -752,9 +753,6 @@ int main(int argc, char **argv)
    }

    cpu_model = NULL;
-#if defined(cpudef_setup)
-    cpudef_setup(); /* parse cpu definitions in target config file (TBD) */
-#endif

    optind = 1;
    for(;;) {
@@ -849,6 +847,7 @@ int main(int argc, char **argv)
    }

    /* init debug */
+    qemu_log_needs_buffers();
    qemu_set_log_filename(log_file);
    if (log_mask) {
        int mask;
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -19,6 +19,7 @@


 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"

 #undef DEBUG_REMAP
--- a/66
+++ b/66
@@ -207,7 +207,7 @@ fdt=""
 netmap="no"
 pixman=""
 sdl=""
-sdlabi="1.2"
+sdlabi=""
 virtfs=""
 vnc="yes"
 sparse="no"
@@ -2157,6 +2157,7 @@ if test "$gtk" != "no"; then
    if $pkg_config --exists "$gtkpackage >= $gtkversion"; then
        gtk_cflags=`$pkg_config --cflags $gtkpackage`
        gtk_libs=`$pkg_config --libs $gtkpackage`
+        gtk_version=`$pkg_config --modversion $gtkpackage`
        if $pkg_config --exists "$gtkx11package >= $gtkversion"; then
            gtk_cflags="$gtk_cflags $x11_cflags"
            gtk_libs="$gtk_libs $x11_libs"
@@ -2392,20 +2393,25 @@ fi

 if test "$vte" != "no"; then
    if test "$gtkabi" = "3.0"; then
-      vtepackage="vte-2.90"
-      vteversion="0.32.0"
+      vteminversion="0.32.0"
+      if $pkg_config --exists "vte-2.91"; then
+        vtepackage="vte-2.91"
+      else
+        vtepackage="vte-2.90"
+      fi
    else
      vtepackage="vte"
-      vteversion="0.24.0"
+      vteminversion="0.24.0"
    fi
-    if $pkg_config --exists "$vtepackage >= $vteversion"; then
+    if $pkg_config --exists "$vtepackage >= $vteminversion"; then
        vte_cflags=`$pkg_config --cflags $vtepackage`
        vte_libs=`$pkg_config --libs $vtepackage`
+        vteversion=`$pkg_config --modversion $vtepackage`
        libs_softmmu="$vte_libs $libs_softmmu"
        vte="yes"
    elif test "$vte" = "yes"; then
        if test "$gtkabi" = "3.0"; then
-            feature_not_found "vte" "Install libvte-2.90 devel"
+            feature_not_found "vte" "Install libvte-2.90/2.91 devel"
        else
            feature_not_found "vte" "Install libvte devel"
        fi
@@ -2420,13 +2426,25 @@ fi
 # Look for sdl configuration program (pkg-config or sdl-config).  Try
 # sdl-config even without cross prefix, and favour pkg-config over sdl-config.

+if test "$sdlabi" = ""; then
+    if $pkg_config --exists "sdl"; then
+        sdlabi=1.2
+    elif $pkg_config --exists "sdl2"; then
+        sdlabi=2.0
+    else
+        sdlabi=1.2
+    fi
+fi
+
 if test $sdlabi = "2.0"; then
    sdl_config=$sdl2_config
    sdlname=sdl2
    sdlconfigname=sdl2_config
-else
+elif test $sdlabi = "1.2"; then
    sdlname=sdl
    sdlconfigname=sdl_config
+else
+    error_exit "Unknown sdlabi $sdlabi, must be 1.2 or 2.0"
 fi

 if test "`basename $sdl_config`" != $sdlconfigname && ! has ${sdl_config}; then
@@ -2435,10 +2453,10 @@ fi

 if $pkg_config $sdlname --exists; then
  sdlconfig="$pkg_config $sdlname"
-  _sdlversion=`$sdlconfig --modversion 2>/dev/null | sed 's/[^0-9]//g'`
+  sdlversion=`$sdlconfig --modversion 2>/dev/null`
 elif has ${sdl_config}; then
  sdlconfig="$sdl_config"
-  _sdlversion=`$sdlconfig --version | sed 's/[^0-9]//g'`
+  sdlversion=`$sdlconfig --version`
 else
  if test "$sdl" = "yes" ; then
    feature_not_found "sdl" "Install SDL devel"
@@ -2463,7 +2481,7 @@ EOF
    sdl_libs=`$sdlconfig --libs 2> /dev/null`
  fi
  if compile_prog "$sdl_cflags" "$sdl_libs" ; then
-    if test "$_sdlversion" -lt 121 ; then
+    if test `echo $sdlversion | sed 's/[^0-9]//g'` -lt 121 ; then
      sdl_too_old=yes
    else
      sdl=yes
@@ -2967,7 +2985,7 @@ int main(void) {
 }
 EOF

-if ! compile_prog "-Werror $CFLAGS" "$LIBS" ; then
+if ! compile_prog "$CFLAGS" "$LIBS" ; then
    error_exit "sizeof(size_t) doesn't match GLIB_SIZEOF_SIZE_T."\
               "You probably need to set PKG_CONFIG_LIBDIR"\
 	       "to point to the right pkg-config files for your"\
@@ -4593,7 +4611,7 @@ if test "$softmmu" = yes ; then
      tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)"
    else
      if test "$virtfs" = yes; then
-        error_exit "VirtFS is supported only on Linux and requires libcap-devel and libattr-devel"
+        error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel"
      fi
      virtfs=no
    fi
@@ -4718,6 +4736,12 @@ EOF
  fi
 fi

+echo_version() {
+    if test "$1" = "yes" ; then
+        echo "($2)"
+    fi
+}
+
 # prepend pixman and ftd flags after all config tests are done
 QEMU_CFLAGS="$pixman_cflags $fdt_cflags $QEMU_CFLAGS"
 libs_softmmu="$pixman_libs $libs_softmmu"
@@ -4767,22 +4791,18 @@ if test "$darwin" = "yes" ; then
    echo "Cocoa support     $cocoa"
 fi
 echo "pixman            $pixman"
-echo "SDL support       $sdl"
-echo "GTK support       $gtk"
+echo "SDL support       $sdl `echo_version $sdl $sdlversion`"
+echo "GTK support       $gtk `echo_version $gtk $gtk_version`"
 echo "GTK GL support    $gtk_gl"
+echo "VTE support       $vte `echo_version $vte $vteversion`"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
 echo "GNUTLS rnd        $gnutls_rnd"
 echo "libgcrypt         $gcrypt"
 echo "libgcrypt kdf     $gcrypt_kdf"
-if test "$nettle" = "yes"; then
-    echo "nettle            $nettle ($nettle_version)"
-else
-    echo "nettle            $nettle"
-fi
+echo "nettle            $nettle `echo_version $nettle $nettle_version`"
 echo "nettle kdf        $nettle_kdf"
 echo "libtasn1          $tasn1"
-echo "VTE support       $vte"
 echo "curses support    $curses"
 echo "virgl support     $virglrenderer"
 echo "curl support      $curl"
@@ -4831,11 +4851,7 @@ echo "Trace backends    $trace_backends"
 if have_backend "simple"; then
 echo "Trace output file $trace_file-<pid>"
 fi
-if test "$spice" = "yes"; then
-echo "spice support     $spice ($spice_protocol_version/$spice_server_version)"
-else
-echo "spice support     $spice"
-fi
+echo "spice support     $spice `echo_version $spice $spice_protocol_version/$spice_server_version`"
 echo "rbd support       $rbd"
 echo "xfsctl support    $xfs"
 echo "smartcard support $smartcard"
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -7,6 +7,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "qemu/sockets.h"

 #include <sys/mman.h>
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -15,7 +15,7 @@
 * unix socket. For each client, the server will create some eventfd
 * (see EVENTFD(2)), one per vector. These fd are transmitted to all
 * clients using the SCM_RIGHTS cmsg message. Therefore, each client is
- * able to send a notification to another client without beeing
+ * able to send a notification to another client without being
 * "profixied" by the server.
 *
 * We use this mechanism to send interruptions between guests.
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -20,6 +20,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "sysemu/cpus.h"
+#include "exec/exec-all.h"
 #include "exec/memory-internal.h"

 bool exit_request;
@@ -68,7 +69,6 @@ void cpu_reloading_memory_map(void)

 void cpu_loop_exit(CPUState *cpu)
 {
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }

@@ -77,6 +77,5 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
    if (pc) {
        cpu_restore_state(cpu, pc);
    }
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -20,6 +20,7 @@
 #include "cpu.h"
 #include "trace.h"
 #include "disas/disas.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
@@ -136,7 +137,9 @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
 static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 {
    CPUArchState *env = cpu->env_ptr;
-    uintptr_t next_tb;
+    uintptr_t ret;
+    TranslationBlock *last_tb;
+    int tb_exit;
    uint8_t *tb_ptr = itb->tc_ptr;

    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
@@ -160,118 +163,125 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 #endif /* DEBUG_DISAS */

    cpu->can_do_io = !use_icount;
-    next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    ret = tcg_qemu_tb_exec(env, tb_ptr);
    cpu->can_do_io = 1;
-    trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
-                       next_tb & TB_EXIT_MASK);
+    last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    tb_exit = ret & TB_EXIT_MASK;
+    trace_exec_tb_exit(last_tb, tb_exit);

-    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
+    if (tb_exit > TB_EXIT_IDX1) {
        /* We didn't start executing this TB (eg because the instruction
         * counter hit zero); we must restore the guest PC to the address
         * of the start of the TB.
         */
        CPUClass *cc = CPU_GET_CLASS(cpu);
-        TranslationBlock *tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-        qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
+        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
                               "Stopped execution of TB chain before %p ["
                               TARGET_FMT_lx "] %s\n",
-                               itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
+                               last_tb->tc_ptr, last_tb->pc,
+                               lookup_symbol(last_tb->pc));
        if (cc->synchronize_from_tb) {
-            cc->synchronize_from_tb(cpu, tb);
+            cc->synchronize_from_tb(cpu, last_tb);
        } else {
            assert(cc->set_pc);
-            cc->set_pc(cpu, tb->pc);
+            cc->set_pc(cpu, last_tb->pc);
        }
    }
-    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_REQUESTED) {
+    if (tb_exit == TB_EXIT_REQUESTED) {
        /* We were asked to stop executing TBs (probably a pending
         * interrupt. We've now stopped, so clear the flag.
         */
        cpu->tcg_exit_req = 0;
    }
-    return next_tb;
+    return ret;
 }

+#ifndef CONFIG_USER_ONLY
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;
+    bool old_tb_flushed;

    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;

+    old_tb_flushed = cpu->tb_flushed;
+    cpu->tb_flushed = false;
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
-    cpu->current_tb = tb;
+    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
+    cpu->tb_flushed |= old_tb_flushed;
    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);
-    cpu->current_tb = NULL;
    tb_phys_invalidate(tb, -1);
    tb_free(tb);
 }
+#endif

 static TranslationBlock *tb_find_physical(CPUState *cpu,
                                          target_ulong pc,
                                          target_ulong cs_base,
-                                          uint64_t flags)
+                                          uint32_t flags)
 {
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
-    TranslationBlock *tb, **ptb1;
+    TranslationBlock *tb, **tb_hash_head, **ptb1;
    unsigned int h;
    tb_page_addr_t phys_pc, phys_page1;
-    target_ulong virt_page2;
-
-    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;

    /* find translated block using physical mappings */
    phys_pc = get_page_addr_code(env, pc);
    phys_page1 = phys_pc & TARGET_PAGE_MASK;
    h = tb_phys_hash_func(phys_pc);
-    ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h];
-    for(;;) {
-        tb = *ptb1;
-        if (!tb) {
-            return NULL;
-        }
+
+    /* Start at head of the hash entry */
+    ptb1 = tb_hash_head = &tcg_ctx.tb_ctx.tb_phys_hash[h];
+    tb = *ptb1;
+
+    while (tb) {
        if (tb->pc == pc &&
            tb->page_addr[0] == phys_page1 &&
            tb->cs_base == cs_base &&
            tb->flags == flags) {
-            /* check next page if needed */
-            if (tb->page_addr[1] != -1) {
-                tb_page_addr_t phys_page2;

-                virt_page2 = (pc & TARGET_PAGE_MASK) +
-                    TARGET_PAGE_SIZE;
-                phys_page2 = get_page_addr_code(env, virt_page2);
+            if (tb->page_addr[1] == -1) {
+                /* done, we have a match */
+                break;
+            } else {
+                /* check next page if needed */
+                target_ulong virt_page2 = (pc & TARGET_PAGE_MASK) +
+                                          TARGET_PAGE_SIZE;
+                tb_page_addr_t phys_page2 = get_page_addr_code(env, virt_page2);
+
                if (tb->page_addr[1] == phys_page2) {
                    break;
                }
-            } else {
-                break;
            }
        }
+
        ptb1 = &tb->phys_hash_next;
+        tb = *ptb1;
    }

-    /* Move the TB to the head of the list */
-    *ptb1 = tb->phys_hash_next;
-    tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h];
-    tcg_ctx.tb_ctx.tb_phys_hash[h] = tb;
+    if (tb) {
+        /* Move the TB to the head of the list */
+        *ptb1 = tb->phys_hash_next;
+        tb->phys_hash_next = *tb_hash_head;
+        *tb_hash_head = tb;
+    }
    return tb;
 }

 static TranslationBlock *tb_find_slow(CPUState *cpu,
                                      target_ulong pc,
                                      target_ulong cs_base,
-                                      uint64_t flags)
+                                      uint32_t flags)
 {
    TranslationBlock *tb;

@@ -309,26 +319,72 @@ found:
    return tb;
 }

-static inline TranslationBlock *tb_find_fast(CPUState *cpu)
+static inline TranslationBlock *tb_find_fast(CPUState *cpu,
+                                             TranslationBlock **last_tb,
+                                             int tb_exit)
 {
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
-    int flags;
+    uint32_t flags;

    /* we record a subset of the CPU state. It will
       always be the same before a given translated block
       is executed. */
    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb_lock();
    tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                 tb->flags != flags)) {
        tb = tb_find_slow(cpu, pc, cs_base, flags);
    }
+    if (cpu->tb_flushed) {
+        /* Ensure that no TB jump will be modified as the
+         * translation buffer has been flushed.
+         */
+        *last_tb = NULL;
+        cpu->tb_flushed = false;
+    }
+#ifndef CONFIG_USER_ONLY
+    /* We don't take care of direct jumps when address mapping changes in
+     * system emulation. So it's not safe to make a direct jump to a TB
+     * spanning two pages because the mapping for the second page can change.
+     */
+    if (tb->page_addr[1] != -1) {
+        *last_tb = NULL;
+    }
+#endif
+    /* See if we can patch the calling TB. */
+    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_add_jump(*last_tb, tb_exit, tb);
+    }
+    tb_unlock();
    return tb;
 }

-static void cpu_handle_debug_exception(CPUState *cpu)
+static inline bool cpu_handle_halt(CPUState *cpu)
+{
+    if (cpu->halted) {
+#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            apic_poll_irq(x86_cpu->apic_state);
+            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
+        }
+#endif
+        if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
+            return true;
+        }
+
+        cpu->halted = 0;
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_debug_exception(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUWatchpoint *wp;
@@ -342,37 +398,197 @@ static void cpu_handle_debug_exception(CPUState *cpu)
    cc->debug_excp_handler(cpu);
 }

+static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
+{
+    if (cpu->exception_index >= 0) {
+        if (cpu->exception_index >= EXCP_INTERRUPT) {
+            /* exit request from the cpu execution loop */
+            *ret = cpu->exception_index;
+            if (*ret == EXCP_DEBUG) {
+                cpu_handle_debug_exception(cpu);
+            }
+            cpu->exception_index = -1;
+            return true;
+        } else {
+#if defined(CONFIG_USER_ONLY)
+            /* if user mode only, we simulate a fake exception
+               which will be handled outside the cpu execution
+               loop */
+#if defined(TARGET_I386)
+            CPUClass *cc = CPU_GET_CLASS(cpu);
+            cc->do_interrupt(cpu);
+#endif
+            *ret = cpu->exception_index;
+            cpu->exception_index = -1;
+            return true;
+#else
+            if (replay_exception()) {
+                CPUClass *cc = CPU_GET_CLASS(cpu);
+                cc->do_interrupt(cpu);
+                cpu->exception_index = -1;
+            } else if (!replay_has_interrupt()) {
+                /* give a chance to iothread in replay mode */
+                *ret = EXCP_INTERRUPT;
+                return true;
+            }
+#endif
+        }
+#ifndef CONFIG_USER_ONLY
+    } else if (replay_has_exception()
+               && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+        /* try to cause an exception pending in the log */
+        TranslationBlock *last_tb = NULL; /* Avoid chaining TBs */
+        cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, &last_tb, 0), true);
+        *ret = -1;
+        return true;
+#endif
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_interrupt(CPUState *cpu,
+                                        TranslationBlock **last_tb)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    int interrupt_request = cpu->interrupt_request;
+
+    if (unlikely(interrupt_request)) {
+        if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
+            /* Mask out external interrupts for this step. */
+            interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
+        }
+        if (interrupt_request & CPU_INTERRUPT_DEBUG) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
+            cpu->exception_index = EXCP_DEBUG;
+            cpu_loop_exit(cpu);
+        }
+        if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
+            /* Do nothing */
+        } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+            replay_interrupt();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
+            cpu->halted = 1;
+            cpu->exception_index = EXCP_HLT;
+            cpu_loop_exit(cpu);
+        }
+#if defined(TARGET_I386)
+        else if (interrupt_request & CPU_INTERRUPT_INIT) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            CPUArchState *env = &x86_cpu->env;
+            replay_interrupt();
+            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
+            do_cpu_init(x86_cpu);
+            cpu->exception_index = EXCP_HALTED;
+            cpu_loop_exit(cpu);
+        }
+#else
+        else if (interrupt_request & CPU_INTERRUPT_RESET) {
+            replay_interrupt();
+            cpu_reset(cpu);
+            cpu_loop_exit(cpu);
+        }
+#endif
+        /* The target hook has 3 exit conditions:
+           False when the interrupt isn't processed,
+           True when it is, and we should restart on a new TB,
+           and via longjmp via cpu_loop_exit.  */
+        else {
+            replay_interrupt();
+            if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                *last_tb = NULL;
+            }
+            /* The target hook may have updated the 'cpu->interrupt_request';
+             * reload the 'interrupt_request' value */
+            interrupt_request = cpu->interrupt_request;
+        }
+        if (interrupt_request & CPU_INTERRUPT_EXITTB) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
+            /* ensure that no TB jump will be modified as
+               the program flow was changed */
+            *last_tb = NULL;
+        }
+    }
+    if (unlikely(cpu->exit_request || replay_has_interrupt())) {
+        cpu->exit_request = 0;
+        cpu->exception_index = EXCP_INTERRUPT;
+        cpu_loop_exit(cpu);
+    }
+}
+
+static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
+                                    TranslationBlock **last_tb, int *tb_exit,
+                                    SyncClocks *sc)
+{
+    uintptr_t ret;
+
+    if (unlikely(cpu->exit_request)) {
+        return;
+    }
+
+    trace_exec_tb(tb, tb->pc);
+    ret = cpu_tb_exec(cpu, tb);
+    *last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    *tb_exit = ret & TB_EXIT_MASK;
+    switch (*tb_exit) {
+    case TB_EXIT_REQUESTED:
+        /* Something asked us to stop executing
+         * chained TBs; just continue round the main
+         * loop. Whatever requested the exit will also
+         * have set something else (eg exit_request or
+         * interrupt_request) which we will handle
+         * next time around the loop.  But we need to
+         * ensure the tcg_exit_req read in generated code
+         * comes before the next read of cpu->exit_request
+         * or cpu->interrupt_request.
+         */
+        smp_rmb();
+        *last_tb = NULL;
+        break;
+    case TB_EXIT_ICOUNT_EXPIRED:
+    {
+        /* Instruction counter expired.  */
+#ifdef CONFIG_USER_ONLY
+        abort();
+#else
+        int insns_left = cpu->icount_decr.u32;
+        if (cpu->icount_extra && insns_left >= 0) {
+            /* Refill decrementer and continue execution.  */
+            cpu->icount_extra += insns_left;
+            insns_left = MIN(0xffff, cpu->icount_extra);
+            cpu->icount_extra -= insns_left;
+            cpu->icount_decr.u16.low = insns_left;
+        } else {
+            if (insns_left > 0) {
+                /* Execute remaining instructions.  */
+                cpu_exec_nocache(cpu, insns_left, *last_tb, false);
+                align_clocks(sc, cpu);
+            }
+            cpu->exception_index = EXCP_INTERRUPT;
+            *last_tb = NULL;
+            cpu_loop_exit(cpu);
+        }
+        break;
+#endif
+    }
+    default:
+        break;
+    }
+}
+
 /* main execution loop */

 int cpu_exec(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-    X86CPU *x86_cpu = X86_CPU(cpu);
-    CPUArchState *env = &x86_cpu->env;
-#endif
-    int ret, interrupt_request;
-    TranslationBlock *tb;
-    uintptr_t next_tb;
+    int ret;
    SyncClocks sc;

    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;

-    if (cpu->halted) {
-#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
-            && replay_interrupt()) {
-            apic_poll_irq(x86_cpu->apic_state);
-            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
-        }
-#endif
-        if (!cpu_has_work(cpu)) {
-            current_cpu = NULL;
-            return EXCP_HALTED;
-        }
-
-        cpu->halted = 0;
+    if (cpu_handle_halt(cpu)) {
+        return EXCP_HALTED;
    }

    atomic_mb_set(&tcg_current_cpu, cpu);
@@ -391,185 +607,26 @@ int cpu_exec(CPUState *cpu)
     */
    init_delay_params(&sc, cpu);

-    /* prepare setjmp context for exception handling */
    for(;;) {
+        TranslationBlock *tb, *last_tb;
+        int tb_exit = 0;
+
+        /* prepare setjmp context for exception handling */
        if (sigsetjmp(cpu->jmp_env, 0) == 0) {
            /* if an exception is pending, we execute it here */
-            if (cpu->exception_index >= 0) {
-                if (cpu->exception_index >= EXCP_INTERRUPT) {
-                    /* exit request from the cpu execution loop */
-                    ret = cpu->exception_index;
-                    if (ret == EXCP_DEBUG) {
-                        cpu_handle_debug_exception(cpu);
-                    }
-                    cpu->exception_index = -1;
-                    break;
-                } else {
-#if defined(CONFIG_USER_ONLY)
-                    /* if user mode only, we simulate a fake exception
-                       which will be handled outside the cpu execution
-                       loop */
-#if defined(TARGET_I386)
-                    cc->do_interrupt(cpu);
-#endif
-                    ret = cpu->exception_index;
-                    cpu->exception_index = -1;
-                    break;
-#else
-                    if (replay_exception()) {
-                        cc->do_interrupt(cpu);
-                        cpu->exception_index = -1;
-                    } else if (!replay_has_interrupt()) {
-                        /* give a chance to iothread in replay mode */
-                        ret = EXCP_INTERRUPT;
-                        break;
-                    }
-#endif
-                }
-            } else if (replay_has_exception()
-                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
-                /* try to cause an exception pending in the log */
-                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
-                ret = -1;
+            if (cpu_handle_exception(cpu, &ret)) {
                break;
            }

-            next_tb = 0; /* force lookup of first TB */
+            last_tb = NULL; /* forget the last executed TB after exception */
+            cpu->tb_flushed = false; /* reset before first TB lookup */
            for(;;) {
-                interrupt_request = cpu->interrupt_request;
-                if (unlikely(interrupt_request)) {
-                    if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
-                        /* Mask out external interrupts for this step. */
-                        interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
-                    }
-                    if (interrupt_request & CPU_INTERRUPT_DEBUG) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
-                        cpu->exception_index = EXCP_DEBUG;
-                        cpu_loop_exit(cpu);
-                    }
-                    if (replay_mode == REPLAY_MODE_PLAY
-                        && !replay_has_interrupt()) {
-                        /* Do nothing */
-                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
-                        replay_interrupt();
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
-                        cpu->halted = 1;
-                        cpu->exception_index = EXCP_HLT;
-                        cpu_loop_exit(cpu);
-                    }
-#if defined(TARGET_I386)
-                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
-                        replay_interrupt();
-                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
-                        do_cpu_init(x86_cpu);
-                        cpu->exception_index = EXCP_HALTED;
-                        cpu_loop_exit(cpu);
-                    }
-#else
-                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
-                        replay_interrupt();
-                        cpu_reset(cpu);
-                        cpu_loop_exit(cpu);
-                    }
-#endif
-                    /* The target hook has 3 exit conditions:
-                       False when the interrupt isn't processed,
-                       True when it is, and we should restart on a new TB,
-                       and via longjmp via cpu_loop_exit.  */
-                    else {
-                        replay_interrupt();
-                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                            next_tb = 0;
-                        }
-                    }
-                    /* Don't use the cached interrupt_request value,
-                       do_interrupt may have updated the EXITTB flag. */
-                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
-                        /* ensure that no TB jump will be modified as
-                           the program flow was changed */
-                        next_tb = 0;
-                    }
-                }
-                if (unlikely(cpu->exit_request
-                             || replay_has_interrupt())) {
-                    cpu->exit_request = 0;
-                    cpu->exception_index = EXCP_INTERRUPT;
-                    cpu_loop_exit(cpu);
-                }
-                tb_lock();
-                tb = tb_find_fast(cpu);
-                /* Note: we do it here to avoid a gcc bug on Mac OS X when
-                   doing it in tb_find_slow */
-                if (tcg_ctx.tb_ctx.tb_invalidated_flag) {
-                    /* as some TB could have been invalidated because
-                       of memory exceptions while generating the code, we
-                       must recompute the hash index here */
-                    next_tb = 0;
-                    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
-                }
-                /* see if we can patch the calling TB. When the TB
-                   spans two pages, we cannot safely do a direct
-                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1
-                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
-                                next_tb & TB_EXIT_MASK, tb);
-                }
-                tb_unlock();
-                if (likely(!cpu->exit_request)) {
-                    trace_exec_tb(tb, tb->pc);
-                    /* execute the generated code */
-                    cpu->current_tb = tb;
-                    next_tb = cpu_tb_exec(cpu, tb);
-                    cpu->current_tb = NULL;
-                    switch (next_tb & TB_EXIT_MASK) {
-                    case TB_EXIT_REQUESTED:
-                        /* Something asked us to stop executing
-                         * chained TBs; just continue round the main
-                         * loop. Whatever requested the exit will also
-                         * have set something else (eg exit_request or
-                         * interrupt_request) which we will handle
-                         * next time around the loop.  But we need to
-                         * ensure the tcg_exit_req read in generated code
-                         * comes before the next read of cpu->exit_request
-                         * or cpu->interrupt_request.
-                         */
-                        smp_rmb();
-                        next_tb = 0;
-                        break;
-                    case TB_EXIT_ICOUNT_EXPIRED:
-                    {
-                        /* Instruction counter expired.  */
-                        int insns_left = cpu->icount_decr.u32;
-                        if (cpu->icount_extra && insns_left >= 0) {
-                            /* Refill decrementer and continue execution.  */
-                            cpu->icount_extra += insns_left;
-                            insns_left = MIN(0xffff, cpu->icount_extra);
-                            cpu->icount_extra -= insns_left;
-                            cpu->icount_decr.u16.low = insns_left;
-                        } else {
-                            if (insns_left > 0) {
-                                /* Execute remaining instructions.  */
-                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb, false);
-                                align_clocks(&sc, cpu);
-                            }
-                            cpu->exception_index = EXCP_INTERRUPT;
-                            next_tb = 0;
-                            cpu_loop_exit(cpu);
-                        }
-                        break;
-                    }
-                    default:
-                        break;
-                    }
-                }
+                cpu_handle_interrupt(cpu, &last_tb);
+                tb = tb_find_fast(cpu, &last_tb, tb_exit);
+                cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
                /* Try to align the host and virtual clocks
                   if the guest is in advance */
                align_clocks(&sc, cpu);
-                /* reset soft MMU for next block (it can currently
-                   only be set by a memory fault) */
            } /* for(;;) */
        } else {
 #if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
@@ -579,18 +636,10 @@ int cpu_exec(CPUState *cpu)
             * Newer versions of gcc would complain about this code (-Wclobbered). */
            cpu = current_cpu;
            cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-            x86_cpu = X86_CPU(cpu);
-            env = &x86_cpu->env;
-#endif
 #else /* buggy compiler */
            /* Assert that the compiler does not smash local variables. */
            g_assert(cpu == current_cpu);
            g_assert(cc == CPU_GET_CLASS(cpu));
-#ifdef TARGET_I386
-            g_assert(x86_cpu == X86_CPU(cpu));
-            g_assert(env == &x86_cpu->env);
-#endif
 #endif /* buggy compiler */
            cpu->can_do_io = 1;
            tb_lock_reset();
--- a/cpus.c
+++ b/cpus.c
@@ -24,7 +24,8 @@

 /* Needed early for CONFIG_BSD etc. */
 #include "qemu/osdep.h"
-
+#include "qemu-common.h"
+#include "cpu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
@@ -34,6 +35,7 @@
 #include "sysemu/dma.h"
 #include "sysemu/kvm.h"
 #include "qmp-commands.h"
+#include "exec/exec-all.h"

 #include "qemu/thread.h"
 #include "sysemu/cpus.h"
@@ -778,7 +780,7 @@ static void sigbus_reraise(void)
        raise(SIGBUS);
        sigemptyset(&set);
        sigaddset(&set, SIGBUS);
-        sigprocmask(SIG_UNBLOCK, &set, NULL);
+        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
    }
    perror("Failed to re-raise SIGBUS!\n");
    abort();
@@ -970,6 +972,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
    qemu_cpu_kick(cpu);
 }

+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed");
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
    struct qemu_work_item *wi;
@@ -1059,7 +1073,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
    cpu->created = true;
    qemu_cond_signal(&qemu_cpu_cond);

-    while (1) {
+    do {
        if (cpu_can_run(cpu)) {
            r = kvm_cpu_exec(cpu);
            if (r == EXCP_DEBUG) {
@@ -1067,8 +1081,12 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
            }
        }
        qemu_kvm_wait_io_event(cpu);
-    }
+    } while (!cpu->unplug || cpu_can_run(cpu));

+    qemu_kvm_destroy_vcpu(cpu);
+    cpu->created = false;
+    qemu_cond_signal(&qemu_cpu_cond);
+    qemu_mutex_unlock_iothread();
    return NULL;
 }

@@ -1122,6 +1140,7 @@ static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;

    rcu_register_thread();

@@ -1159,6 +1178,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
            }
        }
        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        CPU_FOREACH(cpu) {
+            if (cpu->unplug && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            remove_cpu = NULL;
+        }
    }

    return NULL;
@@ -1315,6 +1346,21 @@ void resume_all_vcpus(void)
    }
 }

+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->unplug = true;
+    qemu_cpu_kick(cpu);
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+    cpu_remove(cpu);
+    while (cpu->created) {
+        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    }
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16

@@ -1531,6 +1577,9 @@ static void tcg_exec_all(void)
                break;
            }
        } else if (cpu->stop || cpu->stopped) {
+            if (cpu->unplug) {
+                next_cpu = CPU_NEXT(cpu);
+            }
            break;
        }
    }
@@ -1691,21 +1740,7 @@ exit:

 void qmp_inject_nmi(Error **errp)
 {
-#if defined(TARGET_I386)
-    CPUState *cs;
-
-    CPU_FOREACH(cs) {
-        X86CPU *cpu = X86_CPU(cs);
-
-        if (!cpu->apic_state) {
-            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
-        } else {
-            apic_deliver_nmi(cpu->apic_state);
-        }
-    }
-#else
    nmi_monitor_handle(monitor_get_cpu_index(), errp);
-#endif
 }

 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
--- a/cputlb.c
+++ b/cputlb.c
@@ -28,6 +28,7 @@

 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
+#include "exec/exec-all.h"
 #include "tcg/tcg.h"

 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
@@ -76,10 +77,6 @@ void tlb_flush(CPUState *cpu, int flush_global)

    tlb_debug("(%d)\n", flush_global);

-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;
-
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
@@ -95,9 +92,6 @@ static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
    CPUArchState *env = cpu->env_ptr;

    tlb_debug("start\n");
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    for (;;) {
        int mmu_idx = va_arg(argp, int);
@@ -152,9 +146,6 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
        tlb_flush(cpu, 1);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -193,9 +184,6 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
        va_end(argp);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -258,7 +246,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
 {
    ram_addr_t ram_addr;

-    if (qemu_ram_addr_from_host(ptr, &ram_addr) == NULL) {
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
        fprintf(stderr, "Bad ram pointer %p\n", ptr);
        abort();
    }
--- a/crypto/afsplit.c
+++ b/crypto/afsplit.c
@@ -24,6 +24,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/bswap.h"
 #include "crypto/afsplit.h"
 #include "crypto/random.h"

--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -20,6 +20,7 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qemu/bswap.h"

 #include "crypto/block-luks.h"

--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -100,6 +100,7 @@ CONFIG_ALLWINNER_A10_PIT=y
 CONFIG_ALLWINNER_A10_PIC=y
 CONFIG_ALLWINNER_A10=y

+CONFIG_FSL_IMX6=y
 CONFIG_FSL_IMX31=y
 CONFIG_FSL_IMX25=y

--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -18,6 +18,7 @@ CONFIG_MEGASAS_SCSI_PCI=y
 CONFIG_MPTSAS_SCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
+CONFIG_E1000E_PCI=y
 CONFIG_VMXNET3_PCI=y
 CONFIG_IDE_CORE=y
 CONFIG_IDE_QDEV=y
--- a/device_tree.c
+++ b/device_tree.c
@@ -20,6 +20,7 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "qemu/error-report.h"
+#include "qemu/bswap.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -70,16 +70,17 @@ void qemu_sglist_destroy(QEMUSGList *qsg)

 typedef struct {
    BlockAIOCB common;
-    BlockBackend *blk;
+    AioContext *ctx;
    BlockAIOCB *acb;
    QEMUSGList *sg;
-    uint64_t sector_num;
+    uint64_t offset;
    DMADirection dir;
    int sg_cur_index;
    dma_addr_t sg_cur_byte;
    QEMUIOVector iov;
    QEMUBH *bh;
    DMAIOFunc *io_func;
+    void *io_func_opaque;
 } DMAAIOCB;

 static void dma_blk_cb(void *opaque, int ret);
@@ -130,7 +131,7 @@ static void dma_blk_cb(void *opaque, int ret)
    trace_dma_blk_cb(dbs, ret);

    dbs->acb = NULL;
-    dbs->sector_num += dbs->iov.size / 512;
+    dbs->offset += dbs->iov.size;

    if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
        dma_complete(dbs, ret);
@@ -154,8 +155,7 @@ static void dma_blk_cb(void *opaque, int ret)

    if (dbs->iov.size == 0) {
        trace_dma_map_wait(dbs);
-        dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk),
-                             reschedule_dma, dbs);
+        dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs);
        cpu_register_map_client(dbs->bh);
        return;
    }
@@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret)
        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
    }

-    dbs->acb = dbs->io_func(dbs->blk, dbs->sector_num, &dbs->iov,
-                            dbs->iov.size / 512, dma_blk_cb, dbs);
+    dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+                            dma_blk_cb, dbs, dbs->io_func_opaque);
    assert(dbs->acb);
 }

@@ -191,23 +191,25 @@ static const AIOCBInfo dma_aiocb_info = {
    .cancel_async       = dma_aio_cancel,
 };

-BlockAIOCB *dma_blk_io(
-    BlockBackend *blk, QEMUSGList *sg, uint64_t sector_num,
-    DMAIOFunc *io_func, BlockCompletionFunc *cb,
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+    QEMUSGList *sg, uint64_t offset,
+    DMAIOFunc *io_func, void *io_func_opaque,
+    BlockCompletionFunc *cb,
    void *opaque, DMADirection dir)
 {
-    DMAAIOCB *dbs = blk_aio_get(&dma_aiocb_info, blk, cb, opaque);
+    DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);

-    trace_dma_blk_io(dbs, blk, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
+    trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));

    dbs->acb = NULL;
-    dbs->blk = blk;
    dbs->sg = sg;
-    dbs->sector_num = sector_num;
+    dbs->ctx = ctx;
+    dbs->offset = offset;
    dbs->sg_cur_index = 0;
    dbs->sg_cur_byte = 0;
    dbs->dir = dir;
    dbs->io_func = io_func;
+    dbs->io_func_opaque = io_func_opaque;
    dbs->bh = NULL;
    qemu_iovec_init(&dbs->iov, sg->nsg);
    dma_blk_cb(dbs, 0);
@@ -215,19 +217,39 @@ BlockAIOCB *dma_blk_io(
 }


+static
+BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
+                                 BlockCompletionFunc *cb, void *cb_opaque,
+                                 void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t sector,
+                         QEMUSGList *sg, uint64_t offset,
                         void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_readv, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_read_io_func, blk, cb, opaque,
                      DMA_DIRECTION_FROM_DEVICE);
 }

+static
+BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
+                                  BlockCompletionFunc *cb, void *cb_opaque,
+                                  void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t sector,
+                          QEMUSGList *sg, uint64_t offset,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_writev, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_write_io_func, blk, cb, opaque,
                      DMA_DIRECTION_TO_DEVICE);
 }

--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -62,7 +62,7 @@ operations:
    typeof(*ptr) atomic_fetch_sub(ptr, val)
    typeof(*ptr) atomic_fetch_and(ptr, val)
    typeof(*ptr) atomic_fetch_or(ptr, val)
-    typeof(*ptr) atomic_xchg(ptr, val
+    typeof(*ptr) atomic_xchg(ptr, val)
    typeof(*ptr) atomic_cmpxchg(ptr, old, new)

 all of which return the old value of *ptr.  These operations are
@@ -326,21 +326,41 @@ and memory barriers, and the equivalents in QEMU:
  use a boxed atomic_t type; atomic operations in QEMU are polymorphic
  and use normal C types.

- atomic_read and atomic_set in Linux give no guarantee at all;
-  atomic_read and atomic_set in QEMU include a compiler barrier
-  (similar to the ACCESS_ONCE macro in Linux).
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+  at all. Linux 4.1 updated them to implement volatile
+  semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).

- most atomic read-modify-write operations in Linux return void;
-  in QEMU, all of them return the old value of the variable.
+  QEMU's atomic_read/set implement, if the compiler supports it, C11
+  atomic relaxed semantics, and volatile semantics otherwise.
+  Both semantics prevent the compiler from doing certain transformations;
+  the difference is that atomic accesses are guaranteed to be atomic,
+  while volatile accesses aren't. Thus, in the volatile case we just cross
+  our fingers hoping that the compiler will generate atomic accesses,
+  since we assume the variables passed are machine-word sized and
+  properly aligned.
+  No barriers are implied by atomic_read/set in either Linux or QEMU.
+
+- atomic read-modify-write operations in Linux are of three kinds:
+
+         atomic_OP          returns void
+         atomic_OP_return   returns new value of the variable
+         atomic_fetch_OP    returns the old value of the variable
+         atomic_cmpxchg     returns the old value of the variable
+
+  In QEMU, the second kind does not exist.  Currently Linux has
+  atomic_fetch_or only.  QEMU provides and, or, inc, dec, add, sub.

 - different atomic read-modify-write operations in Linux imply
  a different set of memory barriers; in QEMU, all of them enforce
  sequential consistency, which means they imply full memory barriers
  before and after the operation.

- Linux does not have an equivalent of atomic_mb_read() and
-  atomic_mb_set().  In particular, note that set_mb() is a little
-  weaker than atomic_mb_set().
+- Linux does not have an equivalent of atomic_mb_set().  In particular,
+  note that smp_store_mb() is a little weaker than atomic_mb_set().
+  atomic_mb_read() compiles to the same instructions as Linux's
+  smp_load_acquire(), but this should be treated as an implementation
+  detail.  If required, QEMU might later add atomic_load_acquire() and
+  atomic_store_release() macros.


 SOURCES
--- a/docs/build-system.txt
+++ b/docs/build-system.txt
@@ -438,6 +438,11 @@ top level Makefile, so anything defined in this file will influence the
 entire build system. Care needs to be taken when writing rules for tests
 to ensure they only apply to the unit test execution / build.

+- tests/docker/Makefile.include
+
+Rules for Docker tests. Like tests/Makefile, this file is included
+directly by the top level Makefile, anything defined in this file will
+influence the entire build system.

 - po/Makefile

--- a/docs/igd-assign.txt
+++ b/docs/igd-assign.txt
@@ -0,0 +1,133 @@
+Intel Graphics Device (IGD) assignment with vfio-pci
+====================================================
+
+IGD has two different modes for assignment using vfio-pci:
+
+1) Universal Pass-Through (UPT) mode:
+
+   In this mode the IGD device is added as a *secondary* (ie. non-primary)
+   graphics device in combination with an emulated primary graphics device.
+   This mode *requires* guest driver support to remove the external
+   dependencies generally associated with IGD (see below).  Those guest
+   drivers only support this mode for Broadwell and newer IGD, according to
+   Intel.  Additionally, this mode by default, and as officially supported
+   by Intel, does not support direct video output.  The intention is to use
+   this mode either to provide hardware acceleration to the emulated graphics
+   or to use this mode in combination with guest-based remote access software,
+   for example VNC (see below for optional output support).  This mode
+   theoretically has no device specific handling dependencies on vfio-pci or
+   the VM firmware.
+
+2) "Legacy" mode:
+
+   In this mode the IGD device is intended to be the primary and exclusive
+   graphics device in the VM[1], as such QEMU does not facilitate any sort
+   of remote graphics to the VM in this mode.  A connected physical monitor
+   is the intended output device for IGD.  This mode includes several
+   requirements and restrictions:
+
+    * IGD must be given address 02.0 on the PCI root bus in the VM
+    * The host kernel must support vfio extensions for IGD (v4.6)
+    * vfio VGA support very likely needs to be enabled in the host kernel
+    * The VM firmware must support specific fw_cfg enablers for IGD
+    * The VM machine type must support a PCI host bridge at 00.0 (standard)
+    * The VM machine type must provide or allow to be created a special
+      ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
+      PCI address 1f.0.
+    * The IGD device must have a VGA ROM, either provided via the romfile
+      option or loaded automatically through vfio (standard).  rombar=0
+      will disable legacy mode support.
+    * Hotplug of the IGD device is not supported.
+    * The IGD device must be a SandyBridge or newer model device.
+
+For either mode, depending on the host kernel, the i915 driver in the host
+may generate faults and errors upon re-binding to an IGD device after it
+has been assigned to a VM.  It's therefore generally recommended to prevent
+such driver binding unless the host driver is known to work well for this.
+There are numerous ways to do this, i915 can be blacklisted on the host,
+the driver_override option can be used to ensure that only vfio-pci can bind
+to the device on the host[2], virsh nodedev-detach can be used to bind the
+device to vfio drivers and then managed='no' set in the VM xml to prevent
+re-binding to i915, etc.  Also note that IGD is also typically the primary
+graphics in the host and special options may be required beyond simply
+blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a
+PCI class device.  Lower level drivers exist that may still claim the device.
+It may therefore be necessary to use kernel boot options video=vesafb:off or
+video=efifb:off (depending on host BIOS/UEFI) or these can be combined to
+a catch-all, video=vesafb:off,efifb:off.  Error messages such as:
+
+    Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow
+
+are a good indicator that such a problem exists.  The host files /proc/iomem
+and /proc/ioports are often useful for identifying drivers consuming ranges
+of the device to cause such conflicts.
+
+Additionally, IGD device are known to generate small numbers of DMAR faults
+when initially assigned.  It is believed that this is simply the IGD attempting
+to access the reserved GTT space after reset, which it no longer has access to
+when accessed from userspace.  So long as the DMAR faults are small in number
+and most importantly, not ongoing, these are not an indication of an error.
+
+Additionally++, analog VGA output (as opposed to digital outputs like HDMI,
+DVI, or DisplayPort) may be unsupported in some use cases.  In the author's
+experience, even DP to VGA adapters can be troublesome while adapters between
+digital formats work well.
+
+Usage
+=====
+The intention is for IGD assignment to be transparent for users and thus for
+management tools like libvirt.  To make use of legacy mode, simply remove all
+other graphics options and use "-nographic" and either "-vga none" or
+"-nodefaults", along with adding the device using vfio-pci:
+
+    -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+
+For UPT mode, retain the default emulated graphics and simply add the vfio-pci
+device making use of any other bus address other than 02.0.  libvirt will
+default to assigning the device a UPT compatible address while legacy mode
+users will need to manually edit the XML if using a tool like virt-manager
+where the VM device address is not expressly specified.
+
+An experimental vfio-pci option also exists to enable OpRegion, and thus
+external monitor support, for UPT mode.  This can be enabled by adding
+"x-igd-opregion=on" to the vfio-pci device options for the IGD device.  As
+with legacy mode, this requires the host to support features introduced in
+the v4.6 kernel.  If Intel chooses to embrace this support, the option may
+be made non-experimental in the future, opening it to libvirt support.
+
+Developer ABI
+=============
+Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+
+1) "etc/igd-opregion"
+
+   This fw_cfg file exposes the OpRegion for the IGD device.  A reserved
+   region should be created below 4GB (recommended 4KB alignment), sized
+   sufficient for the fw_cfg file size, and the content of this file copied
+   to it.  The dword based address of this reserved memory region must also
+   be written to the ASLS register at offset 0xFC on the IGD device.  It is
+   recommended that firmware should make use of this fw_cfg entry for any
+   PCI class VGA device with Intel vendor ID.  Multiple of such devices
+   within a VM is undefined.
+
+2) "etc/igd-bdsm-size"
+
+   This fw_cfg file contains an 8-byte, little endian integer indicating
+   the size of the reserved memory region required for IGD stolen memory.
+   Firmware must allocate a reserved memory below 4GB with required 1MB
+   alignment equal to this size.  Additionally the base address of this
+   reserved region must be written to the dword BDSM register in PCI config
+   space of the IGD device at offset 0x5C.  As this support is related to
+   running the IGD ROM, which has other dependencies on the device appearing
+   at guest address 00:02.0, it's expected that this fw_cfg file is only
+   relevant to a single PCI class VGA device with Intel vendor ID, appearing
+   at PCI bus address 00:02.0.
+
+Footnotes
+=========
+[1] Nothing precludes adding additional emulated or assigned graphics devices
+    as non-primary, other than the combination typically not working.  I only
+    intend to set user expectations, others are welcome to find working
+    combinations or fix whatever issues prevent this from working in the common
+    case.
+[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -403,8 +403,8 @@ listen thread:                     --- page -- page -- page -- page -- page --

 On receipt of CMD_PACKAGED (1)
   All the data associated with the package - the ( ... ) section in the
-diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
-recurses into qemu_loadvm_state_main to process the contents of the package (2)
+diagram - is read into memory, and the main thread recurses into
+qemu_loadvm_state_main to process the contents of the package (2)
 which contains commands (3,6) and devices (4...)

 On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -899,10 +899,16 @@ Example:
            goto out_obj;
        }
        visit_type_UserDefOne_members(v, *obj, &err);
-        error_propagate(errp, err);
-        err = NULL;
+        if (err) {
+            goto out_obj;
+        }
+        visit_check_struct(v, &err);
    out_obj:
-        visit_end_struct(v, &err);
+        visit_end_struct(v);
+        if (err && visit_is_input(v)) {
+            qapi_free_UserDefOne(*obj);
+            *obj = NULL;
+        }
    out:
        error_propagate(errp, err);
    }
@@ -910,21 +916,27 @@ Example:
    void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp)
    {
        Error *err = NULL;
-        GenericList *i, **prev;
+        UserDefOneList *tail;
+        size_t size = sizeof(**obj);

-        visit_start_list(v, name, &err);
+        visit_start_list(v, name, (GenericList **)obj, size, &err);
        if (err) {
            goto out;
        }

-        for (prev = (GenericList **)obj;
-             !err && (i = visit_next_list(v, prev, sizeof(**obj))) != NULL;
-             prev = &i) {
-            UserDefOneList *native_i = (UserDefOneList *)i;
-            visit_type_UserDefOne(v, NULL, &native_i->value, &err);
+        for (tail = *obj; tail;
+             tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) {
+            visit_type_UserDefOne(v, NULL, &tail->value, &err);
+            if (err) {
+                break;
+            }
        }

        visit_end_list(v);
+        if (err && visit_is_input(v)) {
+            qapi_free_UserDefOneList(*obj);
+            *obj = NULL;
+        }
    out:
        error_propagate(errp, err);
    }
@@ -996,13 +1008,21 @@ Example:
    {
        Error *err = NULL;
        UserDefOne *retval;
-        QmpInputVisitor *qiv = qmp_input_visitor_new_strict(QOBJECT(args));
+        QmpInputVisitor *qiv = qmp_input_visitor_new(QOBJECT(args), true);
        QapiDeallocVisitor *qdv;
        Visitor *v;
        UserDefOneList *arg1 = NULL;

        v = qmp_input_get_visitor(qiv);
+        visit_start_struct(v, NULL, NULL, 0, &err);
+        if (err) {
+            goto out;
+        }
        visit_type_UserDefOneList(v, "arg1", &arg1, &err);
+        if (!err) {
+            visit_check_struct(v, &err);
+        }
+        visit_end_struct(v);
        if (err) {
            goto out;
        }
@@ -1019,7 +1039,9 @@ Example:
        qmp_input_visitor_cleanup(qiv);
        qdv = qapi_dealloc_visitor_new();
        v = qapi_dealloc_get_visitor(qdv);
+        visit_start_struct(v, NULL, NULL, 0, NULL);
        visit_type_UserDefOneList(v, "arg1", &arg1, NULL);
+        visit_end_struct(v);
        qapi_dealloc_visitor_cleanup(qdv);
    }

--- a/docs/specs/rocker.txt
+++ b/docs/specs/rocker.txt
@@ -303,7 +303,7 @@ Endianness
 ----------

 Device registers are hard-coded to little-endian (LE).  The driver should
-convert to/from host endianess to LE for device register accesses.
+convert to/from host endianness to LE for device register accesses.

 Descriptors are LE.  Descriptor buffer TLVs will have LE type and length
 fields, but the value field can either be LE or network-byte-order, depending
--- a/docs/throttle.txt
+++ b/docs/throttle.txt
@@ -10,7 +10,7 @@ Introduction
 ------------
 QEMU includes a throttling module that can be used to set limits to
 I/O operations. The code itself is generic and independent of the I/O
-units, but it is currenly used to limit the number of bytes per second
+units, but it is currently used to limit the number of bytes per second
 and operations per second (IOPS) when performing disk I/O.

 This document explains how to use the throttling code in QEMU, and how
--- a/exec.c
+++ b/exec.c
@@ -24,24 +24,26 @@

 #include "qemu/cutils.h"
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
-#include "hw/hw.h"
+#include "hw/qdev-core.h"
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/boards.h"
+#include "hw/xen/xen.h"
 #endif
-#include "hw/qdev.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
-#include "hw/xen/xen.h"
 #include "qemu/timer.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "exec/memory.h"
-#include "sysemu/dma.h"
-#include "exec/address-spaces.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #else /* !CONFIG_USER_ONLY */
+#include "hw/hw.h"
+#include "exec/memory.h"
+#include "exec/ioport.h"
+#include "sysemu/dma.h"
+#include "exec/address-spaces.h"
 #include "sysemu/xen-mapcache.h"
 #include "trace.h"
 #endif
@@ -55,6 +57,8 @@
 #include "exec/ram_addr.h"
 #include "exec/log.h"

+#include "migration/vmstate.h"
+
 #include "qemu/range.h"
 #ifndef _WIN32
 #include "qemu/mmap-alloc.h"
@@ -610,15 +614,9 @@ static int cpu_get_free_index(Error **errp)
    return cpu;
 }

-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
-    if (cpu->cpu_index == -1) {
-        /* cpu_index was never allocated by this @cpu or was already freed. */
-        return;
-    }
-
    bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
-    cpu->cpu_index = -1;
 }
 #else

@@ -633,15 +631,45 @@ static int cpu_get_free_index(Error **errp)
    return cpu_index;
 }

-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
+    return;
 }
 #endif

+void cpu_exec_exit(CPUState *cpu)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_lock();
+#endif
+    if (cpu->cpu_index == -1) {
+        /* cpu_index was never allocated by this @cpu or was already freed. */
+#if defined(CONFIG_USER_ONLY)
+        cpu_list_unlock();
+#endif
+        return;
+    }
+
+    QTAILQ_REMOVE(&cpus, cpu, node);
+    cpu_release_index(cpu);
+    cpu->cpu_index = -1;
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_unlock();
+#endif
+
+    if (cc->vmsd != NULL) {
+        vmstate_unregister(NULL, cc->vmsd, cpu);
+    }
+    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
+    }
+}
+
 void cpu_exec_init(CPUState *cpu, Error **errp)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    int cpu_index;
    Error *local_err = NULL;

    cpu->as = NULL;
@@ -668,7 +696,7 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
 #if defined(CONFIG_USER_ONLY)
    cpu_list_lock();
 #endif
-    cpu_index = cpu->cpu_index = cpu_get_free_index(&local_err);
+    cpu->cpu_index = cpu_get_free_index(&local_err);
    if (local_err) {
        error_propagate(errp, local_err);
 #if defined(CONFIG_USER_ONLY)
@@ -678,14 +706,16 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
    }
    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 #if defined(CONFIG_USER_ONLY)
+    (void) cc;
    cpu_list_unlock();
-#endif
+#else
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
-        vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
+        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
    }
    if (cc->vmsd != NULL) {
-        vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
+        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
    }
+#endif
 }

 #if defined(CONFIG_USER_ONLY)
@@ -1043,8 +1073,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,

    if (memory_region_is_ram(section->mr)) {
        /* Normal RAM.  */
-        iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
-            + xlat;
+        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
        if (!section->readonly) {
            iotlb |= PHYS_SECTION_NOTDIRTY;
        } else {
@@ -1296,7 +1325,7 @@ static void *file_ram_alloc(RAMBlock *block,
    }

    page_size = qemu_fd_getpagesize(fd);
-    block->mr->align = page_size;
+    block->mr->align = MAX(page_size, QEMU_VMALLOC_ALIGN);

    if (memory < page_size) {
        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
@@ -1317,7 +1346,8 @@ static void *file_ram_alloc(RAMBlock *block,
        perror("ftruncate");
    }

-    area = qemu_ram_mmap(fd, memory, page_size, block->flags & RAM_SHARED);
+    area = qemu_ram_mmap(fd, memory, block->mr->align,
+                         block->flags & RAM_SHARED);
    if (area == MAP_FAILED) {
        error_setg_errno(errp, errno,
                         "unable to map backing store for guest RAM");
@@ -1407,34 +1437,16 @@ static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
    }
 }

-/* Called within an RCU critical section, or while the ramlist lock
- * is held.
- */
-static RAMBlock *find_ram_block(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (block->offset == addr) {
-            return block;
-        }
-    }
-
-    return NULL;
-}
-
 const char *qemu_ram_get_idstr(RAMBlock *rb)
 {
    return rb->idstr;
 }

 /* Called with iothread lock held.  */
-void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
+void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
 {
-    RAMBlock *new_block, *block;
+    RAMBlock *block;

-    rcu_read_lock();
-    new_block = find_ram_block(addr);
    assert(new_block);
    assert(!new_block->idstr[0]);

@@ -1447,8 +1459,10 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
    }
    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);

+    rcu_read_lock();
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (block != new_block && !strcmp(block->idstr, new_block->idstr)) {
+        if (block != new_block &&
+            !strcmp(block->idstr, new_block->idstr)) {
            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
                    new_block->idstr);
            abort();
@@ -1458,21 +1472,15 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
 }

 /* Called with iothread lock held.  */
-void qemu_ram_unset_idstr(ram_addr_t addr)
+void qemu_ram_unset_idstr(RAMBlock *block)
 {
-    RAMBlock *block;
-
    /* FIXME: arch_init.c assumes that this is not called throughout
     * migration.  Ignore the problem since hot-unplug during migration
     * does not work anyway.
     */
-
-    rcu_read_lock();
-    block = find_ram_block(addr);
    if (block) {
        memset(block->idstr, 0, sizeof(block->idstr));
    }
-    rcu_read_unlock();
 }

 static int memory_try_enable_merging(void *addr, size_t len)
@@ -1492,10 +1500,8 @@ static int memory_try_enable_merging(void *addr, size_t len)
 * resize callback to update device state and/or add assertions to detect
 * misuse, if necessary.
 */
-int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
+int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
 {
-    RAMBlock *block = find_ram_block(base);
-
    assert(block);

    newsize = HOST_PAGE_ALIGN(newsize);
@@ -1836,40 +1842,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */

-int qemu_get_ram_fd(ram_addr_t addr)
-{
-    RAMBlock *block;
-    int fd;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    fd = block->fd;
-    rcu_read_unlock();
-    return fd;
-}
-
-void qemu_set_ram_fd(ram_addr_t addr, int fd)
-{
-    RAMBlock *block;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    block->fd = fd;
-    rcu_read_unlock();
-}
-
-void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-    void *ptr;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    ptr = ramblock_ptr(block, 0);
-    rcu_read_unlock();
-    return ptr;
-}
-
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
@@ -1877,12 +1849,13 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
 *
 * Called within RCU critical section.
 */
-void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
 {
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
+        addr -= block->offset;
    }

    if (xen_enabled() && block->host == NULL) {
@@ -1896,10 +1869,10 @@ void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)

        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }
-    return ramblock_ptr(block, addr - block->offset);
+    return ramblock_ptr(block, addr);
 }

-/* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
+/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
 * but takes a size argument.
 *
 * Called within RCU critical section.
@@ -1908,16 +1881,15 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                 hwaddr *size)
 {
    RAMBlock *block = ram_block;
-    ram_addr_t offset_inside_block;
    if (*size == 0) {
        return NULL;
    }

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
+        addr -= block->offset;
    }
-    offset_inside_block = addr - block->offset;
-    *size = MIN(*size, block->max_length - offset_inside_block);
+    *size = MIN(*size, block->max_length - addr);

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
@@ -1931,7 +1903,7 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }

-    return ramblock_ptr(block, offset_inside_block);
+    return ramblock_ptr(block, addr);
 }

 /*
@@ -1952,16 +1924,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
 * ram_addr_t.
 */
 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
-                                   ram_addr_t *ram_addr,
                                   ram_addr_t *offset)
 {
    RAMBlock *block;
    uint8_t *host = ptr;

    if (xen_enabled()) {
+        ram_addr_t ram_addr;
        rcu_read_lock();
-        *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        block = qemu_get_ram_block(*ram_addr);
+        ram_addr = xen_ram_addr_from_mapcache(ptr);
+        block = qemu_get_ram_block(ram_addr);
        if (block) {
            *offset = (host - block->host);
        }
@@ -1993,7 +1965,6 @@ found:
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
-    *ram_addr = block->offset + *offset;
    rcu_read_unlock();
    return block;
 }
@@ -2020,18 +1991,17 @@ RAMBlock *qemu_ram_block_by_name(const char *name)

 /* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
 {
    RAMBlock *block;
-    ram_addr_t offset; /* Not used */
-
-    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+    ram_addr_t offset;

+    block = qemu_ram_block_from_host(ptr, false, &offset);
    if (!block) {
-        return NULL;
+        return RAM_ADDR_INVALID;
    }

-    return block->mr;
+    return block->offset + offset;
 }

 /* Called within RCU critical section.  */
@@ -2043,13 +2013,13 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
    }
    switch (size) {
    case 1:
-        stb_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    case 2:
-        stw_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    case 4:
-        stl_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    default:
        abort();
@@ -2087,7 +2057,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
    target_ulong pc, cs_base;
    target_ulong vaddr;
    CPUWatchpoint *wp;
-    int cpu_flags;
+    uint32_t cpu_flags;

    if (cpu->watchpoint_hit) {
        /* We re-entered the check after replacing the TB. Now raise
@@ -2511,6 +2481,8 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
                                     hwaddr length)
 {
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+    addr += memory_region_get_ram_addr(mr);
+
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
@@ -2623,9 +2595,8 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                abort();
            }
        } else {
-            addr1 += memory_region_get_ram_addr(mr);
            /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
        }
@@ -2716,8 +2687,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
            }
        } else {
            /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block,
-                                   memory_region_get_ram_addr(mr) + addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            memcpy(buf, ptr, l);
        }

@@ -2800,9 +2770,8 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
              memory_region_is_romd(mr))) {
            l = memory_access_size(mr, l, addr1);
        } else {
-            addr1 += memory_region_get_ram_addr(mr);
            /* ROM/RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
@@ -2960,7 +2929,6 @@ void *address_space_map(AddressSpace *as,
    hwaddr done = 0;
    hwaddr l, xlat, base;
    MemoryRegion *mr, *this_mr;
-    ram_addr_t raddr;
    void *ptr;

    if (len == 0) {
@@ -2995,7 +2963,6 @@ void *address_space_map(AddressSpace *as,
    }

    base = xlat;
-    raddr = memory_region_get_ram_addr(mr);

    for (;;) {
        len -= l;
@@ -3014,7 +2981,7 @@ void *address_space_map(AddressSpace *as,

    memory_region_ref(mr);
    *plen = done;
-    ptr = qemu_ram_ptr_length(mr->ram_block, raddr + base, plen);
+    ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
    rcu_read_unlock();

    return ptr;
@@ -3031,7 +2998,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
        MemoryRegion *mr;
        ram_addr_t addr1;

-        mr = qemu_ram_addr_from_host(buffer, &addr1);
+        mr = memory_region_from_host(buffer, &addr1);
        assert(mr != NULL);
        if (is_write) {
            invalidate_and_set_dirty(mr, addr1, access_len);
@@ -3098,10 +3065,7 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               (memory_region_get_ram_addr(mr)
-                                & TARGET_PAGE_MASK)
-                               + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = ldl_le_p(ptr);
@@ -3194,10 +3158,7 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               (memory_region_get_ram_addr(mr)
-                                & TARGET_PAGE_MASK)
-                               + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = ldq_le_p(ptr);
@@ -3310,10 +3271,7 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               (memory_region_get_ram_addr(mr)
-                                & TARGET_PAGE_MASK)
-                               + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = lduw_le_p(ptr);
@@ -3395,13 +3353,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,

        r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
    } else {
-        addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        stl_p(ptr, val);

        dirty_log_mask = memory_region_get_dirty_log_mask(mr);
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
-        cpu_physical_memory_set_dirty_range(addr1, 4, dirty_log_mask);
+        cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                            4, dirty_log_mask);
        r = MEMTX_OK;
    }
    if (result) {
@@ -3450,8 +3408,7 @@ static inline void address_space_stl_internal(AddressSpace *as,
        r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
    } else {
        /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            stl_le_p(ptr, val);
@@ -3560,8 +3517,7 @@ static inline void address_space_stw_internal(AddressSpace *as,
        r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
    } else {
        /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            stw_le_p(ptr, val);
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -19,7 +19,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/cutils.h"
-
+#include "cpu.h"
 #ifdef CONFIG_USER_ONLY
 #include "qemu.h"
 #else
@@ -35,6 +35,7 @@
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
+#include "exec/exec-all.h"

 #ifdef CONFIG_USER_ONLY
 #define GDB_ATTACHED "0"
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1008,7 +1008,7 @@ ETEXI

    {
        .name       = "migrate_set_parameter",
-        .args_type  = "parameter:s,value:i",
+        .args_type  = "parameter:s,value:s",
        .params     = "parameter value",
        .help       = "Set the parameter for migration",
        .mhandler.cmd = hmp_migrate_set_parameter,
--- a/hmp.c
+++ b/hmp.c
@@ -35,6 +35,7 @@
 #include "block/qapi.h"
 #include "qemu-io.h"
 #include "qemu/cutils.h"
+#include "qemu/error-report.h"

 #ifdef CONFIG_SPICE
 #include <spice/enums.h>
@@ -168,8 +169,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
    }

    if (info->has_status) {
-        monitor_printf(mon, "Migration status: %s\n",
+        monitor_printf(mon, "Migration status: %s",
                       MigrationStatus_lookup[info->status]);
+        if (info->status == MIGRATION_STATUS_FAILED &&
+            info->has_error_desc) {
+            monitor_printf(mon, " (%s)\n", info->error_desc);
+        } else {
+            monitor_printf(mon, "\n");
+        }
+
        monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
                       info->total_time);
        if (info->has_expected_downtime) {
@@ -235,9 +243,9 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
                       info->xbzrle_cache->overflow);
    }

-    if (info->has_x_cpu_throttle_percentage) {
+    if (info->has_cpu_throttle_percentage) {
        monitor_printf(mon, "cpu throttle percentage: %" PRIu64 "\n",
-                       info->x_cpu_throttle_percentage);
+                       info->cpu_throttle_percentage);
    }

    qapi_free_MigrationInfo(info);
@@ -281,11 +289,17 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
            MigrationParameter_lookup[MIGRATION_PARAMETER_DECOMPRESS_THREADS],
            params->decompress_threads);
        monitor_printf(mon, " %s: %" PRId64,
-            MigrationParameter_lookup[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL],
-            params->x_cpu_throttle_initial);
+            MigrationParameter_lookup[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL],
+            params->cpu_throttle_initial);
        monitor_printf(mon, " %s: %" PRId64,
-            MigrationParameter_lookup[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT],
-            params->x_cpu_throttle_increment);
+            MigrationParameter_lookup[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT],
+            params->cpu_throttle_increment);
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_CREDS],
+            params->tls_creds ? : "");
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_HOSTNAME],
+            params->tls_hostname ? : "");
        monitor_printf(mon, "\n");
    }

@@ -1235,13 +1249,17 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict)
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
 {
    const char *param = qdict_get_str(qdict, "parameter");
-    int value = qdict_get_int(qdict, "value");
+    const char *valuestr = qdict_get_str(qdict, "value");
+    long valueint = 0;
    Error *err = NULL;
    bool has_compress_level = false;
    bool has_compress_threads = false;
    bool has_decompress_threads = false;
-    bool has_x_cpu_throttle_initial = false;
-    bool has_x_cpu_throttle_increment = false;
+    bool has_cpu_throttle_initial = false;
+    bool has_cpu_throttle_increment = false;
+    bool has_tls_creds = false;
+    bool has_tls_hostname = false;
+    bool use_int_value = false;
    int i;

    for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) {
@@ -1249,25 +1267,46 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
            switch (i) {
            case MIGRATION_PARAMETER_COMPRESS_LEVEL:
                has_compress_level = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_COMPRESS_THREADS:
                has_compress_threads = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_DECOMPRESS_THREADS:
                has_decompress_threads = true;
+                use_int_value = true;
                break;
-            case MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL:
-                has_x_cpu_throttle_initial = true;
+            case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL:
+                has_cpu_throttle_initial = true;
+                use_int_value = true;
                break;
-            case MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT:
-                has_x_cpu_throttle_increment = true;
+            case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT:
+                has_cpu_throttle_increment = true;
+                break;
+            case MIGRATION_PARAMETER_TLS_CREDS:
+                has_tls_creds = true;
+                break;
+            case MIGRATION_PARAMETER_TLS_HOSTNAME:
+                has_tls_hostname = true;
                break;
            }
-            qmp_migrate_set_parameters(has_compress_level, value,
-                                       has_compress_threads, value,
-                                       has_decompress_threads, value,
-                                       has_x_cpu_throttle_initial, value,
-                                       has_x_cpu_throttle_increment, value,
+
+            if (use_int_value) {
+                if (qemu_strtol(valuestr, NULL, 10, &valueint) < 0) {
+                    error_setg(&err, "Unable to parse '%s' as an int",
+                               valuestr);
+                    goto cleanup;
+                }
+            }
+
+            qmp_migrate_set_parameters(has_compress_level, valueint,
+                                       has_compress_threads, valueint,
+                                       has_decompress_threads, valueint,
+                                       has_cpu_throttle_initial, valueint,
+                                       has_cpu_throttle_increment, valueint,
+                                       has_tls_creds, valuestr,
+                                       has_tls_hostname, valuestr,
                                       &err);
            break;
        }
@@ -1277,6 +1316,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
        error_setg(&err, QERR_INVALID_PARAMETER, param);
    }

+ cleanup:
    if (err) {
        error_report_err(err);
    }
@@ -1533,6 +1573,9 @@ static void hmp_migrate_status_cb(void *opaque)
        if (status->is_block_migration) {
            monitor_printf(status->mon, "\n");
        }
+        if (info->has_error_desc) {
+            error_report("%s", info->error_desc);
+        }
        monitor_resume(status->mon);
        timer_del(status->timer);
        g_free(status);
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -47,7 +47,6 @@
    } while (0)

 extern void co_run_in_worker_bh(void *);
-extern int v9fs_init_worker_threads(void);
 extern int v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
 extern int v9fs_co_readdir_r(V9fsPDU *, V9fsFidState *,
                           struct dirent *, struct dirent **result);
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1563,3 +1563,14 @@ build_rsdt(GArray *table_data, GArray *linker, GArray *table_offsets,
    build_header(linker, table_data,
                 (void *)rsdt, "RSDT", rsdt_len, 1, oem_id, oem_table_id);
 }
+
+void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
+                       uint64_t len, int node, MemoryAffinityFlags flags)
+{
+    numamem->type = ACPI_SRAT_MEMORY;
+    numamem->length = sizeof(*numamem);
+    numamem->proximity = cpu_to_le32(node);
+    numamem->flags = cpu_to_le32(flags);
+    numamem->base_addr = cpu_to_le64(base);
+    numamem->range_length = cpu_to_le64(len);
+}
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -491,6 +491,12 @@ void acpi_pm_tmr_update(ACPIREGS *ar, bool enable)
    }
 }

+static inline int64_t acpi_pm_tmr_get_clock(void)
+{
+    return muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), PM_TIMER_FREQUENCY,
+                    NANOSECONDS_PER_SECOND);
+}
+
 void acpi_pm_tmr_calc_overflow_time(ACPIREGS *ar)
 {
    int64_t d = acpi_pm_tmr_get_clock();
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -378,17 +378,19 @@ struct NvdimmDsmIn {
    uint32_t function;
    /* the remaining size in the page is used by arg3. */
    union {
-        uint8_t arg3[0];
+        uint8_t arg3[4084];
    };
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
+QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmIn) != 4096);

 struct NvdimmDsmOut {
    /* the size of buffer filled by QEMU. */
    uint32_t len;
-    uint8_t data[0];
+    uint8_t data[4092];
 } QEMU_PACKED;
 typedef struct NvdimmDsmOut NvdimmDsmOut;
+QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmOut) != 4096);

 struct NvdimmDsmFunc0Out {
    /* the size of buffer filled by QEMU. */
@@ -424,8 +426,8 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
     * can change its content while we are doing DSM emulation. Avoid
     * this by copying DSM memory to QEMU local memory.
     */
-    in = g_malloc(TARGET_PAGE_SIZE);
-    cpu_physical_memory_read(dsm_mem_addr, in, TARGET_PAGE_SIZE);
+    in = g_new(NvdimmDsmIn, 1);
+    cpu_physical_memory_read(dsm_mem_addr, in, sizeof(*in));

    le32_to_cpus(&in->revision);
    le32_to_cpus(&in->function);
@@ -475,7 +477,7 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);

    state->dsm_mem = g_array_new(false, true /* clear */, 1);
-    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
+    acpi_data_push(state->dsm_mem, sizeof(NvdimmDsmIn));
    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
                    state->dsm_mem->len);
 }
@@ -608,7 +610,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(dev, aml_operation_region("NPIO", AML_SYSTEM_IO,
               aml_int(NVDIMM_ACPI_IO_BASE), NVDIMM_ACPI_IO_LEN));
    aml_append(dev, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
-               aml_name(NVDIMM_ACPI_MEM_ADDR), TARGET_PAGE_SIZE));
+               aml_name(NVDIMM_ACPI_MEM_ADDR), sizeof(NvdimmDsmIn)));

    /*
     * DSM notifier:
@@ -642,8 +644,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(field, aml_named_field("FUNC",
               sizeof(typeof_field(NvdimmDsmIn, function)) * BITS_PER_BYTE));
    aml_append(field, aml_named_field("ARG3",
-               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmIn, arg3)) *
-                BITS_PER_BYTE));
+               (sizeof(NvdimmDsmIn) - offsetof(NvdimmDsmIn, arg3)) * BITS_PER_BYTE));
    aml_append(dev, field);

    /*
@@ -659,8 +660,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    aml_append(field, aml_named_field("RLEN",
               sizeof(typeof_field(NvdimmDsmOut, len)) * BITS_PER_BYTE));
    aml_append(field, aml_named_field("ODAT",
-               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmOut, data)) *
-                     BITS_PER_BYTE));
+               (sizeof(NvdimmDsmOut) - offsetof(NvdimmDsmOut, data)) * BITS_PER_BYTE));
    aml_append(dev, field);

    nvdimm_build_common_dsm(dev);
@@ -678,7 +678,7 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
    mem_addr_offset = build_append_named_dword(table_data,
                                               NVDIMM_ACPI_MEM_ADDR);

-    bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
+    bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, sizeof(NvdimmDsmIn),
                             false /* high memory */);
    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   NVDIMM_DSM_MEM_FILE, table_data,
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -39,6 +39,7 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/acpi_dev_interface.h"
 #include "hw/xen/xen.h"
+#include "qom/cpu.h"

 //#define DEBUG

--- a/hw/alpha/alpha_sys.h
+++ b/hw/alpha/alpha_sys.h
@@ -3,6 +3,7 @@
 #ifndef HW_ALPHA_H
 #define HW_ALPHA_H 1

+#include "target-alpha/cpu-qom.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_host.h"
 #include "hw/ide.h"
--- a/hw/alpha/pci.c
+++ b/hw/alpha/pci.c
@@ -8,7 +8,6 @@

 #include "qemu/osdep.h"
 #include "qemu-common.h"
-#include "cpu.h"
 #include "alpha_sys.h"
 #include "qemu/log.h"
 #include "sysemu/sysemu.h"
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -16,4 +16,5 @@ obj-$(CONFIG_STM32F205_SOC) += stm32f205_soc.o
 obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp.o xlnx-ep108.o
 obj-$(CONFIG_FSL_IMX25) += fsl-imx25.o imx25_pdk.o
 obj-$(CONFIG_FSL_IMX31) += fsl-imx31.o kzm.o
+obj-$(CONFIG_FSL_IMX6) += fsl-imx6.o sabrelite.o
 obj-$(CONFIG_ASPEED_SOC) += ast2400.o palmetto-bmc.o
--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -132,14 +132,14 @@ typedef struct {
    uint32_t base;
 } BitBandState;

-static int bitband_init(SysBusDevice *dev)
+static void bitband_init(Object *obj)
 {
-    BitBandState *s = BITBAND(dev);
+    BitBandState *s = BITBAND(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

-    memory_region_init_io(&s->iomem, OBJECT(s), &bitband_ops, &s->base,
+    memory_region_init_io(&s->iomem, obj, &bitband_ops, &s->base,
                          "bitband", 0x02000000);
    sysbus_init_mmio(dev, &s->iomem);
-    return 0;
 }

 static void armv7m_bitband_init(void)
@@ -244,9 +244,7 @@ static Property bitband_properties[] = {
 static void bitband_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = bitband_init;
    dc->props = bitband_properties;
 }

@@ -254,6 +252,7 @@ static const TypeInfo bitband_info = {
    .name          = TYPE_BITBAND,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(BitBandState),
+    .instance_init = bitband_init,
    .class_init    = bitband_class_init,
 };

--- a/hw/arm/ast2400.c
+++ b/hw/arm/ast2400.c
@@ -17,6 +17,7 @@
 #include "exec/address-spaces.h"
 #include "hw/arm/ast2400.h"
 #include "hw/char/serial.h"
+#include "qemu/log.h"

 #define AST2400_UART_5_BASE      0x00184000
 #define AST2400_IOMEM_SIZE       0x00200000
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -14,6 +14,7 @@
 #include "hw/arm/linux-boot-if.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/numa.h"
 #include "hw/boards.h"
 #include "hw/loader.h"
 #include "elf.h"
@@ -405,6 +406,9 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
    void *fdt = NULL;
    int size, rc;
    uint32_t acells, scells;
+    char *nodename;
+    unsigned int i;
+    hwaddr mem_base, mem_len;

    if (binfo->dtb_filename) {
        char *filename;
@@ -456,12 +460,39 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
        goto fail;
    }

-    rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
-                                      acells, binfo->loader_start,
-                                      scells, binfo->ram_size);
-    if (rc < 0) {
-        fprintf(stderr, "couldn't set /memory/reg\n");
-        goto fail;
+    if (nb_numa_nodes > 0) {
+        /*
+         * Turn the /memory node created before into a NOP node, then create
+         * /memory@addr nodes for all numa nodes respectively.
+         */
+        qemu_fdt_nop_node(fdt, "/memory");
+        mem_base = binfo->loader_start;
+        for (i = 0; i < nb_numa_nodes; i++) {
+            mem_len = numa_info[i].node_mem;
+            nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
+            qemu_fdt_add_subnode(fdt, nodename);
+            qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
+            rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
+                                              acells, mem_base,
+                                              scells, mem_len);
+            if (rc < 0) {
+                fprintf(stderr, "couldn't set %s/reg for node %d\n", nodename,
+                        i);
+                goto fail;
+            }
+
+            qemu_fdt_setprop_cell(fdt, nodename, "numa-node-id", i);
+            mem_base += mem_len;
+            g_free(nodename);
+        }
+    } else {
+        rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
+                                          acells, binfo->loader_start,
+                                          scells, binfo->ram_size);
+        if (rc < 0) {
+            fprintf(stderr, "couldn't set /memory/reg\n");
+            goto fail;
+        }
    }

    if (binfo->kernel_cmdline && *binfo->kernel_cmdline) {
--- a/hw/arm/collie.c
+++ b/hw/arm/collie.c
@@ -18,6 +18,7 @@
 #include "hw/block/flash.h"
 #include "sysemu/block-backend.h"
 #include "exec/address-spaces.h"
+#include "qom/cpu.h"

 static struct arm_boot_info collie_binfo = {
    .loader_start = SA_SDCS0,
--- a/hw/arm/fsl-imx25.c
+++ b/hw/arm/fsl-imx25.c
@@ -191,6 +191,7 @@ static void fsl_imx25_realize(DeviceState *dev, Error **errp)
    }

    qdev_set_nic_properties(DEVICE(&s->fec), &nd_table[0]);
+
    object_property_set_bool(OBJECT(&s->fec), true, "realized", &err);
    if (err) {
        error_propagate(errp, err);
--- a/hw/arm/fsl-imx6.c
+++ b/hw/arm/fsl-imx6.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2015 Jean-Christophe Dubois <jcd@tribudubois.net>
+ *
+ * i.MX6 SOC emulation.
+ *
+ * Based on hw/arm/fsl-imx31.c
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/arm/fsl-imx6.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/char.h"
+#include "qemu/error-report.h"
+
+#define NAME_SIZE 20
+
+static void fsl_imx6_init(Object *obj)
+{
+    FslIMX6State *s = FSL_IMX6(obj);
+    char name[NAME_SIZE];
+    int i;
+
+    if (smp_cpus > FSL_IMX6_NUM_CPUS) {
+        error_report("%s: Only %d CPUs are supported (%d requested)",
+                     TYPE_FSL_IMX6, FSL_IMX6_NUM_CPUS, smp_cpus);
+        exit(1);
+    }
+
+    for (i = 0; i < smp_cpus; i++) {
+        object_initialize(&s->cpu[i], sizeof(s->cpu[i]),
+                          "cortex-a9-" TYPE_ARM_CPU);
+        snprintf(name, NAME_SIZE, "cpu%d", i);
+        object_property_add_child(obj, name, OBJECT(&s->cpu[i]), NULL);
+    }
+
+    object_initialize(&s->a9mpcore, sizeof(s->a9mpcore), TYPE_A9MPCORE_PRIV);
+    qdev_set_parent_bus(DEVICE(&s->a9mpcore), sysbus_get_default());
+    object_property_add_child(obj, "a9mpcore", OBJECT(&s->a9mpcore), NULL);
+
+    object_initialize(&s->ccm, sizeof(s->ccm), TYPE_IMX6_CCM);
+    qdev_set_parent_bus(DEVICE(&s->ccm), sysbus_get_default());
+    object_property_add_child(obj, "ccm", OBJECT(&s->ccm), NULL);
+
+    object_initialize(&s->src, sizeof(s->src), TYPE_IMX6_SRC);
+    qdev_set_parent_bus(DEVICE(&s->src), sysbus_get_default());
+    object_property_add_child(obj, "src", OBJECT(&s->src), NULL);
+
+    for (i = 0; i < FSL_IMX6_NUM_UARTS; i++) {
+        object_initialize(&s->uart[i], sizeof(s->uart[i]), TYPE_IMX_SERIAL);
+        qdev_set_parent_bus(DEVICE(&s->uart[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "uart%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->uart[i]), NULL);
+    }
+
+    object_initialize(&s->gpt, sizeof(s->gpt), TYPE_IMX_GPT);
+    qdev_set_parent_bus(DEVICE(&s->gpt), sysbus_get_default());
+    object_property_add_child(obj, "gpt", OBJECT(&s->gpt), NULL);
+
+    for (i = 0; i < FSL_IMX6_NUM_EPITS; i++) {
+        object_initialize(&s->epit[i], sizeof(s->epit[i]), TYPE_IMX_EPIT);
+        qdev_set_parent_bus(DEVICE(&s->epit[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "epit%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->epit[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_I2CS; i++) {
+        object_initialize(&s->i2c[i], sizeof(s->i2c[i]), TYPE_IMX_I2C);
+        qdev_set_parent_bus(DEVICE(&s->i2c[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "i2c%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->i2c[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_GPIOS; i++) {
+        object_initialize(&s->gpio[i], sizeof(s->gpio[i]), TYPE_IMX_GPIO);
+        qdev_set_parent_bus(DEVICE(&s->gpio[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "gpio%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->gpio[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_ESDHCS; i++) {
+        object_initialize(&s->esdhc[i], sizeof(s->esdhc[i]), TYPE_SYSBUS_SDHCI);
+        qdev_set_parent_bus(DEVICE(&s->esdhc[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "sdhc%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->esdhc[i]), NULL);
+    }
+
+    for (i = 0; i < FSL_IMX6_NUM_ECSPIS; i++) {
+        object_initialize(&s->spi[i], sizeof(s->spi[i]), TYPE_IMX_SPI);
+        qdev_set_parent_bus(DEVICE(&s->spi[i]), sysbus_get_default());
+        snprintf(name, NAME_SIZE, "spi%d", i + 1);
+        object_property_add_child(obj, name, OBJECT(&s->spi[i]), NULL);
+    }
+
+    object_initialize(&s->eth, sizeof(s->eth), TYPE_IMX_ENET);
+    qdev_set_parent_bus(DEVICE(&s->eth), sysbus_get_default());
+    object_property_add_child(obj, "eth", OBJECT(&s->eth), NULL);
+}
+
+static void fsl_imx6_realize(DeviceState *dev, Error **errp)
+{
+    FslIMX6State *s = FSL_IMX6(dev);
+    uint16_t i;
+    Error *err = NULL;
+
+    for (i = 0; i < smp_cpus; i++) {
+
+        /* On uniprocessor, the CBAR is set to 0 */
+        if (smp_cpus > 1) {
+            object_property_set_int(OBJECT(&s->cpu[i]), FSL_IMX6_A9MPCORE_ADDR,
+                                    "reset-cbar", &error_abort);
+        }
+
+        /* All CPU but CPU 0 start in power off mode */
+        if (i) {
+            object_property_set_bool(OBJECT(&s->cpu[i]), true,
+                                     "start-powered-off", &error_abort);
+        }
+
+        object_property_set_bool(OBJECT(&s->cpu[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+    }
+
+    object_property_set_int(OBJECT(&s->a9mpcore), smp_cpus, "num-cpu",
+                            &error_abort);
+
+    object_property_set_int(OBJECT(&s->a9mpcore),
+                            FSL_IMX6_MAX_IRQ + GIC_INTERNAL, "num-irq",
+                            &error_abort);
+
+    object_property_set_bool(OBJECT(&s->a9mpcore), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->a9mpcore), 0, FSL_IMX6_A9MPCORE_ADDR);
+
+    for (i = 0; i < smp_cpus; i++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->a9mpcore), i,
+                           qdev_get_gpio_in(DEVICE(&s->cpu[i]), ARM_CPU_IRQ));
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->a9mpcore), i + smp_cpus,
+                           qdev_get_gpio_in(DEVICE(&s->cpu[i]), ARM_CPU_FIQ));
+    }
+
+    object_property_set_bool(OBJECT(&s->ccm), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->ccm), 0, FSL_IMX6_CCM_ADDR);
+
+    object_property_set_bool(OBJECT(&s->src), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->src), 0, FSL_IMX6_SRC_ADDR);
+
+    /* Initialize all UARTs */
+    for (i = 0; i < FSL_IMX6_NUM_UARTS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } serial_table[FSL_IMX6_NUM_UARTS] = {
+            { FSL_IMX6_UART1_ADDR, FSL_IMX6_UART1_IRQ },
+            { FSL_IMX6_UART2_ADDR, FSL_IMX6_UART2_IRQ },
+            { FSL_IMX6_UART3_ADDR, FSL_IMX6_UART3_IRQ },
+            { FSL_IMX6_UART4_ADDR, FSL_IMX6_UART4_IRQ },
+            { FSL_IMX6_UART5_ADDR, FSL_IMX6_UART5_IRQ },
+        };
+
+        if (i < MAX_SERIAL_PORTS) {
+            CharDriverState *chr;
+
+            chr = serial_hds[i];
+
+            if (!chr) {
+                char *label = g_strdup_printf("imx6.uart%d", i + 1);
+                chr = qemu_chr_new(label, "null", NULL);
+                g_free(label);
+                serial_hds[i] = chr;
+            }
+
+            qdev_prop_set_chr(DEVICE(&s->uart[i]), "chardev", chr);
+        }
+
+        object_property_set_bool(OBJECT(&s->uart[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->uart[i]), 0, serial_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->uart[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            serial_table[i].irq));
+    }
+
+    s->gpt.ccm = IMX_CCM(&s->ccm);
+
+    object_property_set_bool(OBJECT(&s->gpt), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->gpt), 0, FSL_IMX6_GPT_ADDR);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpt), 0,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_GPT_IRQ));
+
+    /* Initialize all EPIT timers */
+    for (i = 0; i < FSL_IMX6_NUM_EPITS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } epit_table[FSL_IMX6_NUM_EPITS] = {
+            { FSL_IMX6_EPIT1_ADDR, FSL_IMX6_EPIT1_IRQ },
+            { FSL_IMX6_EPIT2_ADDR, FSL_IMX6_EPIT2_IRQ },
+        };
+
+        s->epit[i].ccm = IMX_CCM(&s->ccm);
+
+        object_property_set_bool(OBJECT(&s->epit[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->epit[i]), 0, epit_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->epit[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            epit_table[i].irq));
+    }
+
+    /* Initialize all I2C */
+    for (i = 0; i < FSL_IMX6_NUM_I2CS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } i2c_table[FSL_IMX6_NUM_I2CS] = {
+            { FSL_IMX6_I2C1_ADDR, FSL_IMX6_I2C1_IRQ },
+            { FSL_IMX6_I2C2_ADDR, FSL_IMX6_I2C2_IRQ },
+            { FSL_IMX6_I2C3_ADDR, FSL_IMX6_I2C3_IRQ }
+        };
+
+        object_property_set_bool(OBJECT(&s->i2c[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c[i]), 0, i2c_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            i2c_table[i].irq));
+    }
+
+    /* Initialize all GPIOs */
+    for (i = 0; i < FSL_IMX6_NUM_GPIOS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq_low;
+            unsigned int irq_high;
+        } gpio_table[FSL_IMX6_NUM_GPIOS] = {
+            {
+                FSL_IMX6_GPIO1_ADDR,
+                FSL_IMX6_GPIO1_LOW_IRQ,
+                FSL_IMX6_GPIO1_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO2_ADDR,
+                FSL_IMX6_GPIO2_LOW_IRQ,
+                FSL_IMX6_GPIO2_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO3_ADDR,
+                FSL_IMX6_GPIO3_LOW_IRQ,
+                FSL_IMX6_GPIO3_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO4_ADDR,
+                FSL_IMX6_GPIO4_LOW_IRQ,
+                FSL_IMX6_GPIO4_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO5_ADDR,
+                FSL_IMX6_GPIO5_LOW_IRQ,
+                FSL_IMX6_GPIO5_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO6_ADDR,
+                FSL_IMX6_GPIO6_LOW_IRQ,
+                FSL_IMX6_GPIO6_HIGH_IRQ
+            },
+            {
+                FSL_IMX6_GPIO7_ADDR,
+                FSL_IMX6_GPIO7_LOW_IRQ,
+                FSL_IMX6_GPIO7_HIGH_IRQ
+            },
+        };
+
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "has-edge-sel",
+                                 &error_abort);
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "has-upper-pin-irq",
+                                 &error_abort);
+        object_property_set_bool(OBJECT(&s->gpio[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->gpio[i]), 0, gpio_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpio[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            gpio_table[i].irq_low));
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpio[i]), 1,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            gpio_table[i].irq_high));
+    }
+
+    /* Initialize all SDHC */
+    for (i = 0; i < FSL_IMX6_NUM_ESDHCS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } esdhc_table[FSL_IMX6_NUM_ESDHCS] = {
+            { FSL_IMX6_uSDHC1_ADDR, FSL_IMX6_uSDHC1_IRQ },
+            { FSL_IMX6_uSDHC2_ADDR, FSL_IMX6_uSDHC2_IRQ },
+            { FSL_IMX6_uSDHC3_ADDR, FSL_IMX6_uSDHC3_IRQ },
+            { FSL_IMX6_uSDHC4_ADDR, FSL_IMX6_uSDHC4_IRQ },
+        };
+
+        object_property_set_bool(OBJECT(&s->esdhc[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->esdhc[i]), 0, esdhc_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->esdhc[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            esdhc_table[i].irq));
+    }
+
+    /* Initialize all ECSPI */
+    for (i = 0; i < FSL_IMX6_NUM_ECSPIS; i++) {
+        static const struct {
+            hwaddr addr;
+            unsigned int irq;
+        } spi_table[FSL_IMX6_NUM_ECSPIS] = {
+            { FSL_IMX6_eCSPI1_ADDR, FSL_IMX6_ECSPI1_IRQ },
+            { FSL_IMX6_eCSPI2_ADDR, FSL_IMX6_ECSPI2_IRQ },
+            { FSL_IMX6_eCSPI3_ADDR, FSL_IMX6_ECSPI3_IRQ },
+            { FSL_IMX6_eCSPI4_ADDR, FSL_IMX6_ECSPI4_IRQ },
+            { FSL_IMX6_eCSPI5_ADDR, FSL_IMX6_ECSPI5_IRQ },
+        };
+
+        /* Initialize the SPI */
+        object_property_set_bool(OBJECT(&s->spi[i]), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(&s->spi[i]), 0, spi_table[i].addr);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->spi[i]), 0,
+                           qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                            spi_table[i].irq));
+    }
+
+    object_property_set_bool(OBJECT(&s->eth), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->eth), 0, FSL_IMX6_ENET_ADDR);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->eth), 0,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_ENET_MAC_IRQ));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->eth), 1,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_ENET_MAC_1588_IRQ));
+
+    /* ROM memory */
+    memory_region_init_rom_device(&s->rom, NULL, NULL, NULL, "imx6.rom",
+                                  FSL_IMX6_ROM_SIZE, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_ROM_ADDR,
+                                &s->rom);
+
+    /* CAAM memory */
+    memory_region_init_rom_device(&s->caam, NULL, NULL, NULL, "imx6.caam",
+                                  FSL_IMX6_CAAM_MEM_SIZE, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_CAAM_MEM_ADDR,
+                                &s->caam);
+
+    /* OCRAM memory */
+    memory_region_init_ram(&s->ocram, NULL, "imx6.ocram", FSL_IMX6_OCRAM_SIZE,
+                           &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_OCRAM_ADDR,
+                                &s->ocram);
+    vmstate_register_ram_global(&s->ocram);
+
+    /* internal OCRAM (256 KB) is aliased over 1 MB */
+    memory_region_init_alias(&s->ocram_alias, NULL, "imx6.ocram_alias",
+                             &s->ocram, 0, FSL_IMX6_OCRAM_ALIAS_SIZE);
+    memory_region_add_subregion(get_system_memory(), FSL_IMX6_OCRAM_ALIAS_ADDR,
+                                &s->ocram_alias);
+}
+
+static void fsl_imx6_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = fsl_imx6_realize;
+
+    /*
+     * Reason: creates an ARM CPU, thus use after free(), see
+     * arm_cpu_class_init()
+     */
+    dc->cannot_destroy_with_object_finalize_yet = true;
+    dc->desc = "i.MX6 SOC";
+}
+
+static const TypeInfo fsl_imx6_type_info = {
+    .name = TYPE_FSL_IMX6,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(FslIMX6State),
+    .instance_init = fsl_imx6_init,
+    .class_init = fsl_imx6_class_init,
+};
+
+static void fsl_imx6_register_types(void)
+{
+    type_register_static(&fsl_imx6_type_info);
+}
+
+type_init(fsl_imx6_register_types)
--- a/hw/arm/highbank.c
+++ b/hw/arm/highbank.c
@@ -168,23 +168,20 @@ static void highbank_regs_reset(DeviceState *dev)
    s->regs[0x43] = 0x05F40121;
 }

-static int highbank_regs_init(SysBusDevice *dev)
+static void highbank_regs_init(Object *obj)
 {
-    HighbankRegsState *s = HIGHBANK_REGISTERS(dev);
+    HighbankRegsState *s = HIGHBANK_REGISTERS(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

-    memory_region_init_io(&s->iomem, OBJECT(s), &hb_mem_ops, s->regs,
+    memory_region_init_io(&s->iomem, obj, &hb_mem_ops, s->regs,
                          "highbank_regs", 0x1000);
    sysbus_init_mmio(dev, &s->iomem);
-
-    return 0;
 }

 static void highbank_regs_class_init(ObjectClass *klass, void *data)
 {
-    SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass);
    DeviceClass *dc = DEVICE_CLASS(klass);

-    sbc->init = highbank_regs_init;
    dc->desc = "Calxeda Highbank registers";
    dc->vmsd = &vmstate_highbank_regs;
    dc->reset = highbank_regs_reset;
@@ -194,6 +191,7 @@ static const TypeInfo highbank_regs_info = {
    .name          = TYPE_HIGHBANK_REGISTERS,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(HighbankRegsState),
+    .instance_init = highbank_regs_init,
    .class_init    = highbank_regs_class_init,
 };

--- a/hw/arm/integratorcp.c
+++ b/hw/arm/integratorcp.c
@@ -242,9 +242,10 @@ static const MemoryRegionOps integratorcm_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

-static int integratorcm_init(SysBusDevice *dev)
+static void integratorcm_init(Object *obj)
 {
-    IntegratorCMState *s = INTEGRATOR_CM(dev);
+    IntegratorCMState *s = INTEGRATOR_CM(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

    s->cm_osc = 0x01000048;
    /* ??? What should the high bits of this value be?  */
@@ -269,17 +270,16 @@ static int integratorcm_init(SysBusDevice *dev)
    s->cm_init = 0x00000112;
    s->cm_refcnt_offset = muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
                                   1000);
-    memory_region_init_ram(&s->flash, OBJECT(s), "integrator.flash", 0x100000,
+    memory_region_init_ram(&s->flash, obj, "integrator.flash", 0x100000,
                           &error_fatal);
    vmstate_register_ram_global(&s->flash);

-    memory_region_init_io(&s->iomem, OBJECT(s), &integratorcm_ops, s,
+    memory_region_init_io(&s->iomem, obj, &integratorcm_ops, s,
                          "integratorcm", 0x00800000);
    sysbus_init_mmio(dev, &s->iomem);

    integratorcm_do_remap(s);
    /* ??? Save/restore.  */
-    return 0;
 }

 /* Integrator/CP hardware emulation.  */
@@ -394,18 +394,18 @@ static const MemoryRegionOps icp_pic_ops = {
    .endianness = DEVICE_NATIVE_ENDIAN,
 };

-static int icp_pic_init(SysBusDevice *sbd)
+static void icp_pic_init(Object *obj)
 {
-    DeviceState *dev = DEVICE(sbd);
-    icp_pic_state *s = INTEGRATOR_PIC(dev);
+    DeviceState *dev = DEVICE(obj);
+    icp_pic_state *s = INTEGRATOR_PIC(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);

    qdev_init_gpio_in(dev, icp_pic_set_irq, 32);
    sysbus_init_irq(sbd, &s->parent_irq);
    sysbus_init_irq(sbd, &s->parent_fiq);
-    memory_region_init_io(&s->iomem, OBJECT(s), &icp_pic_ops, s,
+    memory_region_init_io(&s->iomem, obj, &icp_pic_ops, s,
                          "icp-pic", 0x00800000);
    sysbus_init_mmio(sbd, &s->iomem);
-    return 0;
 }

 /* CP control registers.  */
@@ -630,9 +630,7 @@ static Property core_properties[] = {
 static void core_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = integratorcm_init;
    dc->props = core_properties;
 }

@@ -640,21 +638,15 @@ static const TypeInfo core_info = {
    .name          = TYPE_INTEGRATOR_CM,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(IntegratorCMState),
+    .instance_init = integratorcm_init,
    .class_init    = core_class_init,
 };

-static void icp_pic_class_init(ObjectClass *klass, void *data)
-{
-    SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
-
-    sdc->init = icp_pic_init;
-}
-
 static const TypeInfo icp_pic_info = {
    .name          = TYPE_INTEGRATOR_PIC,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(icp_pic_state),
-    .class_init    = icp_pic_class_init,
+    .instance_init = icp_pic_init,
 };

 static const TypeInfo icp_ctrl_regs_info = {
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .6.0
 .6.50