spice: Disallow use of gl + TCP port

Currently, virgl support has to go through a local unix socket, trying to connect to a VM using -spice gl through spice://localhost:5900 will only result in a black screen. This commit errors out when the user tries to start a VM with both GL support and a port/tls-port set. This would fit better in spice-server, but currently QEMU does not call into spice-server when parsing 'gl' on its command line, so we have to do this check in QEMU instead. Signed-off-by: Christophe Fergeau <cfergeau@redhat.com> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com> Message-id: 1457955672-28758-1-git-send-email-cfergeau@redhat.com [ applied codestyle fix: break long line ] Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
input-linux: fix Coverity warning
2016-03-24 08:04:01 +01:00 · 2016-03-24 07:58:20 +01:00 · 2016-03-24 07:58:20 +01:00 · 2016-03-23 12:57:44 +00:00 · 2016-03-22 20:27:55 +00:00 · 2016-03-22 19:17:38 +01:00
547 changed files with 24683 additions and 7243 deletions
--- a/22
+++ b/22
@@ -234,6 +234,7 @@ L: kvm@vger.kernel.org
 S: Supported
 F: kvm-*
 F: */kvm.*
+F: include/sysemu/kvm*.h

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -656,12 +657,6 @@ F: hw/*/grlib*

 S390 Machines
 -------------
-S390 Virtio
-M: Alexander Graf <agraf@suse.de>
-S: Maintained
-F: hw/s390x/s390-*.c
-X: hw/s390x/*pci*.[hc]
-
 S390 Virtio-ccw
 M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
@@ -669,7 +664,6 @@ M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
 F: hw/s390x/
-X: hw/s390x/s390-virtio-bus.[ch]
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
 F: hw/watchdog/wdt_diag288.c
@@ -723,6 +717,12 @@ F: hw/timer/hpet*
 F: hw/timer/i8254*
 F: hw/timer/mc146818rtc*

+Machine core
+M: Eduardo Habkost <ehabkost@redhat.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
+S: Supported
+F: hw/core/machine.c
+F: include/hw/boards.h

 Xtensa Machines
 ---------------
@@ -872,6 +872,7 @@ VFIO
 M: Alex Williamson <alex.williamson@redhat.com>
 S: Supported
 F: hw/vfio/*
+F: include/hw/vfio/

 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
@@ -883,6 +884,7 @@ M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/virtio*
 F: net/vhost-user.c
+F: include/hw/virtio/

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
@@ -928,6 +930,7 @@ M: Amit Shah <amit.shah@redhat.com>
 S: Supported
 F: hw/virtio/virtio-rng.c
 F: include/hw/virtio/virtio-rng.h
+F: include/sysemu/rng*.h
 F: backends/rng*.c

 nvme
@@ -1013,7 +1016,7 @@ F: blockjob.c
 F: include/block/blockjob.h
 F: block/backup.c
 F: block/commit.c
-F: block/stream.h
+F: block/stream.c
 F: block/mirror.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block

@@ -1128,6 +1131,7 @@ Network device backends
 M: Jason Wang <jasowang@redhat.com>
 S: Maintained
 F: net/
+F: include/net/
 T: git git://github.com/jasowang/qemu.git net

 Netmap network backend
@@ -1223,10 +1227,12 @@ F: scripts/qmp/
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 SLIRP
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
 M: Jan Kiszka <jan.kiszka@siemens.com>
 S: Maintained
 F: slirp/
 F: net/slirp.c
+F: include/net/slirp.h
 T: git git://git.kiszka.org/qemu.git queues/slirp

 Tracing
--- a/4
+++ b/4
@@ -238,7 +238,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-o
 qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
 qemu-io$(EXESUF): qemu-io.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a

-qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o
+qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a libqemustub.a

 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap
@@ -329,7 +329,7 @@ ifneq ($(EXESUF),)
 qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI)
 endif

-ivshmem-client$(EXESUF): $(ivshmem-client-obj-y)
+ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) libqemuutil.a libqemustub.a
 	$(call LINK, $^)
 ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
 	$(call LINK, $^)
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -1,6 +1,6 @@
 #######################################################################
 # Common libraries for tools and emulators
-stub-obj-y = stubs/
+stub-obj-y = stubs/ crypto/
 util-obj-y = util/ qobject/ qapi/
 util-obj-y += qmp-introspect.o qapi-types.o qapi-visit.o qapi-event.o

--- a/aio-posix.c
+++ b/aio-posix.c
@@ -18,7 +18,7 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1
 #include <sys/epoll.h>
 #endif

@@ -33,7 +33,7 @@ struct AioHandler
    QLIST_ENTRY(AioHandler) node;
 };

-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1

 /* The fd number threashold to switch to epoll */
 #define EPOLL_ENABLE_THRESHOLD 64
@@ -483,7 +483,7 @@ bool aio_poll(AioContext *ctx, bool blocking)

 void aio_context_setup(AioContext *ctx, Error **errp)
 {
-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
--- a/backends/baum.c
+++ b/backends/baum.c
@@ -567,7 +567,7 @@ static CharDriverState *chr_baum_init(const char *id,
                                      ChardevReturn *ret,
                                      Error **errp)
 {
-    ChardevCommon *common = qapi_ChardevDummy_base(backend->u.braille);
+    ChardevCommon *common = backend->u.braille.data;
    BaumDriverState *baum;
    CharDriverState *chr;
    brlapi_handle_t *handle;
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -68,7 +68,7 @@ static CharDriverState *qemu_chr_open_msmouse(const char *id,
                                              ChardevReturn *ret,
                                              Error **errp)
 {
-    ChardevCommon *common = qapi_ChardevDummy_base(backend->u.msmouse);
+    ChardevCommon *common = backend->u.msmouse.data;
    CharDriverState *chr;

    chr = qemu_chr_alloc(common, errp);
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -25,33 +25,12 @@ typedef struct RngEgd

    CharDriverState *chr;
    char *chr_name;
-
-    GSList *requests;
 } RngEgd;

-typedef struct RngRequest
-{
-    EntropyReceiveFunc *receive_entropy;
-    uint8_t *data;
-    void *opaque;
-    size_t offset;
-    size_t size;
-} RngRequest;
-
-static void rng_egd_request_entropy(RngBackend *b, size_t size,
-                                    EntropyReceiveFunc *receive_entropy,
-                                    void *opaque)
+static void rng_egd_request_entropy(RngBackend *b, RngRequest *req)
 {
    RngEgd *s = RNG_EGD(b);
-    RngRequest *req;
-
-    req = g_malloc(sizeof(*req));
-
-    req->offset = 0;
-    req->size = size;
-    req->receive_entropy = receive_entropy;
-    req->opaque = opaque;
-    req->data = g_malloc(req->size);
+    size_t size = req->size;

    while (size > 0) {
        uint8_t header[2];
@@ -65,24 +44,15 @@ static void rng_egd_request_entropy(RngBackend *b, size_t size,

        size -= len;
    }
-
-    s->requests = g_slist_append(s->requests, req);
-}
-
-static void rng_egd_free_request(RngRequest *req)
-{
-    g_free(req->data);
-    g_free(req);
 }

 static int rng_egd_chr_can_read(void *opaque)
 {
    RngEgd *s = RNG_EGD(opaque);
-    GSList *i;
+    RngRequest *req;
    int size = 0;

-    for (i = s->requests; i; i = i->next) {
-        RngRequest *req = i->data;
+    QSIMPLEQ_FOREACH(req, &s->parent.requests, next) {
        size += req->size - req->offset;
    }

@@ -94,8 +64,8 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
    RngEgd *s = RNG_EGD(opaque);
    size_t buf_offset = 0;

-    while (size > 0 && s->requests) {
-        RngRequest *req = s->requests->data;
+    while (size > 0 && !QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
        int len = MIN(size, req->size - req->offset);

        memcpy(req->data + req->offset, buf + buf_offset, len);
@@ -104,38 +74,13 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
        size -= len;

        if (req->offset == req->size) {
-            s->requests = g_slist_remove_link(s->requests, s->requests);
-
            req->receive_entropy(req->opaque, req->data, req->size);

-            rng_egd_free_request(req);
+            rng_backend_finalize_request(&s->parent, req);
        }
    }
 }

-static void rng_egd_free_requests(RngEgd *s)
-{
-    GSList *i;
-
-    for (i = s->requests; i; i = i->next) {
-        rng_egd_free_request(i->data);
-    }
-
-    g_slist_free(s->requests);
-    s->requests = NULL;
-}
-
-static void rng_egd_cancel_requests(RngBackend *b)
-{
-    RngEgd *s = RNG_EGD(b);
-
-    /* We simply delete the list of pending requests.  If there is data in the 
-     * queue waiting to be read, this is okay, because there will always be
-     * more data than we requested originally
-     */
-    rng_egd_free_requests(s);
-}
-
 static void rng_egd_opened(RngBackend *b, Error **errp)
 {
    RngEgd *s = RNG_EGD(b);
@@ -204,8 +149,6 @@ static void rng_egd_finalize(Object *obj)
    }

    g_free(s->chr_name);
-
-    rng_egd_free_requests(s);
 }

 static void rng_egd_class_init(ObjectClass *klass, void *data)
@@ -213,7 +156,6 @@ static void rng_egd_class_init(ObjectClass *klass, void *data)
    RngBackendClass *rbc = RNG_BACKEND_CLASS(klass);

    rbc->request_entropy = rng_egd_request_entropy;
-    rbc->cancel_requests = rng_egd_cancel_requests;
    rbc->opened = rng_egd_opened;
 }

--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -22,10 +22,6 @@ struct RndRandom

    int fd;
    char *filename;
-
-    EntropyReceiveFunc *receive_func;
-    void *opaque;
-    size_t size;
 };

 /**
@@ -38,36 +34,35 @@ struct RndRandom
 static void entropy_available(void *opaque)
 {
    RndRandom *s = RNG_RANDOM(opaque);
-    uint8_t buffer[s->size];
-    ssize_t len;

-    len = read(s->fd, buffer, s->size);
-    if (len < 0 && errno == EAGAIN) {
-        return;
+    while (!QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
+        ssize_t len;
+
+        len = read(s->fd, req->data, req->size);
+        if (len < 0 && errno == EAGAIN) {
+            return;
+        }
+        g_assert(len != -1);
+
+        req->receive_entropy(req->opaque, req->data, len);
+
+        rng_backend_finalize_request(&s->parent, req);
    }
-    g_assert(len != -1);
-
-    s->receive_func(s->opaque, buffer, len);
-    s->receive_func = NULL;

+    /* We've drained all requests, the fd handler can be reset. */
    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
 }

-static void rng_random_request_entropy(RngBackend *b, size_t size,
-                                        EntropyReceiveFunc *receive_entropy,
-                                        void *opaque)
+static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
    RndRandom *s = RNG_RANDOM(b);

-    if (s->receive_func) {
-        s->receive_func(s->opaque, NULL, 0);
+    if (QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        /* If there are no pending requests yet, we need to
+         * install our fd handler. */
+        qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
    }
-
-    s->receive_func = receive_entropy;
-    s->opaque = opaque;
-    s->size = size;
-
-    qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
 }

 static void rng_random_opened(RngBackend *b, Error **errp)
--- a/backends/rng.c
+++ b/backends/rng.c
@@ -20,18 +20,20 @@ void rng_backend_request_entropy(RngBackend *s, size_t size,
                                 void *opaque)
 {
    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+    RngRequest *req;

    if (k->request_entropy) {
-        k->request_entropy(s, size, receive_entropy, opaque);
-    }
-}
+        req = g_malloc(sizeof(*req));

-void rng_backend_cancel_requests(RngBackend *s)
-{
-    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+        req->offset = 0;
+        req->size = size;
+        req->receive_entropy = receive_entropy;
+        req->opaque = opaque;
+        req->data = g_malloc(req->size);

-    if (k->cancel_requests) {
-        k->cancel_requests(s);
+        k->request_entropy(s, req);
+
+        QSIMPLEQ_INSERT_TAIL(&s->requests, req, next);
    }
 }

@@ -73,14 +75,48 @@ static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp)
    s->opened = true;
 }

+static void rng_backend_free_request(RngRequest *req)
+{
+    g_free(req->data);
+    g_free(req);
+}
+
+static void rng_backend_free_requests(RngBackend *s)
+{
+    RngRequest *req, *next;
+
+    QSIMPLEQ_FOREACH_SAFE(req, &s->requests, next, next) {
+        rng_backend_free_request(req);
+    }
+
+    QSIMPLEQ_INIT(&s->requests);
+}
+
+void rng_backend_finalize_request(RngBackend *s, RngRequest *req)
+{
+    QSIMPLEQ_REMOVE(&s->requests, req, RngRequest, next);
+    rng_backend_free_request(req);
+}
+
 static void rng_backend_init(Object *obj)
 {
+    RngBackend *s = RNG_BACKEND(obj);
+
+    QSIMPLEQ_INIT(&s->requests);
+
    object_property_add_bool(obj, "opened",
                             rng_backend_prop_get_opened,
                             rng_backend_prop_set_opened,
                             NULL);
 }

+static void rng_backend_finalize(Object *obj)
+{
+    RngBackend *s = RNG_BACKEND(obj);
+
+    rng_backend_free_requests(s);
+}
+
 static void rng_backend_class_init(ObjectClass *oc, void *data)
 {
    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
@@ -93,6 +129,7 @@ static const TypeInfo rng_backend_info = {
    .parent = TYPE_OBJECT,
    .instance_size = sizeof(RngBackend),
    .instance_init = rng_backend_init,
+    .instance_finalize = rng_backend_finalize,
    .class_size = sizeof(RngBackendClass),
    .class_init = rng_backend_class_init,
    .abstract = true,
--- a/block.c
+++ b/block.c
@@ -53,27 +53,8 @@
 #include <windows.h>
 #endif

-/**
- * A BdrvDirtyBitmap can be in three possible states:
- * (1) successor is NULL and disabled is false: full r/w mode
- * (2) successor is NULL and disabled is true: read only mode ("disabled")
- * (3) successor is set: frozen mode.
- *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
- *     or enabled. A frozen bitmap can only abdicate() or reclaim().
- */
-struct BdrvDirtyBitmap {
-    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
-    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
-    char *name;                 /* Optional non-empty unique ID */
-    int64_t size;               /* Size of the bitmap (Number of sectors) */
-    bool disabled;              /* Bitmap is read-only */
-    QLIST_ENTRY(BdrvDirtyBitmap) list;
-};
-
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

-struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states);
-
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);

@@ -88,9 +69,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                             BlockDriverState *parent,
                             const BdrvChildRole *child_role, Error **errp);

-static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
-static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs);
-
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;

@@ -246,10 +224,7 @@ void bdrv_register(BlockDriver *bdrv)

 BlockDriverState *bdrv_new_root(void)
 {
-    BlockDriverState *bs = bdrv_new();
-
-    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
-    return bs;
+    return bdrv_new();
 }

 BlockDriverState *bdrv_new(void)
@@ -687,13 +662,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
 }

 /*
- * Returns the flags that a temporary snapshot should get, based on the
- * originally requested flags (the originally requested image will have flags
- * like a backing file)
+ * Returns the options and flags that a temporary snapshot should get, based on
+ * the originally requested flags (the originally requested image will have
+ * flags like a backing file)
 */
-static int bdrv_temp_snapshot_flags(int flags)
+static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
+                                       int parent_flags, QDict *parent_options)
 {
-    return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+
+    /* For temporary files, unconditional cache=unsafe is fine */
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on");
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
 }

 /*
@@ -1194,10 +1175,9 @@ static int bdrv_fill_options(QDict **options, const char *filename,
    return 0;
 }

-static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
-                                    BlockDriverState *child_bs,
-                                    const char *child_name,
-                                    const BdrvChildRole *child_role)
+BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
+                                  const char *child_name,
+                                  const BdrvChildRole *child_role)
 {
    BdrvChild *child = g_new(BdrvChild, 1);
    *child = (BdrvChild) {
@@ -1206,24 +1186,43 @@ static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
        .role   = child_role,
    };

-    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
    QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent);

    return child;
 }

+static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+                                    BlockDriverState *child_bs,
+                                    const char *child_name,
+                                    const BdrvChildRole *child_role)
+{
+    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
+    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
+    return child;
+}
+
 static void bdrv_detach_child(BdrvChild *child)
 {
-    QLIST_REMOVE(child, next);
+    if (child->next.le_prev) {
+        QLIST_REMOVE(child, next);
+        child->next.le_prev = NULL;
+    }
    QLIST_REMOVE(child, next_parent);
    g_free(child->name);
    g_free(child);
 }

-void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
+void bdrv_root_unref_child(BdrvChild *child)
 {
    BlockDriverState *child_bs;

+    child_bs = child->bs;
+    bdrv_detach_child(child);
+    bdrv_unref(child_bs);
+}
+
+void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
+{
    if (child == NULL) {
        return;
    }
@@ -1232,9 +1231,7 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
        child->bs->inherits_from = NULL;
    }

-    child_bs = child->bs;
-    bdrv_detach_child(child);
-    bdrv_unref(child_bs);
+    bdrv_root_unref_child(child);
 }

 /*
@@ -1424,13 +1421,13 @@ done:
    return c;
 }

-int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
+static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
+                                     QDict *snapshot_options, Error **errp)
 {
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
    char *tmp_filename = g_malloc0(PATH_MAX + 1);
    int64_t total_size;
    QemuOpts *opts = NULL;
-    QDict *snapshot_options;
    BlockDriverState *bs_snapshot;
    Error *local_err = NULL;
    int ret;
@@ -1464,8 +1461,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
        goto out;
    }

-    /* Prepare a new options QDict for the temporary file */
-    snapshot_options = qdict_new();
+    /* Prepare options QDict for the temporary file */
    qdict_put(snapshot_options, "file.driver",
              qstring_from_str("file"));
    qdict_put(snapshot_options, "file.filename",
@@ -1477,6 +1473,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)

    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
                    flags, &local_err);
+    snapshot_options = NULL;
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto out;
@@ -1485,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
    bdrv_append(bs_snapshot, bs);

 out:
+    QDECREF(snapshot_options);
    g_free(tmp_filename);
    return ret;
 }
@@ -1516,6 +1514,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    const char *drvname;
    const char *backing;
    Error *local_err = NULL;
+    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;

    assert(pbs);
@@ -1607,7 +1606,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
            flags |= BDRV_O_ALLOW_RDWR;
        }
        if (flags & BDRV_O_SNAPSHOT) {
-            snapshot_flags = bdrv_temp_snapshot_flags(flags);
+            snapshot_options = qdict_new();
+            bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
+                                       flags, options);
            bdrv_backing_options(&flags, options, flags, options);
        }

@@ -1681,9 +1682,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
            error_setg(errp, "Block protocol '%s' doesn't support the option "
                       "'%s'", drv->format_name, entry->key);
        } else {
-            error_setg(errp, "Block format '%s' used by device '%s' doesn't "
-                       "support the option '%s'", drv->format_name,
-                       bdrv_get_device_name(bs), entry->key);
+            error_setg(errp,
+                       "Block format '%s' does not support the option '%s'",
+                       drv->format_name, entry->key);
        }

        ret = -EINVAL;
@@ -1709,7 +1710,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
     * temporary snapshot afterwards. */
    if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
+        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
+                                        &local_err);
+        snapshot_options = NULL;
        if (local_err) {
            goto close_and_fail;
        }
@@ -1721,6 +1724,7 @@ fail:
    if (file != NULL) {
        bdrv_unref_child(bs, file);
    }
+    QDECREF(snapshot_options);
    QDECREF(bs->explicit_options);
    QDECREF(bs->options);
    QDECREF(options);
@@ -1743,6 +1747,7 @@ close_and_fail:
    } else {
        bdrv_unref(bs);
    }
+    QDECREF(snapshot_options);
    QDECREF(options);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -2236,26 +2241,10 @@ void bdrv_close_all(void)
    }
 }

-/* Note that bs->device_list.tqe_prev is initially null,
- * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
- * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
- * resetting it to null on remove.  */
-void bdrv_device_remove(BlockDriverState *bs)
-{
-    QTAILQ_REMOVE(&bdrv_states, bs, device_list);
-    bs->device_list.tqe_prev = NULL;
-}
-
-/* make a BlockDriverState anonymous by removing from bdrv_state and
- * graph_bdrv_state list.
-   Also, NULL terminate the device_name to prevent double remove */
+/* make a BlockDriverState anonymous by removing from graph_bdrv_state list.
+ * Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
 {
-    /* Take care to remove bs from bdrv_states only when it's actually
-     * in it. */
-    if (bs->device_list.tqe_prev) {
-        bdrv_device_remove(bs);
-    }
    if (bs->node_name[0] != '\0') {
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
    }
@@ -2282,6 +2271,14 @@ static void change_parent_backing_link(BlockDriverState *from,
 {
    BdrvChild *c, *next;

+    if (from->blk) {
+        /* FIXME We bypass blk_set_bs(), so we need to make these updates
+         * manually. The root problem is not in this change function, but the
+         * existence of BlockDriverState.blk. */
+        to->blk = from->blk;
+        from->blk = NULL;
+    }
+
    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->role != &child_backing);
        c->bs = to;
@@ -2290,13 +2287,6 @@ static void change_parent_backing_link(BlockDriverState *from,
        bdrv_ref(to);
        bdrv_unref(from);
    }
-    if (from->blk) {
-        blk_set_bs(from->blk, to);
-        if (!to->device_list.tqe_prev) {
-            QTAILQ_INSERT_BEFORE(from, to, device_list);
-        }
-        bdrv_device_remove(from);
-    }
 }

 static void swap_feature_fields(BlockDriverState *bs_top,
@@ -2527,26 +2517,6 @@ ro_cleanup:
    return ret;
 }

-int bdrv_commit_all(void)
-{
-    BlockDriverState *bs;
-
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
-        if (bs->drv && bs->backing) {
-            int ret = bdrv_commit(bs);
-            if (ret < 0) {
-                aio_context_release(aio_context);
-                return ret;
-            }
-        }
-        aio_context_release(aio_context);
-    }
-    return 0;
-}
-
 /*
 * Return values:
 * 0        - success
@@ -2995,12 +2965,23 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
    return QTAILQ_NEXT(bs, node_list);
 }

+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
-    if (!bs) {
-        return QTAILQ_FIRST(&bdrv_states);
+    if (!bs || bs->blk) {
+        bs = blk_next_root_bs(bs);
+        if (bs) {
+            return bs;
+        }
    }
-    return QTAILQ_NEXT(bs, device_list);
+
+    /* Ignore all BDSs that are attached to a BlockBackend here; they have been
+     * handled by the above block already */
+    do {
+        bs = bdrv_next_monitor_owned(bs);
+    } while (bs && bs->blk);
+    return bs;
 }

 const char *bdrv_get_node_name(const BlockDriverState *bs)
@@ -3308,10 +3289,10 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

 void bdrv_invalidate_cache_all(Error **errp)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
    Error *local_err = NULL;

-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3341,10 +3322,10 @@ static int bdrv_inactivate(BlockDriverState *bs)

 int bdrv_inactivate_all(void)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
    int ret;

-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3431,346 +3412,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
    }
 }

-BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
-{
-    BdrvDirtyBitmap *bm;
-
-    assert(name);
-    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
-        if (bm->name && !strcmp(name, bm->name)) {
-            return bm;
-        }
-    }
-    return NULL;
-}
-
-void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    g_free(bitmap->name);
-    bitmap->name = NULL;
-}
-
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
-                                          uint32_t granularity,
-                                          const char *name,
-                                          Error **errp)
-{
-    int64_t bitmap_size;
-    BdrvDirtyBitmap *bitmap;
-    uint32_t sector_granularity;
-
-    assert((granularity & (granularity - 1)) == 0);
-
-    if (name && bdrv_find_dirty_bitmap(bs, name)) {
-        error_setg(errp, "Bitmap already exists: %s", name);
-        return NULL;
-    }
-    sector_granularity = granularity >> BDRV_SECTOR_BITS;
-    assert(sector_granularity);
-    bitmap_size = bdrv_nb_sectors(bs);
-    if (bitmap_size < 0) {
-        error_setg_errno(errp, -bitmap_size, "could not get length of device");
-        errno = -bitmap_size;
-        return NULL;
-    }
-    bitmap = g_new0(BdrvDirtyBitmap, 1);
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
-    bitmap->size = bitmap_size;
-    bitmap->name = g_strdup(name);
-    bitmap->disabled = false;
-    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
-    return bitmap;
-}
-
-bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
-{
-    return bitmap->successor;
-}
-
-bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
-{
-    return !(bitmap->disabled || bitmap->successor);
-}
-
-DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
-{
-    if (bdrv_dirty_bitmap_frozen(bitmap)) {
-        return DIRTY_BITMAP_STATUS_FROZEN;
-    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
-        return DIRTY_BITMAP_STATUS_DISABLED;
-    } else {
-        return DIRTY_BITMAP_STATUS_ACTIVE;
-    }
-}
-
-/**
- * Create a successor bitmap destined to replace this bitmap after an operation.
- * Requires that the bitmap is not frozen and has no successor.
- */
-int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
-                                       BdrvDirtyBitmap *bitmap, Error **errp)
-{
-    uint64_t granularity;
-    BdrvDirtyBitmap *child;
-
-    if (bdrv_dirty_bitmap_frozen(bitmap)) {
-        error_setg(errp, "Cannot create a successor for a bitmap that is "
-                   "currently frozen");
-        return -1;
-    }
-    assert(!bitmap->successor);
-
-    /* Create an anonymous successor */
-    granularity = bdrv_dirty_bitmap_granularity(bitmap);
-    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
-    if (!child) {
-        return -1;
-    }
-
-    /* Successor will be on or off based on our current state. */
-    child->disabled = bitmap->disabled;
-
-    /* Install the successor and freeze the parent */
-    bitmap->successor = child;
-    return 0;
-}
-
-/**
- * For a bitmap with a successor, yield our name to the successor,
- * delete the old bitmap, and return a handle to the new bitmap.
- */
-BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
-                                            BdrvDirtyBitmap *bitmap,
-                                            Error **errp)
-{
-    char *name;
-    BdrvDirtyBitmap *successor = bitmap->successor;
-
-    if (successor == NULL) {
-        error_setg(errp, "Cannot relinquish control if "
-                   "there's no successor present");
-        return NULL;
-    }
-
-    name = bitmap->name;
-    bitmap->name = NULL;
-    successor->name = name;
-    bitmap->successor = NULL;
-    bdrv_release_dirty_bitmap(bs, bitmap);
-
-    return successor;
-}
-
-/**
- * In cases of failure where we can no longer safely delete the parent,
- * we may wish to re-join the parent and child/successor.
- * The merged parent will be un-frozen, but not explicitly re-enabled.
- */
-BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
-                                           BdrvDirtyBitmap *parent,
-                                           Error **errp)
-{
-    BdrvDirtyBitmap *successor = parent->successor;
-
-    if (!successor) {
-        error_setg(errp, "Cannot reclaim a successor when none is present");
-        return NULL;
-    }
-
-    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
-        error_setg(errp, "Merging of parent and successor bitmap failed");
-        return NULL;
-    }
-    bdrv_release_dirty_bitmap(bs, successor);
-    parent->successor = NULL;
-
-    return parent;
-}
-
-/**
- * Truncates _all_ bitmaps attached to a BDS.
- */
-static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
-{
-    BdrvDirtyBitmap *bitmap;
-    uint64_t size = bdrv_nb_sectors(bs);
-
-    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
-        assert(!bdrv_dirty_bitmap_frozen(bitmap));
-        hbitmap_truncate(bitmap->bitmap, size);
-        bitmap->size = size;
-    }
-}
-
-static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
-                                                  BdrvDirtyBitmap *bitmap,
-                                                  bool only_named)
-{
-    BdrvDirtyBitmap *bm, *next;
-    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
-        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
-            assert(!bdrv_dirty_bitmap_frozen(bm));
-            QLIST_REMOVE(bm, list);
-            hbitmap_free(bm->bitmap);
-            g_free(bm->name);
-            g_free(bm);
-
-            if (bitmap) {
-                return;
-            }
-        }
-    }
-}
-
-void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
-{
-    bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false);
-}
-
-/**
- * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()).
- * There must not be any frozen bitmaps attached.
- */
-static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
-{
-    bdrv_do_release_matching_dirty_bitmap(bs, NULL, true);
-}
-
-void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    bitmap->disabled = true;
-}
-
-void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    bitmap->disabled = false;
-}
-
-BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
-{
-    BdrvDirtyBitmap *bm;
-    BlockDirtyInfoList *list = NULL;
-    BlockDirtyInfoList **plist = &list;
-
-    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
-        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
-        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bm);
-        info->granularity = bdrv_dirty_bitmap_granularity(bm);
-        info->has_name = !!bm->name;
-        info->name = g_strdup(bm->name);
-        info->status = bdrv_dirty_bitmap_status(bm);
-        entry->value = info;
-        *plist = entry;
-        plist = &entry->next;
-    }
-
-    return list;
-}
-
-int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
-{
-    if (bitmap) {
-        return hbitmap_get(bitmap->bitmap, sector);
-    } else {
-        return 0;
-    }
-}
-
-/**
- * Chooses a default granularity based on the existing cluster size,
- * but clamped between [4K, 64K]. Defaults to 64K in the case that there
- * is no cluster size information available.
- */
-uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-    uint32_t granularity;
-
-    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
-        granularity = MAX(4096, bdi.cluster_size);
-        granularity = MIN(65536, granularity);
-    } else {
-        granularity = 65536;
-    }
-
-    return granularity;
-}
-
-uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
-{
-    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
-}
-
-void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
-{
-    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
-}
-
-void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int nr_sectors)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
-}
-
-void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int nr_sectors)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
-}
-
-void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    if (!out) {
-        hbitmap_reset_all(bitmap->bitmap);
-    } else {
-        HBitmap *backup = bitmap->bitmap;
-        bitmap->bitmap = hbitmap_alloc(bitmap->size,
-                                       hbitmap_granularity(backup));
-        *out = backup;
-    }
-}
-
-void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
-{
-    HBitmap *tmp = bitmap->bitmap;
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    bitmap->bitmap = in;
-    hbitmap_free(tmp);
-}
-
-void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int nr_sectors)
-{
-    BdrvDirtyBitmap *bitmap;
-    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
-        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
-            continue;
-        }
-        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
-    }
-}
-
-/**
- * Advance an HBitmapIter to an arbitrary offset.
- */
-void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
-{
-    assert(hbi->hb);
-    hbitmap_iter_init(hbi, hbi->hb, offset);
-}
-
-int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
-{
-    return hbitmap_count(bitmap->bitmap);
-}
-
 /* Get a reference to bs */
 void bdrv_ref(BlockDriverState *bs)
 {
@@ -4190,10 +3831,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 */
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;

    /* walk down the bs forest recursively */
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        bool perm;

        /* try to recurse in this top level bs */
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,7 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
-block-obj-y += accounting.o
+block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o

 common-obj-y += stream.o
--- a/block/backup.c
+++ b/block/backup.c
@@ -20,11 +20,9 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "qemu/bitmap.h"

-#define BACKUP_CLUSTER_BITS 16
-#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
-#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
-
+#define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 100000000ULL /* ns */

 typedef struct CowRequest {
@@ -45,10 +43,17 @@ typedef struct BackupBlockJob {
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
    uint64_t sectors_read;
-    HBitmap *bitmap;
+    unsigned long *done_bitmap;
+    int64_t cluster_size;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

+/* Size of a cluster in sectors, instead of bytes. */
+static inline int64_t cluster_size_sectors(BackupBlockJob *job)
+{
+  return job->cluster_size / BDRV_SECTOR_SIZE;
+}
+
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                       int64_t start,
@@ -97,13 +102,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
    QEMUIOVector bounce_qiov;
    void *bounce_buffer = NULL;
    int ret = 0;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int64_t start, end;
    int n;

    qemu_co_rwlock_rdlock(&job->flush_rwlock);

-    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
+    start = sector_num / sectors_per_cluster;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);

    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);

@@ -111,19 +117,19 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
    cow_request_begin(&cow_request, job, start, end);

    for (; start < end; start++) {
-        if (hbitmap_get(job->bitmap, start)) {
+        if (test_bit(start, job->done_bitmap)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }

        trace_backup_do_cow_process(job, start);

-        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
+        n = MIN(sectors_per_cluster,
                job->common.len / BDRV_SECTOR_SIZE -
-                start * BACKUP_SECTORS_PER_CLUSTER);
+                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
+            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
@@ -131,10 +137,10 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,

        if (is_write_notifier) {
            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * BACKUP_SECTORS_PER_CLUSTER,
+                                           start * sectors_per_cluster,
                                           n, &bounce_qiov);
        } else {
-            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
                                &bounce_qiov);
        }
        if (ret < 0) {
@@ -147,11 +153,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
            ret = bdrv_co_write_zeroes(job->target,
-                                       start * BACKUP_SECTORS_PER_CLUSTER,
+                                       start * sectors_per_cluster,
                                       n, BDRV_REQ_MAY_UNMAP);
        } else {
            ret = bdrv_co_writev(job->target,
-                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
+                                 start * sectors_per_cluster, n,
                                 &bounce_qiov);
        }
        if (ret < 0) {
@@ -162,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
            goto out;
        }

-        hbitmap_set(job->bitmap, start, 1);
+        set_bit(start, job->done_bitmap);

        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
@@ -322,21 +328,22 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t cluster;
    int64_t end;
    int64_t last_cluster = -1;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
-    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);

    /* Find the next dirty sector(s) */
    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
-        cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+        cluster = sector / sectors_per_cluster;

        /* Fake progress updates for any clusters we skipped */
        if (cluster != last_cluster + 1) {
            job->common.offset += ((cluster - last_cluster - 1) *
-                                   BACKUP_CLUSTER_SIZE);
+                                   job->cluster_size);
        }

        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
@@ -344,8 +351,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
-                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
+                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
                    backup_error_action(job, error_is_read, -ret) ==
@@ -357,17 +364,17 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)

        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
-        if (granularity < BACKUP_CLUSTER_SIZE) {
-            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+        if (granularity < job->cluster_size) {
+            bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster);
        }

        last_cluster = cluster - 1;
    }

    /* Play some final catchup with the progress meter */
-    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    end = DIV_ROUND_UP(job->common.len, job->cluster_size);
    if (last_cluster + 1 < end) {
-        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+        job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
    }

    return ret;
@@ -384,15 +391,16 @@ static void coroutine_fn backup_run(void *opaque)
        .notify = backup_before_write_notify,
    };
    int64_t start, end;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

    start = 0;
-    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    end = DIV_ROUND_UP(job->common.len, job->cluster_size);

-    job->bitmap = hbitmap_alloc(end, 0);
+    job->done_bitmap = bitmap_new(end);

    bdrv_set_enable_write_cache(target, true);
    if (target->blk) {
@@ -427,7 +435,7 @@ static void coroutine_fn backup_run(void *opaque)
                /* Check to see if these blocks are already in the
                 * backing file. */

-                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
+                for (i = 0; i < sectors_per_cluster;) {
                    /* bdrv_is_allocated() only returns true/false based
                     * on the first set of sectors it comes across that
                     * are are all in the same state.
@@ -436,8 +444,8 @@ static void coroutine_fn backup_run(void *opaque)
                     * needed but at some point that is always the case. */
                    alloced =
                        bdrv_is_allocated(bs,
-                                start * BACKUP_SECTORS_PER_CLUSTER + i,
-                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
+                                start * sectors_per_cluster + i,
+                                sectors_per_cluster - i, &n);
                    i += n;

                    if (alloced == 1 || n == 0) {
@@ -452,8 +460,8 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
-                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
+            ret = backup_do_cow(bs, start * sectors_per_cluster,
+                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
                BlockErrorAction action =
@@ -473,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque)
    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
-    hbitmap_free(job->bitmap);
+    g_free(job->done_bitmap);

    if (target->blk) {
        blk_iostatus_disable(target->blk);
@@ -494,6 +502,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  BlockJobTxn *txn, Error **errp)
 {
    int64_t len;
+    BlockDriverInfo bdi;
+    int ret;

    assert(bs);
    assert(target);
@@ -563,14 +573,32 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        goto error;
    }

-    bdrv_op_block_all(target, job->common.blocker);
-
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
+
+    /* If there is no backing file on the target, we cannot rely on COW if our
+     * backup cluster size is smaller than the target cluster size. Even for
+     * targets with a backing file, try to avoid COW if possible. */
+    ret = bdrv_get_info(job->target, &bdi);
+    if (ret < 0 && !target->backing) {
+        error_setg_errno(errp, -ret,
+            "Couldn't determine the cluster size of the target image, "
+            "which has no backing file");
+        error_append_hint(errp,
+            "Aborting, since this may create an unusable destination image\n");
+        goto error;
+    } else if (ret < 0 && target->backing) {
+        /* Not fatal; just trudge on ahead. */
+        job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT;
+    } else {
+        job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
+    }
+
+    bdrv_op_block_all(target, job->common.blocker);
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
    block_job_txn_add_job(txn, &job->common);
--- a/block/block-backend.c
+++ b/block/block-backend.c
--- a/block/curl.c
+++ b/block/curl.c
@@ -27,6 +27,7 @@
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qstring.h"
+#include "crypto/secret.h"
 #include <curl/curl.h>

 // #define DEBUG_CURL
@@ -78,6 +79,10 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
 #define CURL_BLOCK_OPT_TIMEOUT "timeout"
 #define CURL_BLOCK_OPT_COOKIE    "cookie"
+#define CURL_BLOCK_OPT_USERNAME "username"
+#define CURL_BLOCK_OPT_PASSWORD_SECRET "password-secret"
+#define CURL_BLOCK_OPT_PROXY_USERNAME "proxy-username"
+#define CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET "proxy-password-secret"

 struct BDRVCURLState;

@@ -120,6 +125,10 @@ typedef struct BDRVCURLState {
    char *cookie;
    bool accept_range;
    AioContext *aio_context;
+    char *username;
+    char *password;
+    char *proxyusername;
+    char *proxypassword;
 } BDRVCURLState;

 static void curl_clean_state(CURLState *s);
@@ -419,6 +428,21 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
        curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
        curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);

+        if (s->username) {
+            curl_easy_setopt(state->curl, CURLOPT_USERNAME, s->username);
+        }
+        if (s->password) {
+            curl_easy_setopt(state->curl, CURLOPT_PASSWORD, s->password);
+        }
+        if (s->proxyusername) {
+            curl_easy_setopt(state->curl,
+                             CURLOPT_PROXYUSERNAME, s->proxyusername);
+        }
+        if (s->proxypassword) {
+            curl_easy_setopt(state->curl,
+                             CURLOPT_PROXYPASSWORD, s->proxypassword);
+        }
+
        /* Restrict supported protocols to avoid security issues in the more
         * obscure protocols.  For example, do not allow POP3/SMTP/IMAP see
         * CVE-2013-0249.
@@ -525,10 +549,31 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Pass the cookie or list of cookies with each request"
        },
+        {
+            .name = CURL_BLOCK_OPT_USERNAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Username for HTTP auth"
+        },
+        {
+            .name = CURL_BLOCK_OPT_PASSWORD_SECRET,
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret used as password for HTTP auth",
+        },
+        {
+            .name = CURL_BLOCK_OPT_PROXY_USERNAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Username for HTTP proxy auth"
+        },
+        {
+            .name = CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET,
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret used as password for HTTP proxy auth",
+        },
        { /* end of list */ }
    },
 };

+
 static int curl_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
@@ -539,6 +584,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    const char *file;
    const char *cookie;
    double d;
+    const char *secretid;

    static int inited = 0;

@@ -580,6 +626,26 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        goto out_noclean;
    }

+    s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME));
+    secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET);
+
+    if (secretid) {
+        s->password = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!s->password) {
+            goto out_noclean;
+        }
+    }
+
+    s->proxyusername = g_strdup(
+        qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_USERNAME));
+    secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET);
+    if (secretid) {
+        s->proxypassword = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!s->proxypassword) {
+            goto out_noclean;
+        }
+    }
+
    if (!inited) {
        curl_global_init(CURL_GLOBAL_ALL);
        inited = 1;
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -0,0 +1,387 @@
+/*
+ * Block Dirty Bitmap
+ *
+ * Copyright (c) 2016 Red Hat. Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "config-host.h"
+#include "qemu-common.h"
+#include "trace.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+
+/**
+ * A BdrvDirtyBitmap can be in three possible states:
+ * (1) successor is NULL and disabled is false: full r/w mode
+ * (2) successor is NULL and disabled is true: read only mode ("disabled")
+ * (3) successor is set: frozen mode.
+ *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
+ *     or enabled. A frozen bitmap can only abdicate() or reclaim().
+ */
+struct BdrvDirtyBitmap {
+    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
+    char *name;                 /* Optional non-empty unique ID */
+    int64_t size;               /* Size of the bitmap (Number of sectors) */
+    bool disabled;              /* Bitmap is read-only */
+    QLIST_ENTRY(BdrvDirtyBitmap) list;
+};
+
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
+{
+    BdrvDirtyBitmap *bm;
+
+    assert(name);
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->name && !strcmp(name, bm->name)) {
+            return bm;
+        }
+    }
+    return NULL;
+}
+
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    g_free(bitmap->name);
+    bitmap->name = NULL;
+}
+
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          uint32_t granularity,
+                                          const char *name,
+                                          Error **errp)
+{
+    int64_t bitmap_size;
+    BdrvDirtyBitmap *bitmap;
+    uint32_t sector_granularity;
+
+    assert((granularity & (granularity - 1)) == 0);
+
+    if (name && bdrv_find_dirty_bitmap(bs, name)) {
+        error_setg(errp, "Bitmap already exists: %s", name);
+        return NULL;
+    }
+    sector_granularity = granularity >> BDRV_SECTOR_BITS;
+    assert(sector_granularity);
+    bitmap_size = bdrv_nb_sectors(bs);
+    if (bitmap_size < 0) {
+        error_setg_errno(errp, -bitmap_size, "could not get length of device");
+        errno = -bitmap_size;
+        return NULL;
+    }
+    bitmap = g_new0(BdrvDirtyBitmap, 1);
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+    bitmap->size = bitmap_size;
+    bitmap->name = g_strdup(name);
+    bitmap->disabled = false;
+    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+    return bitmap;
+}
+
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->successor;
+}
+
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
+{
+    return !(bitmap->disabled || bitmap->successor);
+}
+
+DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
+{
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        return DIRTY_BITMAP_STATUS_FROZEN;
+    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+        return DIRTY_BITMAP_STATUS_DISABLED;
+    } else {
+        return DIRTY_BITMAP_STATUS_ACTIVE;
+    }
+}
+
+/**
+ * Create a successor bitmap destined to replace this bitmap after an operation.
+ * Requires that the bitmap is not frozen and has no successor.
+ */
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap, Error **errp)
+{
+    uint64_t granularity;
+    BdrvDirtyBitmap *child;
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp, "Cannot create a successor for a bitmap that is "
+                   "currently frozen");
+        return -1;
+    }
+    assert(!bitmap->successor);
+
+    /* Create an anonymous successor */
+    granularity = bdrv_dirty_bitmap_granularity(bitmap);
+    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+    if (!child) {
+        return -1;
+    }
+
+    /* Successor will be on or off based on our current state. */
+    child->disabled = bitmap->disabled;
+
+    /* Install the successor and freeze the parent */
+    bitmap->successor = child;
+    return 0;
+}
+
+/**
+ * For a bitmap with a successor, yield our name to the successor,
+ * delete the old bitmap, and return a handle to the new bitmap.
+ */
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp)
+{
+    char *name;
+    BdrvDirtyBitmap *successor = bitmap->successor;
+
+    if (successor == NULL) {
+        error_setg(errp, "Cannot relinquish control if "
+                   "there's no successor present");
+        return NULL;
+    }
+
+    name = bitmap->name;
+    bitmap->name = NULL;
+    successor->name = name;
+    bitmap->successor = NULL;
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+    return successor;
+}
+
+/**
+ * In cases of failure where we can no longer safely delete the parent,
+ * we may wish to re-join the parent and child/successor.
+ * The merged parent will be un-frozen, but not explicitly re-enabled.
+ */
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *parent,
+                                           Error **errp)
+{
+    BdrvDirtyBitmap *successor = parent->successor;
+
+    if (!successor) {
+        error_setg(errp, "Cannot reclaim a successor when none is present");
+        return NULL;
+    }
+
+    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
+        error_setg(errp, "Merging of parent and successor bitmap failed");
+        return NULL;
+    }
+    bdrv_release_dirty_bitmap(bs, successor);
+    parent->successor = NULL;
+
+    return parent;
+}
+
+/**
+ * Truncates _all_ bitmaps attached to a BDS.
+ */
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bitmap;
+    uint64_t size = bdrv_nb_sectors(bs);
+
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        assert(!bdrv_dirty_bitmap_frozen(bitmap));
+        hbitmap_truncate(bitmap->bitmap, size);
+        bitmap->size = size;
+    }
+}
+
+static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
+                                                  BdrvDirtyBitmap *bitmap,
+                                                  bool only_named)
+{
+    BdrvDirtyBitmap *bm, *next;
+    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
+        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+            assert(!bdrv_dirty_bitmap_frozen(bm));
+            QLIST_REMOVE(bm, list);
+            hbitmap_free(bm->bitmap);
+            g_free(bm->name);
+            g_free(bm);
+
+            if (bitmap) {
+                return;
+            }
+        }
+    }
+}
+
+void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+{
+    bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false);
+}
+
+/**
+ * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()).
+ * There must not be any frozen bitmaps attached.
+ */
+void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
+{
+    bdrv_do_release_matching_dirty_bitmap(bs, NULL, true);
+}
+
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = true;
+}
+
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = false;
+}
+
+BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bm;
+    BlockDirtyInfoList *list = NULL;
+    BlockDirtyInfoList **plist = &list;
+
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
+        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
+        info->count = bdrv_get_dirty_count(bm);
+        info->granularity = bdrv_dirty_bitmap_granularity(bm);
+        info->has_name = !!bm->name;
+        info->name = g_strdup(bm->name);
+        info->status = bdrv_dirty_bitmap_status(bm);
+        entry->value = info;
+        *plist = entry;
+        plist = &entry->next;
+    }
+
+    return list;
+}
+
+int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                   int64_t sector)
+{
+    if (bitmap) {
+        return hbitmap_get(bitmap->bitmap, sector);
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * Chooses a default granularity based on the existing cluster size,
+ * but clamped between [4K, 64K]. Defaults to 64K in the case that there
+ * is no cluster size information available.
+ */
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    uint32_t granularity;
+
+    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
+        granularity = MAX(4096, bdi.cluster_size);
+        granularity = MIN(65536, granularity);
+    } else {
+        granularity = 65536;
+    }
+
+    return granularity;
+}
+
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+{
+    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+{
+    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
+}
+
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                           int64_t cur_sector, int nr_sectors)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                             int64_t cur_sector, int nr_sectors)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    if (!out) {
+        hbitmap_reset_all(bitmap->bitmap);
+    } else {
+        HBitmap *backup = bitmap->bitmap;
+        bitmap->bitmap = hbitmap_alloc(bitmap->size,
+                                       hbitmap_granularity(backup));
+        *out = backup;
+    }
+}
+
+void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
+{
+    HBitmap *tmp = bitmap->bitmap;
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    bitmap->bitmap = in;
+    hbitmap_free(tmp);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                    int nr_sectors)
+{
+    BdrvDirtyBitmap *bitmap;
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
+        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+    }
+}
+
+/**
+ * Advance an HBitmapIter to an arbitrary offset.
+ */
+void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+{
+    assert(hbi->hb);
+    hbitmap_iter_init(hbi, hbi->hb, offset);
+}
+
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
+{
+    return hbitmap_count(bitmap->bitmap);
+}
--- a/block/io.c
+++ b/block/io.c
@@ -44,12 +44,6 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                         int64_t sector_num, int nb_sectors,
                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                         int64_t sector_num,
                                         QEMUIOVector *qiov,
@@ -621,20 +615,6 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
 }

-/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
-int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
-                          uint8_t *buf, int nb_sectors)
-{
-    bool enabled;
-    int ret;
-
-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
-    return ret;
-}
-
 /* Return < 0 if error. Important errors are:
  -EIO         generic I/O error (may happen for all errors)
  -ENOMEDIUM   No media inserted.
@@ -939,7 +919,7 @@ out:
 /*
 * Handle a read request in coroutine context
 */
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
 {
@@ -1284,7 +1264,7 @@ fail:
 /*
 * Handle a write request in coroutine context
 */
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
 {
@@ -1445,26 +1425,6 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
                             BDRV_REQ_ZERO_WRITE | flags);
 }

-int bdrv_flush_all(void)
-{
-    BlockDriverState *bs = NULL;
-    int result = 0;
-
-    while ((bs = bdrv_next(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        ret = bdrv_flush(bs);
-        if (ret < 0 && !result) {
-            result = ret;
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
-}
-
 typedef struct BdrvCoGetBlockStatusData {
    BlockDriverState *bs;
    BlockDriverState *base;
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -39,6 +39,7 @@
 #include "sysemu/sysemu.h"
 #include "qmp-commands.h"
 #include "qapi/qmp/qstring.h"
+#include "crypto/secret.h"

 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
@@ -1080,6 +1081,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
    QemuOpts *opts;
    const char *user = NULL;
    const char *password = NULL;
+    const char *secretid;
+    char *secret = NULL;

    list = qemu_find_opts("iscsi");
    if (!list) {
@@ -1099,8 +1102,20 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
        return;
    }

+    secretid = qemu_opt_get(opts, "password-secret");
    password = qemu_opt_get(opts, "password");
-    if (!password) {
+    if (secretid && password) {
+        error_setg(errp, "'password' and 'password-secret' properties are "
+                   "mutually exclusive");
+        return;
+    }
+    if (secretid) {
+        secret = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!secret) {
+            return;
+        }
+        password = secret;
+    } else if (!password) {
        error_setg(errp, "CHAP username specified but no password was given");
        return;
    }
@@ -1108,6 +1123,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
    if (iscsi_set_initiator_username_pwd(iscsi, user, password)) {
        error_setg(errp, "Failed to set initiator username and password");
    }
+
+    g_free(secret);
 }

 static void parse_header_digest(struct iscsi_context *iscsi, const char *target,
@@ -1857,6 +1874,11 @@ static QemuOptsList qemu_iscsi_opts = {
            .name = "password",
            .type = QEMU_OPT_STRING,
            .help = "password for CHAP authentication to target",
+        },{
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of the secret providing password for CHAP "
+                    "authentication to target",
        },{
            .name = "header-digest",
            .type = QEMU_OPT_STRING,
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -47,7 +47,6 @@ typedef struct MirrorBlockJob {
    BlockdevOnError on_source_error, on_target_error;
    bool synced;
    bool should_complete;
-    int64_t sector_num;
    int64_t granularity;
    size_t buf_size;
    int64_t bdev_length;
@@ -64,6 +63,8 @@ typedef struct MirrorBlockJob {
    int ret;
    bool unmap;
    bool waiting_for_io;
+    int target_cluster_sectors;
+    int max_iov;
 } MirrorBlockJob;

 typedef struct MirrorOp {
@@ -159,115 +160,84 @@ static void mirror_read_complete(void *opaque, int ret)
                    mirror_write_complete, op);
 }

-static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
+ * return the offset of the adjusted tail sector against original. */
+static int mirror_cow_align(MirrorBlockJob *s,
+                            int64_t *sector_num,
+                            int *nb_sectors)
+{
+    bool need_cow;
+    int ret = 0;
+    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
+    int64_t align_sector_num = *sector_num;
+    int align_nb_sectors = *nb_sectors;
+    int max_sectors = chunk_sectors * s->max_iov;
+
+    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
+    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
+                          s->cow_bitmap);
+    if (need_cow) {
+        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+                               &align_sector_num, &align_nb_sectors);
+    }
+
+    if (align_nb_sectors > max_sectors) {
+        align_nb_sectors = max_sectors;
+        if (need_cow) {
+            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
+                                               s->target_cluster_sectors);
+        }
+    }
+
+    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
+    *sector_num = align_sector_num;
+    *nb_sectors = align_nb_sectors;
+    assert(ret >= 0);
+    return ret;
+}
+
+static inline void mirror_wait_for_io(MirrorBlockJob *s)
+{
+    assert(!s->waiting_for_io);
+    s->waiting_for_io = true;
+    qemu_coroutine_yield();
+    s->waiting_for_io = false;
+}
+
+/* Submit async read while handling COW.
+ * Returns: nb_sectors if no alignment is necessary, or
+ *          (new_end - sector_num) if tail is rounded up or down due to
+ *          alignment or buffer limit.
+ */
+static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
+                          int nb_sectors)
 {
    BlockDriverState *source = s->common.bs;
-    int nb_sectors, sectors_per_chunk, nb_chunks, max_iov;
-    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
-    uint64_t delay_ns = 0;
+    int sectors_per_chunk, nb_chunks;
+    int ret = nb_sectors;
    MirrorOp *op;
-    int pnum;
-    int64_t ret;
-    BlockDriverState *file;

-    max_iov = MIN(source->bl.max_iov, s->target->bl.max_iov);
-
-    s->sector_num = hbitmap_iter_next(&s->hbi);
-    if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
-        s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
-        assert(s->sector_num >= 0);
-    }
-
-    hbitmap_next_sector = s->sector_num;
-    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    end = s->bdev_length / BDRV_SECTOR_SIZE;

-    /* Extend the QEMUIOVector to include all adjacent blocks that will
-     * be copied in this operation.
-     *
-     * We have to do this if we have no backing file yet in the destination,
-     * and the cluster size is very large.  Then we need to do COW ourselves.
-     * The first time a cluster is copied, copy it entirely.  Note that,
-     * because both the granularity and the cluster size are powers of two,
-     * the number of sectors to copy cannot exceed one cluster.
-     *
-     * We also want to extend the QEMUIOVector to include more adjacent
-     * dirty blocks if possible, to limit the number of I/O operations and
-     * run efficiently even with a small granularity.
-     */
-    nb_chunks = 0;
-    nb_sectors = 0;
-    next_sector = sector_num;
-    next_chunk = sector_num / sectors_per_chunk;
+    /* We can only handle as much as buf_size at a time. */
+    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
+    assert(nb_sectors);

-    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
-    while (test_bit(next_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
-        s->waiting_for_io = true;
-        qemu_coroutine_yield();
-        s->waiting_for_io = false;
+    if (s->cow_bitmap) {
+        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
+    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
+    /* The sector range must meet granularity because:
+     * 1) Caller passes in aligned values;
+     * 2) mirror_cow_align is used only when target cluster is larger. */
+    assert(!(nb_sectors % sectors_per_chunk));
+    assert(!(sector_num % sectors_per_chunk));
+    nb_chunks = nb_sectors / sectors_per_chunk;

-    do {
-        int added_sectors, added_chunks;
-
-        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
-            test_bit(next_chunk, s->in_flight_bitmap)) {
-            assert(nb_sectors > 0);
-            break;
-        }
-
-        added_sectors = sectors_per_chunk;
-        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
-            bdrv_round_to_clusters(s->target,
-                                   next_sector, added_sectors,
-                                   &next_sector, &added_sectors);
-
-            /* On the first iteration, the rounding may make us copy
-             * sectors before the first dirty one.
-             */
-            if (next_sector < sector_num) {
-                assert(nb_sectors == 0);
-                sector_num = next_sector;
-                next_chunk = next_sector / sectors_per_chunk;
-            }
-        }
-
-        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
-        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
-
-        /* When doing COW, it may happen that there is not enough space for
-         * a full cluster.  Wait if that is the case.
-         */
-        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
-            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
-            s->waiting_for_io = true;
-            qemu_coroutine_yield();
-            s->waiting_for_io = false;
-        }
-        if (s->buf_free_count < nb_chunks + added_chunks) {
-            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
-            break;
-        }
-        if (max_iov < nb_chunks + added_chunks) {
-            trace_mirror_break_iov_max(s, nb_chunks, added_chunks);
-            break;
-        }
-
-        /* We have enough free space to copy these sectors.  */
-        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
-
-        nb_sectors += added_sectors;
-        nb_chunks += added_chunks;
-        next_sector += added_sectors;
-        next_chunk += added_chunks;
-        if (!s->synced && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
-        }
-    } while (delay_ns == 0 && next_sector < end);
+    while (s->buf_free_count < nb_chunks) {
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        mirror_wait_for_io(s);
+    }

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_new(MirrorOp, 1);
@@ -279,47 +249,151 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
-    next_sector = sector_num;
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
-        size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size;
+        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;

        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
-
-        /* Advance the HBitmapIter in parallel, so that we do not examine
-         * the same sector twice.
-         */
-        if (next_sector > hbitmap_next_sector
-            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
-            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
-        }
-
-        next_sector += sectors_per_chunk;
    }

-    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
-
    /* Copy the dirty cluster.  */
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    ret = bdrv_get_block_status_above(source, NULL, sector_num,
-                                      nb_sectors, &pnum, &file);
-    if (ret < 0 || pnum < nb_sectors ||
-            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
-        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
-                       mirror_read_complete, op);
-    } else if (ret & BDRV_BLOCK_ZERO) {
+    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+                   mirror_read_complete, op);
+    return ret;
+}
+
+static void mirror_do_zero_or_discard(MirrorBlockJob *s,
+                                      int64_t sector_num,
+                                      int nb_sectors,
+                                      bool is_discard)
+{
+    MirrorOp *op;
+
+    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
+     * so the freeing in mirror_iteration_done is nop. */
+    op = g_new0(MirrorOp, 1);
+    op->s = s;
+    op->sector_num = sector_num;
+    op->nb_sectors = nb_sectors;
+
+    s->in_flight++;
+    s->sectors_in_flight += nb_sectors;
+    if (is_discard) {
+        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
+                         mirror_write_complete, op);
+    } else {
        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
-    } else {
-        assert(!(ret & BDRV_BLOCK_DATA));
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+    }
+}
+
+static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+{
+    BlockDriverState *source = s->common.bs;
+    int64_t sector_num;
+    uint64_t delay_ns = 0;
+    /* At least the first dirty chunk is mirrored in one iteration. */
+    int nb_chunks = 1;
+    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
+    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+
+    sector_num = hbitmap_iter_next(&s->hbi);
+    if (sector_num < 0) {
+        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
+        sector_num = hbitmap_iter_next(&s->hbi);
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
+        assert(sector_num >= 0);
+    }
+
+    /* Find the number of consective dirty chunks following the first dirty
+     * one, and wait for in flight requests in them. */
+    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
+        int64_t hbitmap_next;
+        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
+        int64_t next_chunk = next_sector / sectors_per_chunk;
+        if (next_sector >= end ||
+            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
+            break;
+        }
+        if (test_bit(next_chunk, s->in_flight_bitmap)) {
+            if (nb_chunks > 0) {
+                break;
+            }
+            trace_mirror_yield_in_flight(s, next_sector, s->in_flight);
+            mirror_wait_for_io(s);
+            /* Now retry.  */
+        } else {
+            hbitmap_next = hbitmap_iter_next(&s->hbi);
+            assert(hbitmap_next == next_sector);
+            nb_chunks++;
+        }
+    }
+
+    /* Clear dirty bits before querying the block status, because
+     * calling bdrv_get_block_status_above could yield - if some blocks are
+     * marked dirty in this window, we need to know.
+     */
+    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
+                            nb_chunks * sectors_per_chunk);
+    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
+    while (nb_chunks > 0 && sector_num < end) {
+        int ret;
+        int io_sectors;
+        BlockDriverState *file;
+        enum MirrorMethod {
+            MIRROR_METHOD_COPY,
+            MIRROR_METHOD_ZERO,
+            MIRROR_METHOD_DISCARD
+        } mirror_method = MIRROR_METHOD_COPY;
+
+        assert(!(sector_num % sectors_per_chunk));
+        ret = bdrv_get_block_status_above(source, NULL, sector_num,
+                                          nb_chunks * sectors_per_chunk,
+                                          &io_sectors, &file);
+        if (ret < 0) {
+            io_sectors = nb_chunks * sectors_per_chunk;
+        }
+
+        io_sectors -= io_sectors % sectors_per_chunk;
+        if (io_sectors < sectors_per_chunk) {
+            io_sectors = sectors_per_chunk;
+        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
+            int64_t target_sector_num;
+            int target_nb_sectors;
+            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+                                   &target_sector_num, &target_nb_sectors);
+            if (target_sector_num == sector_num &&
+                target_nb_sectors == io_sectors) {
+                mirror_method = ret & BDRV_BLOCK_ZERO ?
+                                    MIRROR_METHOD_ZERO :
+                                    MIRROR_METHOD_DISCARD;
+            }
+        }
+
+        switch (mirror_method) {
+        case MIRROR_METHOD_COPY:
+            io_sectors = mirror_do_read(s, sector_num, io_sectors);
+            break;
+        case MIRROR_METHOD_ZERO:
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
+            break;
+        case MIRROR_METHOD_DISCARD:
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
+            break;
+        default:
+            abort();
+        }
+        assert(io_sectors);
+        sector_num += io_sectors;
+        nb_chunks -= io_sectors / sectors_per_chunk;
+        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
    }
    return delay_ns;
 }
@@ -344,9 +418,7 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void mirror_drain(MirrorBlockJob *s)
 {
    while (s->in_flight > 0) {
-        s->waiting_for_io = true;
-        qemu_coroutine_yield();
-        s->waiting_for_io = false;
+        mirror_wait_for_io(s);
    }
 }

@@ -420,6 +492,7 @@ static void coroutine_fn mirror_run(void *opaque)
                                 checking for a NULL string */
    int ret = 0;
    int n;
+    int target_cluster_size = BDRV_SECTOR_SIZE;

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
@@ -449,16 +522,16 @@ static void coroutine_fn mirror_run(void *opaque)
     */
    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
-    if (backing_filename[0] && !s->target->backing) {
-        ret = bdrv_get_info(s->target, &bdi);
-        if (ret < 0) {
-            goto immediate_exit;
-        }
-        if (s->granularity < bdi.cluster_size) {
-            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
-            s->cow_bitmap = bitmap_new(length);
-        }
+    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+        target_cluster_size = bdi.cluster_size;
    }
+    if (backing_filename[0] && !s->target->backing
+        && s->granularity < target_cluster_size) {
+        s->buf_size = MAX(s->buf_size, target_cluster_size);
+        s->cow_bitmap = bitmap_new(length);
+    }
+    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
+    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);

    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -533,9 +606,7 @@ static void coroutine_fn mirror_run(void *opaque)
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
-                s->waiting_for_io = true;
-                qemu_coroutine_yield();
-                s->waiting_for_io = false;
+                mirror_wait_for_io(s);
                continue;
            } else if (cnt != 0) {
                delay_ns = mirror_iteration(s);
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -204,18 +204,20 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
    saddr = g_new0(SocketAddress, 1);

    if (qdict_haskey(options, "path")) {
+        UnixSocketAddress *q_unix;
        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
-        saddr->u.q_unix = g_new0(UnixSocketAddress, 1);
-        saddr->u.q_unix->path = g_strdup(qdict_get_str(options, "path"));
+        q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
+        q_unix->path = g_strdup(qdict_get_str(options, "path"));
        qdict_del(options, "path");
    } else {
+        InetSocketAddress *inet;
        saddr->type = SOCKET_ADDRESS_KIND_INET;
-        saddr->u.inet = g_new0(InetSocketAddress, 1);
-        saddr->u.inet->host = g_strdup(qdict_get_str(options, "host"));
+        inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1);
+        inet->host = g_strdup(qdict_get_str(options, "host"));
        if (!qdict_get_try_str(options, "port")) {
-            saddr->u.inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+            inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
        } else {
-            saddr->u.inet->port = g_strdup(qdict_get_str(options, "port"));
+            inet->port = g_strdup(qdict_get_str(options, "port"));
        }
        qdict_del(options, "host");
        qdict_del(options, "port");
@@ -319,7 +321,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
            error_setg(errp, "TLS only supported over IP sockets");
            goto error;
        }
-        hostname = saddr->u.inet->host;
+        hostname = saddr->u.inet.data->host;
    }

    /* establish TCP connection, return error if it fails
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -36,6 +36,7 @@
 #include <nfsc/libnfs.h>

 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
+#define QEMU_NFS_MAX_DEBUG_LEVEL 2

 typedef struct NFSClient {
    struct nfs_context *context;
@@ -333,6 +334,17 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
                val = QEMU_NFS_MAX_READAHEAD_SIZE;
            }
            nfs_set_readahead(client->context, val);
+#endif
+#ifdef LIBNFS_FEATURE_DEBUG
+        } else if (!strcmp(qp->p[i].name, "debug")) {
+            /* limit the maximum debug level to avoid potential flooding
+             * of our log files. */
+            if (val > QEMU_NFS_MAX_DEBUG_LEVEL) {
+                error_report("NFS Warning: Limiting NFS debug level"
+                             " to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
+                val = QEMU_NFS_MAX_DEBUG_LEVEL;
+            }
+            nfs_set_debug(client->context, val);
 #endif
        } else {
            error_setg(errp, "Unknown NFS parameter name: %s",
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -30,6 +30,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"
@@ -461,7 +462,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size, cl_size;
    uint8_t tmp[BDRV_SECTOR_SIZE];
    Error *local_err = NULL;
-    BlockDriverState *file;
+    BlockBackend *file;
    uint32_t bat_entries, bat_sectors;
    ParallelsHeader header;
    int ret;
@@ -477,14 +478,17 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
        return ret;
    }

-    file = NULL;
-    ret = bdrv_open(&file, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+    file = blk_new_open(filename, NULL, NULL,
+                        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                        &local_err);
+    if (file == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }
-    ret = bdrv_truncate(file, 0);
+
+    blk_set_allow_write_beyond_eof(file, true);
+
+    ret = blk_truncate(file, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -508,18 +512,18 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(tmp, 0, sizeof(tmp));
    memcpy(tmp, &header, sizeof(header));

-    ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
    if (ret < 0) {
        goto exit;
    }
-    ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
    if (ret < 0) {
        goto exit;
    }
    ret = 0;

 done:
-    bdrv_unref(file);
+    blk_unref(file);
    return ret;

 exit:
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -355,100 +355,116 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
    qapi_free_BlockInfo(info);
 }

-static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
-                                    bool query_backing)
+static BlockStats *bdrv_query_stats(BlockBackend *blk,
+                                    const BlockDriverState *bs,
+                                    bool query_backing);
+
+static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk)
 {
-    BlockStats *s;
+    BlockAcctStats *stats = blk_get_stats(blk);
+    BlockAcctTimedStats *ts = NULL;

-    s = g_malloc0(sizeof(*s));
+    s->has_device = true;
+    s->device = g_strdup(blk_name(blk));

-    if (bdrv_get_device_name(bs)[0]) {
-        s->has_device = true;
-        s->device = g_strdup(bdrv_get_device_name(bs));
+    s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
+    s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+    s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
+    s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+
+    s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
+    s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+    s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
+
+    s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
+    s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+    s->stats->invalid_flush_operations =
+        stats->invalid_ops[BLOCK_ACCT_FLUSH];
+
+    s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
+    s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+    s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
+    s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+    s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
+    s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
+
+    s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
+    if (s->stats->has_idle_time_ns) {
+        s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
    }

+    s->stats->account_invalid = stats->account_invalid;
+    s->stats->account_failed = stats->account_failed;
+
+    while ((ts = block_acct_interval_next(stats, ts))) {
+        BlockDeviceTimedStatsList *timed_stats =
+            g_malloc0(sizeof(*timed_stats));
+        BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
+        timed_stats->next = s->stats->timed_stats;
+        timed_stats->value = dev_stats;
+        s->stats->timed_stats = timed_stats;
+
+        TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
+        TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+        TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
+
+        dev_stats->interval_length = ts->interval_length;
+
+        dev_stats->min_rd_latency_ns = timed_average_min(rd);
+        dev_stats->max_rd_latency_ns = timed_average_max(rd);
+        dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
+
+        dev_stats->min_wr_latency_ns = timed_average_min(wr);
+        dev_stats->max_wr_latency_ns = timed_average_max(wr);
+        dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+
+        dev_stats->min_flush_latency_ns = timed_average_min(fl);
+        dev_stats->max_flush_latency_ns = timed_average_max(fl);
+        dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+        dev_stats->avg_rd_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+        dev_stats->avg_wr_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+    }
+}
+
+static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs,
+                                 bool query_backing)
+{
    if (bdrv_get_node_name(bs)[0]) {
        s->has_node_name = true;
        s->node_name = g_strdup(bdrv_get_node_name(bs));
    }

-    s->stats = g_malloc0(sizeof(*s->stats));
-    if (bs->blk) {
-        BlockAcctStats *stats = blk_get_stats(bs->blk);
-        BlockAcctTimedStats *ts = NULL;
-
-        s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
-        s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
-        s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
-        s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
-
-        s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
-        s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
-        s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
-
-        s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
-        s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
-        s->stats->invalid_flush_operations =
-            stats->invalid_ops[BLOCK_ACCT_FLUSH];
-
-        s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
-        s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
-        s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
-        s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
-        s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
-        s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
-
-        s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
-        if (s->stats->has_idle_time_ns) {
-            s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
-        }
-
-        s->stats->account_invalid = stats->account_invalid;
-        s->stats->account_failed = stats->account_failed;
-
-        while ((ts = block_acct_interval_next(stats, ts))) {
-            BlockDeviceTimedStatsList *timed_stats =
-                g_malloc0(sizeof(*timed_stats));
-            BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
-            timed_stats->next = s->stats->timed_stats;
-            timed_stats->value = dev_stats;
-            s->stats->timed_stats = timed_stats;
-
-            TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
-            TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
-            TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
-
-            dev_stats->interval_length = ts->interval_length;
-
-            dev_stats->min_rd_latency_ns = timed_average_min(rd);
-            dev_stats->max_rd_latency_ns = timed_average_max(rd);
-            dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
-
-            dev_stats->min_wr_latency_ns = timed_average_min(wr);
-            dev_stats->max_wr_latency_ns = timed_average_max(wr);
-            dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
-
-            dev_stats->min_flush_latency_ns = timed_average_min(fl);
-            dev_stats->max_flush_latency_ns = timed_average_max(fl);
-            dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
-
-            dev_stats->avg_rd_queue_depth =
-                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
-            dev_stats->avg_wr_queue_depth =
-                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
-        }
-    }
-
    s->stats->wr_highest_offset = bs->wr_highest_offset;

    if (bs->file) {
        s->has_parent = true;
-        s->parent = bdrv_query_stats(bs->file->bs, query_backing);
+        s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing);
    }

    if (query_backing && bs->backing) {
        s->has_backing = true;
-        s->backing = bdrv_query_stats(bs->backing->bs, query_backing);
+        s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing);
+    }
+
+}
+
+static BlockStats *bdrv_query_stats(BlockBackend *blk,
+                                    const BlockDriverState *bs,
+                                    bool query_backing)
+{
+    BlockStats *s;
+
+    s = g_malloc0(sizeof(*s));
+    s->stats = g_malloc0(sizeof(*s->stats));
+
+    if (blk) {
+        bdrv_query_blk_stats(s, blk);
+    }
+    if (bs) {
+        bdrv_query_bds_stats(s, bs, query_backing);
    }

    return s;
@@ -477,22 +493,38 @@ BlockInfoList *qmp_query_block(Error **errp)
    return head;
 }

+static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs,
+                           bool query_nodes)
+{
+    if (query_nodes) {
+        *bs = bdrv_next_node(*bs);
+        return !!*bs;
+    }
+
+    *blk = blk_next(*blk);
+    *bs = *blk ? blk_bs(*blk) : NULL;
+
+    return !!*blk;
+}
+
 BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
                                     bool query_nodes,
                                     Error **errp)
 {
    BlockStatsList *head = NULL, **p_next = &head;
+    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;

    /* Just to be safe if query_nodes is not always initialized */
    query_nodes = has_query_nodes && query_nodes;

-    while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) {
+    while (next_query_bds(&blk, &bs, query_nodes)) {
        BlockStatsList *info = g_malloc0(sizeof(*info));
-        AioContext *ctx = bdrv_get_aio_context(bs);
+        AioContext *ctx = blk ? blk_get_aio_context(blk)
+                              : bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
-        info->value = bdrv_query_stats(bs, !query_nodes);
+        info->value = bdrv_query_stats(blk, bs, !query_nodes);
        aio_context_release(ctx);

        *p_next = info;
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
@@ -120,11 +121,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }
    if (header.version != QCOW_VERSION) {
-        char version[64];
-        snprintf(version, sizeof(version), "QCOW version %" PRIu32,
-                 header.version);
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "qcow", version);
+        error_setg(errp, "Unsupported qcow version %" PRIu32, header.version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -780,7 +777,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    int flags = 0;
    Error *local_err = NULL;
    int ret;
-    BlockDriverState *qcow_bs;
+    BlockBackend *qcow_blk;

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
@@ -796,15 +793,18 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
        goto cleanup;
    }

-    qcow_bs = NULL;
-    ret = bdrv_open(&qcow_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+    qcow_blk = blk_new_open(filename, NULL, NULL,
+                            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                            &local_err);
+    if (qcow_blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto cleanup;
    }

-    ret = bdrv_truncate(qcow_bs, 0);
+    blk_set_allow_write_beyond_eof(qcow_blk, true);
+
+    ret = blk_truncate(qcow_blk, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -844,13 +844,13 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* write all the data */
-    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
    if (ret != sizeof(header)) {
        goto exit;
    }

    if (backing_file) {
-        ret = bdrv_pwrite(qcow_bs, sizeof(header),
+        ret = blk_pwrite(qcow_blk, sizeof(header),
            backing_file, backing_filename_len);
        if (ret != backing_filename_len) {
            goto exit;
@@ -860,7 +860,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    tmp = g_malloc0(BDRV_SECTOR_SIZE);
    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
        BDRV_SECTOR_SIZE); i++) {
-        ret = bdrv_pwrite(qcow_bs, header_size +
+        ret = blk_pwrite(qcow_blk, header_size +
            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
@@ -871,7 +871,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    g_free(tmp);
    ret = 0;
 exit:
-    bdrv_unref(qcow_bs);
+    blk_unref(qcow_blk);
 cleanup:
    g_free(backing_file);
    return ret;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include <zlib.h>
 #include "block/qcow2.h"
@@ -197,22 +198,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs)
    }
 }

-static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
-    Error **errp, const char *fmt, ...)
-{
-    char msg[64];
-    va_list ap;
-
-    va_start(ap, fmt);
-    vsnprintf(msg, sizeof(msg), fmt, ap);
-    va_end(ap);
-
-    error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-               bdrv_get_device_or_node_name(bs), "qcow2", msg);
-}
-
-static void report_unsupported_feature(BlockDriverState *bs,
-    Error **errp, Qcow2Feature *table, uint64_t mask)
+static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
+                                       uint64_t mask)
 {
    char *features = g_strdup("");
    char *old;
@@ -237,7 +224,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
        g_free(old);
    }

-    report_unsupported(bs, errp, "%s", features);
+    error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
    g_free(features);
 }

@@ -854,7 +841,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }
    if (header.version < 2 || header.version > 3) {
-        report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version);
+        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -934,7 +921,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
                              &feature_table, NULL);
-        report_unsupported_feature(bs, errp, feature_table,
+        report_unsupported_feature(errp, feature_table,
                                   s->incompatible_features &
                                   ~QCOW2_INCOMPAT_MASK);
        ret = -ENOTSUP;
@@ -2097,7 +2084,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
-    BlockDriverState* bs;
+    BlockBackend *blk;
    QCowHeader *header;
    uint64_t* refcount_table;
    Error *local_err = NULL;
@@ -2172,14 +2159,16 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        return ret;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* Write the header */
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
    header = g_malloc0(cluster_size);
@@ -2207,7 +2196,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

-    ret = bdrv_pwrite(bs, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size);
    g_free(header);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2217,7 +2206,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
    g_free(refcount_table);

    if (ret < 0) {
@@ -2225,8 +2214,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    bdrv_unref(bs);
-    bs = NULL;
+    blk_unref(blk);
+    blk = NULL;

    /*
     * And now open the image and make it consistent first (i.e. increase the
@@ -2235,15 +2224,16 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     */
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
-    ret = bdrv_open(&bs, filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, options,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

-    ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
+    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
@@ -2255,14 +2245,14 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* Create a full header (including things like feature table) */
-    ret = qcow2_update_header(bs);
+    ret = qcow2_update_header(blk_bs(blk));
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
    }

    /* Okay, now that we have a valid image, let's give it the right size */
-    ret = bdrv_truncate(bs, total_size);
+    ret = blk_truncate(blk, total_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not resize image");
        goto out;
@@ -2270,7 +2260,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,

    /* Want a backing file? There you go.*/
    if (backing_file) {
-        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+        ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
                             "with format '%s'", backing_file, backing_format);
@@ -2280,9 +2270,9 @@ static int qcow2_create2(const char *filename, int64_t total_size,

    /* And if we're supposed to preallocate metadata, do that now */
    if (prealloc != PREALLOC_MODE_OFF) {
-        BDRVQcow2State *s = bs->opaque;
+        BDRVQcow2State *s = blk_bs(blk)->opaque;
        qemu_co_mutex_lock(&s->lock);
-        ret = preallocate(bs);
+        ret = preallocate(blk_bs(blk));
        qemu_co_mutex_unlock(&s->lock);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not preallocate metadata");
@@ -2290,24 +2280,25 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        }
    }

-    bdrv_unref(bs);
-    bs = NULL;
+    blk_unref(blk);
+    blk = NULL;

    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
-    ret = bdrv_open(&bs, filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
-                    &local_err);
-    if (local_err) {
+    blk = blk_new_open(filename, NULL, options,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

    ret = 0;
 out:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    return ret;
 }
@@ -2809,15 +2800,15 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)

    *spec_info = (ImageInfoSpecific){
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
-        .u.qcow2 = g_new(ImageInfoSpecificQCow2, 1),
+        .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
    };
    if (s->qcow_version == 2) {
-        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
        };
    } else if (s->qcow_version == 3) {
-        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
--- a/block/qed.c
+++ b/block/qed.c
@@ -18,6 +18,7 @@
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
 #include "migration/migration.h"
+#include "sysemu/block-backend.h"

 static const AIOCBInfo qed_aiocb_info = {
    .aiocb_size         = sizeof(QEDAIOCB),
@@ -376,18 +377,6 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
    }
 }

-static void bdrv_qed_drain(BlockDriverState *bs)
-{
-    BDRVQEDState *s = bs->opaque;
-
-    /* Cancel timer and start doing I/O that were meant to happen as if it
-     * fired, that way we get bdrv_drain() taking care of the ongoing requests
-     * correctly. */
-    qed_cancel_need_check_timer(s);
-    qed_plug_allocating_write_reqs(s);
-    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
-}
-
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
@@ -411,11 +400,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    }
    if (s->header.features & ~QED_FEATURE_MASK) {
        /* image uses unsupported feature bits */
-        char buf[64];
-        snprintf(buf, sizeof(buf), "%" PRIx64,
-            s->header.features & ~QED_FEATURE_MASK);
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "QED", buf);
+        error_setg(errp, "Unsupported QED features: %" PRIx64,
+                   s->header.features & ~QED_FEATURE_MASK);
        return -ENOTSUP;
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -580,7 +566,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    size_t l1_size = header.cluster_size * header.table_size;
    Error *local_err = NULL;
    int ret = 0;
-    BlockDriverState *bs;
+    BlockBackend *blk;

    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
@@ -588,17 +574,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
        return ret;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* File must start empty and grow, check truncate is supported */
-    ret = bdrv_truncate(bs, 0);
+    ret = blk_truncate(blk, 0);
    if (ret < 0) {
        goto out;
    }
@@ -614,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    qed_header_cpu_to_le(&header, &le_header);
-    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
    if (ret < 0) {
        goto out;
    }
-    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
-                      header.backing_filename_size);
+    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
+                     header.backing_filename_size);
    if (ret < 0) {
        goto out;
    }

    l1_table = g_malloc0(l1_size);
-    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
    if (ret < 0) {
        goto out;
    }
@@ -633,7 +620,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    ret = 0; /* success */
 out:
    g_free(l1_table);
-    bdrv_unref(bs);
+    blk_unref(blk);
    return ret;
 }

@@ -1692,7 +1679,6 @@ static BlockDriver bdrv_qed = {
    .bdrv_check               = bdrv_qed_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
-    .bdrv_drain               = bdrv_qed_drain,
 };

 static void bdrv_qed_init(void)
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -215,14 +215,16 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
    return acb;
 }

-static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
+static void quorum_report_bad(QuorumOpType type, uint64_t sector_num,
+                              int nb_sectors, char *node_name, int ret)
 {
    const char *msg = NULL;
    if (ret < 0) {
        msg = strerror(-ret);
    }
-    qapi_event_send_quorum_report_bad(!!msg, msg, node_name,
-                                      acb->sector_num, acb->nb_sectors, &error_abort);
+
+    qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name,
+                                      sector_num, nb_sectors, &error_abort);
 }

 static void quorum_report_failure(QuorumAIOCB *acb)
@@ -284,6 +286,15 @@ static void quorum_aio_cb(void *opaque, int ret)
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;

+    if (ret == 0) {
+        acb->success_count++;
+    } else {
+        QuorumOpType type;
+        type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
+        quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
+                          sacb->aiocb->bs->node_name, ret);
+    }
+
    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
        /* We try to read next child in FIFO order if we fail to read */
        if (ret < 0 && (acb->child_iter + 1) < s->num_children) {
@@ -302,11 +313,6 @@ static void quorum_aio_cb(void *opaque, int ret)

    sacb->ret = ret;
    acb->count++;
-    if (ret == 0) {
-        acb->success_count++;
-    } else {
-        quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
-    }
    assert(acb->count <= s->num_children);
    assert(acb->success_count <= s->num_children);
    if (acb->count < s->num_children) {
@@ -338,7 +344,9 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
            continue;
        }
        QLIST_FOREACH(item, &version->items, next) {
-            quorum_report_bad(acb, s->children[item->index]->bs->node_name, 0);
+            quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num,
+                              acb->nb_sectors,
+                              s->children[item->index]->bs->node_name, 0);
        }
    }
 }
@@ -648,8 +656,9 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
    }

    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_readv(s->children[i]->bs, acb->sector_num, &acb->qcrs[i].qiov,
-                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
+        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num,
+                                            &acb->qcrs[i].qiov, acb->nb_sectors,
+                                            quorum_aio_cb, &acb->qcrs[i]);
    }

    return &acb->common;
@@ -664,9 +673,10 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
                     acb->qcrs[acb->child_iter].buf);
-    bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
-                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
-                   quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+    acb->qcrs[acb->child_iter].aiocb =
+        bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
+                       &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+                       quorum_aio_cb, &acb->qcrs[acb->child_iter]);

    return &acb->common;
 }
@@ -760,19 +770,30 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
    QuorumVoteValue result_value;
    int i;
    int result = 0;
+    int success_count = 0;

    QLIST_INIT(&error_votes.vote_list);
    error_votes.compare = quorum_64bits_compare;

    for (i = 0; i < s->num_children; i++) {
        result = bdrv_co_flush(s->children[i]->bs);
-        result_value.l = result;
-        quorum_count_vote(&error_votes, &result_value, i);
+        if (result) {
+            quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0,
+                              bdrv_nb_sectors(s->children[i]->bs),
+                              s->children[i]->bs->node_name, result);
+            result_value.l = result;
+            quorum_count_vote(&error_votes, &result_value, i);
+        } else {
+            success_count++;
+        }
    }

-    winner = quorum_get_vote_winner(&error_votes);
-    result = winner->value.l;
-
+    if (success_count >= s->threshold) {
+        result = 0;
+    } else {
+        winner = quorum_get_vote_winner(&error_votes);
+        result = winner->value.l;
+    }
    quorum_free_vote_list(&error_votes);

    return result;
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -16,6 +16,7 @@
 #include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "block/block_int.h"
+#include "crypto/secret.h"

 #include <rbd/librbd.h>

@@ -228,6 +229,27 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
    return NULL;
 }

+
+static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
+                             Error **errp)
+{
+    if (secretid == 0) {
+        return 0;
+    }
+
+    gchar *secret = qcrypto_secret_lookup_as_base64(secretid,
+                                                    errp);
+    if (!secret) {
+        return -1;
+    }
+
+    rados_conf_set(cluster, "key", secret);
+    g_free(secret);
+
+    return 0;
+}
+
+
 static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
                             bool only_read_conf_file,
                             Error **errp)
@@ -299,10 +321,13 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    char conf[RBD_MAX_CONF_SIZE];
    char clientname_buf[RBD_MAX_CONF_SIZE];
    char *clientname;
+    const char *secretid;
    rados_t cluster;
    rados_ioctx_t io_ctx;
    int ret;

+    secretid = qemu_opt_get(opts, "password-secret");
+
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
                           snap_buf, sizeof(snap_buf),
                           name, sizeof(name),
@@ -350,6 +375,11 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }

+    if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
+        rados_shutdown(cluster);
+        return -EIO;
+    }
+
    if (rados_connect(cluster) < 0) {
        error_setg(errp, "error connecting");
        rados_shutdown(cluster);
@@ -423,6 +453,11 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Specification of the rbd image",
        },
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret providing the password",
+        },
        { /* end of list */ }
    },
 };
@@ -436,6 +471,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    char conf[RBD_MAX_CONF_SIZE];
    char clientname_buf[RBD_MAX_CONF_SIZE];
    char *clientname;
+    const char *secretid;
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
@@ -450,6 +486,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    }

    filename = qemu_opt_get(opts, "filename");
+    secretid = qemu_opt_get(opts, "password-secret");

    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
                           snap_buf, sizeof(snap_buf),
@@ -488,6 +525,11 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) {
+        r = -EIO;
+        goto failed_shutdown;
+    }
+
    /*
     * Fallback to more conservative semantics if setting cache
     * options fails. Ignore errors from setting rbd_cache because the
@@ -919,6 +961,11 @@ static QemuOptsList qemu_rbd_create_opts = {
            .type = QEMU_OPT_SIZE,
            .help = "RBD object size"
        },
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret providing the password",
+        },
        { /* end of list */ }
    }
 };
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -18,6 +18,7 @@
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/bitops.h"

 #define SD_PROTO_VER 0x01
@@ -284,6 +285,12 @@ static inline bool is_snapshot(struct SheepdogInode *inode)
    return !!inode->snap_ctime;
 }

+static inline size_t count_data_objs(const struct SheepdogInode *inode)
+{
+    return DIV_ROUND_UP(inode->vdi_size,
+                        (1UL << inode->block_size_shift));
+}
+
 #undef DPRINTF
 #ifdef DEBUG_SDOG
 #define DPRINTF(fmt, args...)                                       \
@@ -609,14 +616,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
    if (ret != sizeof(*hdr)) {
        error_report("failed to send a req, %s", strerror(errno));
-        ret = -socket_error();
-        return ret;
+        return -errno;
    }

    ret = qemu_co_send(sockfd, data, *wlen);
    if (ret != *wlen) {
-        ret = -socket_error();
        error_report("failed to send a req, %s", strerror(errno));
+        return -errno;
    }

    return ret;
@@ -1631,7 +1637,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,

 static int sd_prealloc(const char *filename, Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    BDRVSheepdogState *base = NULL;
    unsigned long buf_size;
    uint32_t idx, max_idx;
@@ -1640,19 +1646,23 @@ static int sd_prealloc(const char *filename, Error **errp)
    void *buf = NULL;
    int ret;

-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    errp);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       errp);
+    if (blk == NULL) {
+        ret = -EIO;
        goto out_with_err_set;
    }

-    vdi_size = bdrv_getlength(bs);
+    blk_set_allow_write_beyond_eof(blk, true);
+
+    vdi_size = blk_getlength(blk);
    if (vdi_size < 0) {
        ret = vdi_size;
        goto out;
    }

-    base = bs->opaque;
+    base = blk_bs(blk)->opaque;
    object_size = (UINT32_C(1) << base->inode.block_size_shift);
    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
    buf = g_malloc0(buf_size);
@@ -1664,23 +1674,24 @@ static int sd_prealloc(const char *filename, Error **errp)
         * The created image can be a cloned image, so we need to read
         * a data from the source image.
         */
-        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
+        ret = blk_pread(blk, idx * buf_size, buf, buf_size);
        if (ret < 0) {
            goto out;
        }
-        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
        if (ret < 0) {
            goto out;
        }
    }

+    ret = 0;
 out:
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Can't pre-allocate");
    }
 out_with_err_set:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    g_free(buf);

@@ -1820,7 +1831,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
    }

    if (backing_file) {
-        BlockDriverState *bs;
+        BlockBackend *blk;
        BDRVSheepdogState *base;
        BlockDriver *drv;

@@ -1832,22 +1843,23 @@ static int sd_create(const char *filename, QemuOpts *opts,
            goto out;
        }

-        bs = NULL;
-        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp);
-        if (ret < 0) {
+        blk = blk_new_open(backing_file, NULL, NULL,
+                           BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp);
+        if (blk == NULL) {
+            ret = -EIO;
            goto out;
        }

-        base = bs->opaque;
+        base = blk_bs(blk)->opaque;

        if (!is_snapshot(&base->inode)) {
            error_setg(errp, "cannot clone from a non snapshot vdi");
-            bdrv_unref(bs);
+            blk_unref(blk);
            ret = -EINVAL;
            goto out;
        }
        s->inode.vdi_id = base->inode.vdi_id;
-        bdrv_unref(bs);
+        blk_unref(blk);
    }

    s->aio_context = qemu_get_aio_context();
@@ -2478,13 +2490,131 @@ out:
    return ret;
 }

+#define NR_BATCHED_DISCARD 128
+
+static bool remove_objects(BDRVSheepdogState *s)
+{
+    int fd, i = 0, nr_objs = 0;
+    Error *local_err = NULL;
+    int ret = 0;
+    bool result = true;
+    SheepdogInode *inode = &s->inode;
+
+    fd = connect_to_sdog(s, &local_err);
+    if (fd < 0) {
+        error_report_err(local_err);
+        return false;
+    }
+
+    nr_objs = count_data_objs(inode);
+    while (i < nr_objs) {
+        int start_idx, nr_filled_idx;
+
+        while (i < nr_objs && !inode->data_vdi_id[i]) {
+            i++;
+        }
+        start_idx = i;
+
+        nr_filled_idx = 0;
+        while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
+            if (inode->data_vdi_id[i]) {
+                inode->data_vdi_id[i] = 0;
+                nr_filled_idx++;
+            }
+
+            i++;
+        }
+
+        ret = write_object(fd, s->aio_context,
+                           (char *)&inode->data_vdi_id[start_idx],
+                           vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
+                           (i - start_idx) * sizeof(uint32_t),
+                           offsetof(struct SheepdogInode,
+                                    data_vdi_id[start_idx]),
+                           false, s->cache_flags);
+        if (ret < 0) {
+            error_report("failed to discard snapshot inode.");
+            result = false;
+            goto out;
+        }
+    }
+
+out:
+    closesocket(fd);
+    return result;
+}
+
 static int sd_snapshot_delete(BlockDriverState *bs,
                              const char *snapshot_id,
                              const char *name,
                              Error **errp)
 {
-    /* FIXME: Delete specified snapshot id.  */
-    return 0;
+    unsigned long snap_id = 0;
+    char snap_tag[SD_MAX_VDI_TAG_LEN];
+    Error *local_err = NULL;
+    int fd, ret;
+    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
+    BDRVSheepdogState *s = bs->opaque;
+    unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
+    uint32_t vid;
+    SheepdogVdiReq hdr = {
+        .opcode = SD_OP_DEL_VDI,
+        .data_length = wlen,
+        .flags = SD_FLAG_CMD_WRITE,
+    };
+    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+
+    if (!remove_objects(s)) {
+        return -1;
+    }
+
+    memset(buf, 0, sizeof(buf));
+    memset(snap_tag, 0, sizeof(snap_tag));
+    pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
+    ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
+    if (ret || snap_id > UINT32_MAX) {
+        error_setg(errp, "Invalid snapshot ID: %s",
+                         snapshot_id ? snapshot_id : "<null>");
+        return -EINVAL;
+    }
+
+    if (snap_id) {
+        hdr.snapid = (uint32_t) snap_id;
+    } else {
+        pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
+        pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
+    }
+
+    ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true,
+                        &local_err);
+    if (ret) {
+        return ret;
+    }
+
+    fd = connect_to_sdog(s, &local_err);
+    if (fd < 0) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+                 buf, &wlen, &rlen);
+    closesocket(fd);
+    if (ret) {
+        return ret;
+    }
+
+    switch (rsp->result) {
+    case SD_RES_NO_VDI:
+        error_report("%s was already deleted", s->name);
+    case SD_RES_SUCCESS:
+        break;
+    default:
+        error_report("%s, %s", sd_strerror(rsp->result), s->name);
+        return -1;
+    }
+
+    return ret;
 }

 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -52,6 +52,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
@@ -733,7 +734,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    size_t bmap_size;
    int64_t offset = 0;
    Error *local_err = NULL;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    uint32_t *bmap = NULL;

    logout("\n");
@@ -766,13 +767,18 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
        error_propagate(errp, local_err);
        goto exit;
    }
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* We need enough blocks to store the given disk size,
       so always round up. */
    blocks = DIV_ROUND_UP(bytes, block_size);
@@ -802,7 +808,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header));
    if (ret < 0) {
        error_setg(errp, "Error writing header to %s", filename);
        goto exit;
@@ -823,7 +829,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size);
        if (ret < 0) {
            error_setg(errp, "Error writing bmap to %s", filename);
            goto exit;
@@ -832,7 +838,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    if (image_type == VDI_TYPE_STATIC) {
-        ret = bdrv_truncate(bs, offset + blocks * block_size);
+        ret = blk_truncate(blk, offset + blocks * block_size);
        if (ret < 0) {
            error_setg(errp, "Failed to statically allocate %s", filename);
            goto exit;
@@ -840,7 +846,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

 exit:
-    bdrv_unref(bs);
+    blk_unref(blk);
    g_free(bmap);
    return ret;
 }
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
 #include "block/vhdx.h"
@@ -264,10 +265,10 @@ static void vhdx_region_unregister_all(BDRVVHDXState *s)

 static void vhdx_set_shift_bits(BDRVVHDXState *s)
 {
-    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
-    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
-    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
-    s->block_size_bits =          31 - clz32(s->block_size);
+    s->logical_sector_size_bits = ctz32(s->logical_sector_size);
+    s->sectors_per_block_bits =   ctz32(s->sectors_per_block);
+    s->chunk_ratio_bits =         ctz64(s->chunk_ratio);
+    s->block_size_bits =          ctz32(s->block_size);
 }

 /*
@@ -857,14 +858,8 @@ static void vhdx_calc_bat_entries(BDRVVHDXState *s)
 {
    uint32_t data_blocks_cnt, bitmap_blocks_cnt;

-    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
-    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
-        data_blocks_cnt++;
-    }
-    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
-    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
-        bitmap_blocks_cnt++;
-    }
+    data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size);
+    bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio);

    if (s->parent_entries) {
        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
@@ -1778,7 +1773,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)

    gunichar2 *creator = NULL;
    glong creator_items;
-    BlockDriverState *bs;
+    BlockBackend *blk;
    char *type = NULL;
    VHDXImageType image_type;
    Error *local_err = NULL;
@@ -1843,14 +1838,17 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* Create (A) */

    /* The creator field is optional, but may be useful for
@@ -1858,13 +1856,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                              &creator_items, NULL);
    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
    if (ret < 0) {
        goto delete_and_exit;
    }
    if (creator) {
-        ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                          creator, creator_items * sizeof(gunichar2));
+        ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
+                         creator, creator_items * sizeof(gunichar2));
        if (ret < 0) {
            goto delete_and_exit;
        }
@@ -1872,13 +1870,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)


    /* Creates (B),(C) */
-    ret = vhdx_create_new_headers(bs, image_size, log_size);
+    ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size);
    if (ret < 0) {
        goto delete_and_exit;
    }

    /* Creates (D),(E),(G) explicitly. (F) created as by-product */
-    ret = vhdx_create_new_region_table(bs, image_size, block_size, 512,
+    ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512,
                                       log_size, use_zero_blocks, image_type,
                                       &metadata_offset);
    if (ret < 0) {
@@ -1886,7 +1884,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* Creates (H) */
-    ret = vhdx_create_new_metadata(bs, image_size, block_size, 512,
+    ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512,
                                   metadata_offset, image_type);
    if (ret < 0) {
        goto delete_and_exit;
@@ -1894,7 +1892,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)


 delete_and_exit:
-    bdrv_unref(bs);
+    blk_unref(blk);
 exit:
    g_free(type);
    g_free(creator);
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -26,6 +26,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
@@ -242,15 +243,17 @@ static void vmdk_free_last_extent(BlockDriverState *bs)

 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
-    char desc[DESC_SIZE];
+    char *desc;
    uint32_t cid = 0xffffffff;
    const char *p_name, *cid_str;
    size_t cid_str_size;
    BDRVVmdkState *s = bs->opaque;
    int ret;

+    desc = g_malloc0(DESC_SIZE);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
+        g_free(desc);
        return 0;
    }

@@ -269,41 +272,45 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
        sscanf(p_name, "%" SCNx32, &cid);
    }

+    g_free(desc);
    return cid;
 }

 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 {
-    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+    char *desc, *tmp_desc;
    char *p_name, *tmp_str;
    BDRVVmdkState *s = bs->opaque;
-    int ret;
+    int ret = 0;

+    desc = g_malloc0(DESC_SIZE);
+    tmp_desc = g_malloc0(DESC_SIZE);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
-        return ret;
+        goto out;
    }

    desc[DESC_SIZE - 1] = '\0';
    tmp_str = strstr(desc, "parentCID");
    if (tmp_str == NULL) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }

-    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
        p_name += sizeof("CID");
-        snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid);
-        pstrcat(desc, sizeof(desc), tmp_desc);
+        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
+        pstrcat(desc, DESC_SIZE, tmp_desc);
    }

    ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
-    if (ret < 0) {
-        return ret;
-    }

-    return 0;
+out:
+    g_free(desc);
+    g_free(tmp_desc);
+    return ret;
 }

 static int vmdk_is_cid_valid(BlockDriverState *bs)
@@ -337,15 +344,16 @@ static int vmdk_reopen_prepare(BDRVReopenState *state,
 static int vmdk_parent_open(BlockDriverState *bs)
 {
    char *p_name;
-    char desc[DESC_SIZE + 1];
+    char *desc;
    BDRVVmdkState *s = bs->opaque;
    int ret;

-    desc[DESC_SIZE] = '\0';
+    desc = g_malloc0(DESC_SIZE + 1);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
-        return ret;
+        goto out;
    }
+    ret = 0;

    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
@@ -354,16 +362,20 @@ static int vmdk_parent_open(BlockDriverState *bs)
        p_name += sizeof("parentFileNameHint") + 1;
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out;
        }
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out;
        }

        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
    }

-    return 0;
+out:
+    g_free(desc);
+    return ret;
 }

 /* Create and append extent to the extent array. Return the added VmdkExtent
@@ -649,11 +661,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
    if (le32_to_cpu(header.version) > 3) {
-        char buf[64];
-        snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
-                 le32_to_cpu(header.version));
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "vmdk", buf);
+        error_setg(errp, "Unsupported VMDK version %" PRIu32,
+                   le32_to_cpu(header.version));
        return -ENOTSUP;
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
               !compressed) {
@@ -1639,7 +1648,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                              QemuOpts *opts, Error **errp)
 {
    int ret, i;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    VMDK4Header header;
    Error *local_err = NULL;
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
@@ -1652,16 +1661,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        goto exit;
    }

-    assert(bs == NULL);
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    if (flat) {
-        ret = bdrv_truncate(bs, filesize);
+        ret = blk_truncate(blk, filesize);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not truncate file");
        }
@@ -1716,18 +1728,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }

-    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
+    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not truncate file");
        goto exit;
@@ -1740,8 +1752,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
+    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                     gd_buf, gd_buf_size);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1752,8 +1764,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
+    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                     gd_buf, gd_buf_size);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1761,8 +1773,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,

    ret = 0;
 exit:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    g_free(gd_buf);
    return ret;
@@ -1811,7 +1823,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
 {
    int idx = 0;
-    BlockDriverState *new_bs = NULL;
+    BlockBackend *new_blk = NULL;
    Error *local_err = NULL;
    char *desc = NULL;
    int64_t total_size = 0, filesize;
@@ -1922,7 +1934,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }
    if (backing_file) {
-        BlockDriverState *bs = NULL;
+        BlockBackend *blk;
        char *full_backing = g_new0(char, PATH_MAX);
        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
                                                     full_backing, PATH_MAX,
@@ -1933,18 +1945,21 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
            ret = -ENOENT;
            goto exit;
        }
-        ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, errp);
+
+        blk = blk_new_open(full_backing, NULL, NULL,
+                           BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp);
        g_free(full_backing);
-        if (ret != 0) {
+        if (blk == NULL) {
+            ret = -EIO;
            goto exit;
        }
-        if (strcmp(bs->drv->format_name, "vmdk")) {
-            bdrv_unref(bs);
+        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
+            blk_unref(blk);
            ret = -EINVAL;
            goto exit;
        }
-        parent_cid = vmdk_read_cid(bs, 0);
-        bdrv_unref(bs);
+        parent_cid = vmdk_read_cid(blk_bs(blk), 0);
+        blk_unref(blk);
        snprintf(parent_desc_line, BUF_SIZE,
                "parentFileNameHint=\"%s\"", backing_file);
    }
@@ -2002,14 +2017,19 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
            goto exit;
        }
    }
-    assert(new_bs == NULL);
-    ret = bdrv_open(&new_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+
+    new_blk = blk_new_open(filename, NULL, NULL,
+                           BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                           &local_err);
+    if (new_blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }
-    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
+
+    blk_set_allow_write_beyond_eof(new_blk, true);
+
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2017,14 +2037,14 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
-        ret = bdrv_truncate(new_bs, desc_len);
+        ret = blk_truncate(new_blk, desc_len);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not truncate file");
        }
    }
 exit:
-    if (new_bs) {
-        bdrv_unref(new_bs);
+    if (new_blk) {
+        blk_unref(new_blk);
    }
    g_free(adapter_type);
    g_free(backing_file);
@@ -2183,18 +2203,18 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)

    *spec_info = (ImageInfoSpecific){
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
-        {
-            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
+        .u = {
+            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
        },
    };

-    *spec_info->u.vmdk = (ImageInfoSpecificVmdk) {
+    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

-    next = &spec_info->u.vmdk->extents;
+    next = &spec_info->u.vmdk.data->extents;
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
 #if defined(CONFIG_UUID)
@@ -46,8 +47,14 @@ enum vhd_type {
 // Seconds since Jan 1, 2000 0:00:00 (UTC)
 #define VHD_TIMESTAMP_BASE 946684800

+#define VHD_CHS_MAX_C   65535LL
+#define VHD_CHS_MAX_H   16
+#define VHD_CHS_MAX_S   255
+
 #define VHD_MAX_SECTORS       (65535LL * 255 * 255)
-#define VHD_MAX_GEOMETRY      (65535LL *  16 * 255)
+#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
+
+#define VPC_OPT_FORCE_SIZE "force_size"

 // always big-endian
 typedef struct vhd_footer {
@@ -128,6 +135,8 @@ typedef struct BDRVVPCState {

    uint32_t block_size;
    uint32_t bitmap_size;
+    bool force_use_chs;
+    bool force_use_sz;

 #ifdef CACHE
    uint8_t *pageentry_u8;
@@ -140,6 +149,22 @@ typedef struct BDRVVPCState {
    Error *migration_blocker;
 } BDRVVPCState;

+#define VPC_OPT_SIZE_CALC "force_size_calc"
+static QemuOptsList vpc_runtime_opts = {
+    .name = "vpc-runtime-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
+    .desc = {
+        {
+            .name = VPC_OPT_SIZE_CALC,
+            .type = QEMU_OPT_STRING,
+            .help = "Force disk size calculation to use either CHS geometry, "
+                    "or use the disk current_size specified in the VHD footer. "
+                    "{chs, current_size}"
+        },
+        { /* end of list */ }
+    }
+};
+
 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
 {
    uint32_t res = 0;
@@ -159,6 +184,25 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
    return 0;
 }

+static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
+                              Error **errp)
+{
+    BDRVVPCState *s = bs->opaque;
+    const char *size_calc;
+
+    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
+
+    if (!size_calc) {
+       /* no override, use autodetect only */
+    } else if (!strcmp(size_calc, "current_size")) {
+        s->force_use_sz = true;
+    } else if (!strcmp(size_calc, "chs")) {
+        s->force_use_chs = true;
+    } else {
+        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
+    }
+}
+
 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
@@ -166,6 +210,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    int i;
    VHDFooter *footer;
    VHDDynDiskHeader *dyndisk_header;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
+    bool use_chs;
    uint8_t buf[HEADER_SIZE];
    uint32_t checksum;
    uint64_t computed_size;
@@ -173,6 +220,21 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    int disk_type = VHD_DYNAMIC;
    int ret;

+    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    vpc_parse_options(bs, opts, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
    ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
    if (ret < 0) {
        goto fail;
@@ -218,12 +280,36 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    bs->total_sectors = (int64_t)
        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;

-    /* Images that have exactly the maximum geometry are probably bigger and
-     * would be truncated if we adhered to the geometry for them. Rely on
-     * footer->current_size for them. */
-    if (bs->total_sectors == VHD_MAX_GEOMETRY) {
+    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
+     * VHD image sizes differently.  VPC will rely on CHS geometry,
+     * while Hyper-V and disk2vhd use the size specified in the footer.
+     *
+     * We use a couple of approaches to try and determine the correct method:
+     * look at the Creator App field, and look for images that have CHS
+     * geometry that is the maximum value.
+     *
+     * If the CHS geometry is the maximum CHS geometry, then we assume that
+     * the size is the footer->current_size to avoid truncation.  Otherwise,
+     * we follow the table based on footer->creator_app:
+     *
+     *  Known creator apps:
+     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
+     *      'qemu'  :  CHS              QEMU (uses disk geometry)
+     *      'qem2'  :  current_size     QEMU (uses current_size)
+     *      'win '  :  current_size     Hyper-V
+     *      'd2v '  :  current_size     Disk2vhd
+     *
+     *  The user can override the table values via drive options, however
+     *  even with an override we will still use current_size for images
+     *  that have CHS geometry of the maximum size.
+     */
+    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
+               !!strncmp(footer->creator_app, "qem2", 4) &&
+               !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs;
+
+    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
        bs->total_sectors = be64_to_cpu(footer->current_size) /
-                            BDRV_SECTOR_SIZE;
+                                        BDRV_SECTOR_SIZE;
    }

    /* Allow a maximum disk size of approximately 2 TB */
@@ -673,7 +759,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    return 0;
 }

-static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
+static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
                               int64_t total_sectors)
 {
    VHDDynDiskHeader *dyndisk_header =
@@ -687,13 +773,13 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
    if (ret) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
    if (ret < 0) {
        goto fail;
    }
@@ -703,7 +789,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = bdrv_pwrite_sync(bs, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512);
        if (ret < 0) {
            goto fail;
        }
@@ -730,7 +816,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    // Write the header
    offset = 512;

-    ret = bdrv_pwrite_sync(bs, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024);
    if (ret < 0) {
        goto fail;
    }
@@ -739,7 +825,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    return ret;
 }

-static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
+static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
                             int64_t total_size)
 {
    int ret;
@@ -747,12 +833,12 @@ static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
    /* Add footer to total size */
    total_size += HEADER_SIZE;

-    ret = bdrv_truncate(bs, total_size);
+    ret = blk_truncate(blk, total_size);
    if (ret < 0) {
        return ret;
    }

-    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
    if (ret < 0) {
        return ret;
    }
@@ -773,8 +859,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size;
    int disk_type;
    int ret = -EIO;
+    bool force_size;
    Error *local_err = NULL;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
@@ -793,30 +880,44 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
        disk_type = VHD_DYNAMIC;
    }

+    force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
+
    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto out;
    }
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /*
     * Calculate matching total_size and geometry. Increase the number of
     * sectors requested until we get enough (or fail). This ensures that
     * qemu-img convert doesn't truncate images, but rather rounds up.
     *
-     * If the image size can't be represented by a spec conform CHS geometry,
+     * If the image size can't be represented by a spec conformant CHS geometry,
     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
     * the image size from the VHD footer to calculate total_sectors.
     */
-    total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
-    for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
-        calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
+    if (force_size) {
+        /* This will force the use of total_size for sector count, below */
+        cyls         = VHD_CHS_MAX_C;
+        heads        = VHD_CHS_MAX_H;
+        secs_per_cyl = VHD_CHS_MAX_S;
+    } else {
+        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
+        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
+            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
+        }
    }

    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
@@ -835,8 +936,11 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(buf, 0, 1024);

    memcpy(footer->creator, "conectix", 8);
-    /* TODO Check if "qemu" creator_app is ok for VPC */
-    memcpy(footer->creator_app, "qemu", 4);
+    if (force_size) {
+        memcpy(footer->creator_app, "qem2", 4);
+    } else {
+        memcpy(footer->creator_app, "qemu", 4);
+    }
    memcpy(footer->creator_os, "Wi2k", 4);

    footer->features = cpu_to_be32(0x02);
@@ -866,13 +970,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));

    if (disk_type == VHD_DYNAMIC) {
-        ret = create_dynamic_disk(bs, buf, total_sectors);
+        ret = create_dynamic_disk(blk, buf, total_sectors);
    } else {
-        ret = create_fixed_disk(bs, buf, total_size);
+        ret = create_fixed_disk(blk, buf, total_size);
    }

 out:
-    bdrv_unref(bs);
+    blk_unref(blk);
    g_free(disk_type_param);
    return ret;
 }
@@ -917,6 +1021,13 @@ static QemuOptsList vpc_create_opts = {
                "Type of virtual hard disk format. Supported formats are "
                "{dynamic (default) | fixed} "
        },
+        {
+            .name = VPC_OPT_FORCE_SIZE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Force disk size calculation to use the actual size "
+                    "specified, rather than using the nearest CHS-based "
+                    "calculation"
+        },
        { /* end of list */ }
    }
 };
--- a/blockdev.c
+++ b/blockdev.c
@@ -147,6 +147,7 @@ void blockdev_auto_del(BlockBackend *blk)
    DriveInfo *dinfo = blk_legacy_dinfo(blk);

    if (dinfo && dinfo->auto_del) {
+        monitor_remove_blk(blk);
        blk_unref(blk);
    }
 }
@@ -561,7 +562,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new(qemu_opts_id(opts), errp);
+        blk = blk_new(errp);
        if (!blk) {
            goto early_err;
        }
@@ -593,19 +594,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");

-        if (snapshot) {
-            /* always use cache=unsafe with snapshot */
-            qdict_put(bs_opts, BDRV_OPT_CACHE_WB, qstring_from_str("on"));
-            qdict_put(bs_opts, BDRV_OPT_CACHE_DIRECT, qstring_from_str("off"));
-            qdict_put(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, qstring_from_str("on"));
-        }
-
        if (runstate_check(RUN_STATE_INMIGRATE)) {
            bdrv_flags |= BDRV_O_INACTIVE;
        }

-        blk = blk_new_open(qemu_opts_id(opts), file, NULL, bs_opts, bdrv_flags,
-                           errp);
+        blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
        if (!blk) {
            goto err_no_bs_opts;
        }
@@ -637,6 +630,12 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

    blk_set_on_error(blk, on_read_error, on_write_error);

+    if (!monitor_add_blk(blk, qemu_opts_id(opts), errp)) {
+        blk_unref(blk);
+        blk = NULL;
+        goto err_no_bs_opts;
+    }
+
 err_no_bs_opts:
    qemu_opts_del(opts);
    QDECREF(interval_dict);
@@ -682,6 +681,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        goto fail;
    }

+    /* bdrv_open() defaults to the values in bdrv_flags (for compatibility
+     * with other callers) rather than what we want as the real defaults.
+     * Apply the defaults here instead. */
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on");
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
+
    if (runstate_check(RUN_STATE_INMIGRATE)) {
        bdrv_flags |= BDRV_O_INACTIVE;
    }
@@ -717,6 +723,13 @@ void blockdev_close_all_bdrv_states(void)
    }
 }

+/* Iterates over the list of monitor-owned BlockDriverStates */
+BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs)
+{
+    return bs ? QTAILQ_NEXT(bs, monitor_list)
+              : QTAILQ_FIRST(&monitor_bdrv_states);
+}
+
 static void qemu_opt_rename(QemuOpts *opts, const char *from, const char *to,
                            Error **errp)
 {
@@ -1173,7 +1186,7 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
    int ret;

    if (!strcmp(device, "all")) {
-        ret = bdrv_commit_all();
+        ret = blk_commit_all();
    } else {
        BlockDriverState *bs;
        AioContext *aio_context;
@@ -1202,15 +1215,11 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
    }
 }

-static void blockdev_do_action(TransactionActionKind type, void *data,
-                               Error **errp)
+static void blockdev_do_action(TransactionAction *action, Error **errp)
 {
-    TransactionAction action;
    TransactionActionList list;

-    action.type = type;
-    action.u.data = data;
-    list.value = &action;
+    list.value = action;
    list.next = NULL;
    qmp_transaction(&list, false, NULL, errp);
 }
@@ -1236,8 +1245,11 @@ void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
        .has_mode = has_mode,
        .mode = mode,
    };
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC,
-                       &snapshot, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC,
+        .u.blockdev_snapshot_sync.data = &snapshot,
+    };
+    blockdev_do_action(&action, errp);
 }

 void qmp_blockdev_snapshot(const char *node, const char *overlay,
@@ -1247,9 +1259,11 @@ void qmp_blockdev_snapshot(const char *node, const char *overlay,
        .node = (char *) node,
        .overlay = (char *) overlay
    };
-
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT,
-                       &snapshot_data, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT,
+        .u.blockdev_snapshot.data = &snapshot_data,
+    };
+    blockdev_do_action(&action, errp);
 }

 void qmp_blockdev_snapshot_internal_sync(const char *device,
@@ -1260,9 +1274,11 @@ void qmp_blockdev_snapshot_internal_sync(const char *device,
        .device = (char *) device,
        .name = (char *) name
    };
-
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC,
-                       &snapshot, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC,
+        .u.blockdev_snapshot_internal_sync.data = &snapshot,
+    };
+    blockdev_do_action(&action, errp);
 }

 SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device,
@@ -1499,7 +1515,7 @@ static void internal_snapshot_prepare(BlkActionState *common,

    g_assert(common->action->type ==
             TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC);
-    internal = common->action->u.blockdev_snapshot_internal_sync;
+    internal = common->action->u.blockdev_snapshot_internal_sync.data;
    state = DO_UPCAST(InternalSnapshotState, common, common);

    /* 1. parse input */
@@ -1649,7 +1665,7 @@ static void external_snapshot_prepare(BlkActionState *common,
    switch (action->type) {
    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT:
        {
-            BlockdevSnapshot *s = action->u.blockdev_snapshot;
+            BlockdevSnapshot *s = action->u.blockdev_snapshot.data;
            device = s->node;
            node_name = s->node;
            new_image_file = NULL;
@@ -1658,7 +1674,7 @@ static void external_snapshot_prepare(BlkActionState *common,
        break;
    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC:
        {
-            BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+            BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync.data;
            device = s->has_device ? s->device : NULL;
            node_name = s->has_node_name ? s->node_name : NULL;
            new_image_file = s->snapshot_file;
@@ -1707,7 +1723,7 @@ static void external_snapshot_prepare(BlkActionState *common,
    }

    if (action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC) {
-        BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+        BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync.data;
        const char *format = s->has_format ? s->format : "qcow2";
        enum NewImageMode mode;
        const char *snapshot_node_name =
@@ -1729,10 +1745,15 @@ static void external_snapshot_prepare(BlkActionState *common,
        /* create new image w/backing file */
        mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
        if (mode != NEW_IMAGE_MODE_EXISTING) {
+            int64_t size = bdrv_getlength(state->old_bs);
+            if (size < 0) {
+                error_setg_errno(errp, -size, "bdrv_getlength failed");
+                return;
+            }
            bdrv_img_create(new_image_file, format,
                            state->old_bs->filename,
                            state->old_bs->drv->format_name,
-                            NULL, -1, flags, &local_err, false);
+                            NULL, size, flags, &local_err, false);
            if (local_err) {
                error_propagate(errp, local_err);
                return;
@@ -1840,7 +1861,7 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp)
    Error *local_err = NULL;

    assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP);
-    backup = common->action->u.drive_backup;
+    backup = common->action->u.drive_backup.data;

    blk = blk_by_name(backup->device);
    if (!blk) {
@@ -1922,7 +1943,7 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
    Error *local_err = NULL;

    assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP);
-    backup = common->action->u.blockdev_backup;
+    backup = common->action->u.blockdev_backup.data;

    blk = blk_by_name(backup->device);
    if (!blk) {
@@ -2008,7 +2029,7 @@ static void block_dirty_bitmap_add_prepare(BlkActionState *common,
        return;
    }

-    action = common->action->u.block_dirty_bitmap_add;
+    action = common->action->u.block_dirty_bitmap_add.data;
    /* AIO context taken and released within qmp_block_dirty_bitmap_add */
    qmp_block_dirty_bitmap_add(action->node, action->name,
                               action->has_granularity, action->granularity,
@@ -2027,7 +2048,7 @@ static void block_dirty_bitmap_add_abort(BlkActionState *common)
    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
                                             common, common);

-    action = common->action->u.block_dirty_bitmap_add;
+    action = common->action->u.block_dirty_bitmap_add.data;
    /* Should not be able to fail: IF the bitmap was added via .prepare(),
     * then the node reference and bitmap name must have been valid.
     */
@@ -2047,7 +2068,7 @@ static void block_dirty_bitmap_clear_prepare(BlkActionState *common,
        return;
    }

-    action = common->action->u.block_dirty_bitmap_clear;
+    action = common->action->u.block_dirty_bitmap_clear.data;
    state->bitmap = block_dirty_bitmap_lookup(action->node,
                                              action->name,
                                              &state->bs,
@@ -2405,11 +2426,6 @@ void qmp_x_blockdev_remove_medium(const char *device, Error **errp)
        goto out;
    }

-    /* This follows the convention established by bdrv_make_anon() */
-    if (bs->device_list.tqe_prev) {
-        bdrv_device_remove(bs);
-    }
-
    blk_remove_bs(blk);

    if (!blk_dev_has_tray(blk)) {
@@ -2457,8 +2473,6 @@ static void qmp_blockdev_insert_anon_medium(const char *device,

    blk_insert_bs(blk, bs);

-    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
-
    if (!blk_dev_has_tray(blk)) {
        /* For tray-less devices, blockdev-close-tray is a no-op (or may not be
         * called at all); therefore, the medium needs to be pushed into the
@@ -2816,6 +2830,15 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
    AioContext *aio_context;
    Error *local_err = NULL;

+    bs = bdrv_find_node(id);
+    if (bs) {
+        qmp_x_blockdev_del(false, NULL, true, id, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+        }
+        return;
+    }
+
    blk = blk_by_name(id);
    if (!blk) {
        error_report("Device '%s' not found", id);
@@ -2842,13 +2865,16 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
        blk_remove_bs(blk);
    }

-    /* if we have a device attached to this BlockDriverState
-     * then we need to make the drive anonymous until the device
-     * can be removed.  If this is a drive with no device backing
-     * then we can just get rid of the block driver state right here.
+    /* Make the BlockBackend and the attached BlockDriverState anonymous */
+    monitor_remove_blk(blk);
+    if (blk_bs(blk)) {
+        bdrv_make_anon(blk_bs(blk));
+    }
+
+    /* If this BlockBackend has a device attached to it, its refcount will be
+     * decremented when the device is removed; otherwise we have to do so here.
     */
    if (blk_get_attached_dev(blk)) {
-        blk_hide_on_behalf_of_hmp_drive_del(blk);
        /* Further I/O must not pause the guest */
        blk_set_on_error(blk, BLOCKDEV_ON_ERROR_REPORT,
                         BLOCKDEV_ON_ERROR_REPORT);
@@ -3867,6 +3893,37 @@ out:
    aio_context_release(aio_context);
 }

+void hmp_drive_add_node(Monitor *mon, const char *optstr)
+{
+    QemuOpts *opts;
+    QDict *qdict;
+    Error *local_err = NULL;
+
+    opts = qemu_opts_parse_noisily(&qemu_drive_opts, optstr, false);
+    if (!opts) {
+        return;
+    }
+
+    qdict = qemu_opts_to_qdict(opts, NULL);
+
+    if (!qdict_get_try_str(qdict, "node-name")) {
+        QDECREF(qdict);
+        error_report("'node-name' needs to be specified");
+        goto out;
+    }
+
+    BlockDriverState *bs = bds_tree_init(qdict, &local_err);
+    if (!bs) {
+        error_report_err(local_err);
+        goto out;
+    }
+
+    QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list);
+
+out:
+    qemu_opts_del(opts);
+}
+
 void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
 {
    QmpOutputVisitor *ov = qmp_output_visitor_new();
@@ -3928,6 +3985,7 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)

    if (bs && bdrv_key_required(bs)) {
        if (blk) {
+            monitor_remove_blk(blk);
            blk_unref(blk);
        } else {
            QTAILQ_REMOVE(&monitor_bdrv_states, bs, monitor_list);
@@ -3957,6 +4015,7 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
    }

    if (has_id) {
+        /* blk_by_name() never returns a BB that is not owned by the monitor */
        blk = blk_by_name(id);
        if (!blk) {
            error_setg(errp, "Cannot find block backend %s", id);
@@ -4004,6 +4063,7 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
    }

    if (blk) {
+        monitor_remove_blk(blk);
        blk_unref(blk);
    } else {
        QTAILQ_REMOVE(&monitor_bdrv_states, bs, monitor_list);
@@ -4174,7 +4234,7 @@ QemuOptsList qemu_common_drive_opts = {

 static QemuOptsList qemu_root_bds_opts = {
    .name = "root-bds",
-    .head = QTAILQ_HEAD_INITIALIZER(qemu_common_drive_opts.head),
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_root_bds_opts.head),
    .desc = {
        {
            .name = "discard",
--- a/55
+++ b/55
@@ -280,6 +280,7 @@ libusb=""
 usb_redir=""
 opengl=""
 opengl_dmabuf="no"
+avx2_opt="no"
 zlib="yes"
 lzo=""
 snappy=""
@@ -305,8 +306,10 @@ gtkabi=""
 gtk_gl="no"
 gnutls=""
 gnutls_hash=""
+gnutls_rnd=""
 nettle=""
 gcrypt=""
+gcrypt_kdf="no"
 vte=""
 virglrenderer=""
 tpm="yes"
@@ -1773,6 +1776,21 @@ EOF
 fi

 ##########################################
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void *bar_ifunc(void) {return (void*) bar;}
+static void foo(void) __attribute__((ifunc("bar_ifunc")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "-mavx2" "" ; then
+    if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
+        avx2_opt="yes"
+    fi
+fi
+
+#########################################
 # zlib check

 if test "$zlib" != "no" ; then
@@ -2185,6 +2203,13 @@ if test "$gnutls" != "no"; then
 	    gnutls_hash="no"
 	fi

+	# gnutls_rnd requires >= 2.11.0
+	if $pkg_config --exists "gnutls >= 2.11.0"; then
+	    gnutls_rnd="yes"
+	else
+	    gnutls_rnd="no"
+	fi
+
 	if $pkg_config --exists 'gnutls >= 3.0'; then
 	    gnutls_gcrypt=no
 	    gnutls_nettle=yes
@@ -2212,9 +2237,11 @@ if test "$gnutls" != "no"; then
    else
        gnutls="no"
        gnutls_hash="no"
+        gnutls_rnd="no"
    fi
 else
    gnutls_hash="no"
+    gnutls_rnd="no"
 fi


@@ -2276,6 +2303,19 @@ if test "$gcrypt" != "no"; then
        if test -z "$nettle"; then
           nettle="no"
        fi
+
+        cat > $TMPC << EOF
+#include <gcrypt.h>
+int main(void) {
+  gcry_kdf_derive(NULL, 0, GCRY_KDF_PBKDF2,
+                  GCRY_MD_SHA256,
+                  NULL, 0, 0, 0, NULL);
+ return 0;
+}
+EOF
+        if compile_prog "$gcrypt_cflags" "$gcrypt_libs" ; then
+            gcrypt_kdf=yes
+        fi
    else
        if test "$gcrypt" = "yes"; then
            feature_not_found "gcrypt" "Install gcrypt devel"
@@ -2796,7 +2836,7 @@ fi
 # curses probe
 if test "$curses" != "no" ; then
  if test "$mingw32" = "yes" ; then
-    curses_list="-lpdcurses"
+    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
  else
    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lncurses:-lcurses"
  fi
@@ -4698,7 +4738,9 @@ echo "GTK support       $gtk"
 echo "GTK GL support    $gtk_gl"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
+echo "GNUTLS rnd        $gnutls_rnd"
 echo "libgcrypt         $gcrypt"
+echo "libgcrypt kdf     $gcrypt_kdf"
 if test "$nettle" = "yes"; then
    echo "nettle            $nettle ($nettle_version)"
 else
@@ -4790,6 +4832,7 @@ echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"

 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5075,8 +5118,14 @@ fi
 if test "$gnutls_hash" = "yes" ; then
  echo "CONFIG_GNUTLS_HASH=y" >> $config_host_mak
 fi
+if test "$gnutls_rnd" = "yes" ; then
+  echo "CONFIG_GNUTLS_RND=y" >> $config_host_mak
+fi
 if test "$gcrypt" = "yes" ; then
  echo "CONFIG_GCRYPT=y" >> $config_host_mak
+  if test "$gcrypt_kdf" = "yes" ; then
+    echo "CONFIG_GCRYPT_KDF=y" >> $config_host_mak
+  fi
 fi
 if test "$nettle" = "yes" ; then
  echo "CONFIG_NETTLE=y" >> $config_host_mak
@@ -5178,6 +5227,10 @@ if test "$opengl" = "yes" ; then
  fi
 fi

+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
  echo "CONFIG_LZO=y" >> $config_host_mak
 fi
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -12,9 +12,6 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/un.h>
-#ifdef CONFIG_LINUX
-#include <sys/vfs.h>
-#endif

 #include "ivshmem-server.h"

@@ -257,7 +254,8 @@ ivshmem_server_ftruncate(int fd, unsigned shmsize)
 /* Init a new ivshmem server */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
+                    size_t shm_size, unsigned n_vectors,
                    bool verbose)
 {
    int ret;
@@ -278,6 +276,7 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
        return -1;
    }

+    server->use_shm_open = use_shm_open;
    server->shm_size = shm_size;
    server->n_vectors = n_vectors;

@@ -286,31 +285,6 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
    return 0;
 }

-#ifdef CONFIG_LINUX
-
-#define HUGETLBFS_MAGIC       0x958458f6
-
-static long gethugepagesize(const char *path)
-{
-    struct statfs fs;
-    int ret;
-
-    do {
-        ret = statfs(path, &fs);
-    } while (ret != 0 && errno == EINTR);
-
-    if (ret != 0) {
-        return -1;
-    }
-
-    if (fs.f_type != HUGETLBFS_MAGIC) {
-        return -1;
-    }
-
-    return fs.f_bsize;
-}
-#endif
-
 /* open shm, create and bind to the unix socket */
 int
 ivshmem_server_start(IvshmemServer *server)
@@ -319,27 +293,17 @@ ivshmem_server_start(IvshmemServer *server)
    int shm_fd, sock_fd, ret;

    /* open shm file */
-#ifdef CONFIG_LINUX
-    long hpagesize;
-
-    hpagesize = gethugepagesize(server->shm_path);
-    if (hpagesize < 0 && errno != ENOENT) {
-        IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n",
-                             server->shm_path, strerror(errno));
-    }
-
-    if (hpagesize > 0) {
+    if (server->use_shm_open) {
+        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
+                             server->shm_path);
+        shm_fd = shm_open(server->shm_path, O_CREAT | O_RDWR, S_IRWXU);
+    } else {
        gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path);
-        IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path);
+        IVSHMEM_SERVER_DEBUG(server, "Using file-backed shared memory: %s\n",
+                             server->shm_path);
        shm_fd = mkstemp(filename);
        unlink(filename);
        g_free(filename);
-    } else
-#endif
-    {
-        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
-                             server->shm_path);
-        shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU);
    }

    if (shm_fd < 0) {
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -66,6 +66,7 @@ typedef struct IvshmemServer {
    char unix_sock_path[PATH_MAX];   /**< path to unix socket */
    int sock_fd;                     /**< unix sock file descriptor */
    char shm_path[PATH_MAX];         /**< path to shm */
+    bool use_shm_open;
    size_t shm_size;                 /**< size of shm */
    int shm_fd;                      /**< shm file descriptor */
    unsigned n_vectors;              /**< number of vectors */
@@ -89,7 +90,8 @@ typedef struct IvshmemServer {
 */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
+                    size_t shm_size, unsigned n_vectors,
                    bool verbose);

 /**
--- a/contrib/ivshmem-server/main.c
+++ b/contrib/ivshmem-server/main.c
@@ -29,35 +29,38 @@ typedef struct IvshmemServerArgs {
    const char *pid_file;
    const char *unix_socket_path;
    const char *shm_path;
+    bool use_shm_open;
    uint64_t shm_size;
    unsigned n_vectors;
 } IvshmemServerArgs;

-/* show ivshmem_server_usage and exit with given error code */
 static void
-ivshmem_server_usage(const char *name, int code)
+ivshmem_server_usage(const char *progname)
 {
-    fprintf(stderr, "%s [opts]\n", name);
-    fprintf(stderr, "  -h: show this help\n");
-    fprintf(stderr, "  -v: verbose mode\n");
-    fprintf(stderr, "  -F: foreground mode (default is to daemonize)\n");
-    fprintf(stderr, "  -p <pid_file>: path to the PID file (used in daemon\n"
-                    "     mode only).\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
-    fprintf(stderr, "  -S <unix_socket_path>: path to the unix socket\n"
-                    "     to listen to.\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH);
-    fprintf(stderr, "  -m <shm_path>: path to the shared memory.\n"
-                    "     The path corresponds to a POSIX shm name or a\n"
-                    "     hugetlbfs mount point.\n"
-                    "     default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
-    fprintf(stderr, "  -l <size>: size of shared memory in bytes. The suffix\n"
-                    "     K, M and G can be used (ex: 1K means 1024).\n"
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE);
-    fprintf(stderr, "  -n <n_vects>: number of vectors.\n"
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+    printf("Usage: %s [OPTION]...\n"
+           "  -h: show this help\n"
+           "  -v: verbose mode\n"
+           "  -F: foreground mode (default is to daemonize)\n"
+           "  -p <pid-file>: path to the PID file (used in daemon mode only)\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_PID_FILE "\n"
+           "  -S <unix-socket-path>: path to the unix socket to listen to\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "\n"
+           "  -M <shm-name>: POSIX shared memory object to use\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_SHM_PATH "\n"
+           "  -m <dir-name>: where to create shared memory\n"
+           "  -l <size>: size of shared memory in bytes\n"
+           "     suffixes K, M and G can be used, e.g. 1K means 1024\n"
+           "     default %u\n"
+           "  -n <nvectors>: number of vectors\n"
+           "     default %u\n",
+           progname, IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
+           IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+}

-    exit(code);
+static void
+ivshmem_server_help(const char *progname)
+{
+    fprintf(stderr, "Try '%s -h' for more information.\n", progname);
 }

 /* parse the program arguments, exit on error */
@@ -68,20 +71,12 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    unsigned long long v;
    Error *err = NULL;

-    while ((c = getopt(argc, argv,
-                       "h"  /* help */
-                       "v"  /* verbose */
-                       "F"  /* foreground */
-                       "p:" /* pid_file */
-                       "S:" /* unix_socket_path */
-                       "m:" /* shm_path */
-                       "l:" /* shm_size */
-                       "n:" /* n_vectors */
-                      )) != -1) {
+    while ((c = getopt(argc, argv, "hvFp:S:m:M:l:n:")) != -1) {

        switch (c) {
        case 'h': /* help */
-            ivshmem_server_usage(argv[0], 0);
+            ivshmem_server_usage(argv[0]);
+            exit(0);
            break;

        case 'v': /* verbose */
@@ -92,36 +87,41 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
            args->foreground = 1;
            break;

-        case 'p': /* pid_file */
+        case 'p': /* pid file */
            args->pid_file = optarg;
            break;

-        case 'S': /* unix_socket_path */
+        case 'S': /* unix socket path */
            args->unix_socket_path = optarg;
            break;

-        case 'm': /* shm_path */
+        case 'M': /* shm name */
+        case 'm': /* dir name */
            args->shm_path = optarg;
+            args->use_shm_open = c == 'M';
            break;

-        case 'l': /* shm_size */
+        case 'l': /* shm size */
            parse_option_size("shm_size", optarg, &args->shm_size, &err);
            if (err) {
                error_report_err(err);
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
+                exit(1);
            }
            break;

-        case 'n': /* n_vectors */
+        case 'n': /* number of vectors */
            if (parse_uint_full(optarg, &v, 0) < 0) {
                fprintf(stderr, "cannot parse n_vectors\n");
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
+                exit(1);
            }
            args->n_vectors = v;
            break;

        default:
-            ivshmem_server_usage(argv[0], 1);
+            ivshmem_server_usage(argv[0]);
+            exit(1);
            break;
        }
    }
@@ -129,12 +129,14 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) {
        fprintf(stderr, "too many requested vectors (max is %d)\n",
                IVSHMEM_SERVER_MAX_VECTORS);
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
+        exit(1);
    }

    if (args->verbose == 1 && args->foreground == 0) {
        fprintf(stderr, "cannot use verbose in daemon mode\n");
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
+        exit(1);
    }
 }

@@ -192,11 +194,18 @@ main(int argc, char *argv[])
        .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE,
        .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH,
        .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH,
+        .use_shm_open = true,
        .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
        .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS,
    };
    int ret = 1;

+    /*
+     * Do not remove this notice without adding proper error handling!
+     * Start with handling ivshmem_server_send_one_msg() failure.
+     */
+    printf("*** Example code, do not use in production ***\n");
+
    /* parse arguments, will exit on error */
    ivshmem_server_parse_args(&args, argc, argv);

@@ -219,7 +228,8 @@ main(int argc, char *argv[])
    }

    /* init the ivshms structure */
-    if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path,
+    if (ivshmem_server_init(&server, args.unix_socket_path,
+                            args.shm_path, args.use_shm_open,
                            args.shm_size, args.n_vectors, args.verbose) < 0) {
        fprintf(stderr, "cannot init server\n");
        goto err;
--- a/cpus.c
+++ b/cpus.c
@@ -29,6 +29,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/block-backend.h"
 #include "exec/gdbstub.h"
 #include "sysemu/dma.h"
 #include "sysemu/kvm.h"
@@ -370,9 +371,12 @@ static void icount_warp_rt(void)
    }
 }

-static void icount_dummy_timer(void *opaque)
+static void icount_timer_cb(void *opaque)
 {
-    (void)opaque;
+    /* No need for a checkpoint because the timer already synchronizes
+     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
+     */
+    icount_warp_rt();
 }

 void qtest_clock_warp(int64_t dest)
@@ -396,17 +400,12 @@ void qtest_clock_warp(int64_t dest)
    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 }

-void qemu_clock_warp(QEMUClockType type)
+void qemu_start_warp_timer(void)
 {
    int64_t clock;
    int64_t deadline;

-    /*
-     * There are too many global variables to make the "warp" behavior
-     * applicable to other clocks.  But a clock argument removes the
-     * need for if statements all over the place.
-     */
-    if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
+    if (!use_icount) {
        return;
    }

@@ -418,29 +417,17 @@ void qemu_clock_warp(QEMUClockType type)
    }

    /* warp clock deterministically in record/replay mode */
-    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
        return;
    }

-    if (icount_sleep) {
-        /*
-         * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
-         * This ensures that the deadline for the timer is computed correctly
-         * below.
-         * This also makes sure that the insn counter is synchronized before
-         * the CPU starts running, in case the CPU is woken by an event other
-         * than the earliest QEMU_CLOCK_VIRTUAL timer.
-         */
-        icount_warp_rt();
-        timer_del(icount_warp_timer);
-    }
    if (!all_cpu_threads_idle()) {
        return;
    }

    if (qtest_enabled()) {
        /* When testing, qtest commands advance icount.  */
-	return;
+        return;
    }

    /* We want to use the earliest deadline from ALL vm_clocks */
@@ -496,6 +483,28 @@ void qemu_clock_warp(QEMUClockType type)
    }
 }

+static void qemu_account_warp_timer(void)
+{
+    if (!use_icount || !icount_sleep) {
+        return;
+    }
+
+    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
+     * do not fire, so computing the deadline does not make sense.
+     */
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    /* warp clock deterministically in record/replay mode */
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
+        return;
+    }
+
+    timer_del(icount_warp_timer);
+    icount_warp_rt();
+}
+
 static bool icount_state_needed(void *opaque)
 {
    return use_icount;
@@ -624,13 +633,13 @@ void configure_icount(QemuOpts *opts, Error **errp)
    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
-                                         icount_dummy_timer, NULL);
+                                         icount_timer_cb, NULL);
    }

    icount_align_option = qemu_opt_get_bool(opts, "align", false);

    if (icount_align_option && !icount_sleep) {
-        error_setg(errp, "align=on and sleep=no are incompatible");
+        error_setg(errp, "align=on and sleep=off are incompatible");
    }
    if (strcmp(option, "auto") != 0) {
        errno = 0;
@@ -643,7 +652,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
    } else if (icount_align_option) {
        error_setg(errp, "shift=auto and align=on are incompatible");
    } else if (!icount_sleep) {
-        error_setg(errp, "shift=auto and sleep=no are incompatible");
+        error_setg(errp, "shift=auto and sleep=off are incompatible");
    }

    use_icount = 2;
@@ -726,7 +735,7 @@ static int do_vm_stop(RunState state)
    }

    bdrv_drain_all();
-    ret = bdrv_flush_all();
+    ret = blk_flush_all();

    return ret;
 }
@@ -995,9 +1004,6 @@ static void qemu_wait_io_event_common(CPUState *cpu)
 static void qemu_tcg_wait_io_event(CPUState *cpu)
 {
    while (all_cpu_threads_idle()) {
-       /* Start accounting real time to the virtual clock if the CPUs
-          are idle.  */
-        qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
    }

@@ -1428,7 +1434,7 @@ int vm_stop_force_state(RunState state)
        bdrv_drain_all();
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
-        return bdrv_flush_all();
+        return blk_flush_all();
    }
 }

@@ -1499,7 +1505,7 @@ static void tcg_exec_all(void)
    int r;

    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-    qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
+    qemu_account_warp_timer();

    if (next_cpu == NULL) {
        next_cpu = first_cpu;
--- a/cputlb.c
+++ b/cputlb.c
@@ -416,8 +416,8 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
            /* Write access calls the I/O callback.  */
            te->addr_write = address | TLB_MMIO;
        } else if (memory_region_is_ram(section->mr)
-                   && cpu_physical_memory_is_clean(section->mr->ram_addr
-                                                   + xlat)) {
+                   && cpu_physical_memory_is_clean(
+                        memory_region_get_ram_addr(section->mr) + xlat)) {
            te->addr_write = address | TLB_NOTDIRTY;
        } else {
            te->addr_write = address;
--- a/crypto/Makefile.objs
+++ b/crypto/Makefile.objs
@@ -8,6 +8,23 @@ crypto-obj-y += tlscredsanon.o
 crypto-obj-y += tlscredsx509.o
 crypto-obj-y += tlssession.o
 crypto-obj-y += secret.o
+crypto-obj-$(CONFIG_GCRYPT) += random-gcrypt.o
+crypto-obj-$(if $(CONFIG_GCRYPT),n,$(CONFIG_GNUTLS_RND)) += random-gnutls.o
+crypto-obj-y += pbkdf.o
+crypto-obj-$(CONFIG_NETTLE) += pbkdf-nettle.o
+crypto-obj-$(if $(CONFIG_NETTLE),n,$(CONFIG_GCRYPT_KDF)) += pbkdf-gcrypt.o
+crypto-obj-y += ivgen.o
+crypto-obj-y += ivgen-essiv.o
+crypto-obj-y += ivgen-plain.o
+crypto-obj-y += ivgen-plain64.o
+crypto-obj-y += afsplit.o
+crypto-obj-y += xts.o
+crypto-obj-y += block.o
+crypto-obj-y += block-qcow.o
+crypto-obj-y += block-luks.o

 # Let the userspace emulators avoid linking gnutls/etc
 crypto-aes-obj-y = aes.o
+
+stub-obj-y += random-stub.o
+stub-obj-y += pbkdf-stub.o
--- a/crypto/afsplit.c
+++ b/crypto/afsplit.c
@@ -0,0 +1,158 @@
+/*
+ * QEMU Crypto anti forensic information splitter
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * Derived from cryptsetup package lib/luks1/af.c
+ *
+ * Copyright (C) 2004, Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2009-2012, Red Hat, Inc. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/afsplit.h"
+#include "crypto/random.h"
+
+
+static void qcrypto_afsplit_xor(size_t blocklen,
+                                const uint8_t *in1,
+                                const uint8_t *in2,
+                                uint8_t *out)
+{
+    size_t i;
+    for (i = 0; i < blocklen; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+
+static int qcrypto_afsplit_hash(QCryptoHashAlgorithm hash,
+                                size_t blocklen,
+                                uint8_t *block,
+                                Error **errp)
+{
+    size_t digestlen = qcrypto_hash_digest_len(hash);
+
+    size_t hashcount = blocklen / digestlen;
+    size_t finallen = blocklen % digestlen;
+    uint32_t i;
+
+    if (finallen) {
+        hashcount++;
+    } else {
+        finallen = digestlen;
+    }
+
+    for (i = 0; i < hashcount; i++) {
+        uint8_t *out = NULL;
+        size_t outlen = 0;
+        uint32_t iv = cpu_to_be32(i);
+        struct iovec in[] = {
+            { .iov_base = &iv,
+              .iov_len = sizeof(iv) },
+            { .iov_base = block + (i * digestlen),
+              .iov_len = (i == (hashcount - 1)) ? finallen : digestlen },
+        };
+
+        if (qcrypto_hash_bytesv(hash,
+                                in,
+                                G_N_ELEMENTS(in),
+                                &out, &outlen,
+                                errp) < 0) {
+            return -1;
+        }
+
+        assert(outlen == digestlen);
+        memcpy(block + (i * digestlen), out,
+               (i == (hashcount - 1)) ? finallen : digestlen);
+        g_free(out);
+    }
+
+    return 0;
+}
+
+
+int qcrypto_afsplit_encode(QCryptoHashAlgorithm hash,
+                           size_t blocklen,
+                           uint32_t stripes,
+                           const uint8_t *in,
+                           uint8_t *out,
+                           Error **errp)
+{
+    uint8_t *block = g_new0(uint8_t, blocklen);
+    size_t i;
+    int ret = -1;
+
+    for (i = 0; i < (stripes - 1); i++) {
+        if (qcrypto_random_bytes(out + (i * blocklen), blocklen, errp) < 0) {
+            goto cleanup;
+        }
+
+        qcrypto_afsplit_xor(blocklen,
+                            out + (i * blocklen),
+                            block,
+                            block);
+        if (qcrypto_afsplit_hash(hash, blocklen, block,
+                                 errp) < 0) {
+            goto cleanup;
+        }
+    }
+    qcrypto_afsplit_xor(blocklen,
+                        in,
+                        block,
+                        out + (i * blocklen));
+    ret = 0;
+
+ cleanup:
+    g_free(block);
+    return ret;
+}
+
+
+int qcrypto_afsplit_decode(QCryptoHashAlgorithm hash,
+                           size_t blocklen,
+                           uint32_t stripes,
+                           const uint8_t *in,
+                           uint8_t *out,
+                           Error **errp)
+{
+    uint8_t *block = g_new0(uint8_t, blocklen);
+    size_t i;
+    int ret = -1;
+
+    for (i = 0; i < (stripes - 1); i++) {
+        qcrypto_afsplit_xor(blocklen,
+                            in + (i * blocklen),
+                            block,
+                            block);
+        if (qcrypto_afsplit_hash(hash, blocklen, block,
+                                 errp) < 0) {
+            goto cleanup;
+        }
+    }
+
+    qcrypto_afsplit_xor(blocklen,
+                        in + (i * blocklen),
+                        block,
+                        out);
+
+    ret = 0;
+
+ cleanup:
+    g_free(block);
+    return ret;
+}
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
--- a/crypto/block-luks.h
+++ b/crypto/block-luks.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block device encryption LUKS format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_LUKS_H__
+#define QCRYPTO_BLOCK_LUKS_H__
+
+#include "crypto/blockpriv.h"
+
+extern const QCryptoBlockDriver qcrypto_block_driver_luks;
+
+#endif /* QCRYPTO_BLOCK_LUKS_H__ */
--- a/crypto/block-qcow.c
+++ b/crypto/block-qcow.c
@@ -0,0 +1,173 @@
+/*
+ * QEMU Crypto block device encryption QCow/QCow2 AES-CBC format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * Note that the block encryption implemented in this file is broken
+ * by design. This exists only to allow data to be liberated from
+ * existing qcow[2] images and should not be used in any new areas.
+ */
+
+#include "qemu/osdep.h"
+
+#include "crypto/block-qcow.h"
+#include "crypto/secret.h"
+
+#define QCRYPTO_BLOCK_QCOW_SECTOR_SIZE 512
+
+
+static bool
+qcrypto_block_qcow_has_format(const uint8_t *buf G_GNUC_UNUSED,
+                              size_t buf_size G_GNUC_UNUSED)
+{
+    return false;
+}
+
+
+static int
+qcrypto_block_qcow_init(QCryptoBlock *block,
+                        const char *keysecret,
+                        Error **errp)
+{
+    char *password;
+    int ret;
+    uint8_t keybuf[16];
+    int len;
+
+    memset(keybuf, 0, 16);
+
+    password = qcrypto_secret_lookup_as_utf8(keysecret, errp);
+    if (!password) {
+        return -1;
+    }
+
+    len = strlen(password);
+    memcpy(keybuf, password, MIN(len, sizeof(keybuf)));
+    g_free(password);
+
+    block->niv = qcrypto_cipher_get_iv_len(QCRYPTO_CIPHER_ALG_AES_128,
+                                           QCRYPTO_CIPHER_MODE_CBC);
+    block->ivgen = qcrypto_ivgen_new(QCRYPTO_IVGEN_ALG_PLAIN64,
+                                     0, 0, NULL, 0, errp);
+    if (!block->ivgen) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    block->cipher = qcrypto_cipher_new(QCRYPTO_CIPHER_ALG_AES_128,
+                                       QCRYPTO_CIPHER_MODE_CBC,
+                                       keybuf, G_N_ELEMENTS(keybuf),
+                                       errp);
+    if (!block->cipher) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    block->payload_offset = 0;
+
+    return 0;
+
+ fail:
+    qcrypto_cipher_free(block->cipher);
+    qcrypto_ivgen_free(block->ivgen);
+    return ret;
+}
+
+
+static int
+qcrypto_block_qcow_open(QCryptoBlock *block,
+                        QCryptoBlockOpenOptions *options,
+                        QCryptoBlockReadFunc readfunc G_GNUC_UNUSED,
+                        void *opaque G_GNUC_UNUSED,
+                        unsigned int flags,
+                        Error **errp)
+{
+    if (flags & QCRYPTO_BLOCK_OPEN_NO_IO) {
+        return 0;
+    } else {
+        if (!options->u.qcow.key_secret) {
+            error_setg(errp,
+                       "Parameter 'key-secret' is required for cipher");
+            return -1;
+        }
+        return qcrypto_block_qcow_init(block,
+                                       options->u.qcow.key_secret, errp);
+    }
+}
+
+
+static int
+qcrypto_block_qcow_create(QCryptoBlock *block,
+                          QCryptoBlockCreateOptions *options,
+                          QCryptoBlockInitFunc initfunc G_GNUC_UNUSED,
+                          QCryptoBlockWriteFunc writefunc G_GNUC_UNUSED,
+                          void *opaque G_GNUC_UNUSED,
+                          Error **errp)
+{
+    if (!options->u.qcow.key_secret) {
+        error_setg(errp, "Parameter 'key-secret' is required for cipher");
+        return -1;
+    }
+    /* QCow2 has no special header, since everything is hardwired */
+    return qcrypto_block_qcow_init(block, options->u.qcow.key_secret, errp);
+}
+
+
+static void
+qcrypto_block_qcow_cleanup(QCryptoBlock *block)
+{
+}
+
+
+static int
+qcrypto_block_qcow_decrypt(QCryptoBlock *block,
+                           uint64_t startsector,
+                           uint8_t *buf,
+                           size_t len,
+                           Error **errp)
+{
+    return qcrypto_block_decrypt_helper(block->cipher,
+                                        block->niv, block->ivgen,
+                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
+                                        startsector, buf, len, errp);
+}
+
+
+static int
+qcrypto_block_qcow_encrypt(QCryptoBlock *block,
+                           uint64_t startsector,
+                           uint8_t *buf,
+                           size_t len,
+                           Error **errp)
+{
+    return qcrypto_block_encrypt_helper(block->cipher,
+                                        block->niv, block->ivgen,
+                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
+                                        startsector, buf, len, errp);
+}
+
+
+const QCryptoBlockDriver qcrypto_block_driver_qcow = {
+    .open = qcrypto_block_qcow_open,
+    .create = qcrypto_block_qcow_create,
+    .cleanup = qcrypto_block_qcow_cleanup,
+    .decrypt = qcrypto_block_qcow_decrypt,
+    .encrypt = qcrypto_block_qcow_encrypt,
+    .has_format = qcrypto_block_qcow_has_format,
+};
--- a/crypto/block-qcow.h
+++ b/crypto/block-qcow.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block device encryption QCow/QCow2 AES-CBC format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_QCOW_H__
+#define QCRYPTO_BLOCK_QCOW_H__
+
+#include "crypto/blockpriv.h"
+
+extern const QCryptoBlockDriver qcrypto_block_driver_qcow;
+
+#endif /* QCRYPTO_BLOCK_QCOW_H__ */
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -0,0 +1,260 @@
+/*
+ * QEMU Crypto block device encryption
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/blockpriv.h"
+#include "crypto/block-qcow.h"
+#include "crypto/block-luks.h"
+
+static const QCryptoBlockDriver *qcrypto_block_drivers[] = {
+    [Q_CRYPTO_BLOCK_FORMAT_QCOW] = &qcrypto_block_driver_qcow,
+    [Q_CRYPTO_BLOCK_FORMAT_LUKS] = &qcrypto_block_driver_luks,
+};
+
+
+bool qcrypto_block_has_format(QCryptoBlockFormat format,
+                              const uint8_t *buf,
+                              size_t len)
+{
+    const QCryptoBlockDriver *driver;
+
+    if (format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[format]) {
+        return false;
+    }
+
+    driver = qcrypto_block_drivers[format];
+
+    return driver->has_format(buf, len);
+}
+
+
+QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
+                                 QCryptoBlockReadFunc readfunc,
+                                 void *opaque,
+                                 unsigned int flags,
+                                 Error **errp)
+{
+    QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+
+    block->format = options->format;
+
+    if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[options->format]) {
+        error_setg(errp, "Unsupported block driver %d", options->format);
+        g_free(block);
+        return NULL;
+    }
+
+    block->driver = qcrypto_block_drivers[options->format];
+
+    if (block->driver->open(block, options,
+                            readfunc, opaque, flags, errp) < 0) {
+        g_free(block);
+        return NULL;
+    }
+
+    return block;
+}
+
+
+QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
+                                   QCryptoBlockInitFunc initfunc,
+                                   QCryptoBlockWriteFunc writefunc,
+                                   void *opaque,
+                                   Error **errp)
+{
+    QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+
+    block->format = options->format;
+
+    if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[options->format]) {
+        error_setg(errp, "Unsupported block driver %d", options->format);
+        g_free(block);
+        return NULL;
+    }
+
+    block->driver = qcrypto_block_drivers[options->format];
+
+    if (block->driver->create(block, options, initfunc,
+                              writefunc, opaque, errp) < 0) {
+        g_free(block);
+        return NULL;
+    }
+
+    return block;
+}
+
+
+int qcrypto_block_decrypt(QCryptoBlock *block,
+                          uint64_t startsector,
+                          uint8_t *buf,
+                          size_t len,
+                          Error **errp)
+{
+    return block->driver->decrypt(block, startsector, buf, len, errp);
+}
+
+
+int qcrypto_block_encrypt(QCryptoBlock *block,
+                          uint64_t startsector,
+                          uint8_t *buf,
+                          size_t len,
+                          Error **errp)
+{
+    return block->driver->encrypt(block, startsector, buf, len, errp);
+}
+
+
+QCryptoCipher *qcrypto_block_get_cipher(QCryptoBlock *block)
+{
+    return block->cipher;
+}
+
+
+QCryptoIVGen *qcrypto_block_get_ivgen(QCryptoBlock *block)
+{
+    return block->ivgen;
+}
+
+
+QCryptoHashAlgorithm qcrypto_block_get_kdf_hash(QCryptoBlock *block)
+{
+    return block->kdfhash;
+}
+
+
+uint64_t qcrypto_block_get_payload_offset(QCryptoBlock *block)
+{
+    return block->payload_offset;
+}
+
+
+void qcrypto_block_free(QCryptoBlock *block)
+{
+    if (!block) {
+        return;
+    }
+
+    block->driver->cleanup(block);
+
+    qcrypto_cipher_free(block->cipher);
+    qcrypto_ivgen_free(block->ivgen);
+    g_free(block);
+}
+
+
+int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp)
+{
+    uint8_t *iv;
+    int ret = -1;
+
+    iv = niv ? g_new0(uint8_t, niv) : NULL;
+
+    while (len > 0) {
+        size_t nbytes;
+        if (niv) {
+            if (qcrypto_ivgen_calculate(ivgen,
+                                        startsector,
+                                        iv, niv,
+                                        errp) < 0) {
+                goto cleanup;
+            }
+
+            if (qcrypto_cipher_setiv(cipher,
+                                     iv, niv,
+                                     errp) < 0) {
+                goto cleanup;
+            }
+        }
+
+        nbytes = len > sectorsize ? sectorsize : len;
+        if (qcrypto_cipher_decrypt(cipher, buf, buf,
+                                   nbytes, errp) < 0) {
+            goto cleanup;
+        }
+
+        startsector++;
+        buf += nbytes;
+        len -= nbytes;
+    }
+
+    ret = 0;
+ cleanup:
+    g_free(iv);
+    return ret;
+}
+
+
+int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp)
+{
+    uint8_t *iv;
+    int ret = -1;
+
+    iv = niv ? g_new0(uint8_t, niv) : NULL;
+
+    while (len > 0) {
+        size_t nbytes;
+        if (niv) {
+            if (qcrypto_ivgen_calculate(ivgen,
+                                        startsector,
+                                        iv, niv,
+                                        errp) < 0) {
+                goto cleanup;
+            }
+
+            if (qcrypto_cipher_setiv(cipher,
+                                     iv, niv,
+                                     errp) < 0) {
+                goto cleanup;
+            }
+        }
+
+        nbytes = len > sectorsize ? sectorsize : len;
+        if (qcrypto_cipher_encrypt(cipher, buf, buf,
+                                   nbytes, errp) < 0) {
+            goto cleanup;
+        }
+
+        startsector++;
+        buf += nbytes;
+        len -= nbytes;
+    }
+
+    ret = 0;
+ cleanup:
+    g_free(iv);
+    return ret;
+}
--- a/crypto/blockpriv.h
+++ b/crypto/blockpriv.h
@@ -0,0 +1,92 @@
+/*
+ * QEMU Crypto block device encryption
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_PRIV_H__
+#define QCRYPTO_BLOCK_PRIV_H__
+
+#include "crypto/block.h"
+
+typedef struct QCryptoBlockDriver QCryptoBlockDriver;
+
+struct QCryptoBlock {
+    QCryptoBlockFormat format;
+
+    const QCryptoBlockDriver *driver;
+    void *opaque;
+
+    QCryptoCipher *cipher;
+    QCryptoIVGen *ivgen;
+    QCryptoHashAlgorithm kdfhash;
+    size_t niv;
+    uint64_t payload_offset; /* In bytes */
+};
+
+struct QCryptoBlockDriver {
+    int (*open)(QCryptoBlock *block,
+                QCryptoBlockOpenOptions *options,
+                QCryptoBlockReadFunc readfunc,
+                void *opaque,
+                unsigned int flags,
+                Error **errp);
+
+    int (*create)(QCryptoBlock *block,
+                  QCryptoBlockCreateOptions *options,
+                  QCryptoBlockInitFunc initfunc,
+                  QCryptoBlockWriteFunc writefunc,
+                  void *opaque,
+                  Error **errp);
+
+    void (*cleanup)(QCryptoBlock *block);
+
+    int (*encrypt)(QCryptoBlock *block,
+                   uint64_t startsector,
+                   uint8_t *buf,
+                   size_t len,
+                   Error **errp);
+    int (*decrypt)(QCryptoBlock *block,
+                   uint64_t startsector,
+                   uint8_t *buf,
+                   size_t len,
+                   Error **errp);
+
+    bool (*has_format)(const uint8_t *buf,
+                       size_t buflen);
+};
+
+
+int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp);
+
+int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp);
+
+#endif /* QCRYPTO_BLOCK_PRIV_H__ */
--- a/crypto/cipher-builtin.c
+++ b/crypto/cipher-builtin.c
@@ -21,11 +21,17 @@
 #include "qemu/osdep.h"
 #include "crypto/aes.h"
 #include "crypto/desrfb.h"
+#include "crypto/xts.h"

+typedef struct QCryptoCipherBuiltinAESContext QCryptoCipherBuiltinAESContext;
+struct QCryptoCipherBuiltinAESContext {
+    AES_KEY enc;
+    AES_KEY dec;
+};
 typedef struct QCryptoCipherBuiltinAES QCryptoCipherBuiltinAES;
 struct QCryptoCipherBuiltinAES {
-    AES_KEY encrypt_key;
-    AES_KEY decrypt_key;
+    QCryptoCipherBuiltinAESContext key;
+    QCryptoCipherBuiltinAESContext key_tweak;
    uint8_t iv[AES_BLOCK_SIZE];
 };
 typedef struct QCryptoCipherBuiltinDESRFB QCryptoCipherBuiltinDESRFB;
@@ -67,6 +73,82 @@ static void qcrypto_cipher_free_aes(QCryptoCipher *cipher)
 }


+static void qcrypto_cipher_aes_ecb_encrypt(AES_KEY *key,
+                                           const void *in,
+                                           void *out,
+                                           size_t len)
+{
+    const uint8_t *inptr = in;
+    uint8_t *outptr = out;
+    while (len) {
+        if (len > AES_BLOCK_SIZE) {
+            AES_encrypt(inptr, outptr, key);
+            inptr += AES_BLOCK_SIZE;
+            outptr += AES_BLOCK_SIZE;
+            len -= AES_BLOCK_SIZE;
+        } else {
+            uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
+            memcpy(tmp1, inptr, len);
+            /* Fill with 0 to avoid valgrind uninitialized reads */
+            memset(tmp1 + len, 0, sizeof(tmp1) - len);
+            AES_encrypt(tmp1, tmp2, key);
+            memcpy(outptr, tmp2, len);
+            len = 0;
+        }
+    }
+}
+
+
+static void qcrypto_cipher_aes_ecb_decrypt(AES_KEY *key,
+                                           const void *in,
+                                           void *out,
+                                           size_t len)
+{
+    const uint8_t *inptr = in;
+    uint8_t *outptr = out;
+    while (len) {
+        if (len > AES_BLOCK_SIZE) {
+            AES_decrypt(inptr, outptr, key);
+            inptr += AES_BLOCK_SIZE;
+            outptr += AES_BLOCK_SIZE;
+            len -= AES_BLOCK_SIZE;
+        } else {
+            uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
+            memcpy(tmp1, inptr, len);
+            /* Fill with 0 to avoid valgrind uninitialized reads */
+            memset(tmp1 + len, 0, sizeof(tmp1) - len);
+            AES_decrypt(tmp1, tmp2, key);
+            memcpy(outptr, tmp2, len);
+            len = 0;
+        }
+    }
+}
+
+
+static void qcrypto_cipher_aes_xts_encrypt(const void *ctx,
+                                           size_t length,
+                                           uint8_t *dst,
+                                           const uint8_t *src)
+{
+    const QCryptoCipherBuiltinAESContext *aesctx = ctx;
+
+    qcrypto_cipher_aes_ecb_encrypt((AES_KEY *)&aesctx->enc,
+                                   src, dst, length);
+}
+
+
+static void qcrypto_cipher_aes_xts_decrypt(const void *ctx,
+                                           size_t length,
+                                           uint8_t *dst,
+                                           const uint8_t *src)
+{
+    const QCryptoCipherBuiltinAESContext *aesctx = ctx;
+
+    qcrypto_cipher_aes_ecb_decrypt((AES_KEY *)&aesctx->dec,
+                                   src, dst, length);
+}
+
+
 static int qcrypto_cipher_encrypt_aes(QCryptoCipher *cipher,
                                      const void *in,
                                      void *out,
@@ -75,29 +157,26 @@ static int qcrypto_cipher_encrypt_aes(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    if (cipher->mode == QCRYPTO_CIPHER_MODE_ECB) {
-        const uint8_t *inptr = in;
-        uint8_t *outptr = out;
-        while (len) {
-            if (len > AES_BLOCK_SIZE) {
-                AES_encrypt(inptr, outptr, &ctxt->state.aes.encrypt_key);
-                inptr += AES_BLOCK_SIZE;
-                outptr += AES_BLOCK_SIZE;
-                len -= AES_BLOCK_SIZE;
-            } else {
-                uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
-                memcpy(tmp1, inptr, len);
-                /* Fill with 0 to avoid valgrind uninitialized reads */
-                memset(tmp1 + len, 0, sizeof(tmp1) - len);
-                AES_encrypt(tmp1, tmp2, &ctxt->state.aes.encrypt_key);
-                memcpy(outptr, tmp2, len);
-                len = 0;
-            }
-        }
-    } else {
+    switch (cipher->mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+        qcrypto_cipher_aes_ecb_encrypt(&ctxt->state.aes.key.enc,
+                                       in, out, len);
+        break;
+    case QCRYPTO_CIPHER_MODE_CBC:
        AES_cbc_encrypt(in, out, len,
-                        &ctxt->state.aes.encrypt_key,
+                        &ctxt->state.aes.key.enc,
                        ctxt->state.aes.iv, 1);
+        break;
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_encrypt(&ctxt->state.aes.key,
+                    &ctxt->state.aes.key_tweak,
+                    qcrypto_cipher_aes_xts_encrypt,
+                    qcrypto_cipher_aes_xts_decrypt,
+                    ctxt->state.aes.iv,
+                    len, out, in);
+        break;
+    default:
+        g_assert_not_reached();
    }

    return 0;
@@ -112,29 +191,26 @@ static int qcrypto_cipher_decrypt_aes(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    if (cipher->mode == QCRYPTO_CIPHER_MODE_ECB) {
-        const uint8_t *inptr = in;
-        uint8_t *outptr = out;
-        while (len) {
-            if (len > AES_BLOCK_SIZE) {
-                AES_decrypt(inptr, outptr, &ctxt->state.aes.decrypt_key);
-                inptr += AES_BLOCK_SIZE;
-                outptr += AES_BLOCK_SIZE;
-                len -= AES_BLOCK_SIZE;
-            } else {
-                uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
-                memcpy(tmp1, inptr, len);
-                /* Fill with 0 to avoid valgrind uninitialized reads */
-                memset(tmp1 + len, 0, sizeof(tmp1) - len);
-                AES_decrypt(tmp1, tmp2, &ctxt->state.aes.decrypt_key);
-                memcpy(outptr, tmp2, len);
-                len = 0;
-            }
-        }
-    } else {
+    switch (cipher->mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+        qcrypto_cipher_aes_ecb_decrypt(&ctxt->state.aes.key.dec,
+                                       in, out, len);
+        break;
+    case QCRYPTO_CIPHER_MODE_CBC:
        AES_cbc_encrypt(in, out, len,
-                        &ctxt->state.aes.decrypt_key,
+                        &ctxt->state.aes.key.dec,
                        ctxt->state.aes.iv, 0);
+        break;
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_decrypt(&ctxt->state.aes.key,
+                    &ctxt->state.aes.key_tweak,
+                    qcrypto_cipher_aes_xts_encrypt,
+                    qcrypto_cipher_aes_xts_decrypt,
+                    ctxt->state.aes.iv,
+                    len, out, in);
+        break;
+    default:
+        g_assert_not_reached();
    }

    return 0;
@@ -166,21 +242,46 @@ static int qcrypto_cipher_init_aes(QCryptoCipher *cipher,
    QCryptoCipherBuiltin *ctxt;

    if (cipher->mode != QCRYPTO_CIPHER_MODE_CBC &&
-        cipher->mode != QCRYPTO_CIPHER_MODE_ECB) {
+        cipher->mode != QCRYPTO_CIPHER_MODE_ECB &&
+        cipher->mode != QCRYPTO_CIPHER_MODE_XTS) {
        error_setg(errp, "Unsupported cipher mode %d", cipher->mode);
        return -1;
    }

    ctxt = g_new0(QCryptoCipherBuiltin, 1);

-    if (AES_set_encrypt_key(key, nkey * 8, &ctxt->state.aes.encrypt_key) != 0) {
-        error_setg(errp, "Failed to set encryption key");
-        goto error;
-    }
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        if (AES_set_encrypt_key(key, nkey * 4, &ctxt->state.aes.key.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }

-    if (AES_set_decrypt_key(key, nkey * 8, &ctxt->state.aes.decrypt_key) != 0) {
-        error_setg(errp, "Failed to set decryption key");
-        goto error;
+        if (AES_set_decrypt_key(key, nkey * 4, &ctxt->state.aes.key.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
+
+        if (AES_set_encrypt_key(key + (nkey / 2), nkey * 4,
+                                &ctxt->state.aes.key_tweak.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }
+
+        if (AES_set_decrypt_key(key + (nkey / 2), nkey * 4,
+                                &ctxt->state.aes.key_tweak.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
+    } else {
+        if (AES_set_encrypt_key(key, nkey * 8, &ctxt->state.aes.key.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }
+
+        if (AES_set_decrypt_key(key, nkey * 8, &ctxt->state.aes.key.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
    }

    ctxt->blocksize = AES_BLOCK_SIZE;
@@ -322,7 +423,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    cipher->alg = alg;
    cipher->mode = mode;

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        goto error;
    }

--- a/crypto/cipher-gcrypt.c
+++ b/crypto/cipher-gcrypt.c
@@ -19,6 +19,8 @@
 */

 #include "qemu/osdep.h"
+#include "crypto/xts.h"
+
 #include <gcrypt.h>


@@ -29,6 +31,12 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
        return true;
    default:
        return false;
@@ -38,7 +46,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
 typedef struct QCryptoCipherGcrypt QCryptoCipherGcrypt;
 struct QCryptoCipherGcrypt {
    gcry_cipher_hd_t handle;
+    gcry_cipher_hd_t tweakhandle;
    size_t blocksize;
+    uint8_t *iv;
 };

 QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
@@ -53,6 +63,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

    switch (mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_XTS:
        gcrymode = GCRY_CIPHER_MODE_ECB;
        break;
    case QCRYPTO_CIPHER_MODE_CBC:
@@ -63,7 +74,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        return NULL;
    }

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        return NULL;
    }

@@ -84,6 +95,30 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        gcryalg = GCRY_CIPHER_AES256;
        break;

+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+        gcryalg = GCRY_CIPHER_CAST5;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+        gcryalg = GCRY_CIPHER_SERPENT128;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+        gcryalg = GCRY_CIPHER_SERPENT192;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        gcryalg = GCRY_CIPHER_SERPENT256;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+        gcryalg = GCRY_CIPHER_TWOFISH128;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        gcryalg = GCRY_CIPHER_TWOFISH;
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        return NULL;
@@ -101,6 +136,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                   gcry_strerror(err));
        goto error;
    }
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        err = gcry_cipher_open(&ctx->tweakhandle, gcryalg, gcrymode, 0);
+        if (err != 0) {
+            error_setg(errp, "Cannot initialize cipher: %s",
+                       gcry_strerror(err));
+            goto error;
+        }
+    }

    if (cipher->alg == QCRYPTO_CIPHER_ALG_DES_RFB) {
        /* We're using standard DES cipher from gcrypt, so we need
@@ -112,13 +155,44 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        g_free(rfbkey);
        ctx->blocksize = 8;
    } else {
-        err = gcry_cipher_setkey(ctx->handle, key, nkey);
-        ctx->blocksize = 16;
+        if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+            nkey /= 2;
+            err = gcry_cipher_setkey(ctx->handle, key, nkey);
+            if (err != 0) {
+                error_setg(errp, "Cannot set key: %s",
+                           gcry_strerror(err));
+                goto error;
+            }
+            err = gcry_cipher_setkey(ctx->tweakhandle, key + nkey, nkey);
+        } else {
+            err = gcry_cipher_setkey(ctx->handle, key, nkey);
+        }
+        if (err != 0) {
+            error_setg(errp, "Cannot set key: %s",
+                       gcry_strerror(err));
+            goto error;
+        }
+        switch (cipher->alg) {
+        case QCRYPTO_CIPHER_ALG_AES_128:
+        case QCRYPTO_CIPHER_ALG_AES_192:
+        case QCRYPTO_CIPHER_ALG_AES_256:
+        case QCRYPTO_CIPHER_ALG_SERPENT_128:
+        case QCRYPTO_CIPHER_ALG_SERPENT_192:
+        case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+        case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+            ctx->blocksize = 16;
+            break;
+        case QCRYPTO_CIPHER_ALG_CAST5_128:
+            ctx->blocksize = 8;
+            break;
+        default:
+            g_assert_not_reached();
+        }
    }
-    if (err != 0) {
-        error_setg(errp, "Cannot set key: %s",
-                   gcry_strerror(err));
-        goto error;
+
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        ctx->iv = g_new0(uint8_t, ctx->blocksize);
    }

    cipher->opaque = ctx;
@@ -126,6 +200,9 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

 error:
    gcry_cipher_close(ctx->handle);
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        gcry_cipher_close(ctx->tweakhandle);
+    }
    g_free(ctx);
    g_free(cipher);
    return NULL;
@@ -140,11 +217,35 @@ void qcrypto_cipher_free(QCryptoCipher *cipher)
    }
    ctx = cipher->opaque;
    gcry_cipher_close(ctx->handle);
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        gcry_cipher_close(ctx->tweakhandle);
+    }
+    g_free(ctx->iv);
    g_free(ctx);
    g_free(cipher);
 }


+static void qcrypto_gcrypt_xts_encrypt(const void *ctx,
+                                       size_t length,
+                                       uint8_t *dst,
+                                       const uint8_t *src)
+{
+    gcry_error_t err;
+    err = gcry_cipher_encrypt((gcry_cipher_hd_t)ctx, dst, length, src, length);
+    g_assert(err == 0);
+}
+
+static void qcrypto_gcrypt_xts_decrypt(const void *ctx,
+                                       size_t length,
+                                       uint8_t *dst,
+                                       const uint8_t *src)
+{
+    gcry_error_t err;
+    err = gcry_cipher_decrypt((gcry_cipher_hd_t)ctx, dst, length, src, length);
+    g_assert(err == 0);
+}
+
 int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
                           const void *in,
                           void *out,
@@ -160,13 +261,20 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
        return -1;
    }

-    err = gcry_cipher_encrypt(ctx->handle,
-                              out, len,
-                              in, len);
-    if (err != 0) {
-        error_setg(errp, "Cannot encrypt data: %s",
-                   gcry_strerror(err));
-        return -1;
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        xts_encrypt(ctx->handle, ctx->tweakhandle,
+                    qcrypto_gcrypt_xts_encrypt,
+                    qcrypto_gcrypt_xts_decrypt,
+                    ctx->iv, len, out, in);
+    } else {
+        err = gcry_cipher_encrypt(ctx->handle,
+                                  out, len,
+                                  in, len);
+        if (err != 0) {
+            error_setg(errp, "Cannot encrypt data: %s",
+                       gcry_strerror(err));
+            return -1;
+        }
    }

    return 0;
@@ -188,13 +296,20 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
        return -1;
    }

-    err = gcry_cipher_decrypt(ctx->handle,
-                              out, len,
-                              in, len);
-    if (err != 0) {
-        error_setg(errp, "Cannot decrypt data: %s",
-                   gcry_strerror(err));
-        return -1;
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        xts_decrypt(ctx->handle, ctx->tweakhandle,
+                    qcrypto_gcrypt_xts_encrypt,
+                    qcrypto_gcrypt_xts_decrypt,
+                    ctx->iv, len, out, in);
+    } else {
+        err = gcry_cipher_decrypt(ctx->handle,
+                                  out, len,
+                                  in, len);
+        if (err != 0) {
+            error_setg(errp, "Cannot decrypt data: %s",
+                       gcry_strerror(err));
+            return -1;
+        }
    }

    return 0;
@@ -213,12 +328,16 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
        return -1;
    }

-    gcry_cipher_reset(ctx->handle);
-    err = gcry_cipher_setiv(ctx->handle, iv, niv);
-    if (err != 0) {
-        error_setg(errp, "Cannot set IV: %s",
+    if (ctx->iv) {
+        memcpy(ctx->iv, iv, niv);
+    } else {
+        gcry_cipher_reset(ctx->handle);
+        err = gcry_cipher_setiv(ctx->handle, iv, niv);
+        if (err != 0) {
+            error_setg(errp, "Cannot set IV: %s",
                   gcry_strerror(err));
-        return -1;
+            return -1;
+        }
    }

    return 0;
--- a/crypto/cipher-nettle.c
+++ b/crypto/cipher-nettle.c
@@ -19,56 +19,174 @@
 */

 #include "qemu/osdep.h"
+#include "crypto/xts.h"
+
 #include <nettle/nettle-types.h>
 #include <nettle/aes.h>
 #include <nettle/des.h>
 #include <nettle/cbc.h>
+#include <nettle/cast128.h>
+#include <nettle/serpent.h>
+#include <nettle/twofish.h>
+
+typedef void (*QCryptoCipherNettleFuncWrapper)(const void *ctx,
+                                               size_t length,
+                                               uint8_t *dst,
+                                               const uint8_t *src);

 #if CONFIG_NETTLE_VERSION_MAJOR < 3
-typedef nettle_crypt_func nettle_cipher_func;
-
+typedef nettle_crypt_func * QCryptoCipherNettleFuncNative;
 typedef void *       cipher_ctx_t;
 typedef unsigned     cipher_length_t;
+
+#define cast5_set_key cast128_set_key
 #else
+typedef nettle_cipher_func * QCryptoCipherNettleFuncNative;
 typedef const void * cipher_ctx_t;
 typedef size_t       cipher_length_t;
 #endif

-static nettle_cipher_func aes_encrypt_wrapper;
-static nettle_cipher_func aes_decrypt_wrapper;
-static nettle_cipher_func des_encrypt_wrapper;
-static nettle_cipher_func des_decrypt_wrapper;
+typedef struct QCryptoNettleAES {
+    struct aes_ctx enc;
+    struct aes_ctx dec;
+} QCryptoNettleAES;

-static void aes_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
-                                uint8_t *dst, const uint8_t *src)
+static void aes_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
 {
-    aes_encrypt(ctx, length, dst, src);
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_encrypt(&aesctx->enc, length, dst, src);
 }

-static void aes_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
-                                uint8_t *dst, const uint8_t *src)
+static void aes_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
 {
-    aes_decrypt(ctx, length, dst, src);
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_decrypt(&aesctx->dec, length, dst, src);
 }

-static void des_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
+static void des_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    des_encrypt(ctx, length, dst, src);
+}
+
+static void des_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    des_decrypt(ctx, length, dst, src);
+}
+
+static void cast128_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    cast128_encrypt(ctx, length, dst, src);
+}
+
+static void cast128_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    cast128_decrypt(ctx, length, dst, src);
+}
+
+static void serpent_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    serpent_encrypt(ctx, length, dst, src);
+}
+
+static void serpent_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    serpent_decrypt(ctx, length, dst, src);
+}
+
+static void twofish_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    twofish_encrypt(ctx, length, dst, src);
+}
+
+static void twofish_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    twofish_decrypt(ctx, length, dst, src);
+}
+
+static void aes_encrypt_wrapper(const void *ctx, size_t length,
+                                uint8_t *dst, const uint8_t *src)
+{
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_encrypt(&aesctx->enc, length, dst, src);
+}
+
+static void aes_decrypt_wrapper(const void *ctx, size_t length,
+                                uint8_t *dst, const uint8_t *src)
+{
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_decrypt(&aesctx->dec, length, dst, src);
+}
+
+static void des_encrypt_wrapper(const void *ctx, size_t length,
                                uint8_t *dst, const uint8_t *src)
 {
    des_encrypt(ctx, length, dst, src);
 }

-static void des_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
+static void des_decrypt_wrapper(const void *ctx, size_t length,
                                uint8_t *dst, const uint8_t *src)
 {
    des_decrypt(ctx, length, dst, src);
 }

+static void cast128_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    cast128_encrypt(ctx, length, dst, src);
+}
+
+static void cast128_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    cast128_decrypt(ctx, length, dst, src);
+}
+
+static void serpent_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    serpent_encrypt(ctx, length, dst, src);
+}
+
+static void serpent_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    serpent_decrypt(ctx, length, dst, src);
+}
+
+static void twofish_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    twofish_encrypt(ctx, length, dst, src);
+}
+
+static void twofish_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    twofish_decrypt(ctx, length, dst, src);
+}
+
 typedef struct QCryptoCipherNettle QCryptoCipherNettle;
 struct QCryptoCipherNettle {
-    void *ctx_encrypt;
-    void *ctx_decrypt;
-    nettle_cipher_func *alg_encrypt;
-    nettle_cipher_func *alg_decrypt;
+    /* Primary cipher context for all modes */
+    void *ctx;
+    /* Second cipher context for XTS mode only */
+    void *ctx_tweak;
+    /* Cipher callbacks for both contexts */
+    QCryptoCipherNettleFuncNative alg_encrypt_native;
+    QCryptoCipherNettleFuncNative alg_decrypt_native;
+    QCryptoCipherNettleFuncWrapper alg_encrypt_wrapper;
+    QCryptoCipherNettleFuncWrapper alg_decrypt_wrapper;
+
    uint8_t *iv;
    size_t blocksize;
 };
@@ -80,6 +198,13 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_192:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
        return true;
    default:
        return false;
@@ -99,13 +224,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    switch (mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
        break;
    default:
        error_setg(errp, "Unsupported cipher mode %d", mode);
        return NULL;
    }

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        return NULL;
    }

@@ -117,14 +243,15 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

    switch (alg) {
    case QCRYPTO_CIPHER_ALG_DES_RFB:
-        ctx->ctx_encrypt = g_new0(struct des_ctx, 1);
-        ctx->ctx_decrypt = NULL; /* 1 ctx can do both */
+        ctx->ctx = g_new0(struct des_ctx, 1);
        rfbkey = qcrypto_cipher_munge_des_rfb_key(key, nkey);
-        des_set_key(ctx->ctx_encrypt, rfbkey);
+        des_set_key(ctx->ctx, rfbkey);
        g_free(rfbkey);

-        ctx->alg_encrypt = des_encrypt_wrapper;
-        ctx->alg_decrypt = des_decrypt_wrapper;
+        ctx->alg_encrypt_native = des_encrypt_native;
+        ctx->alg_decrypt_native = des_decrypt_native;
+        ctx->alg_encrypt_wrapper = des_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = des_decrypt_wrapper;

        ctx->blocksize = DES_BLOCK_SIZE;
        break;
@@ -132,17 +259,103 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
-        ctx->ctx_encrypt = g_new0(struct aes_ctx, 1);
-        ctx->ctx_decrypt = g_new0(struct aes_ctx, 1);
+        ctx->ctx = g_new0(QCryptoNettleAES, 1);

-        aes_set_encrypt_key(ctx->ctx_encrypt, nkey, key);
-        aes_set_decrypt_key(ctx->ctx_decrypt, nkey, key);
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(QCryptoNettleAES, 1);

-        ctx->alg_encrypt = aes_encrypt_wrapper;
-        ctx->alg_decrypt = aes_decrypt_wrapper;
+            nkey /= 2;
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx)->enc,
+                                nkey, key);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx)->dec,
+                                nkey, key);
+
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx_tweak)->enc,
+                                nkey, key + nkey);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx_tweak)->dec,
+                                nkey, key + nkey);
+        } else {
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx)->enc,
+                                nkey, key);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx)->dec,
+                                nkey, key);
+        }
+
+        ctx->alg_encrypt_native = aes_encrypt_native;
+        ctx->alg_decrypt_native = aes_decrypt_native;
+        ctx->alg_encrypt_wrapper = aes_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = aes_decrypt_wrapper;

        ctx->blocksize = AES_BLOCK_SIZE;
        break;
+
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+        ctx->ctx = g_new0(struct cast128_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct cast128_ctx, 1);
+
+            nkey /= 2;
+            cast5_set_key(ctx->ctx, nkey, key);
+            cast5_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            cast5_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = cast128_encrypt_native;
+        ctx->alg_decrypt_native = cast128_decrypt_native;
+        ctx->alg_encrypt_wrapper = cast128_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = cast128_decrypt_wrapper;
+
+        ctx->blocksize = CAST128_BLOCK_SIZE;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        ctx->ctx = g_new0(struct serpent_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct serpent_ctx, 1);
+
+            nkey /= 2;
+            serpent_set_key(ctx->ctx, nkey, key);
+            serpent_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            serpent_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = serpent_encrypt_native;
+        ctx->alg_decrypt_native = serpent_decrypt_native;
+        ctx->alg_encrypt_wrapper = serpent_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = serpent_decrypt_wrapper;
+
+        ctx->blocksize = SERPENT_BLOCK_SIZE;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_192:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        ctx->ctx = g_new0(struct twofish_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct twofish_ctx, 1);
+
+            nkey /= 2;
+            twofish_set_key(ctx->ctx, nkey, key);
+            twofish_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            twofish_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = twofish_encrypt_native;
+        ctx->alg_decrypt_native = twofish_decrypt_native;
+        ctx->alg_encrypt_wrapper = twofish_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = twofish_decrypt_wrapper;
+
+        ctx->blocksize = TWOFISH_BLOCK_SIZE;
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        goto error;
@@ -170,8 +383,8 @@ void qcrypto_cipher_free(QCryptoCipher *cipher)

    ctx = cipher->opaque;
    g_free(ctx->iv);
-    g_free(ctx->ctx_encrypt);
-    g_free(ctx->ctx_decrypt);
+    g_free(ctx->ctx);
+    g_free(ctx->ctx_tweak);
    g_free(ctx);
    g_free(cipher);
 }
@@ -193,14 +406,21 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,

    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
-        ctx->alg_encrypt(ctx->ctx_encrypt, len, out, in);
+        ctx->alg_encrypt_wrapper(ctx->ctx, len, out, in);
        break;

    case QCRYPTO_CIPHER_MODE_CBC:
-        cbc_encrypt(ctx->ctx_encrypt, ctx->alg_encrypt,
+        cbc_encrypt(ctx->ctx, ctx->alg_encrypt_native,
                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
+
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_encrypt(ctx->ctx, ctx->ctx_tweak,
+                    ctx->alg_encrypt_wrapper, ctx->alg_encrypt_wrapper,
+                    ctx->iv, len, out, in);
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d",
                   cipher->alg);
@@ -226,15 +446,26 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,

    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
-        ctx->alg_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                         len, out, in);
+        ctx->alg_decrypt_wrapper(ctx->ctx, len, out, in);
        break;

    case QCRYPTO_CIPHER_MODE_CBC:
-        cbc_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                    ctx->alg_decrypt, ctx->blocksize, ctx->iv,
+        cbc_decrypt(ctx->ctx, ctx->alg_decrypt_native,
+                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
+
+    case QCRYPTO_CIPHER_MODE_XTS:
+        if (ctx->blocksize != XTS_BLOCK_SIZE) {
+            error_setg(errp, "Block size must be %d not %zu",
+                       XTS_BLOCK_SIZE, ctx->blocksize);
+            return -1;
+        }
+        xts_decrypt(ctx->ctx, ctx->ctx_tweak,
+                    ctx->alg_encrypt_wrapper, ctx->alg_decrypt_wrapper,
+                    ctx->iv, len, out, in);
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d",
                   cipher->alg);
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -27,6 +27,13 @@ static size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = {
    [QCRYPTO_CIPHER_ALG_AES_192] = 24,
    [QCRYPTO_CIPHER_ALG_AES_256] = 32,
    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
+    [QCRYPTO_CIPHER_ALG_CAST5_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_192] = 24,
+    [QCRYPTO_CIPHER_ALG_SERPENT_256] = 32,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32,
 };

 static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
@@ -34,11 +41,19 @@ static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
    [QCRYPTO_CIPHER_ALG_AES_192] = 16,
    [QCRYPTO_CIPHER_ALG_AES_256] = 16,
    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
+    [QCRYPTO_CIPHER_ALG_CAST5_128] = 8,
+    [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_192] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_256] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 16,
 };

 static bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {
    [QCRYPTO_CIPHER_MODE_ECB] = false,
    [QCRYPTO_CIPHER_MODE_CBC] = true,
+    [QCRYPTO_CIPHER_MODE_XTS] = true,
 };


@@ -79,6 +94,7 @@ size_t qcrypto_cipher_get_iv_len(QCryptoCipherAlgorithm alg,

 static bool
 qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
+                                   QCryptoCipherMode mode,
                                   size_t nkey,
                                   Error **errp)
 {
@@ -88,10 +104,27 @@ qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
        return false;
    }

-    if (alg_key_len[alg] != nkey) {
-        error_setg(errp, "Cipher key length %zu should be %zu",
-                   nkey, alg_key_len[alg]);
-        return false;
+    if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+        if (alg == QCRYPTO_CIPHER_ALG_DES_RFB) {
+            error_setg(errp, "XTS mode not compatible with DES-RFB");
+            return false;
+        }
+        if (nkey % 2) {
+            error_setg(errp, "XTS cipher key length should be a multiple of 2");
+            return false;
+        }
+
+        if (alg_key_len[alg] != (nkey / 2)) {
+            error_setg(errp, "Cipher key length %zu should be %zu",
+                       nkey, alg_key_len[alg] * 2);
+            return false;
+        }
+    } else {
+        if (alg_key_len[alg] != nkey) {
+            error_setg(errp, "Cipher key length %zu should be %zu",
+                       nkey, alg_key_len[alg]);
+            return false;
+        }
    }
    return true;
 }
--- a/crypto/ivgen-essiv.c
+++ b/crypto/ivgen-essiv.c
@@ -0,0 +1,118 @@
+/*
+ * QEMU Crypto block IV generator - essiv
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-essiv.h"
+
+typedef struct QCryptoIVGenESSIV QCryptoIVGenESSIV;
+struct QCryptoIVGenESSIV {
+    QCryptoCipher *cipher;
+};
+
+static int qcrypto_ivgen_essiv_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    uint8_t *salt;
+    size_t nhash;
+    size_t nsalt;
+    QCryptoIVGenESSIV *essiv = g_new0(QCryptoIVGenESSIV, 1);
+
+    /* Not necessarily the same as nkey */
+    nsalt = qcrypto_cipher_get_key_len(ivgen->cipher);
+
+    nhash = qcrypto_hash_digest_len(ivgen->hash);
+    /* Salt must be larger of hash size or key size */
+    salt = g_new0(uint8_t, MAX(nhash, nsalt));
+
+    if (qcrypto_hash_bytes(ivgen->hash, (const gchar *)key, nkey,
+                           &salt, &nhash,
+                           errp) < 0) {
+        g_free(essiv);
+        return -1;
+    }
+
+    /* Now potentially truncate salt to match cipher key len */
+    essiv->cipher = qcrypto_cipher_new(ivgen->cipher,
+                                       QCRYPTO_CIPHER_MODE_ECB,
+                                       salt, MIN(nhash, nsalt),
+                                       errp);
+    if (!essiv->cipher) {
+        g_free(essiv);
+        g_free(salt);
+        return -1;
+    }
+
+    g_free(salt);
+    ivgen->private = essiv;
+
+    return 0;
+}
+
+static int qcrypto_ivgen_essiv_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    QCryptoIVGenESSIV *essiv = ivgen->private;
+    size_t ndata = qcrypto_cipher_get_block_len(ivgen->cipher);
+    uint8_t *data = g_new(uint8_t, ndata);
+
+    sector = cpu_to_le64(sector);
+    memcpy(data, (uint8_t *)&sector, ndata);
+    if (sizeof(sector) < ndata) {
+        memset(data + sizeof(sector), 0, ndata - sizeof(sector));
+    }
+
+    if (qcrypto_cipher_encrypt(essiv->cipher,
+                               data,
+                               data,
+                               ndata,
+                               errp) < 0) {
+        g_free(data);
+        return -1;
+    }
+
+    if (ndata > niv) {
+        ndata = niv;
+    }
+    memcpy(iv, data, ndata);
+    if (ndata < niv) {
+        memset(iv + ndata, 0, niv - ndata);
+    }
+    g_free(data);
+    return 0;
+}
+
+static void qcrypto_ivgen_essiv_cleanup(QCryptoIVGen *ivgen)
+{
+    QCryptoIVGenESSIV *essiv = ivgen->private;
+
+    qcrypto_cipher_free(essiv->cipher);
+    g_free(essiv);
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_essiv = {
+    .init = qcrypto_ivgen_essiv_init,
+    .calculate = qcrypto_ivgen_essiv_calculate,
+    .cleanup = qcrypto_ivgen_essiv_cleanup,
+};
+
--- a/crypto/ivgen-essiv.h
+++ b/crypto/ivgen-essiv.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - essiv
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_ESSIV_H__
+#define QCRYPTO_IVGEN_ESSIV_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_essiv;
+
+#endif /* QCRYPTO_IVGEN_ESSIV_H__ */
--- a/crypto/ivgen-plain.c
+++ b/crypto/ivgen-plain.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-plain.h"
+
+static int qcrypto_ivgen_plain_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    return 0;
+}
+
+static int qcrypto_ivgen_plain_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    size_t ivprefix;
+    uint32_t shortsector = cpu_to_le32((sector & 0xffffffff));
+    ivprefix = sizeof(shortsector);
+    if (ivprefix > niv) {
+        ivprefix = niv;
+    }
+    memcpy(iv, &shortsector, ivprefix);
+    if (ivprefix < niv) {
+        memset(iv + ivprefix, 0, niv - ivprefix);
+    }
+    return 0;
+}
+
+static void qcrypto_ivgen_plain_cleanup(QCryptoIVGen *ivgen)
+{
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_plain = {
+    .init = qcrypto_ivgen_plain_init,
+    .calculate = qcrypto_ivgen_plain_calculate,
+    .cleanup = qcrypto_ivgen_plain_cleanup,
+};
+
--- a/crypto/ivgen-plain.h
+++ b/crypto/ivgen-plain.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_PLAIN_H__
+#define QCRYPTO_IVGEN_PLAIN_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_plain;
+
+#endif /* QCRYPTO_IVGEN_PLAIN_H__ */
--- a/crypto/ivgen-plain64.c
+++ b/crypto/ivgen-plain64.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-plain.h"
+
+static int qcrypto_ivgen_plain_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    return 0;
+}
+
+static int qcrypto_ivgen_plain_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    size_t ivprefix;
+    ivprefix = sizeof(sector);
+    sector = cpu_to_le64(sector);
+    if (ivprefix > niv) {
+        ivprefix = niv;
+    }
+    memcpy(iv, &sector, ivprefix);
+    if (ivprefix < niv) {
+        memset(iv + ivprefix, 0, niv - ivprefix);
+    }
+    return 0;
+}
+
+static void qcrypto_ivgen_plain_cleanup(QCryptoIVGen *ivgen)
+{
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_plain64 = {
+    .init = qcrypto_ivgen_plain_init,
+    .calculate = qcrypto_ivgen_plain_calculate,
+    .cleanup = qcrypto_ivgen_plain_cleanup,
+};
+
--- a/crypto/ivgen-plain64.h
+++ b/crypto/ivgen-plain64.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - plain64
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_PLAIN64_H__
+#define QCRYPTO_IVGEN_PLAIN64_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_plain64;
+
+#endif /* QCRYPTO_IVGEN_PLAIN64_H__ */
--- a/crypto/ivgen.c
+++ b/crypto/ivgen.c
@@ -0,0 +1,99 @@
+/*
+ * QEMU Crypto block IV generator
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgenpriv.h"
+#include "crypto/ivgen-plain.h"
+#include "crypto/ivgen-plain64.h"
+#include "crypto/ivgen-essiv.h"
+
+
+QCryptoIVGen *qcrypto_ivgen_new(QCryptoIVGenAlgorithm alg,
+                                QCryptoCipherAlgorithm cipheralg,
+                                QCryptoHashAlgorithm hash,
+                                const uint8_t *key, size_t nkey,
+                                Error **errp)
+{
+    QCryptoIVGen *ivgen = g_new0(QCryptoIVGen, 1);
+
+    ivgen->algorithm = alg;
+    ivgen->cipher = cipheralg;
+    ivgen->hash = hash;
+
+    switch (alg) {
+    case QCRYPTO_IVGEN_ALG_PLAIN:
+        ivgen->driver = &qcrypto_ivgen_plain;
+        break;
+    case QCRYPTO_IVGEN_ALG_PLAIN64:
+        ivgen->driver = &qcrypto_ivgen_plain64;
+        break;
+    case QCRYPTO_IVGEN_ALG_ESSIV:
+        ivgen->driver = &qcrypto_ivgen_essiv;
+        break;
+    default:
+        error_setg(errp, "Unknown block IV generator algorithm %d", alg);
+        g_free(ivgen);
+        return NULL;
+    }
+
+    if (ivgen->driver->init(ivgen, key, nkey, errp) < 0) {
+        g_free(ivgen);
+        return NULL;
+    }
+
+    return ivgen;
+}
+
+
+int qcrypto_ivgen_calculate(QCryptoIVGen *ivgen,
+                            uint64_t sector,
+                            uint8_t *iv, size_t niv,
+                            Error **errp)
+{
+    return ivgen->driver->calculate(ivgen, sector, iv, niv, errp);
+}
+
+
+QCryptoIVGenAlgorithm qcrypto_ivgen_get_algorithm(QCryptoIVGen *ivgen)
+{
+    return ivgen->algorithm;
+}
+
+
+QCryptoCipherAlgorithm qcrypto_ivgen_get_cipher(QCryptoIVGen *ivgen)
+{
+    return ivgen->cipher;
+}
+
+
+QCryptoHashAlgorithm qcrypto_ivgen_get_hash(QCryptoIVGen *ivgen)
+{
+    return ivgen->hash;
+}
+
+
+void qcrypto_ivgen_free(QCryptoIVGen *ivgen)
+{
+    if (!ivgen) {
+        return;
+    }
+    ivgen->driver->cleanup(ivgen);
+    g_free(ivgen);
+}
--- a/crypto/ivgenpriv.h
+++ b/crypto/ivgenpriv.h
@@ -0,0 +1,49 @@
+/*
+ * QEMU Crypto block IV generator
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_IVGEN_PRIV_H__
+#define QCRYPTO_IVGEN_PRIV_H__
+
+#include "crypto/ivgen.h"
+
+typedef struct QCryptoIVGenDriver QCryptoIVGenDriver;
+
+struct QCryptoIVGenDriver {
+    int (*init)(QCryptoIVGen *ivgen,
+                const uint8_t *key, size_t nkey,
+                Error **errp);
+    int (*calculate)(QCryptoIVGen *ivgen,
+                     uint64_t sector,
+                     uint8_t *iv, size_t niv,
+                     Error **errp);
+    void (*cleanup)(QCryptoIVGen *ivgen);
+};
+
+struct QCryptoIVGen {
+    QCryptoIVGenDriver *driver;
+    void *private;
+
+    QCryptoIVGenAlgorithm algorithm;
+    QCryptoCipherAlgorithm cipher;
+    QCryptoHashAlgorithm hash;
+};
+
+
+#endif /* QCRYPTO_IVGEN_PRIV_H__ */
--- a/crypto/pbkdf-gcrypt.c
+++ b/crypto/pbkdf-gcrypt.c
@@ -0,0 +1,68 @@
+/*
+ * QEMU Crypto PBKDF support (Password-Based Key Derivation Function)
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/pbkdf.h"
+#include "gcrypt.h"
+
+bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash)
+{
+    switch (hash) {
+    case QCRYPTO_HASH_ALG_MD5:
+    case QCRYPTO_HASH_ALG_SHA1:
+    case QCRYPTO_HASH_ALG_SHA256:
+        return true;
+    default:
+        return false;
+    }
+}
+
+int qcrypto_pbkdf2(QCryptoHashAlgorithm hash,
+                   const uint8_t *key, size_t nkey,
+                   const uint8_t *salt, size_t nsalt,
+                   unsigned int iterations,
+                   uint8_t *out, size_t nout,
+                   Error **errp)
+{
+    static const int hash_map[QCRYPTO_HASH_ALG__MAX] = {
+        [QCRYPTO_HASH_ALG_MD5] = GCRY_MD_MD5,
+        [QCRYPTO_HASH_ALG_SHA1] = GCRY_MD_SHA1,
+        [QCRYPTO_HASH_ALG_SHA256] = GCRY_MD_SHA256,
+    };
+    int ret;
+
+    if (hash >= G_N_ELEMENTS(hash_map) ||
+        hash_map[hash] == GCRY_MD_NONE) {
+        error_setg(errp, "Unexpected hash algorithm %d", hash);
+        return -1;
+    }
+
+    ret = gcry_kdf_derive(key, nkey, GCRY_KDF_PBKDF2,
+                          hash_map[hash],
+                          salt, nsalt, iterations,
+                          nout, out);
+    if (ret != 0) {
+        error_setg(errp, "Cannot derive password: %s",
+                   gcry_strerror(ret));
+        return -1;
+    }
+
+    return 0;
+}
--- a/crypto/pbkdf-nettle.c
+++ b/crypto/pbkdf-nettle.c
@@ -0,0 +1,65 @@
+/*
+ * QEMU Crypto PBKDF support (Password-Based Key Derivation Function)
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/pbkdf.h"
+#include "nettle/pbkdf2.h"
+
+
+bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash)
+{
+    switch (hash) {
+    case QCRYPTO_HASH_ALG_SHA1:
+    case QCRYPTO_HASH_ALG_SHA256:
+        return true;
+    default:
+        return false;
+    }
+}
+
+int qcrypto_pbkdf2(QCryptoHashAlgorithm hash,
+                   const uint8_t *key, size_t nkey,
+                   const uint8_t *salt, size_t nsalt,
+                   unsigned int iterations,
+                   uint8_t *out, size_t nout,
+                   Error **errp)
+{
+    switch (hash) {
+    case QCRYPTO_HASH_ALG_SHA1:
+        pbkdf2_hmac_sha1(nkey, key,
+                         iterations,
+                         nsalt, salt,
+                         nout, out);
+        break;
+
+    case QCRYPTO_HASH_ALG_SHA256:
+        pbkdf2_hmac_sha256(nkey, key,
+                           iterations,
+                           nsalt, salt,
+                           nout, out);
+        break;
+
+    default:
+        error_setg_errno(errp, ENOSYS,
+                         "PBKDF does not support hash algorithm %d", hash);
+        return -1;
+    }
+    return 0;
+}
--- a/crypto/pbkdf-stub.c
+++ b/crypto/pbkdf-stub.c
@@ -0,0 +1,42 @@
+/*
+ * QEMU Crypto PBKDF support (Password-Based Key Derivation Function)
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/pbkdf.h"
+
+bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash G_GNUC_UNUSED)
+{
+    return false;
+}
+
+int qcrypto_pbkdf2(QCryptoHashAlgorithm hash G_GNUC_UNUSED,
+                   const uint8_t *key G_GNUC_UNUSED,
+                   size_t nkey G_GNUC_UNUSED,
+                   const uint8_t *salt G_GNUC_UNUSED,
+                   size_t nsalt G_GNUC_UNUSED,
+                   unsigned int iterations G_GNUC_UNUSED,
+                   uint8_t *out G_GNUC_UNUSED,
+                   size_t nout G_GNUC_UNUSED,
+                   Error **errp)
+{
+    error_setg_errno(errp, ENOSYS,
+                     "No crypto library supporting PBKDF in this build");
+    return -1;
+}
--- a/crypto/pbkdf.c
+++ b/crypto/pbkdf.c
@@ -0,0 +1,109 @@
+/*
+ * QEMU Crypto PBKDF support (Password-Based Key Derivation Function)
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/pbkdf.h"
+#ifndef _WIN32
+#include <sys/resource.h>
+#endif
+
+
+static int qcrypto_pbkdf2_get_thread_cpu(unsigned long long *val_ms,
+                                         Error **errp)
+{
+#ifdef _WIN32
+    FILETIME creation_time, exit_time, kernel_time, user_time;
+    ULARGE_INTEGER thread_time;
+
+    if (!GetThreadTimes(GetCurrentThread(), &creation_time, &exit_time,
+                        &kernel_time, &user_time)) {
+        error_setg(errp, "Unable to get thread CPU usage");
+        return -1;
+    }
+
+    thread_time.LowPart = user_time.dwLowDateTime;
+    thread_time.HighPart = user_time.dwHighDateTime;
+
+    /* QuadPart is units of 100ns and we want ms as unit */
+    *val_ms = thread_time.QuadPart / 10000ll;
+    return 0;
+#elif defined(RUSAGE_THREAD)
+    struct rusage ru;
+    if (getrusage(RUSAGE_THREAD, &ru) < 0) {
+        error_setg_errno(errp, errno, "Unable to get thread CPU usage");
+        return -1;
+    }
+
+    *val_ms = ((ru.ru_utime.tv_sec * 1000ll) +
+               (ru.ru_utime.tv_usec / 1000));
+    return 0;
+#else
+    *val_ms = 0;
+    error_setg(errp, "Unable to calculate thread CPU usage on this platform");
+    return -1;
+#endif
+}
+
+int qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash,
+                               const uint8_t *key, size_t nkey,
+                               const uint8_t *salt, size_t nsalt,
+                               Error **errp)
+{
+    uint8_t out[32];
+    long long int iterations = (1 << 15);
+    unsigned long long delta_ms, start_ms, end_ms;
+
+    while (1) {
+        if (qcrypto_pbkdf2_get_thread_cpu(&start_ms, errp) < 0) {
+            return -1;
+        }
+        if (qcrypto_pbkdf2(hash,
+                           key, nkey,
+                           salt, nsalt,
+                           iterations,
+                           out, sizeof(out),
+                           errp) < 0) {
+            return -1;
+        }
+        if (qcrypto_pbkdf2_get_thread_cpu(&end_ms, errp) < 0) {
+            return -1;
+        }
+
+        delta_ms = end_ms - start_ms;
+
+        if (delta_ms > 500) {
+            break;
+        } else if (delta_ms < 100) {
+            iterations = iterations * 10;
+        } else {
+            iterations = (iterations * 1000 / delta_ms);
+        }
+    }
+
+    iterations = iterations * 1000 / delta_ms;
+
+    if (iterations > INT32_MAX) {
+        error_setg(errp, "Iterations %lld too large for a 32-bit int",
+                   iterations);
+        return -1;
+    }
+
+    return iterations;
+}
--- a/crypto/random-gcrypt.c
+++ b/crypto/random-gcrypt.c
@@ -0,0 +1,33 @@
+/*
+ * QEMU Crypto random number provider
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "crypto/random.h"
+
+#include <gcrypt.h>
+
+int qcrypto_random_bytes(uint8_t *buf,
+                         size_t buflen,
+                         Error **errp G_GNUC_UNUSED)
+{
+    gcry_randomize(buf, buflen, GCRY_STRONG_RANDOM);
+    return 0;
+}
--- a/crypto/random-gnutls.c
+++ b/crypto/random-gnutls.c
@@ -0,0 +1,43 @@
+/*
+ * QEMU Crypto random number provider
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "crypto/random.h"
+
+#include <gnutls/gnutls.h>
+#include <gnutls/crypto.h>
+
+int qcrypto_random_bytes(uint8_t *buf,
+                         size_t buflen,
+                         Error **errp)
+{
+    int ret;
+
+    ret = gnutls_rnd(GNUTLS_RND_RANDOM, buf, buflen);
+
+    if (ret < 0) {
+        error_setg(errp, "Cannot get random bytes: %s",
+                   gnutls_strerror(ret));
+        return -1;
+    }
+
+    return 0;
+}
--- a/crypto/random-stub.c
+++ b/crypto/random-stub.c
@@ -0,0 +1,31 @@
+/*
+ * QEMU Crypto random number provider
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "crypto/random.h"
+
+int qcrypto_random_bytes(uint8_t *buf G_GNUC_UNUSED,
+                         size_t buflen G_GNUC_UNUSED,
+                         Error **errp)
+{
+    error_setg(errp, "No random byte source provided in this build");
+    return -1;
+}
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -0,0 +1,230 @@
+/*
+ * QEMU Crypto XTS cipher mode
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * This code is originally derived from public domain / WTFPL code in
+ * LibTomCrypt crytographic library http://libtom.org. The XTS code
+ * was donated by Elliptic Semiconductor Inc (www.ellipticsemi.com)
+ * to the LibTom Projects
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/xts.h"
+
+static void xts_mult_x(uint8_t *I)
+{
+    int x;
+    uint8_t t, tt;
+
+    for (x = t = 0; x < 16; x++) {
+        tt = I[x] >> 7;
+        I[x] = ((I[x] << 1) | t) & 0xFF;
+        t = tt;
+    }
+    if (tt) {
+        I[0] ^= 0x87;
+    }
+}
+
+
+/**
+ * xts_tweak_uncrypt:
+ * @param ctxt: the cipher context
+ * @param func: the cipher function
+ * @src: buffer providing the cipher text of XTS_BLOCK_SIZE bytes
+ * @dst: buffer to output the plain text of XTS_BLOCK_SIZE bytes
+ * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
+ *
+ * Decrypt data with a tweak
+ */
+static void xts_tweak_decrypt(const void *ctx,
+                              xts_cipher_func *func,
+                              const uint8_t *src,
+                              uint8_t *dst,
+                              uint8_t *iv)
+{
+    unsigned long x;
+
+    /* tweak encrypt block i */
+    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
+        dst[x] = src[x] ^ iv[x];
+    }
+
+    func(ctx, XTS_BLOCK_SIZE, dst, dst);
+
+    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
+        dst[x] = dst[x] ^ iv[x];
+    }
+
+    /* LFSR the tweak */
+    xts_mult_x(iv);
+}
+
+
+void xts_decrypt(const void *datactx,
+                 const void *tweakctx,
+                 xts_cipher_func *encfunc,
+                 xts_cipher_func *decfunc,
+                 uint8_t *iv,
+                 size_t length,
+                 uint8_t *dst,
+                 const uint8_t *src)
+{
+    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    unsigned long i, m, mo, lim;
+
+    /* get number of blocks */
+    m = length >> 4;
+    mo = length & 15;
+
+    /* must have at least one full block */
+    g_assert(m != 0);
+
+    if (mo == 0) {
+        lim = m;
+    } else {
+        lim = m - 1;
+    }
+
+    /* encrypt the iv */
+    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+
+    for (i = 0; i < lim; i++) {
+        xts_tweak_decrypt(datactx, decfunc, src, dst, T);
+
+        src += XTS_BLOCK_SIZE;
+        dst += XTS_BLOCK_SIZE;
+    }
+
+    /* if length is not a multiple of XTS_BLOCK_SIZE then */
+    if (mo > 0) {
+        memcpy(CC, T, XTS_BLOCK_SIZE);
+        xts_mult_x(CC);
+
+        /* PP = tweak decrypt block m-1 */
+        xts_tweak_decrypt(datactx, decfunc, src, PP, CC);
+
+        /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
+        for (i = 0; i < mo; i++) {
+            CC[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = PP[i];
+        }
+        for (; i < XTS_BLOCK_SIZE; i++) {
+            CC[i] = PP[i];
+        }
+
+        /* Pm-1 = Tweak uncrypt CC */
+        xts_tweak_decrypt(datactx, decfunc, CC, dst, T);
+    }
+
+    /* Decrypt the iv back */
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+}
+
+
+/**
+ * xts_tweak_crypt:
+ * @param ctxt: the cipher context
+ * @param func: the cipher function
+ * @src: buffer providing the plain text of XTS_BLOCK_SIZE bytes
+ * @dst: buffer to output the cipher text of XTS_BLOCK_SIZE bytes
+ * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
+ *
+ * Encrypt data with a tweak
+ */
+static void xts_tweak_encrypt(const void *ctx,
+                              xts_cipher_func *func,
+                              const uint8_t *src,
+                              uint8_t *dst,
+                              uint8_t *iv)
+{
+    unsigned long x;
+
+    /* tweak encrypt block i */
+    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
+        dst[x] = src[x] ^ iv[x];
+    }
+
+    func(ctx, XTS_BLOCK_SIZE, dst, dst);
+
+    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
+        dst[x] = dst[x] ^ iv[x];
+    }
+
+    /* LFSR the tweak */
+    xts_mult_x(iv);
+}
+
+
+void xts_encrypt(const void *datactx,
+                 const void *tweakctx,
+                 xts_cipher_func *encfunc,
+                 xts_cipher_func *decfunc,
+                 uint8_t *iv,
+                 size_t length,
+                 uint8_t *dst,
+                 const uint8_t *src)
+{
+    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    unsigned long i, m, mo, lim;
+
+    /* get number of blocks */
+    m = length >> 4;
+    mo = length & 15;
+
+    /* must have at least one full block */
+    g_assert(m != 0);
+
+    if (mo == 0) {
+        lim = m;
+    } else {
+        lim = m - 1;
+    }
+
+    /* encrypt the iv */
+    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+
+    for (i = 0; i < lim; i++) {
+        xts_tweak_encrypt(datactx, encfunc, src, dst, T);
+
+        dst += XTS_BLOCK_SIZE;
+        src += XTS_BLOCK_SIZE;
+    }
+
+    /* if length is not a multiple of XTS_BLOCK_SIZE then */
+    if (mo > 0) {
+        /* CC = tweak encrypt block m-1 */
+        xts_tweak_encrypt(datactx, encfunc, src, CC, T);
+
+        /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
+        for (i = 0; i < mo; i++) {
+            PP[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = CC[i];
+        }
+
+        for (; i < XTS_BLOCK_SIZE; i++) {
+            PP[i] = CC[i];
+        }
+
+        /* Cm-1 = Tweak encrypt PP */
+        xts_tweak_encrypt(datactx, encfunc, PP, dst, T);
+    }
+
+    /* Decrypt the iv back */
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+}
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -110,3 +110,4 @@ CONFIG_IOH3420=y
 CONFIG_I82801B11=y
 CONFIG_ACPI=y
 CONFIG_SMBIOS=y
+CONFIG_ASPEED_SOC=y
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -36,5 +36,5 @@ CONFIG_SDHCI=y
 CONFIG_EDU=y
 CONFIG_VGA=y
 CONFIG_VGA_PCI=y
-CONFIG_IVSHMEM=$(CONFIG_POSIX)
+CONFIG_IVSHMEM=$(CONFIG_EVENTFD)
 CONFIG_ROCKER=y
--- a/device-hotplug.c
+++ b/device-hotplug.c
@@ -30,6 +30,7 @@
 #include "qemu/config-file.h"
 #include "sysemu/sysemu.h"
 #include "monitor/monitor.h"
+#include "block/block_int.h"

 static DriveInfo *add_init_drive(const char *optstr)
 {
@@ -55,6 +56,12 @@ void hmp_drive_add(Monitor *mon, const QDict *qdict)
 {
    DriveInfo *dinfo = NULL;
    const char *opts = qdict_get_str(qdict, "opts");
+    bool node = qdict_get_try_bool(qdict, "node", false);
+
+    if (node) {
+        hmp_drive_add_node(mon, opts);
+        return;
+    }

    dinfo = add_init_drive(opts);
    if (!dinfo) {
@@ -77,6 +84,8 @@ void hmp_drive_add(Monitor *mon, const QDict *qdict)

 err:
    if (dinfo) {
-        blk_unref(blk_by_legacy_dinfo(dinfo));
+        BlockBackend *blk = blk_by_legacy_dinfo(dinfo);
+        monitor_remove_blk(blk);
+        blk_unref(blk);
    }
 }
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -153,8 +153,6 @@
 /* opcodes/i386-dis.c r1.126 */
 #include "qemu-common.h"

-#include <setjmp.h>
-
 static int fetch_data2(struct disassemble_info *, bfd_byte *);
 static int fetch_data(struct disassemble_info *, bfd_byte *);
 static void ckprefix (void);
--- a/disas/m68k.c
+++ b/disas/m68k.c
@@ -615,8 +615,6 @@ static const char *const reg_half_names[] =
 /* Maximum length of an instruction.  */
 #define MAXLEN 22

-#include <setjmp.h>
-
 struct private
 {
  /* Points to first byte not fetched.  */
--- a/docs/memory.txt
+++ b/docs/memory.txt
@@ -180,8 +180,8 @@ aliases that leave holes then the lower priority region will appear in these
 holes too.)

 For example, suppose we have a container A of size 0x8000 with two subregions
-B and C. B is a container mapped at 0x2000, size 0x4000, priority 1; C is
-an MMIO region mapped at 0x0, size 0x6000, priority 2. B currently has two
+B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is
+an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two
 of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at
 offset 0x2000. As a diagram:

@@ -297,8 +297,9 @@ various constraints can be supplied to control how these callbacks are called:
 - .valid.min_access_size, .valid.max_access_size define the access sizes
   (in bytes) which the device accepts; accesses outside this range will
   have device and bus specific behaviour (ignored, or machine check)
- - .valid.aligned specifies that the device only accepts naturally aligned
-   accesses.  Unaligned accesses invoke device and bus specific behaviour.
+ - .valid.unaligned specifies that the *device being modelled* supports
+    unaligned accesses; if false, unaligned accesses will invoke the
+    appropriate bus or CPU specific behaviour.
 - .impl.min_access_size, .impl.max_access_size define the access sizes
   (in bytes) supported by the *implementation*; other access sizes will be
   emulated using the ones available.  For example a 4-byte write will be
@@ -306,5 +307,5 @@ various constraints can be supplied to control how these callbacks are called:
 - .impl.unaligned specifies that the *implementation* supports unaligned
   accesses; if false, unaligned accesses will be emulated by two aligned
   accesses.
- - .old_mmio can be used to ease porting from code using
+ - .old_mmio eases the porting of code that was formerly using
   cpu_register_io_memory(). It should not be used in new code.
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -333,7 +333,7 @@ doesn't finish in a given time the switch is made to postcopy.
 To enable postcopy, issue this command on the monitor prior to the
 start of migration:

-migrate_set_capability x-postcopy-ram on
+migrate_set_capability postcopy-ram on

 The normal commands are then used to start a migration, which is still
 started in precopy mode.  Issuing:
--- a/docs/pci_expander_bridge.txt
+++ b/docs/pci_expander_bridge.txt
@@ -24,8 +24,8 @@ A detailed command line would be:
 -object memory-backend-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0
 -object memory-backend-ram,size=1024M,policy=bind,host-nodes=1,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1
 -device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd -device e1000,bus=bridge1,addr=0x4,netdev=nd
-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8, -device e1000,bus=bridge2,addr=0x3
-device pxb,id=bridge3,bus=pci.0,bus_nr=40, -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1
+-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8 -device e1000,bus=bridge2,addr=0x3
+-device pxb,id=bridge3,bus=pci.0,bus_nr=40 -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1

 Here you have:
 - 2 NUMA nodes for the guest, 0 and 1. (both mapped to the same NUMA node in host, but you can and should put it in different host NUMA nodes)
@@ -43,7 +43,7 @@ Implementation
 ==============
 The PXB is composed by:
 - HostBridge (TYPE_PXB_HOST)
-  The host bridge allows to register and query the PXB's rPCI root bus in QEMU.
+  The host bridge allows to register and query the PXB's PCI root bus in QEMU.
 - PXBDev(TYPE_PXB_DEVICE)
  It is a regular PCI Device that resides on the piix host-bridge bus and its bus uses the same PCI domain.
  However, the bus behind is exposed through ACPI as a primary PCI bus and starts a new PCI hierarchy.
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -1,7 +1,7 @@
 = How to use the QAPI code generator =

 Copyright IBM Corp. 2011
-Copyright (C) 2012-2015 Red Hat, Inc.
+Copyright (C) 2012-2016 Red Hat, Inc.

 This work is licensed under the terms of the GNU GPL, version 2 or
 later. See the COPYING file in the top-level directory.
@@ -52,7 +52,7 @@ schema.  The documentation is delimited between two lines of ##, then
 the first line names the expression, an optional overview is provided,
 then individual documentation about each member of 'data' is provided,
 and finally, a 'Since: x.y.z' tag lists the release that introduced
-the expression.  Optional fields are tagged with the phrase
+the expression.  Optional members are tagged with the phrase
 '#optional', often with their default value; and extensions added
 after the expression was first released are also given a '(since
 x.y.z)' comment.  For example:
@@ -108,15 +108,15 @@ user-defined type names, while built-in types are lowercase. Type
 definitions should not end in 'Kind', as this namespace is used for
 creating implicit C enums for visiting union types, or in 'List', as
 this namespace is used for creating array types.  Command names,
-and field names within a type, should be all lower case with words
+and member names within a type, should be all lower case with words
 separated by a hyphen.  However, some existing older commands and
 complex types use underscore; when extending such expressions,
 consistency is preferred over blindly avoiding underscore.  Event
-names should be ALL_CAPS with words separated by underscore.  Field
+names should be ALL_CAPS with words separated by underscore.  Member
 names cannot start with 'has-' or 'has_', as this is reserved for
-tracking optional fields.
+tracking optional members.

-Any name (command, event, type, field, or enum value) beginning with
+Any name (command, event, type, member, or enum value) beginning with
 "x-" is marked experimental, and may be withdrawn or changed
 incompatibly in a future release.  All names must begin with a letter,
 and contain only ASCII letters, digits, dash, and underscore.  There
@@ -127,7 +127,7 @@ the vendor), even if the rest of the name uses dash (example:
 __com.redhat_drive-mirror).  Names beginning with 'q_' are reserved
 for the generator: QMP names that resemble C keywords or other
 problematic strings will be munged in C to use this prefix.  For
-example, a field named "default" in qapi becomes "q_default" in the
+example, a member named "default" in qapi becomes "q_default" in the
 generated C code.

 In the rest of this document, usage lines are given for each
@@ -217,17 +217,18 @@ and must continue to work).

 On output structures (only mentioned in the 'returns' side of a command),
 changing from mandatory to optional is in general unsafe (older clients may be
-expecting the field, and could crash if it is missing), although it can be done
-if the only way that the optional argument will be omitted is when it is
-triggered by the presence of a new input flag to the command that older clients
-don't know to send.  Changing from optional to mandatory is safe.
+expecting the member, and could crash if it is missing), although it
+can be done if the only way that the optional argument will be omitted
+is when it is triggered by the presence of a new input flag to the
+command that older clients don't know to send.  Changing from optional
+to mandatory is safe.

 A structure that is used in both input and output of various commands
 must consider the backwards compatibility constraints of both directions
 of use.

 A struct definition can specify another struct as its base.
-In this case, the fields of the base type are included as top-level fields
+In this case, the members of the base type are included as top-level members
 of the new struct's dictionary in the Client JSON Protocol wire
 format. An example definition is:

@@ -237,7 +238,7 @@ format. An example definition is:
   'data': { '*backing': 'str' } }

 An example BlockdevOptionsGenericCOWFormat object on the wire could use
-both fields like this:
+both members like this:

 { "file": "/some/place/my-image",
   "backing": "/some/place/my-backing-file" }
@@ -262,7 +263,7 @@ The enum constants will be named by using a heuristic to turn the
 type name into a set of underscore separated words. For the example
 above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name
 of 'MY_ENUM_VALUE1' for the first value. If the default heuristic
-does not result in a desirable name, the optional 'prefix' field
+does not result in a desirable name, the optional 'prefix' member
 can be used when defining the enum.

 The enumeration values are passed as strings over the Client JSON
@@ -275,15 +276,15 @@ converting between strings and enum values.  Since the wire format
 always passes by name, it is acceptable to reorder or add new
 enumeration members in any location without breaking clients of Client
 JSON Protocol; however, removing enum values would break
-compatibility.  For any struct that has a field that will only contain
-a finite set of string values, using an enum type for that field is
-better than open-coding the field to be type 'str'.
+compatibility.  For any struct that has a member that will only contain
+a finite set of string values, using an enum type for that member is
+better than open-coding the member to be type 'str'.


 === Union types ===

 Usage: { 'union': STRING, 'data': DICT }
-or:    { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME,
+or:    { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT,
         'discriminator': ENUM-MEMBER-OF-BASE }

 Union types are used to let the user choose between several different
@@ -296,22 +297,22 @@ be empty.
 A simple union type defines a mapping from automatic discriminator
 values to data types like in this example:

- { 'struct': 'FileOptions', 'data': { 'filename': 'str' } }
- { 'struct': 'Qcow2Options',
-   'data': { 'backing-file': 'str', 'lazy-refcounts': 'bool' } }
+ { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } }
+ { 'struct': 'BlockdevOptionsQcow2',
+   'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } }

- { 'union': 'BlockdevOptions',
-   'data': { 'file': 'FileOptions',
-             'qcow2': 'Qcow2Options' } }
+ { 'union': 'BlockdevOptionsSimple',
+   'data': { 'file': 'BlockdevOptionsFile',
+             'qcow2': 'BlockdevOptionsQcow2' } }

 In the Client JSON Protocol, a simple union is represented by a
-dictionary that contains the 'type' field as a discriminator, and a
-'data' field that is of the specified data type corresponding to the
+dictionary that contains the 'type' member as a discriminator, and a
+'data' member that is of the specified data type corresponding to the
 discriminator value, as in these examples:

- { "type": "file", "data" : { "filename": "/some/place/my-image" } }
- { "type": "qcow2", "data" : { "backing-file": "/some/place/my-image",
-                               "lazy-refcounts": true } }
+ { "type": "file", "data": { "filename": "/some/place/my-image" } }
+ { "type": "qcow2", "data": { "backing": "/some/place/my-image",
+                              "lazy-refcounts": true } }

 The generated C code uses a struct containing a union. Additionally,
 an implicit C enum 'NameKind' is created, corresponding to the union
@@ -319,42 +320,43 @@ an implicit C enum 'NameKind' is created, corresponding to the union
 the union can be named 'max', as this would collide with the implicit
 enum.  The value for each branch can be of any type.

-A flat union definition specifies a struct as its base, and
-avoids nesting on the wire.  All branches of the union must be
-complex types, and the top-level fields of the union dictionary on
-the wire will be combination of fields from both the base type and the
-appropriate branch type (when merging two dictionaries, there must be
-no keys in common).  The 'discriminator' field must be the name of an
-enum-typed member of the base struct.
+A flat union definition avoids nesting on the wire, and specifies a
+set of common members that occur in all variants of the union.  The
+'base' key must specifiy either a type name (the type must be a
+struct, not a union), or a dictionary representing an anonymous type.
+All branches of the union must be complex types, and the top-level
+members of the union dictionary on the wire will be combination of
+members from both the base type and the appropriate branch type (when
+merging two dictionaries, there must be no keys in common).  The
+'discriminator' member must be the name of a non-optional enum-typed
+member of the base struct.

 The following example enhances the above simple union example by
-adding a common field 'readonly', renaming the discriminator to
-something more applicable, and reducing the number of {} required on
-the wire:
+adding an optional common member 'read-only', renaming the
+discriminator to something more applicable than the simple union's
+default of 'type', and reducing the number of {} required on the wire:

 { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] }
- { 'struct': 'BlockdevCommonOptions',
-   'data': { 'driver': 'BlockdevDriver', 'readonly': 'bool' } }
 { 'union': 'BlockdevOptions',
-   'base': 'BlockdevCommonOptions',
+   'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' },
   'discriminator': 'driver',
-   'data': { 'file': 'FileOptions',
-             'qcow2': 'Qcow2Options' } }
+   'data': { 'file': 'BlockdevOptionsFile',
+             'qcow2': 'BlockdevOptionsQcow2' } }

 Resulting in these JSON objects:

- { "driver": "file", "readonly": true,
+ { "driver": "file", "read-only": true,
   "filename": "/some/place/my-image" }
- { "driver": "qcow2", "readonly": false,
-   "backing-file": "/some/place/my-image", "lazy-refcounts": true }
+ { "driver": "qcow2", "read-only": false,
+   "backing": "/some/place/my-image", "lazy-refcounts": true }

 Notice that in a flat union, the discriminator name is controlled by
 the user, but because it must map to a base member with enum type, the
 code generator can ensure that branches exist for all values of the
 enum (although the order of the keys need not match the declaration of
 the enum).  In the resulting generated C data types, a flat union is
-represented as a struct with the base member fields included directly,
-and then a union of structures for each branch of the struct.
+represented as a struct with the base members included directly, and
+then a union of structures for each branch of the struct.

 A simple union can always be re-written as a flat union where the base
 class has a single member named 'type', and where each branch of the
@@ -365,10 +367,9 @@ union has a struct with a single member named 'data'.  That is,
 is identical on the wire to:

 { 'enum': 'Enum', 'data': ['one', 'two'] }
- { 'struct': 'Base', 'data': { 'type': 'Enum' } }
 { 'struct': 'Branch1', 'data': { 'data': 'str' } }
 { 'struct': 'Branch2', 'data': { 'data': 'int' } }
- { 'union': 'Flat', 'base': 'Base', 'discriminator': 'type',
+ { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type',
   'data': { 'one': 'Branch1', 'two': 'Branch2' } }


@@ -381,7 +382,7 @@ data types (string, integer, number, or object, but currently not
 array) on the wire.  The definition is similar to a simple union type,
 where each branch of the union names a QAPI type.  For example:

- { 'alternate': 'BlockRef',
+ { 'alternate': 'BlockdevRef',
   'data': { 'definition': 'BlockdevOptions',
             'reference': 'str' } }

@@ -402,7 +403,7 @@ following example objects:

 { "file": "my_existing_block_device_id" }
 { "file": { "driver": "file",
-             "readonly": false,
+             "read-only": false,
             "filename": "/tmp/mydisk.qcow2" } }


@@ -424,10 +425,10 @@ string name of a complex type, or a dictionary that declares an
 anonymous type with the same semantics as a 'struct' expression, with
 one exception noted below when 'gen' is used.

-The 'returns' member describes what will appear in the "return" field
+The 'returns' member describes what will appear in the "return" member
 of a Client JSON Protocol reply on successful completion of a command.
 The member is optional from the command declaration; if absent, the
-"return" field will be an empty dictionary.  If 'returns' is present,
+"return" member will be an empty dictionary.  If 'returns' is present,
 it must be the string name of a complex or built-in type, a
 one-element array containing the name of a complex or built-in type,
 with one exception noted below when 'gen' is used.  Although it is
@@ -435,7 +436,7 @@ permitted to have the 'returns' member name a built-in type or an
 array of built-in types, any command that does this cannot be extended
 to return additional information in the future; thus, new commands
 should strongly consider returning a dictionary-based type or an array
-of dictionaries, even if the dictionary only contains one field at the
+of dictionaries, even if the dictionary only contains one member at the
 present.

 All commands in Client JSON Protocol use a dictionary to report
@@ -478,7 +479,7 @@ response is not possible (although the command will still return a
 normal dictionary error on failure).  When a successful reply is not
 possible, the command expression should include the optional key
 'success-response' with boolean value false.  So far, only QGA makes
-use of this field.
+use of this member.


 === Events ===
@@ -574,9 +575,9 @@ names an object type without members.
 Example: the SchemaInfo for command query-qmp-schema

    { "name": "query-qmp-schema", "meta-type": "command",
-      "arg-type": ":empty", "ret-type": "SchemaInfoList" }
+      "arg-type": "q_empty", "ret-type": "SchemaInfoList" }

-    Type ":empty" is an object type without members, and type
+    Type "q_empty" is an automatic object type without members, and type
    "SchemaInfoList" is the array of SchemaInfo type.

 The SchemaInfo for an event has meta-type "event", and variant member
@@ -593,9 +594,9 @@ QAPI schema implicitly defines an object type.
 Example: the SchemaInfo for EVENT_C from section Events

    { "name": "EVENT_C", "meta-type": "event",
-      "arg-type": ":obj-EVENT_C-arg" }
+      "arg-type": "q_obj-EVENT_C-arg" }

-    Type ":obj-EVENT_C-arg" is an implicitly defined object type with
+    Type "q_obj-EVENT_C-arg" is an implicitly defined object type with
    the two members from the event's definition.

 The SchemaInfo for struct and union types has meta-type "object".
@@ -636,11 +637,11 @@ Union types
    { "name": "BlockdevOptions", "meta-type": "object",
      "members": [
          { "name": "driver", "type": "BlockdevDriver" },
-          { "name": "readonly", "type": "bool"} ],
+          { "name": "read-only", "type": "bool", "default": null } ],
      "tag": "driver",
      "variants": [
-          { "case": "file", "type": "FileOptions" },
-          { "case": "qcow2", "type": "Qcow2Options" } ] }
+          { "case": "file", "type": "BlockdevOptionsFile" },
+          { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] }

 Note that base types are "flattened": its members are included in the
 "members" array.
@@ -651,20 +652,20 @@ discriminator (called "type" on the wire, see section Union types).
 A simple union implicitly defines an object type for each of its
 variants.

-Example: the SchemaInfo for simple union BlockdevOptions from section
+Example: the SchemaInfo for simple union BlockdevOptionsSimple from section
 Union types

-    { "name": "BlockdevOptions", "meta-type": "object",
+    { "name": "BlockdevOptionsSimple", "meta-type": "object",
      "members": [
-          { "name": "kind", "type": "BlockdevOptionsKind" } ],
+          { "name": "type", "type": "BlockdevOptionsSimpleKind" } ],
      "tag": "type",
      "variants": [
-          { "case": "file", "type": ":obj-FileOptions-wrapper" },
-          { "case": "qcow2", "type": ":obj-Qcow2Options-wrapper" } ] }
+          { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" },
+          { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] }

-    Enumeration type "BlockdevOptionsKind" and the object types
-    ":obj-FileOptions-wrapper", ":obj-Qcow2Options-wrapper" are
-    implicitly defined.
+    Enumeration type "BlockdevOptionsSimpleKind" and the object types
+    "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper"
+    are implicitly defined.

 The SchemaInfo for an alternate type has meta-type "alternate", and
 variant member "members".  "members" is a JSON array.  Each element is
@@ -672,9 +673,9 @@ a JSON object with member "type", which names a type.  Values of the
 alternate type conform to exactly one of its member types.  There is
 no guarantee on the order in which "members" will be listed.

-Example: the SchemaInfo for BlockRef from section Alternate types
+Example: the SchemaInfo for BlockdevRef from section Alternate types

-    { "name": "BlockRef", "meta-type": "alternate",
+    { "name": "BlockdevRef", "meta-type": "alternate",
      "members": [
          { "type": "BlockdevOptions" },
          { "type": "str" } ] }
@@ -722,33 +723,38 @@ the names of built-in types.  Clients should examine member

 == Code generation ==

-Schemas are fed into four scripts to generate all the code/files that,
+Schemas are fed into five scripts to generate all the code/files that,
 paired with the core QAPI libraries, comprise everything required to
 take JSON commands read in by a Client JSON Protocol server, unmarshal
 the arguments into the underlying C types, call into the corresponding
-C function, and map the response back to a Client JSON Protocol
-response to be returned to the user.
+C function, map the response back to a Client JSON Protocol response
+to be returned to the user, and introspect the commands.

-As an example, we'll use the following schema, which describes a single
-complex user-defined type (which will produce a C struct, along with a list
-node structure that can be used to chain together a list of such types in
-case we want to accept/return a list of this type with a command), and a
-command which takes that type as a parameter and returns the same type:
+As an example, we'll use the following schema, which describes a
+single complex user-defined type, along with command which takes a
+list of that type as a parameter, and returns a single element of that
+type.  The user is responsible for writing the implementation of
+qmp_my_command(); everything else is produced by the generator.

    $ cat example-schema.json
    { 'struct': 'UserDefOne',
-      'data': { 'integer': 'int', 'string': 'str' } }
+      'data': { 'integer': 'int', '*string': 'str' } }

    { 'command': 'my-command',
-      'data':    {'arg1': 'UserDefOne'},
+      'data': { 'arg1': ['UserDefOne'] },
      'returns': 'UserDefOne' }

    { 'event': 'MY_EVENT' }

+For a more thorough look at generated code, the testsuite includes
+tests/qapi-schema/qapi-schema-tests.json that covers more examples of
+what the generator will accept, and compiles the resulting C code as
+part of 'make check-unit'.
+
 === scripts/qapi-types.py ===

-Used to generate the C types defined by a schema. The following files are
-created:
+Used to generate the C types defined by a schema, along with
+supporting code. The following files are created:

 $(prefix)qapi-types.h - C types corresponding to types defined in
                        the schema you pass in
@@ -763,38 +769,6 @@ Example:

    $ python scripts/qapi-types.py --output-dir="qapi-generated" \
    --prefix="example-" example-schema.json
-    $ cat qapi-generated/example-qapi-types.c
-[Uninteresting stuff omitted...]
-
-    void qapi_free_UserDefOne(UserDefOne *obj)
-    {
-        QapiDeallocVisitor *qdv;
-        Visitor *v;
-
-        if (!obj) {
-            return;
-        }
-
-        qdv = qapi_dealloc_visitor_new();
-        v = qapi_dealloc_get_visitor(qdv);
-        visit_type_UserDefOne(v, &obj, NULL, NULL);
-        qapi_dealloc_visitor_cleanup(qdv);
-    }
-
-    void qapi_free_UserDefOneList(UserDefOneList *obj)
-    {
-        QapiDeallocVisitor *qdv;
-        Visitor *v;
-
-        if (!obj) {
-            return;
-        }
-
-        qdv = qapi_dealloc_visitor_new();
-        v = qapi_dealloc_get_visitor(qdv);
-        visit_type_UserDefOneList(v, &obj, NULL, NULL);
-        qapi_dealloc_visitor_cleanup(qdv);
-    }
    $ cat qapi-generated/example-qapi-types.h
 [Uninteresting stuff omitted...]

@@ -809,29 +783,59 @@ Example:

    struct UserDefOne {
        int64_t integer;
+        bool has_string;
        char *string;
    };

    void qapi_free_UserDefOne(UserDefOne *obj);

    struct UserDefOneList {
-        union {
-            UserDefOne *value;
-            uint64_t padding;
-        };
        UserDefOneList *next;
+        UserDefOne *value;
    };

    void qapi_free_UserDefOneList(UserDefOneList *obj);

    #endif
+    $ cat qapi-generated/example-qapi-types.c
+[Uninteresting stuff omitted...]
+
+    void qapi_free_UserDefOne(UserDefOne *obj)
+    {
+        QapiDeallocVisitor *qdv;
+        Visitor *v;
+
+        if (!obj) {
+            return;
+        }
+
+        qdv = qapi_dealloc_visitor_new();
+        v = qapi_dealloc_get_visitor(qdv);
+        visit_type_UserDefOne(v, NULL, &obj, NULL);
+        qapi_dealloc_visitor_cleanup(qdv);
+    }
+
+    void qapi_free_UserDefOneList(UserDefOneList *obj)
+    {
+        QapiDeallocVisitor *qdv;
+        Visitor *v;
+
+        if (!obj) {
+            return;
+        }
+
+        qdv = qapi_dealloc_visitor_new();
+        v = qapi_dealloc_get_visitor(qdv);
+        visit_type_UserDefOneList(v, NULL, &obj, NULL);
+        qapi_dealloc_visitor_cleanup(qdv);
+    }

 === scripts/qapi-visit.py ===

-Used to generate the visitor functions used to walk through and convert
-a QObject (as provided by QMP) to a native C data structure and
-vice-versa, as well as the visitor function used to dealloc a complex
-schema-defined C type.
+Used to generate the visitor functions used to walk through and
+convert between a native QAPI C data structure and some other format
+(such as QObject); the generated functions are named visit_type_FOO()
+and visit_type_FOO_members().

 The following files are generated:

@@ -848,41 +852,62 @@ Example:

    $ python scripts/qapi-visit.py --output-dir="qapi-generated"
    --prefix="example-" example-schema.json
+    $ cat qapi-generated/example-qapi-visit.h
+[Uninteresting stuff omitted...]
+
+    #ifndef EXAMPLE_QAPI_VISIT_H
+    #define EXAMPLE_QAPI_VISIT_H
+
+[Visitors for built-in types omitted...]
+
+    void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp);
+    void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp);
+    void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp);
+
+    #endif
    $ cat qapi-generated/example-qapi-visit.c
 [Uninteresting stuff omitted...]

-    static void visit_type_UserDefOne_fields(Visitor *v, UserDefOne **obj, Error **errp)
+    void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp)
    {
        Error *err = NULL;

-        visit_type_int(v, &(*obj)->integer, "integer", &err);
+        visit_type_int(v, "integer", &obj->integer, &err);
        if (err) {
            goto out;
        }
-        visit_type_str(v, &(*obj)->string, "string", &err);
-        if (err) {
-            goto out;
+        if (visit_optional(v, "string", &obj->has_string)) {
+            visit_type_str(v, "string", &obj->string, &err);
+            if (err) {
+                goto out;
+            }
        }

    out:
        error_propagate(errp, err);
    }

-    void visit_type_UserDefOne(Visitor *v, UserDefOne **obj, const char *name, Error **errp)
+    void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp)
    {
        Error *err = NULL;

-        visit_start_struct(v, (void **)obj, "UserDefOne", name, sizeof(UserDefOne), &err);
-        if (!err) {
-            if (*obj) {
-                visit_type_UserDefOne_fields(v, obj, errp);
-            }
-            visit_end_struct(v, &err);
+        visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err);
+        if (err) {
+            goto out;
        }
+        if (!*obj) {
+            goto out_obj;
+        }
+        visit_type_UserDefOne_members(v, *obj, &err);
+        error_propagate(errp, err);
+        err = NULL;
+    out_obj:
+        visit_end_struct(v, &err);
+    out:
        error_propagate(errp, err);
    }

-    void visit_type_UserDefOneList(Visitor *v, UserDefOneList **obj, const char *name, Error **errp)
+    void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp)
    {
        Error *err = NULL;
        GenericList *i, **prev;
@@ -893,35 +918,24 @@ Example:
        }

        for (prev = (GenericList **)obj;
-             !err && (i = visit_next_list(v, prev, &err)) != NULL;
+             !err && (i = visit_next_list(v, prev, sizeof(**obj))) != NULL;
             prev = &i) {
            UserDefOneList *native_i = (UserDefOneList *)i;
-            visit_type_UserDefOne(v, &native_i->value, NULL, &err);
+            visit_type_UserDefOne(v, NULL, &native_i->value, &err);
        }

-        error_propagate(errp, err);
-        err = NULL;
-        visit_end_list(v, &err);
+        visit_end_list(v);
    out:
        error_propagate(errp, err);
    }
-    $ cat qapi-generated/example-qapi-visit.h
-[Uninteresting stuff omitted...]
-
-    #ifndef EXAMPLE_QAPI_VISIT_H
-    #define EXAMPLE_QAPI_VISIT_H
-
-[Visitors for built-in types omitted...]
-
-    void visit_type_UserDefOne(Visitor *v, UserDefOne **obj, const char *name, Error **errp);
-    void visit_type_UserDefOneList(Visitor *v, UserDefOneList **obj, const char *name, Error **errp);
-
-    #endif

 === scripts/qapi-commands.py ===

-Used to generate the marshaling/dispatch functions for the commands defined
-in the schema. The following files are generated:
+Used to generate the marshaling/dispatch functions for the commands
+defined in the schema. The generated code implements
+qmp_marshal_COMMAND() (mentioned in qmp-commands.hx, and registered
+automatically), and declares qmp_COMMAND() that the user must
+implement.  The following files are generated:

 $(prefix)qmp-marshal.c: command marshal/dispatch functions for each
                        QMP command defined in the schema. Functions
@@ -939,6 +953,19 @@ Example:

    $ python scripts/qapi-commands.py --output-dir="qapi-generated"
    --prefix="example-" example-schema.json
+    $ cat qapi-generated/example-qmp-commands.h
+[Uninteresting stuff omitted...]
+
+    #ifndef EXAMPLE_QMP_COMMANDS_H
+    #define EXAMPLE_QMP_COMMANDS_H
+
+    #include "example-qapi-types.h"
+    #include "qapi/qmp/qdict.h"
+    #include "qapi/error.h"
+
+    UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp);
+
+    #endif
    $ cat qapi-generated/example-qmp-marshal.c
 [Uninteresting stuff omitted...]

@@ -950,7 +977,7 @@ Example:
        Visitor *v;

        v = qmp_output_get_visitor(qov);
-        visit_type_UserDefOne(v, &ret_in, "unused", &err);
+        visit_type_UserDefOne(v, "unused", &ret_in, &err);
        if (err) {
            goto out;
        }
@@ -961,7 +988,7 @@ Example:
        qmp_output_visitor_cleanup(qov);
        qdv = qapi_dealloc_visitor_new();
        v = qapi_dealloc_get_visitor(qdv);
-        visit_type_UserDefOne(v, &ret_in, "unused", NULL);
+        visit_type_UserDefOne(v, "unused", &ret_in, NULL);
        qapi_dealloc_visitor_cleanup(qdv);
    }

@@ -972,10 +999,10 @@ Example:
        QmpInputVisitor *qiv = qmp_input_visitor_new_strict(QOBJECT(args));
        QapiDeallocVisitor *qdv;
        Visitor *v;
-        UserDefOne *arg1 = NULL;
+        UserDefOneList *arg1 = NULL;

        v = qmp_input_get_visitor(qiv);
-        visit_type_UserDefOne(v, &arg1, "arg1", &err);
+        visit_type_UserDefOneList(v, "arg1", &arg1, &err);
        if (err) {
            goto out;
        }
@@ -992,7 +1019,7 @@ Example:
        qmp_input_visitor_cleanup(qiv);
        qdv = qapi_dealloc_visitor_new();
        v = qapi_dealloc_get_visitor(qdv);
-        visit_type_UserDefOne(v, &arg1, "arg1", NULL);
+        visit_type_UserDefOneList(v, "arg1", &arg1, NULL);
        qapi_dealloc_visitor_cleanup(qdv);
    }

@@ -1002,24 +1029,12 @@ Example:
    }

    qapi_init(qmp_init_marshal);
-    $ cat qapi-generated/example-qmp-commands.h
-[Uninteresting stuff omitted...]
-
-    #ifndef EXAMPLE_QMP_COMMANDS_H
-    #define EXAMPLE_QMP_COMMANDS_H
-
-    #include "example-qapi-types.h"
-    #include "qapi/qmp/qdict.h"
-    #include "qapi/error.h"
-
-    UserDefOne *qmp_my_command(UserDefOne *arg1, Error **errp);
-
-    #endif

 === scripts/qapi-event.py ===

-Used to generate the event-related C code defined by a schema. The
-following files are created:
+Used to generate the event-related C code defined by a schema, with
+implementations for qapi_event_send_FOO(). The following files are
+created:

 $(prefix)qapi-event.h - Function prototypes for each event type, plus an
                        enumeration of all event names
@@ -1029,6 +1044,27 @@ Example:

    $ python scripts/qapi-event.py --output-dir="qapi-generated"
    --prefix="example-" example-schema.json
+    $ cat qapi-generated/example-qapi-event.h
+[Uninteresting stuff omitted...]
+
+    #ifndef EXAMPLE_QAPI_EVENT_H
+    #define EXAMPLE_QAPI_EVENT_H
+
+    #include "qapi/error.h"
+    #include "qapi/qmp/qdict.h"
+    #include "example-qapi-types.h"
+
+
+    void qapi_event_send_my_event(Error **errp);
+
+    typedef enum example_QAPIEvent {
+        EXAMPLE_QAPI_EVENT_MY_EVENT = 0,
+        EXAMPLE_QAPI_EVENT__MAX = 1,
+    } example_QAPIEvent;
+
+    extern const char *const example_QAPIEvent_lookup[];
+
+    #endif
    $ cat qapi-generated/example-qapi-event.c
 [Uninteresting stuff omitted...]

@@ -1054,27 +1090,6 @@ Example:
        [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT",
        [EXAMPLE_QAPI_EVENT__MAX] = NULL,
    };
-    $ cat qapi-generated/example-qapi-event.h
-[Uninteresting stuff omitted...]
-
-    #ifndef EXAMPLE_QAPI_EVENT_H
-    #define EXAMPLE_QAPI_EVENT_H
-
-    #include "qapi/error.h"
-    #include "qapi/qmp/qdict.h"
-    #include "example-qapi-types.h"
-
-
-    void qapi_event_send_my_event(Error **errp);
-
-    typedef enum example_QAPIEvent {
-        EXAMPLE_QAPI_EVENT_MY_EVENT = 0,
-        EXAMPLE_QAPI_EVENT__MAX = 1,
-    } example_QAPIEvent;
-
-    extern const char *const example_QAPIEvent_lookup[];
-
-    #endif

 === scripts/qapi-introspect.py ===

@@ -1089,17 +1104,6 @@ Example:

    $ python scripts/qapi-introspect.py --output-dir="qapi-generated"
    --prefix="example-" example-schema.json
-    $ cat qapi-generated/example-qmp-introspect.c
-[Uninteresting stuff omitted...]
-
-    const char example_qmp_schema_json[] = "["
-        "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, "
-        "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, "
-        "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, "
-        "{\"members\": [{\"name\": \"arg1\", \"type\": \"2\"}], \"meta-type\": \"object\", \"name\": \"1\"}, "
-        "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, "
-        "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, "
-        "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]";
    $ cat qapi-generated/example-qmp-introspect.h
 [Uninteresting stuff omitted...]

@@ -1109,3 +1113,15 @@ Example:
    extern const char example_qmp_schema_json[];

    #endif
+    $ cat qapi-generated/example-qmp-introspect.c
+[Uninteresting stuff omitted...]
+
+    const char example_qmp_schema_json[] = "["
+        "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, "
+        "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, "
+        "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, "
+        "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, "
+        "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, "
+        "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, "
+        "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, "
+        "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]";
--- a/docs/qmp-events.txt
+++ b/docs/qmp-events.txt
@@ -325,6 +325,7 @@ Emitted to report a corruption of a Quorum file.

 Data:

+- "type":          Quorum operation type
 - "error":         Error message (json-string, optional)
                   Only present on failure.  This field contains a human-readable
                   error message.  There are no semantics other than that the
@@ -336,10 +337,18 @@ Data:

 Example:

+Read operation:
 { "event": "QUORUM_REPORT_BAD",
-     "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 },
+     "data": { "node-name": "node0", "sector-num": 345435, "sectors-count": 5,
+               "type": "read" },
     "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }

+Flush operation:
+{ "event": "QUORUM_REPORT_BAD",
+     "data": { "node-name": "node0", "sector-num": 0, "sectors-count": 2097120,
+               "type": "flush", "error": "Broken pipe" },
+     "timestamp": { "seconds": 1456406829, "microseconds": 291763 } }
+
 Note: this event is rate-limited.

 RESET
--- a/docs/qmp-spec.txt
+++ b/docs/qmp-spec.txt
@@ -3,7 +3,7 @@
 0. About This Document
 ======================

-Copyright (C) 2009-2015 Red Hat, Inc.
+Copyright (C) 2009-2016 Red Hat, Inc.

 This work is licensed under the terms of the GNU GPL, version 2 or
 later. See the COPYING file in the top-level directory.
@@ -277,7 +277,7 @@ However, Clients must not assume any particular:
 - Amount of errors generated by a command, that is, new errors can be added
  to any existing command in newer versions of the Server

-Any command or field name beginning with "x-" is deemed experimental,
+Any command or member name beginning with "x-" is deemed experimental,
 and may be withdrawn or changed in an incompatible manner in a future
 release.

--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -107,7 +107,7 @@ at the specified moments of time. There are several kinds of timers:
   sources (e.g. real time clock chip). Host clock is the one of the sources
   of non-determinism. Host clock read operations should be logged to
   make the execution deterministic.
- * Real time clock for icount. This clock is similar to real time clock but
+ * Virtual real time clock. This clock is similar to real time clock but
   it is used only for increasing virtual clock while virtual machine is
   sleeping. Due to its nature it is also non-deterministic as the host clock
   and has to be logged too.
@@ -134,11 +134,20 @@ of time. That's why we do not process a group of timers until the checkpoint
 event will be read from the log. Such an event allows synchronizing CPU
 execution and timer events.

-Another checkpoints application in record/replay is instruction counting
-while the virtual machine is idle. This function (qemu_clock_warp) is called
-from the wait loop. It changes virtual machine state and must be deterministic
-then. That is why we added checkpoint to this function to prevent its
-operation in replay mode when it does not correspond to record mode.
+Two other checkpoints govern the "warping" of the virtual clock.
+While the virtual machine is idle, the virtual clock increments at
+1 ns per *real time* nanosecond.  This is done by setting up a timer
+(called the warp timer) on the virtual real time clock, so that the
+timer fires at the next deadline of the virtual clock; the virtual clock
+is then incremented (which is called "warping" the virtual clock) as
+soon as the timer fires or the CPUs need to go out of the idle state.
+Two functions are used for this purpose; because these actions change
+virtual machine state and must be deterministic, each of them creates a
+checkpoint.  qemu_start_warp_timer checks if the CPUs are idle and if so
+starts accounting real time to virtual clock.  qemu_account_warp_timer
+is called when the CPUs get an interrupt or when the warp timer fires,
+and it warps the virtual clock by the amount of real time that has passed
+since qemu_start_warp_timer.

 Bottom halves
 -------------
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -84,6 +84,15 @@ Selector Register address: Base + 8 (2 bytes)
 Data Register address:     Base + 0 (8 bytes)
 DMA Address address:       Base + 16 (8 bytes)

+== ACPI Interface ==
+
+The fw_cfg device is defined with ACPI ID "QEMU0002". Since we expect
+ACPI tables to be passed into the guest through the fw_cfg device itself,
+the guest-side firmware can not use ACPI to find fw_cfg. However, once the
+firmware is finished setting up ACPI tables and hands control over to the
+guest kernel, the latter can use the fw_cfg ACPI node for a more accurate
+inventory of in-use IOport or MMIO regions.
+
 == Firmware Configuration Items ==

 === Signature (Key 0x0000, FW_CFG_SIGNATURE) ===
--- a/docs/specs/ivshmem-spec.txt
+++ b/docs/specs/ivshmem-spec.txt
@@ -0,0 +1,254 @@
+= Device Specification for Inter-VM shared memory device =
+
+The Inter-VM shared memory device (ivshmem) is designed to share a
+memory region between multiple QEMU processes running different guests
+and the host.  In order for all guests to be able to pick up the
+shared memory area, it is modeled by QEMU as a PCI device exposing
+said memory to the guest as a PCI BAR.
+
+The device can use a shared memory object on the host directly, or it
+can obtain one from an ivshmem server.
+
+In the latter case, the device can additionally interrupt its peers, and
+get interrupted by its peers.
+
+
+== Configuring the ivshmem PCI device ==
+
+There are two basic configurations:
+
+- Just shared memory: -device ivshmem-plain,memdev=HMB,...
+
+  This uses host memory backend HMB.  It should have option "share"
+  set.
+
+- Shared memory plus interrupts: -device ivshmem,chardev=CHR,vectors=N,...
+
+  An ivshmem server must already be running on the host.  The device
+  connects to the server's UNIX domain socket via character device
+  CHR.
+
+  Each peer gets assigned a unique ID by the server.  IDs must be
+  between 0 and 65535.
+
+  Interrupts are message-signaled (MSI-X).  vectors=N configures the
+  number of vectors to use.
+
+For more details on ivshmem device properties, see The QEMU Emulator
+User Documentation (qemu-doc.*).
+
+
+== The ivshmem PCI device's guest interface ==
+
+The device has vendor ID 1af4, device ID 1110, revision 1.  Before
+QEMU 2.6.0, it had revision 0.
+
+=== PCI BARs ===
+
+The ivshmem PCI device has two or three BARs:
+
+- BAR0 holds device registers (256 Byte MMIO)
+- BAR1 holds MSI-X table and PBA (only ivshmem-doorbell)
+- BAR2 maps the shared memory object
+
+There are two ways to use this device:
+
+- If you only need the shared memory part, BAR2 suffices.  This way,
+  you have access to the shared memory in the guest and can use it as
+  you see fit.  Memnic, for example, uses ivshmem this way from guest
+  user space (see http://dpdk.org/browse/memnic).
+
+- If you additionally need the capability for peers to interrupt each
+  other, you need BAR0 and BAR1.  You will most likely want to write a
+  kernel driver to handle interrupts.  Requires the device to be
+  configured for interrupts, obviously.
+
+Before QEMU 2.6.0, BAR2 can initially be invalid if the device is
+configured for interrupts.  It becomes safely accessible only after
+the ivshmem server provided the shared memory.  These devices have PCI
+revision 0 rather than 1.  Guest software should wait for the
+IVPosition register (described below) to become non-negative before
+accessing BAR2.
+
+Revision 0 of the device is not capable to tell guest software whether
+it is configured for interrupts.
+
+=== PCI device registers ===
+
+BAR 0 contains the following registers:
+
+    Offset  Size  Access      On reset  Function
+        0     4   read/write        0   Interrupt Mask
+                                        bit 0: peer interrupt (rev 0)
+                                               reserved       (rev 1)
+                                        bit 1..31: reserved
+        4     4   read/write        0   Interrupt Status
+                                        bit 0: peer interrupt (rev 0)
+                                               reserved       (rev 1)
+                                        bit 1..31: reserved
+        8     4   read-only   0 or ID   IVPosition
+       12     4   write-only      N/A   Doorbell
+                                        bit 0..15: vector
+                                        bit 16..31: peer ID
+       16   240   none            N/A   reserved
+
+Software should only access the registers as specified in column
+"Access".  Reserved bits should be ignored on read, and preserved on
+write.
+
+In revision 0 of the device, Interrupt Status and Mask Register
+together control the legacy INTx interrupt when the device has no
+MSI-X capability: INTx is asserted when the bit-wise AND of Status and
+Mask is non-zero and the device has no MSI-X capability.  Interrupt
+Status Register bit 0 becomes 1 when an interrupt request from a peer
+is received.  Reading the register clears it.
+
+IVPosition Register: if the device is not configured for interrupts,
+this is zero.  Else, it is the device's ID (between 0 and 65535).
+
+Before QEMU 2.6.0, the register may read -1 for a short while after
+reset.  These devices have PCI revision 0 rather than 1.
+
+There is no good way for software to find out whether the device is
+configured for interrupts.  A positive IVPosition means interrupts,
+but zero could be either.
+
+Doorbell Register: writing this register requests to interrupt a peer.
+The written value's high 16 bits are the ID of the peer to interrupt,
+and its low 16 bits select an interrupt vector.
+
+If the device is not configured for interrupts, the write is ignored.
+
+If the interrupt hasn't completed setup, the write is ignored.  The
+device is not capable to tell guest software whether setup is
+complete.  Interrupts can regress to this state on migration.
+
+If the peer with the requested ID isn't connected, or it has fewer
+interrupt vectors connected, the write is ignored.  The device is not
+capable to tell guest software what peers are connected, or how many
+interrupt vectors are connected.
+
+The peer's interrupt for this vector then becomes pending.  There is
+no way for software to clear the pending bit, and a polling mode of
+operation is therefore impossible.
+
+If the peer is a revision 0 device without MSI-X capability, its
+Interrupt Status register is set to 1.  This asserts INTx unless
+masked by the Interrupt Mask register.  The device is not capable to
+communicate the interrupt vector to guest software then.
+
+With multiple MSI-X vectors, different vectors can be used to indicate
+different events have occurred.  The semantics of interrupt vectors
+are left to the application.
+
+
+== Interrupt infrastructure ==
+
+When configured for interrupts, the peers share eventfd objects in
+addition to shared memory.  The shared resources are managed by an
+ivshmem server.
+
+=== The ivshmem server ===
+
+The server listens on a UNIX domain socket.
+
+For each new client that connects to the server, the server
+- picks an ID,
+- creates eventfd file descriptors for the interrupt vectors,
+- sends the ID and the file descriptor for the shared memory to the
+  new client,
+- sends connect notifications for the new client to the other clients
+  (these contain file descriptors for sending interrupts),
+- sends connect notifications for the other clients to the new client,
+  and
+- sends interrupt setup messages to the new client (these contain file
+  descriptors for receiving interrupts).
+
+The first client to connect to the server receives ID zero.
+
+When a client disconnects from the server, the server sends disconnect
+notifications to the other clients.
+
+The next section describes the protocol in detail.
+
+If the server terminates without sending disconnect notifications for
+its connected clients, the clients can elect to continue.  They can
+communicate with each other normally, but won't receive disconnect
+notification on disconnect, and no new clients can connect.  There is
+no way for the clients to connect to a restarted server.  The device
+is not capable to tell guest software whether the server is still up.
+
+Example server code is in contrib/ivshmem-server/.  Not to be used in
+production.  It assumes all clients use the same number of interrupt
+vectors.
+
+A standalone client is in contrib/ivshmem-client/.  It can be useful
+for debugging.
+
+=== The ivshmem Client-Server Protocol ===
+
+An ivshmem device configured for interrupts connects to an ivshmem
+server.  This section details the protocol between the two.
+
+The connection is one-way: the server sends messages to the client.
+Each message consists of a single 8 byte little-endian signed number,
+and may be accompanied by a file descriptor via SCM_RIGHTS.  Both
+client and server close the connection on error.
+
+Note: QEMU currently doesn't close the connection right on error, but
+only when the character device is destroyed.
+
+On connect, the server sends the following messages in order:
+
+1. The protocol version number, currently zero.  The client should
+   close the connection on receipt of versions it can't handle.
+
+2. The client's ID.  This is unique among all clients of this server.
+   IDs must be between 0 and 65535, because the Doorbell register
+   provides only 16 bits for them.
+
+3. The number -1, accompanied by the file descriptor for the shared
+   memory.
+
+4. Connect notifications for existing other clients, if any.  This is
+   a peer ID (number between 0 and 65535 other than the client's ID),
+   repeated N times.  Each repetition is accompanied by one file
+   descriptor.  These are for interrupting the peer with that ID using
+   vector 0,..,N-1, in order.  If the client is configured for fewer
+   vectors, it closes the extra file descriptors.  If it is configured
+   for more, the extra vectors remain unconnected.
+
+5. Interrupt setup.  This is the client's own ID, repeated N times.
+   Each repetition is accompanied by one file descriptor.  These are
+   for receiving interrupts from peers using vector 0,..,N-1, in
+   order.  If the client is configured for fewer vectors, it closes
+   the extra file descriptors.  If it is configured for more, the
+   extra vectors remain unconnected.
+
+From then on, the server sends these kinds of messages:
+
+6. Connection / disconnection notification.  This is a peer ID.
+
+  - If the number comes with a file descriptor, it's a connection
+    notification, exactly like in step 4.
+
+  - Else, it's a disconnection notification for the peer with that ID.
+
+Known bugs:
+
+* The protocol changed incompatibly in QEMU 2.5.  Before, messages
+  were native endian long, and there was no version number.
+
+* The protocol is poorly designed.
+
+=== The ivshmem Client-Client Protocol ===
+
+An ivshmem device configured for interrupts receives eventfd file
+descriptors for interrupting peers and getting interrupted by peers
+from the server, as explained in the previous section.
+
+To interrupt a peer, the device writes the 8-byte integer 1 in native
+byte order to the respective file descriptor.
+
+To receive an interrupt, the device reads and discards as many 8-byte
+integers as it can.
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -1,161 +0,0 @@
-
-Device Specification for Inter-VM shared memory device
------------------------------------------------------
-
-The Inter-VM shared memory device is designed to share a memory region (created
-on the host via the POSIX shared memory API) between multiple QEMU processes
-running different guests. In order for all guests to be able to pick up the
-shared memory area, it is modeled by QEMU as a PCI device exposing said memory
-to the guest as a PCI BAR.
-The memory region does not belong to any guest, but is a POSIX memory object on
-the host. The host can access this shared memory if needed.
-
-The device also provides an optional communication mechanism between guests
-sharing the same memory object. More details about that in the section 'Guest to
-guest communication' section.
-
-
-The Inter-VM PCI device
-----------------------
-
-From the VM point of view, the ivshmem PCI device supports three BARs.
-
- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is
-  not used.
- BAR1 is used for MSI-X when it is enabled in the device.
- BAR2 is used to access the shared memory object.
-
-It is your choice how to use the device but you must choose between two
-behaviors :
-
- basically, if you only need the shared memory part, you will map BAR2.
-  This way, you have access to the shared memory in guest and can use it as you
-  see fit (memnic, for example, uses it in userland
-  http://dpdk.org/browse/memnic).
-
- BAR0 and BAR1 are used to implement an optional communication mechanism
-  through interrupts in the guests. If you need an event mechanism between the
-  guests accessing the shared memory, you will most likely want to write a
-  kernel driver that will handle interrupts. See details in the section 'Guest
-  to guest communication' section.
-
-The behavior is chosen when starting your QEMU processes:
- no communication mechanism needed, the first QEMU to start creates the shared
-  memory on the host, subsequent QEMU processes will use it.
-
- communication mechanism needed, an ivshmem server must be started before any
-  QEMU processes, then each QEMU process connects to the server unix socket.
-
-For more details on the QEMU ivshmem parameters, see qemu-doc documentation.
-
-
-Guest to guest communication
----------------------------
-
-This section details the communication mechanism between the guests accessing
-the ivhsmem shared memory.
-
-*ivshmem server*
-
-This server code is available in qemu.git/contrib/ivshmem-server.
-
-The server must be started on the host before any guest.
-It creates a shared memory object then waits for clients to connect on a unix
-socket. All the messages are little-endian int64_t integer.
-
-For each client (QEMU process) that connects to the server:
- the server sends a protocol version, if client does not support it, the client
-  closes the communication,
- the server assigns an ID for this client and sends this ID to him as the first
-  message,
- the server sends a fd to the shared memory object to this client,
- the server creates a new set of host eventfds associated to the new client and
-  sends this set to all already connected clients,
- finally, the server sends all the eventfds sets for all clients to the new
-  client.
-
-The server signals all clients when one of them disconnects.
-
-The client IDs are limited to 16 bits because of the current implementation (see
-Doorbell register in 'PCI device registers' subsection). Hence only 65536
-clients are supported.
-
-All the file descriptors (fd to the shared memory, eventfds for each client)
-are passed to clients using SCM_RIGHTS over the server unix socket.
-
-Apart from the current ivshmem implementation in QEMU, an ivshmem client has
-been provided in qemu.git/contrib/ivshmem-client for debug.
-
-*QEMU as an ivshmem client*
-
-At initialisation, when creating the ivshmem device, QEMU first receives a
-protocol version and closes communication with server if it does not match.
-Then, QEMU gets its ID from the server then makes it available through BAR0
-IVPosition register for the VM to use (see 'PCI device registers' subsection).
-QEMU then uses the fd to the shared memory to map it to BAR2.
-eventfds for all other clients received from the server are stored to implement
-BAR0 Doorbell register (see 'PCI device registers' subsection).
-Finally, eventfds assigned to this QEMU process are used to send interrupts in
-this VM.
-
-*PCI device registers*
-
-From the VM point of view, the ivshmem PCI device supports 4 registers of
-32-bits each.
-
-enum ivshmem_registers {
-    IntrMask = 0,
-    IntrStatus = 4,
-    IVPosition = 8,
-    Doorbell = 12
-};
-
-The first two registers are the interrupt mask and status registers.  Mask and
-status are only used with pin-based interrupts.  They are unused with MSI
-interrupts.
-
-Status Register: The status register is set to 1 when an interrupt occurs.
-
-Mask Register: The mask register is bitwise ANDed with the interrupt status
-and the result will raise an interrupt if it is non-zero.  However, since 1 is
-the only value the status will be set to, it is only the first bit of the mask
-that has any effect.  Therefore interrupts can be masked by setting the first
-bit to 0 and unmasked by setting the first bit to 1.
-
-IVPosition Register: The IVPosition register is read-only and reports the
-guest's ID number.  The guest IDs are non-negative integers.  When using the
-server, since the server is a separate process, the VM ID will only be set when
-the device is ready (shared memory is received from the server and accessible
-via the device).  If the device is not ready, the IVPosition will return -1.
-Applications should ensure that they have a valid VM ID before accessing the
-shared memory.
-
-Doorbell Register:  To interrupt another guest, a guest must write to the
-Doorbell register.  The doorbell register is 32-bits, logically divided into
-two 16-bit fields.  The high 16-bits are the guest ID to interrupt and the low
-16-bits are the interrupt vector to trigger.  The semantics of the value
-written to the doorbell depends on whether the device is using MSI or a regular
-pin-based interrupt.  In short, MSI uses vectors while regular interrupts set
-the status register.
-
-Regular Interrupts
-
-If regular interrupts are used (due to either a guest not supporting MSI or the
-user specifying not to use them on startup) then the value written to the lower
-16-bits of the Doorbell register results is arbitrary and will trigger an
-interrupt in the destination guest.
-
-Message Signalled Interrupts
-
-An ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
-written to the Doorbell register must be between 0 and the maximum number of
-vectors the guest supports.  The lower 16 bits written to the doorbell is the
-MSI vector that will be raised in the destination guest.  The number of MSI
-vectors is configurable but it is set when the VM is started.
-
-The important thing to remember with MSI is that it is only a signal, no status
-is set (since MSI interrupts are not shared).  All information other than the
-interrupt itself should be communicated via the shared memory region.  Devices
-supporting multiple MSI vectors can use different vectors to indicate different
-events have occurred.  The semantics of interrupt vectors are left to the
-user's discretion.
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -15,13 +15,23 @@ The 1000 -> 10ff device ID range is used as follows for virtio-pci devices.
 Note that this allocation separate from the virtio device IDs, which are
 maintained as part of the virtio specification.

-1af4:1000  network device
-1af4:1001  block device
-1af4:1002  balloon device
-1af4:1003  console device
-1af4:1004  SCSI host bus adapter device
-1af4:1005  entropy generator device
-1af4:1009  9p filesystem device
+1af4:1000  network device (legacy)
+1af4:1001  block device (legacy)
+1af4:1002  balloon device (legacy)
+1af4:1003  console device (legacy)
+1af4:1004  SCSI host bus adapter device (legacy)
+1af4:1005  entropy generator device (legacy)
+1af4:1009  9p filesystem device (legacy)
+
+1af4:1041  network device (modern)
+1af4:1042  block device (modern)
+1af4:1043  console device (modern)
+1af4:1044  entropy generator device (modern)
+1af4:1045  balloon device (modern)
+1af4:1048  SCSI host bus adapter device (modern)
+1af4:1049  9p filesystem device (modern)
+1af4:1050  virtio gpu device (modern)
+1af4:1052  virtio input device (modern)

 1af4:10f0  Available for experimental usage without registration.  Must get
   to      official ID when the code leaves the test lab (i.e. when seeking
--- a/docs/tracing.txt
+++ b/docs/tracing.txt
@@ -172,9 +172,6 @@ source tree.  It may not be as powerful as platform-specific or third-party
 trace backends but it is portable.  This is the recommended trace backend
 unless you have specific needs for more advanced backends.

-The "simple" backend currently does not capture string arguments, it simply
-records the char* pointer value instead of the string that is pointed to.
-
 === Ftrace ===

 The "ftrace" backend writes trace data to ftrace marker. This effectively
@@ -347,3 +344,44 @@ This will immediately call:
 and will generate the TCG code to call:

    void trace_foo(uint8_t a1, uint32_t a2);
+
+=== "vcpu" ===
+
+Identifies events that trace vCPU-specific information. It implicitly adds a
+"CPUState*" argument, and extends the tracing print format to show the vCPU
+information. If used together with the "tcg" property, it adds a second
+"TCGv_env" argument that must point to the per-target global TCG register that
+points to the vCPU when guest code is executed (usually the "cpu_env" variable).
+
+The following example events:
+
+    foo(uint32_t a) "a=%x"
+    vcpu bar(uint32_t a) "a=%x"
+    tcg vcpu baz(uint32_t a) "a=%x", "a=%x"
+
+Can be used as:
+
+    #include "trace-tcg.h"
+    
+    CPUArchState *env;
+    TCGv_ptr cpu_env;
+    
+    void some_disassembly_func(...)
+    {
+        /* trace emitted at this point */
+        trace_foo(0xd1);
+        /* trace emitted at this point */
+        trace_bar(ENV_GET_CPU(env), 0xd2);
+        /* trace emitted at this point (env) and when guest code is executed (cpu_env) */
+        trace_baz_tcg(ENV_GET_CPU(env), cpu_env, 0xd3);
+    }
+
+If the translating vCPU has address 0xc1 and code is later executed by vCPU
+0xc2, this would be an example output:
+
+    // at guest code translation
+    foo a=0xd1
+    bar cpu=0xc1 a=0xd2
+    baz_trans cpu=0xc1 a=0xd3
+    // at guest code execution
+    baz_exec cpu=0xc2 a=0xd3
--- a/exec.c
+++ b/exec.c
@@ -135,6 +135,7 @@ typedef struct PhysPageMap {
 struct AddressSpaceDispatch {
    struct rcu_head rcu;

+    MemoryRegionSection *mru_section;
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
@@ -307,6 +308,17 @@ static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
    }
 }

+static inline bool section_covers_addr(const MemoryRegionSection *section,
+                                       hwaddr addr)
+{
+    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
+     * the section must cover the entire address space.
+     */
+    return section->size.hi ||
+           range_covers_byte(section->offset_within_address_space,
+                             section->size.lo, addr);
+}
+
 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
                                           Node *nodes, MemoryRegionSection *sections)
 {
@@ -322,9 +334,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
    }

-    if (sections[lp.ptr].size.hi ||
-        range_covers_byte(sections[lp.ptr].offset_within_address_space,
-                          sections[lp.ptr].size.lo, addr)) {
+    if (section_covers_addr(&sections[lp.ptr], addr)) {
        return &sections[lp.ptr];
    } else {
        return &sections[PHYS_SECTION_UNASSIGNED];
@@ -342,14 +352,25 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
                                                        hwaddr addr,
                                                        bool resolve_subpage)
 {
-    MemoryRegionSection *section;
+    MemoryRegionSection *section = atomic_read(&d->mru_section);
    subpage_t *subpage;
+    bool update;

-    section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
+    if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
+        section_covers_addr(section, addr)) {
+        update = false;
+    } else {
+        section = phys_page_find(d->phys_map, addr, d->map.nodes,
+                                 d->map.sections);
+        update = true;
+    }
    if (resolve_subpage && section->mr->subpage) {
        subpage = container_of(section->mr, subpage_t, iomem);
        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
    }
+    if (update) {
+        atomic_set(&d->mru_section, section);
+    }
    return section;
 }

@@ -1207,92 +1228,83 @@ void qemu_mutex_unlock_ramlist(void)
 }

 #ifdef __linux__
-
-#include <sys/vfs.h>
-
-#define HUGETLBFS_MAGIC       0x958458f6
-
-static long gethugepagesize(const char *path, Error **errp)
-{
-    struct statfs fs;
-    int ret;
-
-    do {
-        ret = statfs(path, &fs);
-    } while (ret != 0 && errno == EINTR);
-
-    if (ret != 0) {
-        error_setg_errno(errp, errno, "failed to get page size of file %s",
-                         path);
-        return 0;
-    }
-
-    return fs.f_bsize;
-}
-
 static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
                            const char *path,
                            Error **errp)
 {
-    struct stat st;
+    bool unlink_on_error = false;
    char *filename;
    char *sanitized_name;
    char *c;
    void *area;
    int fd;
-    uint64_t hpagesize;
-    Error *local_err = NULL;
-
-    hpagesize = gethugepagesize(path, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        goto error;
-    }
-    block->mr->align = hpagesize;
-
-    if (memory < hpagesize) {
-        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
-                   "or larger than huge page size 0x%" PRIx64,
-                   memory, hpagesize);
-        goto error;
-    }
+    int64_t page_size;

    if (kvm_enabled() && !kvm_has_sync_mmu()) {
        error_setg(errp,
                   "host lacks kvm mmu notifiers, -mem-path unsupported");
-        goto error;
+        return NULL;
    }

-    if (!stat(path, &st) && S_ISDIR(st.st_mode)) {
-        /* Make name safe to use with mkstemp by replacing '/' with '_'. */
-        sanitized_name = g_strdup(memory_region_name(block->mr));
-        for (c = sanitized_name; *c != '\0'; c++) {
-            if (*c == '/') {
-                *c = '_';
-            }
-        }
-
-        filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
-                                   sanitized_name);
-        g_free(sanitized_name);
-
-        fd = mkstemp(filename);
+    for (;;) {
+        fd = open(path, O_RDWR);
        if (fd >= 0) {
-            unlink(filename);
+            /* @path names an existing file, use it */
+            break;
        }
-        g_free(filename);
-    } else {
-        fd = open(path, O_RDWR | O_CREAT, 0644);
+        if (errno == ENOENT) {
+            /* @path names a file that doesn't exist, create it */
+            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
+            if (fd >= 0) {
+                unlink_on_error = true;
+                break;
+            }
+        } else if (errno == EISDIR) {
+            /* @path names a directory, create a file there */
+            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
+            sanitized_name = g_strdup(memory_region_name(block->mr));
+            for (c = sanitized_name; *c != '\0'; c++) {
+                if (*c == '/') {
+                    *c = '_';
+                }
+            }
+
+            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
+                                       sanitized_name);
+            g_free(sanitized_name);
+
+            fd = mkstemp(filename);
+            if (fd >= 0) {
+                unlink(filename);
+                g_free(filename);
+                break;
+            }
+            g_free(filename);
+        }
+        if (errno != EEXIST && errno != EINTR) {
+            error_setg_errno(errp, errno,
+                             "can't open backing store %s for guest RAM",
+                             path);
+            goto error;
+        }
+        /*
+         * Try again on EINTR and EEXIST.  The latter happens when
+         * something else creates the file between our two open().
+         */
    }

-    if (fd < 0) {
-        error_setg_errno(errp, errno,
-                         "unable to create backing store for hugepages");
+    page_size = qemu_fd_getpagesize(fd);
+    block->mr->align = page_size;
+
+    if (memory < page_size) {
+        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
+                   "or larger than page size 0x%" PRIx64,
+                   memory, page_size);
        goto error;
    }

-    memory = ROUND_UP(memory, hpagesize);
+    memory = ROUND_UP(memory, page_size);

    /*
     * ftruncate is not supported by hugetlbfs in older
@@ -1304,10 +1316,10 @@ static void *file_ram_alloc(RAMBlock *block,
        perror("ftruncate");
    }

-    area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED);
+    area = qemu_ram_mmap(fd, memory, page_size, block->flags & RAM_SHARED);
    if (area == MAP_FAILED) {
        error_setg_errno(errp, errno,
-                         "unable to map backing store for hugepages");
+                         "unable to map backing store for guest RAM");
        close(fd);
        goto error;
    }
@@ -1320,6 +1332,10 @@ static void *file_ram_alloc(RAMBlock *block,
    return area;

 error:
+    if (unlink_on_error) {
+        unlink(path);
+    }
+    close(fd);
    return NULL;
 }
 #endif
@@ -1554,7 +1570,7 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
    }
 }

-static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
+static void ram_block_add(RAMBlock *new_block, Error **errp)
 {
    RAMBlock *block;
    RAMBlock *last_block = NULL;
@@ -1573,7 +1589,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
            if (err) {
                error_propagate(errp, err);
                qemu_mutex_unlock_ramlist();
-                return -1;
+                return;
            }
        } else {
            new_block->host = phys_mem_alloc(new_block->max_length,
@@ -1583,7 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
                                 "cannot set up guest memory '%s'",
                                 memory_region_name(new_block->mr));
                qemu_mutex_unlock_ramlist();
-                return -1;
+                return;
            }
            memory_try_enable_merging(new_block->host, new_block->max_length);
        }
@@ -1631,22 +1647,19 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
            kvm_setup_guest_memory(new_block->host, new_block->max_length);
        }
    }
-
-    return new_block->offset;
 }

 #ifdef __linux__
-ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
-                                    bool share, const char *mem_path,
-                                    Error **errp)
+RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
+                                   bool share, const char *mem_path,
+                                   Error **errp)
 {
    RAMBlock *new_block;
-    ram_addr_t addr;
    Error *local_err = NULL;

    if (xen_enabled()) {
        error_setg(errp, "-mem-path not supported with Xen");
-        return -1;
+        return NULL;
    }

    if (phys_mem_alloc != qemu_anon_ram_alloc) {
@@ -1657,7 +1670,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
         */
        error_setg(errp,
                   "-mem-path not supported with this accelerator");
-        return -1;
+        return NULL;
    }

    size = HOST_PAGE_ALIGN(size);
@@ -1670,29 +1683,28 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                     mem_path, errp);
    if (!new_block->host) {
        g_free(new_block);
-        return -1;
+        return NULL;
    }

-    addr = ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err);
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
-        return -1;
+        return NULL;
    }
-    return addr;
+    return new_block;
 }
 #endif

 static
-ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
-                                   void (*resized)(const char*,
-                                                   uint64_t length,
-                                                   void *host),
-                                   void *host, bool resizeable,
-                                   MemoryRegion *mr, Error **errp)
+RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
+                                  void (*resized)(const char*,
+                                                  uint64_t length,
+                                                  void *host),
+                                  void *host, bool resizeable,
+                                  MemoryRegion *mr, Error **errp)
 {
    RAMBlock *new_block;
-    ram_addr_t addr;
    Error *local_err = NULL;

    size = HOST_PAGE_ALIGN(size);
@@ -1711,29 +1723,27 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
    if (resizeable) {
        new_block->flags |= RAM_RESIZEABLE;
    }
-    addr = ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err);
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
-        return -1;
+        return NULL;
    }
-
-    mr->ram_block = new_block;
-    return addr;
+    return new_block;
 }

-ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
+RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                   MemoryRegion *mr, Error **errp)
 {
    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
 }

-ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
+RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
 {
    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
 }

-ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
+RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
                                     void (*resized)(const char*,
                                                     uint64_t length,
                                                     void *host),
@@ -1759,22 +1769,15 @@ static void reclaim_ramblock(RAMBlock *block)
    g_free(block);
 }

-void qemu_ram_free(ram_addr_t addr)
+void qemu_ram_free(RAMBlock *block)
 {
-    RAMBlock *block;
-
    qemu_mutex_lock_ramlist();
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (addr == block->offset) {
-            QLIST_REMOVE_RCU(block, next);
-            ram_list.mru_block = NULL;
-            /* Write list before version */
-            smp_wmb();
-            ram_list.version++;
-            call_rcu(block, reclaim_ramblock, rcu);
-            break;
-        }
-    }
+    QLIST_REMOVE_RCU(block, next);
+    ram_list.mru_block = NULL;
+    /* Write list before version */
+    smp_wmb();
+    ram_list.version++;
+    call_rcu(block, reclaim_ramblock, rcu);
    qemu_mutex_unlock_ramlist();
 }

@@ -2707,7 +2710,8 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
            }
        } else {
            /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, mr->ram_addr + addr1);
+            ptr = qemu_get_ram_ptr(mr->ram_block,
+                                   memory_region_get_ram_addr(mr) + addr1);
            memcpy(buf, ptr, l);
        }

--- a/fsdev/qemu-fsdev-opts.c
+++ b/fsdev/qemu-fsdev-opts.c
@@ -83,4 +83,4 @@ static void fsdev_register_config(void)
    qemu_add_opts(&qemu_fsdev_opts);
    qemu_add_opts(&qemu_virtfs_opts);
 }
-machine_init(fsdev_register_config);
+opts_init(fsdev_register_config);
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -1752,7 +1752,7 @@ int gdbserver_start(const char *device)
            sigaction(SIGINT, &act, NULL);
        }
 #endif
-        chr = qemu_chr_new("gdb", device, NULL);
+        chr = qemu_chr_new_noreplay("gdb", device, NULL);
        if (!chr)
            return -1;

--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1026,7 +1026,7 @@ ETEXI
        .args_type  = "",
        .params     = "",
        .help       = "Followup to a migration command to switch the migration"
-                      " to postcopy mode. The x-postcopy-ram capability must "
+                      " to postcopy mode. The postcopy-ram capability must "
                      "be set before the original migration command.",
        .mhandler.cmd = hmp_migrate_start_postcopy,
    },
@@ -1201,8 +1201,8 @@ ETEXI

    {
        .name       = "drive_add",
-        .args_type  = "pci_addr:s,opts:s",
-        .params     = "[[<domain>:]<bus>:]<slot>\n"
+        .args_type  = "node:-n,pci_addr:s,opts:s",
+        .params     = "[-n] [[<domain>:]<bus>:]<slot>\n"
                      "[file=file][,if=type][,bus=n]\n"
                      "[,unit=m][,media=d][,index=i]\n"
                      "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
--- a/hmp.c
+++ b/hmp.c
@@ -857,7 +857,7 @@ void hmp_info_tpm(Monitor *mon, const QDict *qdict)

        switch (ti->options->type) {
        case TPM_TYPE_OPTIONS_KIND_PASSTHROUGH:
-            tpo = ti->options->u.passthrough;
+            tpo = ti->options->u.passthrough.data;
            monitor_printf(mon, "%s%s%s%s",
                           tpo->has_path ? ",path=" : "",
                           tpo->has_path ? tpo->path : "",
@@ -1753,14 +1753,14 @@ void hmp_sendkey(Monitor *mon, const QDict *qdict)
                goto err_out;
            }
            keylist->value->type = KEY_VALUE_KIND_NUMBER;
-            keylist->value->u.number = value;
+            keylist->value->u.number.data = value;
        } else {
            int idx = index_from_key(keys, keyname_len);
            if (idx == Q_KEY_CODE__MAX) {
                goto err_out;
            }
            keylist->value->type = KEY_VALUE_KIND_QCODE;
-            keylist->value->u.qcode = idx;
+            keylist->value->u.qcode.data = idx;
        }

        if (!separator) {
@@ -1977,7 +1977,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
        if (value) {
            switch (value->type) {
            case MEMORY_DEVICE_INFO_KIND_DIMM:
-                di = value->u.dimm;
+                di = value->u.dimm.data;

                monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
                               MemoryDeviceInfoKind_lookup[value->type],
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
 common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o cpu_hotplug_acpi_table.o
 common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o memory_hotplug_acpi_table.o
-common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
+obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI) += acpi_interface.o
 common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
 common-obj-$(CONFIG_ACPI) += aml-build.o
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -258,6 +258,34 @@ static void build_append_int(GArray *table, uint64_t value)
    }
 }

+/*
+ * Build NAME(XXXX, 0x00000000) where 0x00000000 is encoded as a dword,
+ * and return the offset to 0x00000000 for runtime patching.
+ *
+ * Warning: runtime patching is best avoided. Only use this as
+ * a replacement for DataTableRegion (for guests that don't
+ * support it).
+ */
+int
+build_append_named_dword(GArray *array, const char *name_format, ...)
+{
+    int offset;
+    va_list ap;
+
+    build_append_byte(array, 0x08); /* NameOp */
+    va_start(ap, name_format);
+    build_append_namestringv(array, name_format, ap);
+    va_end(ap);
+
+    build_append_byte(array, 0x0C); /* DWordPrefix */
+
+    offset = array->len;
+    build_append_int_noprefix(array, 0x00000000, 4);
+    assert(array->len == offset + 4);
+
+    return offset;
+}
+
 static GPtrArray *alloc_list;

 static Aml *aml_alloc(void)
@@ -942,14 +970,14 @@ Aml *aml_package(uint8_t num_elements)

 /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefOpRegion */
 Aml *aml_operation_region(const char *name, AmlRegionSpace rs,
-                          uint32_t offset, uint32_t len)
+                          Aml *offset, uint32_t len)
 {
    Aml *var = aml_alloc();
    build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
    build_append_byte(var->buf, 0x80); /* OpRegionOp */
    build_append_namestring(var->buf, "%s", name);
    build_append_byte(var->buf, rs);
-    build_append_int(var->buf, offset);
+    aml_append(var, offset);
    build_append_int(var->buf, len);
    return var;
 }
@@ -997,6 +1025,20 @@ Aml *create_field_common(int opcode, Aml *srcbuf, Aml *index, const char *name)
    return var;
 }

+/* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateField */
+Aml *aml_create_field(Aml *srcbuf, Aml *bit_index, Aml *num_bits,
+                      const char *name)
+{
+    Aml *var = aml_alloc();
+    build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+    build_append_byte(var->buf, 0x13); /* CreateFieldOp */
+    aml_append(var, srcbuf);
+    aml_append(var, bit_index);
+    aml_append(var, num_bits);
+    build_append_namestring(var->buf, "%s", name);
+    return var;
+}
+
 /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateDWordField */
 Aml *aml_create_dword_field(Aml *srcbuf, Aml *index, const char *name)
 {
@@ -1423,6 +1465,13 @@ Aml *aml_alias(const char *source_object, const char *alias_object)
    return var;
 }

+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefConcat */
+Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target)
+{
+    return build_opcode_2arg_dst(0x73 /* ConcatOp */, source1, source2,
+                                 target);
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
             AcpiTableHeader *h, const char *sig, int len, uint8_t rev,
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -26,7 +26,6 @@
 #include "hw/nvram/fw_cfg.h"
 #include "qemu/config-file.h"
 #include "qapi/opts-visitor.h"
-#include "qapi/dealloc-visitor.h"
 #include "qapi-visit.h"
 #include "qapi-event.h"

@@ -68,7 +67,7 @@ static void acpi_register_config(void)
    qemu_add_opts(&qemu_acpi_opts);
 }

-machine_init(acpi_register_config);
+opts_init(acpi_register_config);

 static int acpi_checksum(const uint8_t *data, int len)
 {
@@ -297,15 +296,7 @@ void acpi_table_add(const QemuOpts *opts, Error **errp)
 out:
    g_free(blob);
    g_strfreev(pathnames);
-
-    if (hdrs != NULL) {
-        QapiDeallocVisitor *dv;
-
-        dv = qapi_dealloc_visitor_new();
-        visit_type_AcpiTableOptions(qapi_dealloc_get_visitor(dv), NULL, &hdrs,
-                                    NULL);
-        qapi_dealloc_visitor_cleanup(dv);
-    }
+    qapi_free_AcpiTableOptions(hdrs);

    error_propagate(errp, err);
 }
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -29,6 +29,8 @@
 #include "qemu/osdep.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"

 static int nvdimm_plugged_device_list(Object *obj, void *opaque)
@@ -370,15 +372,131 @@ static void nvdimm_build_nfit(GSList *device_list, GArray *table_offsets,
    g_array_free(structures, true);
 }

+struct NvdimmDsmIn {
+    uint32_t handle;
+    uint32_t revision;
+    uint32_t function;
+    /* the remaining size in the page is used by arg3. */
+    union {
+        uint8_t arg3[0];
+    };
+} QEMU_PACKED;
+typedef struct NvdimmDsmIn NvdimmDsmIn;
+
+struct NvdimmDsmOut {
+    /* the size of buffer filled by QEMU. */
+    uint32_t len;
+    uint8_t data[0];
+} QEMU_PACKED;
+typedef struct NvdimmDsmOut NvdimmDsmOut;
+
+struct NvdimmDsmFunc0Out {
+    /* the size of buffer filled by QEMU. */
+     uint32_t len;
+     uint32_t supported_func;
+} QEMU_PACKED;
+typedef struct NvdimmDsmFunc0Out NvdimmDsmFunc0Out;
+
+struct NvdimmDsmFuncNoPayloadOut {
+    /* the size of buffer filled by QEMU. */
+     uint32_t len;
+     uint32_t func_ret_status;
+} QEMU_PACKED;
+typedef struct NvdimmDsmFuncNoPayloadOut NvdimmDsmFuncNoPayloadOut;
+
+static uint64_t
+nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
+{
+    nvdimm_debug("BUG: we never read _DSM IO Port.\n");
+    return 0;
+}
+
+static void
+nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    NvdimmDsmIn *in;
+    hwaddr dsm_mem_addr = val;
+
+    nvdimm_debug("dsm memory address %#" HWADDR_PRIx ".\n", dsm_mem_addr);
+
+    /*
+     * The DSM memory is mapped to guest address space so an evil guest
+     * can change its content while we are doing DSM emulation. Avoid
+     * this by copying DSM memory to QEMU local memory.
+     */
+    in = g_malloc(TARGET_PAGE_SIZE);
+    cpu_physical_memory_read(dsm_mem_addr, in, TARGET_PAGE_SIZE);
+
+    le32_to_cpus(&in->revision);
+    le32_to_cpus(&in->function);
+    le32_to_cpus(&in->handle);
+
+    nvdimm_debug("Revision %#x Handler %#x Function %#x.\n", in->revision,
+                 in->handle, in->function);
+
+    /*
+     * function 0 is called to inquire which functions are supported by
+     * OSPM
+     */
+    if (in->function == 0) {
+        NvdimmDsmFunc0Out func0 = {
+            .len = cpu_to_le32(sizeof(func0)),
+             /* No function supported other than function 0 */
+            .supported_func = cpu_to_le32(0),
+        };
+        cpu_physical_memory_write(dsm_mem_addr, &func0, sizeof func0);
+    } else {
+        /* No function except function 0 is supported yet. */
+        NvdimmDsmFuncNoPayloadOut out = {
+            .len = cpu_to_le32(sizeof(out)),
+            .func_ret_status = cpu_to_le32(1)  /* Not Supported */,
+        };
+        cpu_physical_memory_write(dsm_mem_addr, &out, sizeof(out));
+    }
+
+    g_free(in);
+}
+
+static const MemoryRegionOps nvdimm_dsm_ops = {
+    .read = nvdimm_dsm_read,
+    .write = nvdimm_dsm_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
+                            FWCfgState *fw_cfg, Object *owner)
+{
+    memory_region_init_io(&state->io_mr, owner, &nvdimm_dsm_ops, state,
+                          "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
+    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);
+
+    state->dsm_mem = g_array_new(false, true /* clear */, 1);
+    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
+    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
+                    state->dsm_mem->len);
+}
+
 #define NVDIMM_COMMON_DSM      "NCAL"
+#define NVDIMM_ACPI_MEM_ADDR   "MEMA"

 static void nvdimm_build_common_dsm(Aml *dev)
 {
-    Aml *method, *ifctx, *function;
+    Aml *method, *ifctx, *function, *dsm_mem, *unpatched, *result_size;
    uint8_t byte_list[1];

-    method = aml_method(NVDIMM_COMMON_DSM, 4, AML_NOTSERIALIZED);
+    method = aml_method(NVDIMM_COMMON_DSM, 4, AML_SERIALIZED);
    function = aml_arg(2);
+    dsm_mem = aml_name(NVDIMM_ACPI_MEM_ADDR);
+
+    /*
+     * do not support any method if DSM memory address has not been
+     * patched.
+     */
+    unpatched = aml_if(aml_equal(dsm_mem, aml_int(0x0)));

    /*
     * function 0 is called to inquire what functions are supported by
@@ -387,12 +505,38 @@ static void nvdimm_build_common_dsm(Aml *dev)
    ifctx = aml_if(aml_equal(function, aml_int(0)));
    byte_list[0] = 0 /* No function Supported */;
    aml_append(ifctx, aml_return(aml_buffer(1, byte_list)));
-    aml_append(method, ifctx);
+    aml_append(unpatched, ifctx);

    /* No function is supported yet. */
    byte_list[0] = 1 /* Not Supported */;
-    aml_append(method, aml_return(aml_buffer(1, byte_list)));
+    aml_append(unpatched, aml_return(aml_buffer(1, byte_list)));
+    aml_append(method, unpatched);

+    /*
+     * The HDLE indicates the DSM function is issued from which device,
+     * it is not used at this time as no function is supported yet.
+     * Currently we make it always be 0 for all the devices and will set
+     * the appropriate value once real function is implemented.
+     */
+    aml_append(method, aml_store(aml_int(0x0), aml_name("HDLE")));
+    aml_append(method, aml_store(aml_arg(1), aml_name("REVS")));
+    aml_append(method, aml_store(aml_arg(2), aml_name("FUNC")));
+
+    /*
+     * tell QEMU about the real address of DSM memory, then QEMU
+     * gets the control and fills the result in DSM memory.
+     */
+    aml_append(method, aml_store(dsm_mem, aml_name("NTFI")));
+
+    result_size = aml_local(1);
+    aml_append(method, aml_store(aml_name("RLEN"), result_size));
+    aml_append(method, aml_store(aml_shiftleft(result_size, aml_int(3)),
+                                 result_size));
+    aml_append(method, aml_create_field(aml_name("ODAT"), aml_int(0),
+                                        result_size, "OBUF"));
+    aml_append(method, aml_concatenate(aml_buffer(0, NULL), aml_name("OBUF"),
+                                       aml_arg(6)));
+    aml_append(method, aml_return(aml_arg(6)));
    aml_append(dev, method);
 }

@@ -435,7 +579,8 @@ static void nvdimm_build_nvdimm_devices(GSList *device_list, Aml *root_dev)
 static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
                              GArray *table_data, GArray *linker)
 {
-    Aml *ssdt, *sb_scope, *dev;
+    Aml *ssdt, *sb_scope, *dev, *field;
+    int mem_addr_offset, nvdimm_ssdt;

    acpi_add_table(table_offsets, table_data);

@@ -459,19 +604,89 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
     */
    aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0012")));

+    /* map DSM memory and IO into ACPI namespace. */
+    aml_append(dev, aml_operation_region("NPIO", AML_SYSTEM_IO,
+               aml_int(NVDIMM_ACPI_IO_BASE), NVDIMM_ACPI_IO_LEN));
+    aml_append(dev, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
+               aml_name(NVDIMM_ACPI_MEM_ADDR), TARGET_PAGE_SIZE));
+
+    /*
+     * DSM notifier:
+     * NTFI: write the address of DSM memory and notify QEMU to emulate
+     *       the access.
+     *
+     * It is the IO port so that accessing them will cause VM-exit, the
+     * control will be transferred to QEMU.
+     */
+    field = aml_field("NPIO", AML_DWORD_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("NTFI",
+               sizeof(uint32_t) * BITS_PER_BYTE));
+    aml_append(dev, field);
+
+    /*
+     * DSM input:
+     * HDLE: store device's handle, it's zero if the _DSM call happens
+     *       on NVDIMM Root Device.
+     * REVS: store the Arg1 of _DSM call.
+     * FUNC: store the Arg2 of _DSM call.
+     * ARG3: store the Arg3 of _DSM call.
+     *
+     * They are RAM mapping on host so that these accesses never cause
+     * VM-EXIT.
+     */
+    field = aml_field("NRAM", AML_DWORD_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("HDLE",
+               sizeof(typeof_field(NvdimmDsmIn, handle)) * BITS_PER_BYTE));
+    aml_append(field, aml_named_field("REVS",
+               sizeof(typeof_field(NvdimmDsmIn, revision)) * BITS_PER_BYTE));
+    aml_append(field, aml_named_field("FUNC",
+               sizeof(typeof_field(NvdimmDsmIn, function)) * BITS_PER_BYTE));
+    aml_append(field, aml_named_field("ARG3",
+               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmIn, arg3)) *
+                BITS_PER_BYTE));
+    aml_append(dev, field);
+
+    /*
+     * DSM output:
+     * RLEN: the size of the buffer filled by QEMU.
+     * ODAT: the buffer QEMU uses to store the result.
+     *
+     * Since the page is reused by both input and out, the input data
+     * will be lost after storing new result into ODAT so we should fetch
+     * all the input data before writing the result.
+     */
+    field = aml_field("NRAM", AML_DWORD_ACC, AML_NOLOCK, AML_PRESERVE);
+    aml_append(field, aml_named_field("RLEN",
+               sizeof(typeof_field(NvdimmDsmOut, len)) * BITS_PER_BYTE));
+    aml_append(field, aml_named_field("ODAT",
+               (TARGET_PAGE_SIZE - offsetof(NvdimmDsmOut, data)) *
+                     BITS_PER_BYTE));
+    aml_append(dev, field);
+
    nvdimm_build_common_dsm(dev);
    nvdimm_build_device_dsm(dev);

    nvdimm_build_nvdimm_devices(device_list, dev);

    aml_append(sb_scope, dev);
-
    aml_append(ssdt, sb_scope);
+
+    nvdimm_ssdt = table_data->len;
+
    /* copy AML table into ACPI tables blob and patch header there */
    g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+    mem_addr_offset = build_append_named_dword(table_data,
+                                               NVDIMM_ACPI_MEM_ADDR);
+
+    bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
+                             false /* high memory */);
+    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+                                   NVDIMM_DSM_MEM_FILE, table_data,
+                                   table_data->data + mem_addr_offset,
+                                   sizeof(uint32_t));
    build_header(linker, table_data,
-        (void *)(table_data->data + table_data->len - ssdt->buf->len),
-        "SSDT", ssdt->buf->len, 1, NULL, "NVDIMM");
+        (void *)(table_data->data + nvdimm_ssdt),
+        "SSDT", table_data->len - nvdimm_ssdt, 1, NULL, "NVDIMM");
    free_aml_allocator();
 }

--- a/hw/alpha/dp264.c
+++ b/hw/alpha/dp264.c
@@ -111,7 +111,7 @@ static void clipper_init(MachineState *machine)
    }
    size = load_elf(palcode_filename, cpu_alpha_superpage_to_phys,
                    NULL, &palcode_entry, &palcode_low, &palcode_high,
-                    0, EM_ALPHA, 0);
+                    0, EM_ALPHA, 0, 0);
    if (size < 0) {
        error_report("could not load palcode '%s'", palcode_filename);
        exit(1);
@@ -131,7 +131,7 @@ static void clipper_init(MachineState *machine)

        size = load_elf(kernel_filename, cpu_alpha_superpage_to_phys,
                        NULL, &kernel_entry, &kernel_low, &kernel_high,
-                        0, EM_ALPHA, 0);
+                        0, EM_ALPHA, 0, 0);
        if (size < 0) {
            error_report("could not load kernel '%s'", kernel_filename);
            exit(1);
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -16,3 +16,4 @@ obj-$(CONFIG_STM32F205_SOC) += stm32f205_soc.o
 obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp.o xlnx-ep108.o
 obj-$(CONFIG_FSL_IMX25) += fsl-imx25.o imx25_pdk.o
 obj-$(CONFIG_FSL_IMX31) += fsl-imx31.o kzm.o
+obj-$(CONFIG_ASPEED_SOC) += ast2400.o palmetto-bmc.o
--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -211,7 +211,7 @@ DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,

    if (kernel_filename) {
        image_size = load_elf(kernel_filename, NULL, NULL, &entry, &lowaddr,
-                              NULL, big_endian, EM_ARM, 1);
+                              NULL, big_endian, EM_ARM, 1, 0);
        if (image_size < 0) {
            image_size = load_image_targphys(kernel_filename, 0, mem_size);
            lowaddr = 0;
--- a/hw/arm/ast2400.c
+++ b/hw/arm/ast2400.c
@@ -0,0 +1,137 @@
+/*
+ * AST2400 SoC
+ *
+ * Andrew Jeffery <andrew@aj.id.au>
+ * Jeremy Kerr <jk@ozlabs.org>
+ *
+ * Copyright 2016 IBM Corp.
+ *
+ * This code is licensed under the GPL version 2 or later.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/address-spaces.h"
+#include "hw/arm/ast2400.h"
+#include "hw/char/serial.h"
+
+#define AST2400_UART_5_BASE      0x00184000
+#define AST2400_IOMEM_SIZE       0x00200000
+#define AST2400_IOMEM_BASE       0x1E600000
+#define AST2400_VIC_BASE         0x1E6C0000
+#define AST2400_TIMER_BASE       0x1E782000
+
+static const int uart_irqs[] = { 9, 32, 33, 34, 10 };
+static const int timer_irqs[] = { 16, 17, 18, 35, 36, 37, 38, 39, };
+
+/*
+ * IO handlers: simply catch any reads/writes to IO addresses that aren't
+ * handled by a device mapping.
+ */
+
+static uint64_t ast2400_io_read(void *p, hwaddr offset, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n",
+                  __func__, offset, size);
+    return 0;
+}
+
+static void ast2400_io_write(void *opaque, hwaddr offset, uint64_t value,
+                unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " [%u]\n",
+                  __func__, offset, value, size);
+}
+
+static const MemoryRegionOps ast2400_io_ops = {
+    .read = ast2400_io_read,
+    .write = ast2400_io_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void ast2400_init(Object *obj)
+{
+    AST2400State *s = AST2400(obj);
+
+    s->cpu = cpu_arm_init("arm926");
+
+    object_initialize(&s->vic, sizeof(s->vic), TYPE_ASPEED_VIC);
+    object_property_add_child(obj, "vic", OBJECT(&s->vic), NULL);
+    qdev_set_parent_bus(DEVICE(&s->vic), sysbus_get_default());
+
+    object_initialize(&s->timerctrl, sizeof(s->timerctrl), TYPE_ASPEED_TIMER);
+    object_property_add_child(obj, "timerctrl", OBJECT(&s->timerctrl), NULL);
+    qdev_set_parent_bus(DEVICE(&s->timerctrl), sysbus_get_default());
+}
+
+static void ast2400_realize(DeviceState *dev, Error **errp)
+{
+    int i;
+    AST2400State *s = AST2400(dev);
+    Error *err = NULL;
+
+    /* IO space */
+    memory_region_init_io(&s->iomem, NULL, &ast2400_io_ops, NULL,
+            "ast2400.io", AST2400_IOMEM_SIZE);
+    memory_region_add_subregion_overlap(get_system_memory(), AST2400_IOMEM_BASE,
+            &s->iomem, -1);
+
+    /* VIC */
+    object_property_set_bool(OBJECT(&s->vic), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->vic), 0, AST2400_VIC_BASE);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->vic), 0,
+                       qdev_get_gpio_in(DEVICE(s->cpu), ARM_CPU_IRQ));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->vic), 1,
+                       qdev_get_gpio_in(DEVICE(s->cpu), ARM_CPU_FIQ));
+
+    /* Timer */
+    object_property_set_bool(OBJECT(&s->timerctrl), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->timerctrl), 0, AST2400_TIMER_BASE);
+    for (i = 0; i < ARRAY_SIZE(timer_irqs); i++) {
+        qemu_irq irq = qdev_get_gpio_in(DEVICE(&s->vic), timer_irqs[i]);
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->timerctrl), i, irq);
+    }
+
+    /* UART - attach an 8250 to the IO space as our UART5 */
+    if (serial_hds[0]) {
+        qemu_irq uart5 = qdev_get_gpio_in(DEVICE(&s->vic), uart_irqs[4]);
+        serial_mm_init(&s->iomem, AST2400_UART_5_BASE, 2,
+                       uart5, 38400, serial_hds[0], DEVICE_LITTLE_ENDIAN);
+    }
+}
+
+static void ast2400_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = ast2400_realize;
+
+    /*
+     * Reason: creates an ARM CPU, thus use after free(), see
+     * arm_cpu_class_init()
+     */
+    dc->cannot_destroy_with_object_finalize_yet = true;
+}
+
+static const TypeInfo ast2400_type_info = {
+    .name = TYPE_AST2400,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(AST2400State),
+    .instance_init = ast2400_init,
+    .class_init = ast2400_class_init,
+};
+
+static void ast2400_register_types(void)
+{
+    type_register_static(&ast2400_type_info);
+}
+
+type_init(ast2400_register_types)
--- a/hw/arm/bcm2835_peripherals.c
+++ b/hw/arm/bcm2835_peripherals.c
@@ -12,6 +12,7 @@
 #include "hw/arm/bcm2835_peripherals.h"
 #include "hw/misc/bcm2835_mbox_defs.h"
 #include "hw/arm/raspi_platform.h"
+#include "sysemu/char.h"

 /* Peripheral base address on the VC (GPU) system bus */
 #define BCM2835_VC_PERI_BASE 0x7e000000
@@ -48,6 +49,11 @@ static void bcm2835_peripherals_init(Object *obj)
    object_property_add_child(obj, "uart0", OBJECT(s->uart0), NULL);
    qdev_set_parent_bus(DEVICE(s->uart0), sysbus_get_default());

+    /* AUX / UART1 */
+    object_initialize(&s->aux, sizeof(s->aux), TYPE_BCM2835_AUX);
+    object_property_add_child(obj, "aux", OBJECT(&s->aux), NULL);
+    qdev_set_parent_bus(DEVICE(&s->aux), sysbus_get_default());
+
    /* Mailboxes */
    object_initialize(&s->mboxes, sizeof(s->mboxes), TYPE_BCM2835_MBOX);
    object_property_add_child(obj, "mbox", OBJECT(&s->mboxes), NULL);
@@ -56,6 +62,16 @@ static void bcm2835_peripherals_init(Object *obj)
    object_property_add_const_link(OBJECT(&s->mboxes), "mbox-mr",
                                   OBJECT(&s->mbox_mr), &error_abort);

+    /* Framebuffer */
+    object_initialize(&s->fb, sizeof(s->fb), TYPE_BCM2835_FB);
+    object_property_add_child(obj, "fb", OBJECT(&s->fb), NULL);
+    object_property_add_alias(obj, "vcram-size", OBJECT(&s->fb), "vcram-size",
+                              &error_abort);
+    qdev_set_parent_bus(DEVICE(&s->fb), sysbus_get_default());
+
+    object_property_add_const_link(OBJECT(&s->fb), "dma-mr",
+                                   OBJECT(&s->gpu_bus_mr), &error_abort);
+
    /* Property channel */
    object_initialize(&s->property, sizeof(s->property), TYPE_BCM2835_PROPERTY);
    object_property_add_child(obj, "property", OBJECT(&s->property), NULL);
@@ -63,6 +79,8 @@ static void bcm2835_peripherals_init(Object *obj)
                              "board-rev", &error_abort);
    qdev_set_parent_bus(DEVICE(&s->property), sysbus_get_default());

+    object_property_add_const_link(OBJECT(&s->property), "fb",
+                                   OBJECT(&s->fb), &error_abort);
    object_property_add_const_link(OBJECT(&s->property), "dma-mr",
                                   OBJECT(&s->gpu_bus_mr), &error_abort);

@@ -70,6 +88,14 @@ static void bcm2835_peripherals_init(Object *obj)
    object_initialize(&s->sdhci, sizeof(s->sdhci), TYPE_SYSBUS_SDHCI);
    object_property_add_child(obj, "sdhci", OBJECT(&s->sdhci), NULL);
    qdev_set_parent_bus(DEVICE(&s->sdhci), sysbus_get_default());
+
+    /* DMA Channels */
+    object_initialize(&s->dma, sizeof(s->dma), TYPE_BCM2835_DMA);
+    object_property_add_child(obj, "dma", OBJECT(&s->dma), NULL);
+    qdev_set_parent_bus(DEVICE(&s->dma), sysbus_get_default());
+
+    object_property_add_const_link(OBJECT(&s->dma), "dma-mr",
+                                   OBJECT(&s->gpu_bus_mr), &error_abort);
 }

 static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
@@ -78,7 +104,8 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
    Object *obj;
    MemoryRegion *ram;
    Error *err = NULL;
-    uint32_t ram_size;
+    uint32_t ram_size, vcram_size;
+    CharDriverState *chr;
    int n;

    obj = object_property_get_link(OBJECT(dev), "ram", &err);
@@ -131,6 +158,29 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
        qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_GPU_IRQ,
                               INTERRUPT_UART));

+    /* AUX / UART1 */
+    /* TODO: don't call qemu_char_get_next_serial() here, instead set
+     * chardev properties for each uart at the board level, once pl011
+     * (uart0) has been updated to avoid qemu_char_get_next_serial()
+     */
+    chr = qemu_char_get_next_serial();
+    if (chr == NULL) {
+        chr = qemu_chr_new("bcm2835.uart1", "null", NULL);
+    }
+    qdev_prop_set_chr(DEVICE(&s->aux), "chardev", chr);
+
+    object_property_set_bool(OBJECT(&s->aux), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    memory_region_add_subregion(&s->peri_mr, UART1_OFFSET,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->aux), 0));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->aux), 0,
+        qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_GPU_IRQ,
+                               INTERRUPT_AUX));
+
    /* Mailboxes */
    object_property_set_bool(OBJECT(&s->mboxes), true, "realized", &err);
    if (err) {
@@ -144,13 +194,33 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
        qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_ARM_IRQ,
                               INTERRUPT_ARM_MAILBOX));

-    /* Property channel */
-    object_property_set_int(OBJECT(&s->property), ram_size, "ram-size", &err);
+    /* Framebuffer */
+    vcram_size = (uint32_t)object_property_get_int(OBJECT(s), "vcram-size",
+                                                   &err);
    if (err) {
        error_propagate(errp, err);
        return;
    }

+    object_property_set_int(OBJECT(&s->fb), ram_size - vcram_size,
+                            "vcram-base", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    object_property_set_bool(OBJECT(&s->fb), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    memory_region_add_subregion(&s->mbox_mr, MBOX_CHAN_FB << MBOX_AS_CHAN_SHIFT,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->fb), 0));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->fb), 0,
+                       qdev_get_gpio_in(DEVICE(&s->mboxes), MBOX_CHAN_FB));
+
+    /* Property channel */
    object_property_set_bool(OBJECT(&s->property), true, "realized", &err);
    if (err) {
        error_propagate(errp, err);
@@ -171,6 +241,13 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
        return;
    }

+    object_property_set_bool(OBJECT(&s->sdhci), true, "pending-insert-quirk",
+                             &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
    object_property_set_bool(OBJECT(&s->sdhci), true, "realized", &err);
    if (err) {
        error_propagate(errp, err);
@@ -189,6 +266,24 @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
        return;
    }

+    /* DMA Channels */
+    object_property_set_bool(OBJECT(&s->dma), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    memory_region_add_subregion(&s->peri_mr, DMA_OFFSET,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->dma), 0));
+    memory_region_add_subregion(&s->peri_mr, DMA15_OFFSET,
+                sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->dma), 1));
+
+    for (n = 0; n <= 12; n++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(&s->dma), n,
+                           qdev_get_gpio_in_named(DEVICE(&s->ic),
+                                                  BCM2835_IC_GPU_IRQ,
+                                                  INTERRUPT_DMA0 + n));
+    }
 }

 static void bcm2835_peripherals_class_init(ObjectClass *oc, void *data)
@@ -196,6 +291,8 @@ static void bcm2835_peripherals_class_init(ObjectClass *oc, void *data)
    DeviceClass *dc = DEVICE_CLASS(oc);

    dc->realize = bcm2835_peripherals_realize;
+    /* Reason: realize() method uses qemu_char_get_next_serial() */
+    dc->cannot_instantiate_with_device_add_yet = true;
 }

 static const TypeInfo bcm2835_peripherals_type_info = {
--- a/hw/arm/bcm2836.c
+++ b/hw/arm/bcm2836.c
@@ -42,6 +42,8 @@ static void bcm2836_init(Object *obj)
                              &error_abort);
    object_property_add_alias(obj, "board-rev", OBJECT(&s->peripherals),
                              "board-rev", &error_abort);
+    object_property_add_alias(obj, "vcram-size", OBJECT(&s->peripherals),
+                              "vcram-size", &error_abort);
    qdev_set_parent_bus(DEVICE(&s->peripherals), sysbus_get_default());
 }

--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -518,9 +518,34 @@ static void do_cpu_reset(void *opaque)
    cpu_reset(cs);
    if (info) {
        if (!info->is_linux) {
+            int i;
            /* Jump to the entry point.  */
            uint64_t entry = info->entry;

+            switch (info->endianness) {
+            case ARM_ENDIANNESS_LE:
+                env->cp15.sctlr_el[1] &= ~SCTLR_E0E;
+                for (i = 1; i < 4; ++i) {
+                    env->cp15.sctlr_el[i] &= ~SCTLR_EE;
+                }
+                env->uncached_cpsr &= ~CPSR_E;
+                break;
+            case ARM_ENDIANNESS_BE8:
+                env->cp15.sctlr_el[1] |= SCTLR_E0E;
+                for (i = 1; i < 4; ++i) {
+                    env->cp15.sctlr_el[i] |= SCTLR_EE;
+                }
+                env->uncached_cpsr |= CPSR_E;
+                break;
+            case ARM_ENDIANNESS_BE32:
+                env->cp15.sctlr_el[1] |= SCTLR_B;
+                break;
+            case ARM_ENDIANNESS_UNKNOWN:
+                break; /* Board's decision */
+            default:
+                g_assert_not_reached();
+            }
+
            if (!env->aarch64) {
                env->thumb = info->entry & 1;
                entry &= 0xfffffffe;
@@ -638,6 +663,62 @@ static int do_arm_linux_init(Object *obj, void *opaque)
    return 0;
 }

+static uint64_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry,
+                             uint64_t *lowaddr, uint64_t *highaddr,
+                             int elf_machine)
+{
+    bool elf_is64;
+    union {
+        Elf32_Ehdr h32;
+        Elf64_Ehdr h64;
+    } elf_header;
+    int data_swab = 0;
+    bool big_endian;
+    uint64_t ret = -1;
+    Error *err = NULL;
+
+
+    load_elf_hdr(info->kernel_filename, &elf_header, &elf_is64, &err);
+    if (err) {
+        return ret;
+    }
+
+    if (elf_is64) {
+        big_endian = elf_header.h64.e_ident[EI_DATA] == ELFDATA2MSB;
+        info->endianness = big_endian ? ARM_ENDIANNESS_BE8
+                                      : ARM_ENDIANNESS_LE;
+    } else {
+        big_endian = elf_header.h32.e_ident[EI_DATA] == ELFDATA2MSB;
+        if (big_endian) {
+            if (bswap32(elf_header.h32.e_flags) & EF_ARM_BE8) {
+                info->endianness = ARM_ENDIANNESS_BE8;
+            } else {
+                info->endianness = ARM_ENDIANNESS_BE32;
+                /* In BE32, the CPU has a different view of the per-byte
+                 * address map than the rest of the system. BE32 ELF files
+                 * are organised such that they can be programmed through
+                 * the CPU's per-word byte-reversed view of the world. QEMU
+                 * however loads ELF files independently of the CPU. So
+                 * tell the ELF loader to byte reverse the data for us.
+                 */
+                data_swab = 2;
+            }
+        } else {
+            info->endianness = ARM_ENDIANNESS_LE;
+        }
+    }
+
+    ret = load_elf(info->kernel_filename, NULL, NULL,
+                   pentry, lowaddr, highaddr, big_endian, elf_machine,
+                   1, data_swab);
+    if (ret <= 0) {
+        /* The header loaded but the image didn't */
+        exit(1);
+    }
+
+    return ret;
+}
+
 static void arm_load_kernel_notify(Notifier *notifier, void *data)
 {
    CPUState *cs;
@@ -647,7 +728,6 @@ static void arm_load_kernel_notify(Notifier *notifier, void *data)
    uint64_t elf_entry, elf_low_addr, elf_high_addr;
    int elf_machine;
    hwaddr entry, kernel_load_offset;
-    int big_endian;
    static const ARMInsnFixup *primary_loader;
    ArmLoadKernelNotifier *n = DO_UPCAST(ArmLoadKernelNotifier,
                                         notifier, notifier);
@@ -733,12 +813,6 @@ static void arm_load_kernel_notify(Notifier *notifier, void *data)
    if (info->nb_cpus == 0)
        info->nb_cpus = 1;

-#ifdef TARGET_WORDS_BIGENDIAN
-    big_endian = 1;
-#else
-    big_endian = 0;
-#endif
-
    /* We want to put the initrd far enough into RAM that when the
     * kernel is uncompressed it will not clobber the initrd. However
     * on boards without much RAM we must ensure that we still leave
@@ -753,9 +827,8 @@ static void arm_load_kernel_notify(Notifier *notifier, void *data)
        MIN(info->ram_size / 2, 128 * 1024 * 1024);

    /* Assume that raw images are linux kernels, and ELF images are not.  */
-    kernel_size = load_elf(info->kernel_filename, NULL, NULL, &elf_entry,
-                           &elf_low_addr, &elf_high_addr, big_endian,
-                           elf_machine, 1);
+    kernel_size = arm_load_elf(info, &elf_entry, &elf_low_addr,
+                               &elf_high_addr, elf_machine);
    if (kernel_size > 0 && have_dtb(info)) {
        /* If there is still some room left at the base of RAM, try and put
         * the DTB there like we do for images loaded with -bios or -pflash.
--- a/Show More
+++ b/Show More