seabios: update binaries to release 1.9.0

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
seabios: stop updating aml files
2016-01-05 13:04:15 +01:00 · 2016-01-05 13:04:14 +01:00 · 2016-01-05 13:04:14 +01:00 · 2016-01-05 13:04:14 +01:00 · 2016-01-05 13:01:02 +01:00 · 2015-12-23 13:53:32 +00:00
937 changed files with 60226 additions and 12664 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,8 @@
 /*-darwin-user
 /*-linux-user
 /*-bsd-user
+/ivshmem-client
+/ivshmem-server
 /libdis*
 /libuser
 /linux-headers/asm
--- a/91
+++ b/91
@@ -62,14 +62,22 @@ Guest CPU cores (TCG):
 ----------------------
 Overall
 L: qemu-devel@nongnu.org
-S: Odd fixes
+M: Paolo Bonzini <pbonzini@redhat.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+M: Richard Henderson <rth@twiddle.net>
+S: Maintained
 F: cpu-exec.c
+F: cpu-exec-common.c
+F: cpus.c
 F: cputlb.c
+F: exec.c
 F: softmmu_template.h
-F: translate-all.c
-F: include/exec/cpu_ldst.h
-F: include/exec/cpu_ldst_template.h
+F: translate-all.*
+F: translate-common.c
+F: include/exec/cpu*.h
+F: include/exec/exec-all.h
 F: include/exec/helper*.h
+F: include/exec/tb-hash.h

 Alpha
 M: Richard Henderson <rth@twiddle.net>
@@ -81,6 +89,7 @@ F: disas/alpha.c

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: target-arm/
 F: hw/arm/
@@ -216,6 +225,7 @@ F: */kvm.*

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: target-arm/kvm.c

@@ -235,9 +245,14 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target-s390x/kvm.c
+F: target-s390x/ioinst.[ch]
+F: target-s390x/machine.c
 F: hw/intc/s390_flic.c
 F: hw/intc/s390_flic_kvm.c
 F: include/hw/s390x/s390_flic.h
+F: gdb-xml/s390*.xml
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 X86
 M: Paolo Bonzini <pbonzini@redhat.com>
@@ -287,6 +302,7 @@ ARM Machines
 ------------
 Allwinner-a10
 M: Beniamino Galvani <b.galvani@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/allwinner*
 F: include/hw/*/allwinner*
@@ -294,6 +310,7 @@ F: hw/arm/cubieboard.c

 ARM PrimeCell
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/char/pl011.c
 F: hw/display/pl110*
@@ -308,6 +325,7 @@ F: include/hw/arm/primecell.h

 ARM cores
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/intc/arm*
 F: hw/intc/gic_internal.h
@@ -327,54 +345,64 @@ M: Evgeny Voevodin <e.voevodin@samsung.com>
 M: Maksim Kozlov <m.kozlov@samsung.com>
 M: Igor Mitsyanko <i.mitsyanko@gmail.com>
 M: Dmitry Solodkiy <d.solodkiy@samsung.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/exynos*

 Calxeda Highbank
 M: Rob Herring <robh@kernel.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/highbank.c
 F: hw/net/xgmac.c

 Canon DIGIC
 M: Antony Pavlov <antonynpavlov@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: include/hw/arm/digic.h
 F: hw/*/digic*

 Gumstix
 L: qemu-devel@nongnu.org
+L: qemu-arm@nongnu.org
 S: Orphan
 F: hw/arm/gumstix.c

 i.MX31
 M: Peter Chubb <peter.chubb@nicta.com.au>
+L: qemu-arm@nongnu.org
 S: Odd fixes
 F: hw/*/imx*
 F: hw/arm/kzm.c

 Integrator CP
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/integratorcp.c

 Musicpal
 M: Jan Kiszka <jan.kiszka@web.de>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/musicpal.c

 nSeries
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/nseries.c

 Palm
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/palm.c

 Real View
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/realview*
 F: hw/intc/realview_gic.c
@@ -382,6 +410,7 @@ F: include/hw/intc/realview_gic.h

 PXA2XX
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/mainstone.c
 F: hw/arm/spitz.c
@@ -391,17 +420,20 @@ F: hw/*/pxa2xx*

 Stellaris
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/stellaris*

 Versatile PB
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/versatile*

 Xilinx Zynq
 M: Alistair Francis <alistair.francis@xilinx.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/xilinx_zynq.c
 F: hw/misc/zynq_slcr.c
@@ -411,6 +443,7 @@ F: hw/ssi/xilinx_spips.c
 Xilinx ZynqMP
 M: Alistair Francis <alistair.francis@xilinx.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/xlnx-zynqmp.c
 F: hw/arm/xlnx-ep108.c
@@ -419,6 +452,7 @@ F: include/hw/arm/xlnx-zynqmp.h
 ARM ACPI Subsystem
 M: Shannon Zhao <zhaoshenglong@huawei.com>
 M: Shannon Zhao <shannon.zhao@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/virt-acpi-build.c
 F: include/hw/arm/virt-acpi-build.h
@@ -616,15 +650,13 @@ M: Christian Borntraeger <borntraeger@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
-F: hw/s390x/s390-virtio-ccw.c
-F: hw/s390x/css.[hc]
-F: hw/s390x/sclp*.[hc]
-F: hw/s390x/ipl*.[hc]
-F: hw/s390x/*pci*.[hc]
-F: hw/s390x/s390-skeys*.c
+F: hw/s390x/
+X: hw/s390x/s390-virtio-bus.[ch]
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+F: hw/watchdog/wdt_diag288.c
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 UniCore32 Machines
 -------------
@@ -831,6 +863,7 @@ F: net/vhost-user.c

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+M: Greg Kurz <gkurz@linux.vnet.ibm.com>
 S: Supported
 F: hw/9pfs/
 F: fsdev/
@@ -851,7 +884,8 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 F: hw/s390x/virtio-ccw.[hc]
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 virtio-input
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -907,6 +941,13 @@ M: Jiri Pirko <jiri@resnulli.us>
 S: Maintained
 F: hw/net/rocker/

+NVDIMM
+M: Xiao Guangrong <guangrong.xiao@linux.intel.com>
+S: Maintained
+F: hw/acpi/nvdimm.c
+F: hw/mem/nvdimm.c
+F: include/hw/mem/nvdimm.h
+
 Subsystems
 ----------
 Audio
@@ -1017,6 +1058,7 @@ S: Supported
 F: include/exec/ioport.h
 F: ioport.c
 F: include/exec/memory.h
+F: include/exec/ram_addr.h
 F: memory.c
 F: include/exec/memory-internal.h
 F: exec.c
@@ -1139,6 +1181,8 @@ F: include/qom/
 X: include/qom/cpu.h
 F: qom/
 X: qom/cpu.c
+F: tests/check-qom-interface.c
+F: tests/check-qom-proplist.c
 F: tests/qom-test.c

 QMP
@@ -1193,6 +1237,26 @@ F: crypto/
 F: include/crypto/
 F: tests/test-crypto-*

+Coroutines
+M: Stefan Hajnoczi <stefanha@redhat.com>
+M: Kevin Wolf <kwolf@redhat.com>
+F: util/*coroutine*
+F: include/qemu/coroutine*
+F: tests/test-coroutine.c
+
+Buffers
+M: Daniel P. Berrange <berrange@redhat.com>
+S: Odd fixes
+F: util/buffer.c
+F: include/qemu/buffer.h
+
+I/O Channels
+M: Daniel P. Berrange <berrange@redhat.com>
+S: Maintained
+F: io/
+F: include/io/
+F: tests/test-io-*
+
 Usermode Emulation
 ------------------
 Overall
@@ -1222,6 +1286,7 @@ AArch64 target
 M: Claudio Fontana <claudio.fontana@huawei.com>
 M: Claudio Fontana <claudio.fontana@gmail.com>
 S: Maintained
+L: qemu-arm@nongnu.org
 F: tcg/aarch64/
 F: disas/arm-a64.cc
 F: disas/libvixl/
@@ -1229,6 +1294,7 @@ F: disas/libvixl/
 ARM target
 M: Andrzej Zaborowski <balrogg@gmail.com>
 S: Maintained
+L: qemu-arm@nongnu.org
 F: tcg/arm/
 F: disas/arm.c

@@ -1431,6 +1497,7 @@ M: Denis V. Lunev <den@openvz.org>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/parallels.c
+F: docs/specs/parallels.txt

 qed
 M: Stefan Hajnoczi <stefanha@redhat.com>
--- a/28
+++ b/28
@@ -151,12 +151,15 @@ dummy := $(call unnest-vars,, \
                stub-obj-y \
                util-obj-y \
                qga-obj-y \
+                ivshmem-client-obj-y \
+                ivshmem-server-obj-y \
                qga-vss-dll-obj-y \
                block-obj-y \
                block-obj-m \
                crypto-obj-y \
                crypto-aes-obj-y \
                qom-obj-y \
+                io-obj-y \
                common-obj-y \
                common-obj-m)

@@ -176,6 +179,7 @@ SOFTMMU_SUBDIR_RULES=$(filter %-softmmu,$(SUBDIR_RULES))

 $(SOFTMMU_SUBDIR_RULES): $(block-obj-y)
 $(SOFTMMU_SUBDIR_RULES): $(crypto-obj-y)
+$(SOFTMMU_SUBDIR_RULES): $(io-obj-y)
 $(SOFTMMU_SUBDIR_RULES): config-all-devices.mak

 subdir-%:
@@ -267,7 +271,8 @@ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)

 qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \
-               $(SRC_PATH)/qapi/event.json $(SRC_PATH)/qapi/introspect.json
+               $(SRC_PATH)/qapi/event.json $(SRC_PATH)/qapi/introspect.json \
+               $(SRC_PATH)/qapi/crypto.json

 qapi-types.c qapi-types.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
@@ -298,18 +303,15 @@ $(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
 $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)

-# we require QGA_VSS_PROVIDER files to be built alongside qemu-ga
-# executable since they are shipped together, but we don't want to actually
-# link against them
-qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(QGA_VSS_PROVIDER)
-	$(call LINK, $(filter-out $(QGA_VSS_PROVIDER), $^))
+qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
+	$(call LINK, $^)

 ifdef QEMU_GA_MSI_ENABLED
 QEMU_GA_MSI=qemu-ga-$(ARCH).msi

 msi: $(QEMU_GA_MSI)

-$(QEMU_GA_MSI): qemu-ga.exe
+$(QEMU_GA_MSI): qemu-ga.exe $(QGA_VSS_PROVIDER)

 $(QEMU_GA_MSI): config-host.mak

@@ -321,6 +323,16 @@ msi:
 	@echo "MSI build not configured or dependency resolution failed (reconfigure with --enable-guest-agent-msi option)"
 endif

+ifneq ($(EXESUF),)
+.PHONY: qemu-ga
+qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI)
+endif
+
+ivshmem-client$(EXESUF): $(ivshmem-client-obj-y)
+	$(call LINK, $^)
+ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
+	$(call LINK, $^)
+
 clean:
 # avoid old build problems by removing potentially incorrect old files
 	rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h
@@ -431,7 +443,7 @@ endif
 install: all $(if $(BUILD_DOCS),install-doc) \
 install-datadir install-localstatedir
 ifneq ($(TOOLS),)
-	$(call install-prog,$(TOOLS),$(DESTDIR)$(bindir))
+	$(call install-prog,$(subst qemu-ga,qemu-ga$(EXESUF),$(TOOLS)),$(DESTDIR)$(bindir))
 endif
 ifneq ($(CONFIG_MODULES),)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_moddir)"
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -15,10 +15,6 @@ block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o

-block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
-block-obj-y += qemu-coroutine-sleep.o
-block-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
-
 block-obj-m = block/

 #######################################################################
@@ -32,6 +28,11 @@ crypto-aes-obj-y = crypto/

 qom-obj-y = qom/

+#######################################################################
+# io-obj-y is code used by both qemu system emulation and qemu-img
+
+io-obj-y = io/
+
 ######################################################################
 # Target independent part of system emulation. The long term path is to
 # suppress *all* target specific code in case of system emulation, i.e. a
@@ -58,6 +59,8 @@ common-obj-y += audio/
 common-obj-y += hw/
 common-obj-y += accel.o

+common-obj-y += replay/
+
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
 bt-host.o-cflags := $(BLUEZ_CFLAGS)
@@ -108,3 +111,8 @@ target-obj-y += trace/
 # by libqemuutil.a.  These should be moved to a separate .json schema.
 qga-obj-y = qga/
 qga-vss-dll-obj-y = qga/
+
+######################################################################
+# contrib
+ivshmem-client-obj-y = contrib/ivshmem-client/
+ivshmem-server-obj-y = contrib/ivshmem-server/
--- a/Makefile.target
+++ b/Makefile.target
@@ -176,6 +176,7 @@ dummy := $(call unnest-vars,.., \
               crypto-obj-y \
               crypto-aes-obj-y \
               qom-obj-y \
+               io-obj-y \
               common-obj-y \
               common-obj-m)
 target-obj-y := $(target-obj-y-save)
@@ -185,6 +186,7 @@ all-obj-y += $(qom-obj-y)
 all-obj-$(CONFIG_SOFTMMU) += $(block-obj-y)
 all-obj-$(CONFIG_USER_ONLY) += $(crypto-aes-obj-y)
 all-obj-$(CONFIG_SOFTMMU) += $(crypto-obj-y)
+all-obj-$(CONFIG_SOFTMMU) += $(io-obj-y)

 $(QEMU_PROG_BUILD): config-devices.mak

--- a/2
+++ b/2
@@ -1 +1 @@
-2.4.50
+2.5.50
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif

 struct AioHandler
 {
@@ -25,9 +28,166 @@ struct AioHandler
    IOHandler *io_write;
    int deleted;
    void *opaque;
+    bool is_external;
    QLIST_ENTRY(AioHandler) node;
 };

+#ifdef CONFIG_EPOLL
+
+/* The fd number threashold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+static void aio_epoll_disable(AioContext *ctx)
+{
+    ctx->epoll_available = false;
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    ctx->epoll_enabled = false;
+    close(ctx->epollfd);
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static bool aio_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        int r;
+        if (node->deleted || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+    ctx->epoll_enabled = true;
+    return true;
+}
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    struct epoll_event event;
+    int r;
+
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    if (!node->pfd.events) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
+        if (r) {
+            aio_epoll_disable(ctx);
+        }
+    } else {
+        event.data.ptr = node;
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        if (is_new) {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        } else {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        }
+    }
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    assert(npfd == 1);
+    assert(pfds[0].fd == ctx->epollfd);
+    if (timeout > 0) {
+        ret = qemu_poll_ns(pfds, npfd, timeout);
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         sizeof(events) / sizeof(events[0]),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            node = events[i].data.ptr;
+            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                (ev & EPOLLERR ? G_IO_ERR : 0);
+        }
+    }
+out:
+    return ret;
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    /* Fall back to ppoll when external clients are disabled. */
+    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                                 unsigned npfd, int64_t timeout)
+{
+    if (!ctx->epoll_available) {
+        return false;
+    }
+    if (aio_epoll_enabled(ctx)) {
+        return true;
+    }
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (aio_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            aio_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+#else
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    assert(false);
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    return false;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                          unsigned npfd, int64_t timeout)
+{
+    return false;
+}
+
+#endif
+
 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 {
    AioHandler *node;
@@ -43,11 +203,14 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)

 void aio_set_fd_handler(AioContext *ctx,
                        int fd,
+                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
 {
    AioHandler *node;
+    bool is_new = false;
+    bool deleted = false;

    node = find_aio_handler(ctx, fd);

@@ -66,7 +229,7 @@ void aio_set_fd_handler(AioContext *ctx,
                 * releasing the walking_handlers lock.
                 */
                QLIST_REMOVE(node, node);
-                g_free(node);
+                deleted = true;
            }
        }
    } else {
@@ -77,25 +240,32 @@ void aio_set_fd_handler(AioContext *ctx,
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
+            is_new = true;
        }
        /* Update handler with latest information */
        node->io_read = io_read;
        node->io_write = io_write;
        node->opaque = opaque;
+        node->is_external = is_external;

        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
    }

+    aio_epoll_update(ctx, node, is_new);
    aio_notify(ctx);
+    if (deleted) {
+        g_free(node);
+    }
 }

 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *notifier,
+                            bool is_external,
                            EventNotifierHandler *io_read)
 {
    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
-                       (IOHandler *)io_read, NULL, notifier);
+                       is_external, (IOHandler *)io_read, NULL, notifier);
 }

 bool aio_prepare(AioContext *ctx)
@@ -257,7 +427,9 @@ bool aio_poll(AioContext *ctx, bool blocking)

    /* fill pollfds */
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->pfd.events) {
+        if (!node->deleted && node->pfd.events
+            && !aio_epoll_enabled(ctx)
+            && aio_node_check(ctx, node->is_external)) {
            add_pollfd(node);
        }
    }
@@ -268,7 +440,17 @@ bool aio_poll(AioContext *ctx, bool blocking)
    if (timeout) {
        aio_context_release(ctx);
    }
-    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+        AioHandler epoll_handler;
+
+        epoll_handler.pfd.fd = ctx->epollfd;
+        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
+        npfd = 0;
+        add_pollfd(&epoll_handler);
+        ret = aio_epoll(ctx, pollfds, npfd, timeout);
+    } else  {
+        ret = qemu_poll_ns(pollfds, npfd, timeout);
+    }
    if (blocking) {
        atomic_sub(&ctx->notify_me, 2);
    }
@@ -297,3 +479,16 @@ bool aio_poll(AioContext *ctx, bool blocking)

    return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+#ifdef CONFIG_EPOLL
+    assert(!ctx->epollfd);
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        ctx->epoll_available = false;
+    } else {
+        ctx->epoll_available = true;
+    }
+#endif
+}
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -28,11 +28,13 @@ struct AioHandler {
    GPollFD pfd;
    int deleted;
    void *opaque;
+    bool is_external;
    QLIST_ENTRY(AioHandler) node;
 };

 void aio_set_fd_handler(AioContext *ctx,
                        int fd,
+                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
@@ -86,6 +88,7 @@ void aio_set_fd_handler(AioContext *ctx,
        node->opaque = opaque;
        node->io_read = io_read;
        node->io_write = io_write;
+        node->is_external = is_external;

        event = event_notifier_get_handle(&ctx->notifier);
        WSAEventSelect(node->pfd.fd, event,
@@ -98,6 +101,7 @@ void aio_set_fd_handler(AioContext *ctx,

 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *e,
+                            bool is_external,
                            EventNotifierHandler *io_notify)
 {
    AioHandler *node;
@@ -133,6 +137,7 @@ void aio_set_event_notifier(AioContext *ctx,
            node->e = e;
            node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
            node->pfd.events = G_IO_IN;
+            node->is_external = is_external;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
@@ -304,7 +309,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /* fill fd sets */
    count = 0;
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->io_notify) {
+        if (!node->deleted && node->io_notify
+            && aio_node_check(ctx, node->is_external)) {
            events[count++] = event_notifier_get_handle(node->e);
        }
    }
@@ -363,3 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    aio_context_release(ctx);
    return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+}
--- a/async.c
+++ b/async.c
@@ -59,6 +59,11 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
    return bh;
 }

+void aio_bh_call(QEMUBH *bh)
+{
+    bh->cb(bh->opaque);
+}
+
 /* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 int aio_bh_poll(AioContext *ctx)
 {
@@ -84,7 +89,7 @@ int aio_bh_poll(AioContext *ctx)
                ret = 1;
            }
            bh->idle = 0;
-            bh->cb(bh->opaque);
+            aio_bh_call(bh);
        }
    }

@@ -247,7 +252,7 @@ aio_ctx_finalize(GSource     *source)
    }
    qemu_mutex_unlock(&ctx->bh_lock);

-    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
+    aio_set_event_notifier(ctx, &ctx->notifier, false, NULL);
    event_notifier_cleanup(&ctx->notifier);
    rfifolock_destroy(&ctx->lock);
    qemu_mutex_destroy(&ctx->bh_lock);
@@ -320,15 +325,22 @@ AioContext *aio_context_new(Error **errp)
 {
    int ret;
    AioContext *ctx;
+    Error *local_err = NULL;
+
    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    aio_context_setup(ctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto fail;
+    }
    ret = event_notifier_init(&ctx->notifier, false);
    if (ret < 0) {
-        g_source_destroy(&ctx->source);
        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
-        return NULL;
+        goto fail;
    }
    g_source_set_can_recurse(&ctx->source, true);
    aio_set_event_notifier(ctx, &ctx->notifier,
+                           false,
                           (EventNotifierHandler *)
                           event_notifier_dummy_cb);
    ctx->thread_pool = NULL;
@@ -339,6 +351,9 @@ AioContext *aio_context_new(Error **errp)
    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);

    return ctx;
+fail:
+    g_source_destroy(&ctx->source);
+    return NULL;
 }

 void aio_context_ref(AioContext *ctx)
--- a/audio/coreaudio.c
+++ b/audio/coreaudio.c
@@ -32,6 +32,10 @@
 #define AUDIO_CAP "coreaudio"
 #include "audio_int.h"

+#ifndef MAC_OS_X_VERSION_10_6
+#define MAC_OS_X_VERSION_10_6 1060
+#endif
+
 static int isAtexit;

 typedef struct {
@@ -45,11 +49,233 @@ typedef struct coreaudioVoiceOut {
    AudioDeviceID outputDeviceID;
    UInt32 audioDevicePropertyBufferFrameSize;
    AudioStreamBasicDescription outputStreamBasicDescription;
+    AudioDeviceIOProcID ioprocid;
    int live;
    int decr;
    int rpos;
 } coreaudioVoiceOut;

+#if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+/* The APIs used here only become available from 10.6 */
+
+static OSStatus coreaudio_get_voice(AudioDeviceID *id)
+{
+    UInt32 size = sizeof(*id);
+    AudioObjectPropertyAddress addr = {
+        kAudioHardwarePropertyDefaultOutputDevice,
+        kAudioObjectPropertyScopeGlobal,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectGetPropertyData(kAudioObjectSystemObject,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      &size,
+                                      id);
+}
+
+static OSStatus coreaudio_get_framesizerange(AudioDeviceID id,
+                                             AudioValueRange *framerange)
+{
+    UInt32 size = sizeof(*framerange);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyBufferFrameSizeRange,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectGetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      &size,
+                                      framerange);
+}
+
+static OSStatus coreaudio_get_framesize(AudioDeviceID id, UInt32 *framesize)
+{
+    UInt32 size = sizeof(*framesize);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyBufferFrameSize,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectGetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      &size,
+                                      framesize);
+}
+
+static OSStatus coreaudio_set_framesize(AudioDeviceID id, UInt32 *framesize)
+{
+    UInt32 size = sizeof(*framesize);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyBufferFrameSize,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectSetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      size,
+                                      framesize);
+}
+
+static OSStatus coreaudio_get_streamformat(AudioDeviceID id,
+                                           AudioStreamBasicDescription *d)
+{
+    UInt32 size = sizeof(*d);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyStreamFormat,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectGetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      &size,
+                                      d);
+}
+
+static OSStatus coreaudio_set_streamformat(AudioDeviceID id,
+                                           AudioStreamBasicDescription *d)
+{
+    UInt32 size = sizeof(*d);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyStreamFormat,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectSetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      size,
+                                      d);
+}
+
+static OSStatus coreaudio_get_isrunning(AudioDeviceID id, UInt32 *result)
+{
+    UInt32 size = sizeof(*result);
+    AudioObjectPropertyAddress addr = {
+        kAudioDevicePropertyDeviceIsRunning,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    return AudioObjectGetPropertyData(id,
+                                      &addr,
+                                      0,
+                                      NULL,
+                                      &size,
+                                      result);
+}
+#else
+/* Legacy versions of functions using deprecated APIs */
+
+static OSStatus coreaudio_get_voice(AudioDeviceID *id)
+{
+    UInt32 size = sizeof(*id);
+
+    return AudioHardwareGetProperty(
+        kAudioHardwarePropertyDefaultOutputDevice,
+        &size,
+        id);
+}
+
+static OSStatus coreaudio_get_framesizerange(AudioDeviceID id,
+                                             AudioValueRange *framerange)
+{
+    UInt32 size = sizeof(*framerange);
+
+    return AudioDeviceGetProperty(
+        id,
+        0,
+        0,
+        kAudioDevicePropertyBufferFrameSizeRange,
+        &size,
+        framerange);
+}
+
+static OSStatus coreaudio_get_framesize(AudioDeviceID id, UInt32 *framesize)
+{
+    UInt32 size = sizeof(*framesize);
+
+    return AudioDeviceGetProperty(
+        id,
+        0,
+        false,
+        kAudioDevicePropertyBufferFrameSize,
+        &size,
+        framesize);
+}
+
+static OSStatus coreaudio_set_framesize(AudioDeviceID id, UInt32 *framesize)
+{
+    UInt32 size = sizeof(*framesize);
+
+    return AudioDeviceSetProperty(
+        id,
+        NULL,
+        0,
+        false,
+        kAudioDevicePropertyBufferFrameSize,
+        size,
+        framesize);
+}
+
+static OSStatus coreaudio_get_streamformat(AudioDeviceID id,
+                                           AudioStreamBasicDescription *d)
+{
+    UInt32 size = sizeof(*d);
+
+    return AudioDeviceGetProperty(
+        id,
+        0,
+        false,
+        kAudioDevicePropertyStreamFormat,
+        &size,
+        d);
+}
+
+static OSStatus coreaudio_set_streamformat(AudioDeviceID id,
+                                           AudioStreamBasicDescription *d)
+{
+    UInt32 size = sizeof(*d);
+
+    return AudioDeviceSetProperty(
+        id,
+        0,
+        0,
+        0,
+        kAudioDevicePropertyStreamFormat,
+        size,
+        d);
+}
+
+static OSStatus coreaudio_get_isrunning(AudioDeviceID id, UInt32 *result)
+{
+    UInt32 size = sizeof(*result);
+
+    return AudioDeviceGetProperty(
+        id,
+        0,
+        0,
+        kAudioDevicePropertyDeviceIsRunning,
+        &size,
+        result);
+}
+#endif
+
 static void coreaudio_logstatus (OSStatus status)
 {
    const char *str = "BUG";
@@ -144,10 +370,7 @@ static inline UInt32 isPlaying (AudioDeviceID outputDeviceID)
 {
    OSStatus status;
    UInt32 result = 0;
-    UInt32 propertySize = sizeof(outputDeviceID);
-    status = AudioDeviceGetProperty(
-        outputDeviceID, 0, 0,
-        kAudioDevicePropertyDeviceIsRunning, &propertySize, &result);
+    status = coreaudio_get_isrunning(outputDeviceID, &result);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr(status,
                         "Could not determine whether Device is playing\n");
@@ -288,7 +511,6 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
 {
    OSStatus status;
    coreaudioVoiceOut *core = (coreaudioVoiceOut *) hw;
-    UInt32 propertySize;
    int err;
    const char *typ = "playback";
    AudioValueRange frameRange;
@@ -303,12 +525,7 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,

    audio_pcm_init_info (&hw->info, as);

-    /* open default output device */
-    propertySize = sizeof(core->outputDeviceID);
-    status = AudioHardwareGetProperty(
-        kAudioHardwarePropertyDefaultOutputDevice,
-        &propertySize,
-        &core->outputDeviceID);
+    status = coreaudio_get_voice(&core->outputDeviceID);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ,
                           "Could not get default output Device\n");
@@ -320,14 +537,8 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
    }

    /* get minimum and maximum buffer frame sizes */
-    propertySize = sizeof(frameRange);
-    status = AudioDeviceGetProperty(
-        core->outputDeviceID,
-        0,
-        0,
-        kAudioDevicePropertyBufferFrameSizeRange,
-        &propertySize,
-        &frameRange);
+    status = coreaudio_get_framesizerange(core->outputDeviceID,
+                                          &frameRange);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ,
                           "Could not get device buffer frame range\n");
@@ -347,15 +558,8 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
    }

    /* set Buffer Frame Size */
-    propertySize = sizeof(core->audioDevicePropertyBufferFrameSize);
-    status = AudioDeviceSetProperty(
-        core->outputDeviceID,
-        NULL,
-        0,
-        false,
-        kAudioDevicePropertyBufferFrameSize,
-        propertySize,
-        &core->audioDevicePropertyBufferFrameSize);
+    status = coreaudio_set_framesize(core->outputDeviceID,
+                                     &core->audioDevicePropertyBufferFrameSize);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ,
                           "Could not set device buffer frame size %" PRIu32 "\n",
@@ -364,14 +568,8 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
    }

    /* get Buffer Frame Size */
-    propertySize = sizeof(core->audioDevicePropertyBufferFrameSize);
-    status = AudioDeviceGetProperty(
-        core->outputDeviceID,
-        0,
-        false,
-        kAudioDevicePropertyBufferFrameSize,
-        &propertySize,
-        &core->audioDevicePropertyBufferFrameSize);
+    status = coreaudio_get_framesize(core->outputDeviceID,
+                                     &core->audioDevicePropertyBufferFrameSize);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ,
                           "Could not get device buffer frame size\n");
@@ -380,14 +578,8 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
    hw->samples = conf->nbuffers * core->audioDevicePropertyBufferFrameSize;

    /* get StreamFormat */
-    propertySize = sizeof(core->outputStreamBasicDescription);
-    status = AudioDeviceGetProperty(
-        core->outputDeviceID,
-        0,
-        false,
-        kAudioDevicePropertyStreamFormat,
-        &propertySize,
-        &core->outputStreamBasicDescription);
+    status = coreaudio_get_streamformat(core->outputDeviceID,
+                                        &core->outputStreamBasicDescription);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ,
                           "Could not get Device Stream properties\n");
@@ -397,15 +589,8 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,

    /* set Samplerate */
    core->outputStreamBasicDescription.mSampleRate = (Float64) as->freq;
-    propertySize = sizeof(core->outputStreamBasicDescription);
-    status = AudioDeviceSetProperty(
-        core->outputDeviceID,
-        0,
-        0,
-        0,
-        kAudioDevicePropertyStreamFormat,
-        propertySize,
-        &core->outputStreamBasicDescription);
+    status = coreaudio_set_streamformat(core->outputDeviceID,
+                                        &core->outputStreamBasicDescription);
    if (status != kAudioHardwareNoError) {
        coreaudio_logerr2 (status, typ, "Could not set samplerate %d\n",
                           as->freq);
@@ -414,8 +599,12 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,
    }

    /* set Callback */
-    status = AudioDeviceAddIOProc(core->outputDeviceID, audioDeviceIOProc, hw);
-    if (status != kAudioHardwareNoError) {
+    core->ioprocid = NULL;
+    status = AudioDeviceCreateIOProcID(core->outputDeviceID,
+                                       audioDeviceIOProc,
+                                       hw,
+                                       &core->ioprocid);
+    if (status != kAudioHardwareNoError || core->ioprocid == NULL) {
        coreaudio_logerr2 (status, typ, "Could not set IOProc\n");
        core->outputDeviceID = kAudioDeviceUnknown;
        return -1;
@@ -423,10 +612,10 @@ static int coreaudio_init_out(HWVoiceOut *hw, struct audsettings *as,

    /* start Playback */
    if (!isPlaying(core->outputDeviceID)) {
-        status = AudioDeviceStart(core->outputDeviceID, audioDeviceIOProc);
+        status = AudioDeviceStart(core->outputDeviceID, core->ioprocid);
        if (status != kAudioHardwareNoError) {
            coreaudio_logerr2 (status, typ, "Could not start playback\n");
-            AudioDeviceRemoveIOProc(core->outputDeviceID, audioDeviceIOProc);
+            AudioDeviceDestroyIOProcID(core->outputDeviceID, core->ioprocid);
            core->outputDeviceID = kAudioDeviceUnknown;
            return -1;
        }
@@ -444,15 +633,15 @@ static void coreaudio_fini_out (HWVoiceOut *hw)
    if (!isAtexit) {
        /* stop playback */
        if (isPlaying(core->outputDeviceID)) {
-            status = AudioDeviceStop(core->outputDeviceID, audioDeviceIOProc);
+            status = AudioDeviceStop(core->outputDeviceID, core->ioprocid);
            if (status != kAudioHardwareNoError) {
                coreaudio_logerr (status, "Could not stop playback\n");
            }
        }

        /* remove callback */
-        status = AudioDeviceRemoveIOProc(core->outputDeviceID,
-                                         audioDeviceIOProc);
+        status = AudioDeviceDestroyIOProcID(core->outputDeviceID,
+                                            core->ioprocid);
        if (status != kAudioHardwareNoError) {
            coreaudio_logerr (status, "Could not remove IOProc\n");
        }
@@ -475,7 +664,7 @@ static int coreaudio_ctl_out (HWVoiceOut *hw, int cmd, ...)
    case VOICE_ENABLE:
        /* start playback */
        if (!isPlaying(core->outputDeviceID)) {
-            status = AudioDeviceStart(core->outputDeviceID, audioDeviceIOProc);
+            status = AudioDeviceStart(core->outputDeviceID, core->ioprocid);
            if (status != kAudioHardwareNoError) {
                coreaudio_logerr (status, "Could not resume playback\n");
            }
@@ -486,7 +675,8 @@ static int coreaudio_ctl_out (HWVoiceOut *hw, int cmd, ...)
        /* stop playback */
        if (!isAtexit) {
            if (isPlaying(core->outputDeviceID)) {
-                status = AudioDeviceStop(core->outputDeviceID, audioDeviceIOProc);
+                status = AudioDeviceStop(core->outputDeviceID,
+                                         core->ioprocid);
                if (status != kAudioHardwareNoError) {
                    coreaudio_logerr (status, "Could not pause playback\n");
                }
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -313,9 +313,11 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
        assert(maxnode <= MAX_NODES);
        if (mbind(ptr, sz, backend->policy,
                  maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {
-            error_setg_errno(errp, errno,
-                             "cannot bind memory to host NUMA nodes");
-            return;
+            if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
+                error_setg_errno(errp, errno,
+                                 "cannot bind memory to host NUMA nodes");
+                return;
+            }
        }
 #endif
        /* Preallocate memory after the NUMA policy has been instantiated.
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,17 @@
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
 static void *balloon_opaque;
+static bool balloon_inhibited;
+
+bool qemu_balloon_is_inhibited(void)
+{
+    return balloon_inhibited;
+}
+
+void qemu_balloon_inhibit(bool state)
+{
+    balloon_inhibited = state;
+}

 static bool have_balloon(Error **errp)
 {
--- a/block.c
+++ b/block.c
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -2,6 +2,7 @@
 * QEMU System Emulator block accounting
 *
 * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +26,54 @@
 #include "block/accounting.h"
 #include "block/block_int.h"
 #include "qemu/timer.h"
+#include "sysemu/qtest.h"
+
+static QEMUClockType clock_type = QEMU_CLOCK_REALTIME;
+static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000;
+
+void block_acct_init(BlockAcctStats *stats, bool account_invalid,
+                     bool account_failed)
+{
+    stats->account_invalid = account_invalid;
+    stats->account_failed = account_failed;
+
+    if (qtest_enabled()) {
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+}
+
+void block_acct_cleanup(BlockAcctStats *stats)
+{
+    BlockAcctTimedStats *s, *next;
+    QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) {
+        g_free(s);
+    }
+}
+
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length)
+{
+    BlockAcctTimedStats *s;
+    unsigned i;
+
+    s = g_new0(BlockAcctTimedStats, 1);
+    s->interval_length = interval_length;
+    QSLIST_INSERT_HEAD(&stats->intervals, s, entries);
+
+    for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
+        timed_average_init(&s->latency[i], clock_type,
+                           (uint64_t) interval_length * NANOSECONDS_PER_SECOND);
+    }
+}
+
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+                                              BlockAcctTimedStats *s)
+{
+    if (s == NULL) {
+        return QSLIST_FIRST(&stats->intervals);
+    } else {
+        return QSLIST_NEXT(s, entries);
+    }
+}

 void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
                      int64_t bytes, enum BlockAcctType type)
@@ -32,26 +81,69 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
    assert(type < BLOCK_MAX_IOTYPE);

    cookie->bytes = bytes;
-    cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    cookie->start_time_ns = qemu_clock_get_ns(clock_type);
    cookie->type = type;
 }

 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
 {
+    BlockAcctTimedStats *s;
+    int64_t time_ns = qemu_clock_get_ns(clock_type);
+    int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+    if (qtest_enabled()) {
+        latency_ns = qtest_latency_ns;
+    }
+
    assert(cookie->type < BLOCK_MAX_IOTYPE);

    stats->nr_bytes[cookie->type] += cookie->bytes;
    stats->nr_ops[cookie->type]++;
-    stats->total_time_ns[cookie->type] +=
-        qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns;
+    stats->total_time_ns[cookie->type] += latency_ns;
+    stats->last_access_time_ns = time_ns;
+
+    QSLIST_FOREACH(s, &stats->intervals, entries) {
+        timed_average_account(&s->latency[cookie->type], latency_ns);
+    }
 }

-
-void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
-                               unsigned int nb_sectors)
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
 {
-    if (stats->wr_highest_sector < sector_num + nb_sectors - 1) {
-        stats->wr_highest_sector = sector_num + nb_sectors - 1;
+    assert(cookie->type < BLOCK_MAX_IOTYPE);
+
+    stats->failed_ops[cookie->type]++;
+
+    if (stats->account_failed) {
+        BlockAcctTimedStats *s;
+        int64_t time_ns = qemu_clock_get_ns(clock_type);
+        int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+        if (qtest_enabled()) {
+            latency_ns = qtest_latency_ns;
+        }
+
+        stats->total_time_ns[cookie->type] += latency_ns;
+        stats->last_access_time_ns = time_ns;
+
+        QSLIST_FOREACH(s, &stats->intervals, entries) {
+            timed_average_account(&s->latency[cookie->type], latency_ns);
+        }
+    }
+}
+
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    /* block_acct_done() and block_acct_failed() update
+     * total_time_ns[], but this one does not. The reason is that
+     * invalid requests are accounted during their submission,
+     * therefore there's no actual I/O involved. */
+
+    stats->invalid_ops[type]++;
+
+    if (stats->account_invalid) {
+        stats->last_access_time_ns = qemu_clock_get_ns(clock_type);
    }
 }

@@ -61,3 +153,20 @@ void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
    assert(type < BLOCK_MAX_IOTYPE);
    stats->merged[type] += num_requests;
 }
+
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats)
+{
+    return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns;
+}
+
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type)
+{
+    uint64_t sum, elapsed;
+
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    sum = timed_average_sum(&stats->latency[type], &elapsed);
+
+    return (double) sum / elapsed;
+}
--- a/block/backup.c
+++ b/block/backup.c
@@ -21,6 +21,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 #define BACKUP_CLUSTER_BITS 16
 #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
@@ -131,7 +132,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

        if (is_write_notifier) {
-            ret = bdrv_co_no_copy_on_readv(bs,
+            ret = bdrv_co_readv_no_serialising(bs,
                                           start * BACKUP_SECTORS_PER_CLUSTER,
                                           n, &bounce_qiov);
        } else {
@@ -215,7 +216,41 @@ static void backup_iostatus_reset(BlockJob *job)
 {
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

-    bdrv_iostatus_reset(s->target);
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
+}
+
+static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
+{
+    BdrvDirtyBitmap *bm;
+    BlockDriverState *bs = job->common.bs;
+
+    if (ret < 0 || block_job_is_cancelled(&job->common)) {
+        /* Merge the successor back into the parent, delete nothing. */
+        bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    } else {
+        /* Everything is fine, delete this bitmap and install the backup. */
+        bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    }
+}
+
+static void backup_commit(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, 0);
+    }
+}
+
+static void backup_abort(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, -1);
+    }
 }

 static const BlockJobDriver backup_job_driver = {
@@ -223,6 +258,8 @@ static const BlockJobDriver backup_job_driver = {
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
    .set_speed      = backup_set_speed,
    .iostatus_reset = backup_iostatus_reset,
+    .commit         = backup_commit,
+    .abort          = backup_abort,
 };

 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -360,8 +397,10 @@ static void coroutine_fn backup_run(void *opaque)
    job->bitmap = hbitmap_alloc(end, 0);

    bdrv_set_enable_write_cache(target, true);
-    bdrv_set_on_error(target, on_target_error, on_target_error);
-    bdrv_iostatus_enable(target);
+    if (target->blk) {
+        blk_set_on_error(target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(target->blk);
+    }

    bdrv_add_before_write_notifier(bs, &before_write);

@@ -436,22 +475,11 @@ static void coroutine_fn backup_run(void *opaque)
    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
-
-    if (job->sync_bitmap) {
-        BdrvDirtyBitmap *bm;
-        if (ret < 0 || block_job_is_cancelled(&job->common)) {
-            /* Merge the successor back into the parent, delete nothing. */
-            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        } else {
-            /* Everything is fine, delete this bitmap and install the backup. */
-            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        }
-    }
    hbitmap_free(job->bitmap);

-    bdrv_iostatus_disable(target);
+    if (target->blk) {
+        blk_iostatus_disable(target->blk);
+    }
    bdrv_op_unblock_all(target, job->common.blocker);

    data = g_malloc(sizeof(*data));
@@ -465,7 +493,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  BlockCompletionFunc *cb, void *opaque,
-                  Error **errp)
+                  BlockJobTxn *txn, Error **errp)
 {
    int64_t len;

@@ -480,7 +508,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }
@@ -547,6 +575,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                       sync_bitmap : NULL;
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
+    block_job_txn_add_job(txn, &job->common);
    qemu_coroutine_enter(job->common.co, job);
    return;

--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -30,12 +30,13 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
+#include "sysemu/qtest.h"

 typedef struct BDRVBlkdebugState {
    int state;
    int new_state;

-    QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
+    QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX];
    QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
    QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
 } BDRVBlkdebugState;
@@ -63,7 +64,7 @@ enum {
 };

 typedef struct BlkdebugRule {
-    BlkDebugEvent event;
+    BlkdebugEvent event;
    int action;
    int state;
    union {
@@ -142,69 +143,12 @@ static QemuOptsList *config_groups[] = {
    NULL
 };

-static const char *event_names[BLKDBG_EVENT_MAX] = {
-    [BLKDBG_L1_UPDATE]                      = "l1_update",
-    [BLKDBG_L1_GROW_ALLOC_TABLE]            = "l1_grow.alloc_table",
-    [BLKDBG_L1_GROW_WRITE_TABLE]            = "l1_grow.write_table",
-    [BLKDBG_L1_GROW_ACTIVATE_TABLE]         = "l1_grow.activate_table",
-
-    [BLKDBG_L2_LOAD]                        = "l2_load",
-    [BLKDBG_L2_UPDATE]                      = "l2_update",
-    [BLKDBG_L2_UPDATE_COMPRESSED]           = "l2_update_compressed",
-    [BLKDBG_L2_ALLOC_COW_READ]              = "l2_alloc.cow_read",
-    [BLKDBG_L2_ALLOC_WRITE]                 = "l2_alloc.write",
-
-    [BLKDBG_READ_AIO]                       = "read_aio",
-    [BLKDBG_READ_BACKING_AIO]               = "read_backing_aio",
-    [BLKDBG_READ_COMPRESSED]                = "read_compressed",
-
-    [BLKDBG_WRITE_AIO]                      = "write_aio",
-    [BLKDBG_WRITE_COMPRESSED]               = "write_compressed",
-
-    [BLKDBG_VMSTATE_LOAD]                   = "vmstate_load",
-    [BLKDBG_VMSTATE_SAVE]                   = "vmstate_save",
-
-    [BLKDBG_COW_READ]                       = "cow_read",
-    [BLKDBG_COW_WRITE]                      = "cow_write",
-
-    [BLKDBG_REFTABLE_LOAD]                  = "reftable_load",
-    [BLKDBG_REFTABLE_GROW]                  = "reftable_grow",
-    [BLKDBG_REFTABLE_UPDATE]                = "reftable_update",
-
-    [BLKDBG_REFBLOCK_LOAD]                  = "refblock_load",
-    [BLKDBG_REFBLOCK_UPDATE]                = "refblock_update",
-    [BLKDBG_REFBLOCK_UPDATE_PART]           = "refblock_update_part",
-    [BLKDBG_REFBLOCK_ALLOC]                 = "refblock_alloc",
-    [BLKDBG_REFBLOCK_ALLOC_HOOKUP]          = "refblock_alloc.hookup",
-    [BLKDBG_REFBLOCK_ALLOC_WRITE]           = "refblock_alloc.write",
-    [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS]    = "refblock_alloc.write_blocks",
-    [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE]     = "refblock_alloc.write_table",
-    [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE]    = "refblock_alloc.switch_table",
-
-    [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc",
-    [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes",
-    [BLKDBG_CLUSTER_FREE]                   = "cluster_free",
-
-    [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os",
-    [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk",
-
-    [BLKDBG_PWRITEV_RMW_HEAD]               = "pwritev_rmw.head",
-    [BLKDBG_PWRITEV_RMW_AFTER_HEAD]         = "pwritev_rmw.after_head",
-    [BLKDBG_PWRITEV_RMW_TAIL]               = "pwritev_rmw.tail",
-    [BLKDBG_PWRITEV_RMW_AFTER_TAIL]         = "pwritev_rmw.after_tail",
-    [BLKDBG_PWRITEV]                        = "pwritev",
-    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
-    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
-
-    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
-};
-
-static int get_event_by_name(const char *name, BlkDebugEvent *event)
+static int get_event_by_name(const char *name, BlkdebugEvent *event)
 {
    int i;

-    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
-        if (!strcmp(event_names[i], name)) {
+    for (i = 0; i < BLKDBG__MAX; i++) {
+        if (!strcmp(BlkdebugEvent_lookup[i], name)) {
            *event = i;
            return 0;
        }
@@ -223,7 +167,7 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
    struct add_rule_data *d = opaque;
    BDRVBlkdebugState *s = d->s;
    const char* event_name;
-    BlkDebugEvent event;
+    BlkdebugEvent event;
    struct BlkdebugRule *rule;

    /* Find the right event for the rule */
@@ -563,7 +507,7 @@ static void blkdebug_close(BlockDriverState *bs)
    BlkdebugRule *rule, *next;
    int i;

-    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+    for (i = 0; i < BLKDBG__MAX; i++) {
        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
            remove_rule(rule);
        }
@@ -583,9 +527,13 @@ static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
    remove_rule(rule);
    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);

-    printf("blkdebug: Suspended request '%s'\n", r.tag);
+    if (!qtest_enabled()) {
+        printf("blkdebug: Suspended request '%s'\n", r.tag);
+    }
    qemu_coroutine_yield();
-    printf("blkdebug: Resuming request '%s'\n", r.tag);
+    if (!qtest_enabled()) {
+        printf("blkdebug: Resuming request '%s'\n", r.tag);
+    }

    QLIST_REMOVE(&r, next);
    g_free(r.tag);
@@ -622,13 +570,13 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
    return injected;
 }

-static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
+static void blkdebug_debug_event(BlockDriverState *bs, BlkdebugEvent event)
 {
    BDRVBlkdebugState *s = bs->opaque;
    struct BlkdebugRule *rule, *next;
    bool injected;

-    assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
+    assert((int)event >= 0 && event < BLKDBG__MAX);

    injected = false;
    s->new_state = s->state;
@@ -643,7 +591,7 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
 {
    BDRVBlkdebugState *s = bs->opaque;
    struct BlkdebugRule *rule;
-    BlkDebugEvent blkdebug_event;
+    BlkdebugEvent blkdebug_event;

    if (get_event_by_name(event, &blkdebug_event) < 0) {
        return -ENOENT;
@@ -685,7 +633,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
    BlkdebugRule *rule, *next;
    int i, ret = -ENOENT;

-    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
+    for (i = 0; i < BLKDBG__MAX; i++) {
        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
            if (rule->action == ACTION_SUSPEND &&
                !strcmp(rule->options.suspend.tag, tag)) {
@@ -726,17 +674,15 @@ static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
    return bdrv_truncate(bs->file->bs, offset);
 }

-static void blkdebug_refresh_filename(BlockDriverState *bs)
+static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    QDict *opts;
    const QDictEntry *e;
    bool force_json = false;

-    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
+    for (e = qdict_first(options); e; e = qdict_next(options, e)) {
        if (strcmp(qdict_entry_key(e), "config") &&
-            strcmp(qdict_entry_key(e), "x-image") &&
-            strcmp(qdict_entry_key(e), "image") &&
-            strncmp(qdict_entry_key(e), "image.", strlen("image.")))
+            strcmp(qdict_entry_key(e), "x-image"))
        {
            force_json = true;
            break;
@@ -752,7 +698,7 @@ static void blkdebug_refresh_filename(BlockDriverState *bs)
    if (!force_json && bs->file->bs->exact_filename[0]) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
                 "blkdebug:%s:%s",
-                 qdict_get_try_str(bs->options, "config") ?: "",
+                 qdict_get_try_str(options, "config") ?: "",
                 bs->file->bs->exact_filename);
    }

@@ -762,11 +708,8 @@ static void blkdebug_refresh_filename(BlockDriverState *bs)
    QINCREF(bs->file->bs->full_open_options);
    qdict_put_obj(opts, "image", QOBJECT(bs->file->bs->full_open_options));

-    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
-        if (strcmp(qdict_entry_key(e), "x-image") &&
-            strcmp(qdict_entry_key(e), "image") &&
-            strncmp(qdict_entry_key(e), "image.", strlen("image.")))
-        {
+    for (e = qdict_first(options); e; e = qdict_next(options, e)) {
+        if (strcmp(qdict_entry_key(e), "x-image")) {
            qobject_incref(qdict_entry_value(e));
            qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e));
        }
@@ -775,6 +718,12 @@ static void blkdebug_refresh_filename(BlockDriverState *bs)
    bs->full_open_options = opts;
 }

+static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_blkdebug = {
    .format_name            = "blkdebug",
    .protocol_name          = "blkdebug",
@@ -783,6 +732,7 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_parse_filename    = blkdebug_parse_filename,
    .bdrv_file_open         = blkdebug_open,
    .bdrv_close             = blkdebug_close,
+    .bdrv_reopen_prepare    = blkdebug_reopen_prepare,
    .bdrv_getlength         = blkdebug_getlength,
    .bdrv_truncate          = blkdebug_truncate,
    .bdrv_refresh_filename  = blkdebug_refresh_filename,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -307,7 +307,7 @@ static void blkverify_attach_aio_context(BlockDriverState *bs,
    bdrv_attach_aio_context(s->test_file->bs, new_context);
 }

-static void blkverify_refresh_filename(BlockDriverState *bs)
+static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkverifyState *s = bs->opaque;

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -12,12 +12,17 @@

 #include "sysemu/block-backend.h"
 #include "block/block_int.h"
+#include "block/blockjob.h"
+#include "block/throttle-groups.h"
 #include "sysemu/blockdev.h"
+#include "sysemu/sysemu.h"
 #include "qapi-event.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64

+static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
+
 struct BlockBackend {
    char *name;
    int refcnt;
@@ -29,15 +34,31 @@ struct BlockBackend {
    /* TODO change to DeviceState when all users are qdevified */
    const BlockDevOps *dev_ops;
    void *dev_opaque;
+
+    /* the block size for which the guest device expects atomicity */
+    int guest_block_size;
+
+    /* If the BDS tree is removed, some of its options are stored here (which
+     * can be used to restore those options in the new BDS on insert) */
+    BlockBackendRootState root_state;
+
+    /* I/O stats (display with "info blockstats"). */
+    BlockAcctStats stats;
+
+    BlockdevOnError on_read_error, on_write_error;
+    bool iostatus_enabled;
+    BlockDeviceIoStatus iostatus;
 };

 typedef struct BlockBackendAIOCB {
    BlockAIOCB common;
    QEMUBH *bh;
+    BlockBackend *blk;
    int ret;
 } BlockBackendAIOCB;

 static const AIOCBInfo block_backend_aiocb_info = {
+    .get_aio_context = blk_aiocb_get_aio_context,
    .aiocb_size = sizeof(BlockBackendAIOCB),
 };

@@ -145,12 +166,17 @@ static void blk_delete(BlockBackend *blk)
        bdrv_unref(blk->bs);
        blk->bs = NULL;
    }
+    if (blk->root_state.throttle_state) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
    /* Avoid double-remove after blk_hide_on_behalf_of_hmp_drive_del() */
    if (blk->name[0]) {
        QTAILQ_REMOVE(&blk_backends, blk, link);
    }
    g_free(blk->name);
    drive_info_del(blk->legacy_dinfo);
+    block_acct_cleanup(&blk->stats);
    g_free(blk);
 }

@@ -164,6 +190,11 @@ static void drive_info_del(DriveInfo *dinfo)
    g_free(dinfo);
 }

+int blk_get_refcnt(BlockBackend *blk)
+{
+    return blk ? blk->refcnt : 0;
+}
+
 /*
 * Increment @blk's reference count.
 * @blk must not be null.
@@ -308,6 +339,29 @@ void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk)
    }
 }

+/*
+ * Disassociates the currently associated BlockDriverState from @blk.
+ */
+void blk_remove_bs(BlockBackend *blk)
+{
+    blk_update_root_state(blk);
+
+    blk->bs->blk = NULL;
+    bdrv_unref(blk->bs);
+    blk->bs = NULL;
+}
+
+/*
+ * Associates a new BlockDriverState with @blk.
+ */
+void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
+{
+    assert(!blk->bs && !bs->blk);
+    bdrv_ref(bs);
+    blk->bs = bs;
+    bs->blk = blk;
+}
+
 /*
 * Attach device model @dev to @blk.
 * Return 0 on success, -EBUSY when a device model is attached already.
@@ -320,7 +374,7 @@ int blk_attach_dev(BlockBackend *blk, void *dev)
    }
    blk_ref(blk);
    blk->dev = dev;
-    bdrv_iostatus_reset(blk->bs);
+    blk_iostatus_reset(blk);
    return 0;
 }

@@ -347,7 +401,7 @@ void blk_detach_dev(BlockBackend *blk, void *dev)
    blk->dev = NULL;
    blk->dev_ops = NULL;
    blk->dev_opaque = NULL;
-    bdrv_set_guest_block_size(blk->bs, 512);
+    blk->guest_block_size = 512;
    blk_unref(blk);
 }

@@ -381,18 +435,15 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 void blk_dev_change_media_cb(BlockBackend *blk, bool load)
 {
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
-        bool tray_was_closed = !blk_dev_is_tray_open(blk);
+        bool tray_was_open, tray_is_open;

+        tray_was_open = blk_dev_is_tray_open(blk);
        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
-        if (tray_was_closed) {
-            /* tray open */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              true, &error_abort);
-        }
-        if (load) {
-            /* tray close */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              false, &error_abort);
+        tray_is_open = blk_dev_is_tray_open(blk);
+
+        if (tray_was_open != tray_is_open) {
+            qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open,
+                                              &error_abort);
        }
    }
 }
@@ -452,7 +503,47 @@ void blk_dev_resize_cb(BlockBackend *blk)

 void blk_iostatus_enable(BlockBackend *blk)
 {
-    bdrv_iostatus_enable(blk->bs);
+    blk->iostatus_enabled = true;
+    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+}
+
+/* The I/O status is only enabled if the drive explicitly
+ * enables it _and_ the VM is configured to stop on errors */
+bool blk_iostatus_is_enabled(const BlockBackend *blk)
+{
+    return (blk->iostatus_enabled &&
+           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
+            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
+            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
+}
+
+BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
+{
+    return blk->iostatus;
+}
+
+void blk_iostatus_disable(BlockBackend *blk)
+{
+    blk->iostatus_enabled = false;
+}
+
+void blk_iostatus_reset(BlockBackend *blk)
+{
+    if (blk_iostatus_is_enabled(blk)) {
+        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+        if (blk->bs && blk->bs->job) {
+            block_job_iostatus_reset(blk->bs->job);
+        }
+    }
+}
+
+void blk_iostatus_set_err(BlockBackend *blk, int error)
+{
+    assert(blk_iostatus_is_enabled(blk));
+    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+                                          BLOCK_DEVICE_IO_STATUS_FAILED;
+    }
 }

 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
@@ -464,7 +555,7 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
        return -EIO;
    }

-    if (!blk_is_inserted(blk)) {
+    if (!blk_is_available(blk)) {
        return -ENOMEDIUM;
    }

@@ -551,13 +642,15 @@ static void error_callback_bh(void *opaque)
    qemu_aio_unref(acb);
 }

-static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb,
-                                     void *opaque, int ret)
+BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque, int ret)
 {
    struct BlockBackendAIOCB *acb;
    QEMUBH *bh;

    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
+    acb->blk = blk;
    acb->ret = ret;

    bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb);
@@ -573,7 +666,7 @@ BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags,
@@ -602,16 +695,28 @@ int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)

 int64_t blk_getlength(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_getlength(blk->bs);
 }

 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
 {
-    bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+    if (!blk->bs) {
+        *nb_sectors_ptr = 0;
+    } else {
+        bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+    }
 }

 int64_t blk_nb_sectors(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_nb_sectors(blk->bs);
 }

@@ -621,7 +726,7 @@ BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -633,7 +738,7 @@ BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -642,6 +747,10 @@ BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                          BlockCompletionFunc *cb, void *opaque)
 {
+    if (!blk_is_available(blk)) {
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+    }
+
    return bdrv_aio_flush(blk->bs, cb, opaque);
 }

@@ -651,7 +760,7 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque);
@@ -683,12 +792,20 @@ int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)

 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_ioctl(blk->bs, req, buf);
 }

 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                          BlockCompletionFunc *cb, void *opaque)
 {
+    if (!blk_is_available(blk)) {
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+    }
+
    return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque);
 }

@@ -704,11 +821,19 @@ int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)

 int blk_co_flush(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_co_flush(blk->bs);
 }

 int blk_flush(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_flush(blk->bs);
 }

@@ -719,7 +844,9 @@ int blk_flush_all(void)

 void blk_drain(BlockBackend *blk)
 {
-    bdrv_drain(blk->bs);
+    if (blk->bs) {
+        bdrv_drain(blk->bs);
+    }
 }

 void blk_drain_all(void)
@@ -727,76 +854,183 @@ void blk_drain_all(void)
    bdrv_drain_all();
 }

+void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
+                      BlockdevOnError on_write_error)
+{
+    blk->on_read_error = on_read_error;
+    blk->on_write_error = on_write_error;
+}
+
 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
 {
-    return bdrv_get_on_error(blk->bs, is_read);
+    return is_read ? blk->on_read_error : blk->on_write_error;
 }

 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
                                      int error)
 {
-    return bdrv_get_error_action(blk->bs, is_read, error);
+    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        return (error == ENOSPC) ?
+               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_STOP:
+        return BLOCK_ERROR_ACTION_STOP;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        return BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        return BLOCK_ERROR_ACTION_IGNORE;
+    default:
+        abort();
+    }
 }

+static void send_qmp_error_event(BlockBackend *blk,
+                                 BlockErrorAction action,
+                                 bool is_read, int error)
+{
+    IoOperationType optype;
+
+    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
+    qapi_event_send_block_io_error(blk_name(blk), optype, action,
+                                   blk_iostatus_is_enabled(blk),
+                                   error == ENOSPC, strerror(error),
+                                   &error_abort);
+}
+
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
                      bool is_read, int error)
 {
-    bdrv_error_action(blk->bs, action, is_read, error);
+    assert(error >= 0);
+
+    if (action == BLOCK_ERROR_ACTION_STOP) {
+        /* First set the iostatus, so that "info block" returns an iostatus
+         * that matches the events raised so far (an additional error iostatus
+         * is fine, but not a lost one).
+         */
+        blk_iostatus_set_err(blk, error);
+
+        /* Then raise the request to stop the VM and the event.
+         * qemu_system_vmstop_request_prepare has two effects.  First,
+         * it ensures that the STOP event always comes after the
+         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
+         * can observe the STOP event and do a "cont" before the STOP
+         * event is issued, the VM will not stop.  In this case, vm_start()
+         * also ensures that the STOP/RESUME pair of events is emitted.
+         */
+        qemu_system_vmstop_request_prepare();
+        send_qmp_error_event(blk, action, is_read, error);
+        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
+    } else {
+        send_qmp_error_event(blk, action, is_read, error);
+    }
 }

 int blk_is_read_only(BlockBackend *blk)
 {
-    return bdrv_is_read_only(blk->bs);
+    if (blk->bs) {
+        return bdrv_is_read_only(blk->bs);
+    } else {
+        return blk->root_state.read_only;
+    }
 }

 int blk_is_sg(BlockBackend *blk)
 {
+    if (!blk->bs) {
+        return 0;
+    }
+
    return bdrv_is_sg(blk->bs);
 }

 int blk_enable_write_cache(BlockBackend *blk)
 {
-    return bdrv_enable_write_cache(blk->bs);
+    if (blk->bs) {
+        return bdrv_enable_write_cache(blk->bs);
+    } else {
+        return !!(blk->root_state.open_flags & BDRV_O_CACHE_WB);
+    }
 }

 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
 {
-    bdrv_set_enable_write_cache(blk->bs, wce);
+    if (blk->bs) {
+        bdrv_set_enable_write_cache(blk->bs, wce);
+    } else {
+        if (wce) {
+            blk->root_state.open_flags |= BDRV_O_CACHE_WB;
+        } else {
+            blk->root_state.open_flags &= ~BDRV_O_CACHE_WB;
+        }
+    }
 }

 void blk_invalidate_cache(BlockBackend *blk, Error **errp)
 {
+    if (!blk->bs) {
+        error_setg(errp, "Device '%s' has no medium", blk->name);
+        return;
+    }
+
    bdrv_invalidate_cache(blk->bs, errp);
 }

-int blk_is_inserted(BlockBackend *blk)
+bool blk_is_inserted(BlockBackend *blk)
 {
-    return bdrv_is_inserted(blk->bs);
+    return blk->bs && bdrv_is_inserted(blk->bs);
+}
+
+bool blk_is_available(BlockBackend *blk)
+{
+    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
 }

 void blk_lock_medium(BlockBackend *blk, bool locked)
 {
-    bdrv_lock_medium(blk->bs, locked);
+    if (blk->bs) {
+        bdrv_lock_medium(blk->bs, locked);
+    }
 }

 void blk_eject(BlockBackend *blk, bool eject_flag)
 {
-    bdrv_eject(blk->bs, eject_flag);
+    if (blk->bs) {
+        bdrv_eject(blk->bs, eject_flag);
+    }
 }

 int blk_get_flags(BlockBackend *blk)
 {
-    return bdrv_get_flags(blk->bs);
+    if (blk->bs) {
+        return bdrv_get_flags(blk->bs);
+    } else {
+        return blk->root_state.open_flags;
+    }
 }

 int blk_get_max_transfer_length(BlockBackend *blk)
 {
-    return blk->bs->bl.max_transfer_length;
+    if (blk->bs) {
+        return blk->bs->bl.max_transfer_length;
+    } else {
+        return 0;
+    }
+}
+
+int blk_get_max_iov(BlockBackend *blk)
+{
+    return blk->bs->bl.max_iov;
 }

 void blk_set_guest_block_size(BlockBackend *blk, int align)
 {
-    bdrv_set_guest_block_size(blk->bs, align);
+    blk->guest_block_size = align;
 }

 void *blk_blockalign(BlockBackend *blk, size_t size)
@@ -806,40 +1040,64 @@ void *blk_blockalign(BlockBackend *blk, size_t size)

 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
 {
+    if (!blk->bs) {
+        return false;
+    }
+
    return bdrv_op_is_blocked(blk->bs, op, errp);
 }

 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
 {
-    bdrv_op_unblock(blk->bs, op, reason);
+    if (blk->bs) {
+        bdrv_op_unblock(blk->bs, op, reason);
+    }
 }

 void blk_op_block_all(BlockBackend *blk, Error *reason)
 {
-    bdrv_op_block_all(blk->bs, reason);
+    if (blk->bs) {
+        bdrv_op_block_all(blk->bs, reason);
+    }
 }

 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 {
-    bdrv_op_unblock_all(blk->bs, reason);
+    if (blk->bs) {
+        bdrv_op_unblock_all(blk->bs, reason);
+    }
 }

 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    return bdrv_get_aio_context(blk->bs);
+    if (blk->bs) {
+        return bdrv_get_aio_context(blk->bs);
+    } else {
+        return qemu_get_aio_context();
+    }
+}
+
+static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
+{
+    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
+    return blk_get_aio_context(blk_acb->blk);
 }

 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
-    bdrv_set_aio_context(blk->bs, new_context);
+    if (blk->bs) {
+        bdrv_set_aio_context(blk->bs, new_context);
+    }
 }

 void blk_add_aio_context_notifier(BlockBackend *blk,
        void (*attached_aio_context)(AioContext *new_context, void *opaque),
        void (*detach_aio_context)(void *opaque), void *opaque)
 {
-    bdrv_add_aio_context_notifier(blk->bs, attached_aio_context,
-                                  detach_aio_context, opaque);
+    if (blk->bs) {
+        bdrv_add_aio_context_notifier(blk->bs, attached_aio_context,
+                                      detach_aio_context, opaque);
+    }
 }

 void blk_remove_aio_context_notifier(BlockBackend *blk,
@@ -848,28 +1106,36 @@ void blk_remove_aio_context_notifier(BlockBackend *blk,
                                     void (*detach_aio_context)(void *),
                                     void *opaque)
 {
-    bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context,
-                                     detach_aio_context, opaque);
+    if (blk->bs) {
+        bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context,
+                                         detach_aio_context, opaque);
+    }
 }

 void blk_add_close_notifier(BlockBackend *blk, Notifier *notify)
 {
-    bdrv_add_close_notifier(blk->bs, notify);
+    if (blk->bs) {
+        bdrv_add_close_notifier(blk->bs, notify);
+    }
 }

 void blk_io_plug(BlockBackend *blk)
 {
-    bdrv_io_plug(blk->bs);
+    if (blk->bs) {
+        bdrv_io_plug(blk->bs);
+    }
 }

 void blk_io_unplug(BlockBackend *blk)
 {
-    bdrv_io_unplug(blk->bs);
+    if (blk->bs) {
+        bdrv_io_unplug(blk->bs);
+    }
 }

 BlockAcctStats *blk_get_stats(BlockBackend *blk)
 {
-    return bdrv_get_stats(blk->bs);
+    return &blk->stats;
 }

 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
@@ -902,6 +1168,10 @@ int blk_write_compressed(BlockBackend *blk, int64_t sector_num,

 int blk_truncate(BlockBackend *blk, int64_t offset)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_truncate(blk->bs, offset);
 }

@@ -918,20 +1188,94 @@ int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                     int64_t pos, int size)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_save_vmstate(blk->bs, buf, pos, size);
 }

 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_load_vmstate(blk->bs, buf, pos, size);
 }

 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_probe_blocksizes(blk->bs, bsz);
 }

 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_probe_geometry(blk->bs, geo);
 }
+
+/*
+ * Updates the BlockBackendRootState object with data from the currently
+ * attached BlockDriverState.
+ */
+void blk_update_root_state(BlockBackend *blk)
+{
+    assert(blk->bs);
+
+    blk->root_state.open_flags    = blk->bs->open_flags;
+    blk->root_state.read_only     = blk->bs->read_only;
+    blk->root_state.detect_zeroes = blk->bs->detect_zeroes;
+
+    if (blk->root_state.throttle_group) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
+    if (blk->bs->throttle_state) {
+        const char *name = throttle_group_get_name(blk->bs);
+        blk->root_state.throttle_group = g_strdup(name);
+        blk->root_state.throttle_state = throttle_group_incref(name);
+    } else {
+        blk->root_state.throttle_group = NULL;
+        blk->root_state.throttle_state = NULL;
+    }
+}
+
+/*
+ * Applies the information in the root state to the given BlockDriverState. This
+ * does not include the flags which have to be specified for bdrv_open(), use
+ * blk_get_open_flags_from_root_state() to inquire them.
+ */
+void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
+{
+    bs->detect_zeroes = blk->root_state.detect_zeroes;
+    if (blk->root_state.throttle_group) {
+        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
+    }
+}
+
+/*
+ * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
+ * supposed to inherit the root state.
+ */
+int blk_get_open_flags_from_root_state(BlockBackend *blk)
+{
+    int bs_flags;
+
+    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
+    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
+
+    return bs_flags;
+}
+
+BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
+{
+    return &blk->root_state;
+}
--- a/block/commit.c
+++ b/block/commit.c
@@ -17,6 +17,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 enum {
    /*
@@ -213,7 +214,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,

    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, "Invalid parameter combination");
        return;
    }
@@ -235,14 +236,14 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
    orig_overlay_flags = bdrv_get_flags(overlay_bs);

    /* convert base & overlay_bs to r/w, if necessary */
-    if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
                                         orig_overlay_flags | BDRV_O_RDWR);
    }
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
    if (reopen_queue) {
        bdrv_reopen_multiple(reopen_queue, &local_err);
        if (local_err != NULL) {
--- a/block/curl.c
+++ b/block/curl.c
@@ -154,18 +154,20 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd);
    switch (action) {
        case CURL_POLL_IN:
-            aio_set_fd_handler(s->aio_context, fd, curl_multi_read,
-                               NULL, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               curl_multi_read, NULL, state);
            break;
        case CURL_POLL_OUT:
-            aio_set_fd_handler(s->aio_context, fd, NULL, curl_multi_do, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               NULL, curl_multi_do, state);
            break;
        case CURL_POLL_INOUT:
-            aio_set_fd_handler(s->aio_context, fd, curl_multi_read,
-                               curl_multi_do, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               curl_multi_read, curl_multi_do, state);
            break;
        case CURL_POLL_REMOVE:
-            aio_set_fd_handler(s->aio_context, fd, NULL, NULL, NULL);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               NULL, NULL, NULL);
            break;
    }

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -429,28 +429,23 @@ static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 static inline bool gluster_supports_zerofill(void)
@@ -541,35 +536,30 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

    if (write) {
        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
    } else {
        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
    }

    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -600,26 +590,21 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;

-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 #ifdef CONFIG_GLUSTERFS_DISCARD
@@ -627,28 +612,23 @@ static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 #endif

--- a/block/io.c
+++ b/block/io.c
@@ -23,6 +23,7 @@
 */

 #include "trace.h"
+#include "sysemu/block-backend.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
 #include "block/throttle-groups.h"
@@ -165,9 +166,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
        bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
        bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
        bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
+        bs->bl.max_iov = bs->file->bs->bl.max_iov;
    } else {
        bs->bl.min_mem_alignment = 512;
        bs->bl.opt_mem_alignment = getpagesize();
+
+        /* Safe default since most protocols use readv()/writev()/etc */
+        bs->bl.max_iov = IOV_MAX;
    }

    if (bs->backing) {
@@ -188,6 +193,9 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
        bs->bl.min_mem_alignment =
            MAX(bs->bl.min_mem_alignment,
                bs->backing->bs->bl.min_mem_alignment);
+        bs->bl.max_iov =
+            MIN(bs->bl.max_iov,
+                bs->backing->bs->bl.max_iov);
    }

    /* Then let the driver override it */
@@ -215,6 +223,8 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
 /* Check if any requests are in-flight (including throttled requests) */
 bool bdrv_requests_pending(BlockDriverState *bs)
 {
+    BdrvChild *child;
+
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
        return true;
    }
@@ -224,17 +234,31 @@ bool bdrv_requests_pending(BlockDriverState *bs)
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
        return true;
    }
-    if (bs->file && bdrv_requests_pending(bs->file->bs)) {
-        return true;
-    }
-    if (bs->backing && bdrv_requests_pending(bs->backing->bs)) {
-        return true;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        if (bdrv_requests_pending(child->bs)) {
+            return true;
+        }
    }
+
    return false;
 }

+static void bdrv_drain_recurse(BlockDriverState *bs)
+{
+    BdrvChild *child;
+
+    if (bs->drv && bs->drv->bdrv_drain) {
+        bs->drv->bdrv_drain(bs);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_drain_recurse(child->bs);
+    }
+}
+
 /*
- * Wait for pending requests to complete on a single BlockDriverState subtree
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
 *
 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 * AioContext.
@@ -247,6 +271,7 @@ void bdrv_drain(BlockDriverState *bs)
 {
    bool busy = true;

+    bdrv_drain_recurse(bs);
    while (busy) {
        /* Keep iterating */
         bdrv_flush_io_queue(bs);
@@ -344,13 +369,14 @@ static void tracked_request_end(BdrvTrackedRequest *req)
 static void tracked_request_begin(BdrvTrackedRequest *req,
                                  BlockDriverState *bs,
                                  int64_t offset,
-                                  unsigned int bytes, bool is_write)
+                                  unsigned int bytes,
+                                  enum BdrvTrackedRequestType type)
 {
    *req = (BdrvTrackedRequest){
        .bs = bs,
        .offset         = offset,
        .bytes          = bytes,
-        .is_write       = is_write,
+        .type           = type,
        .co             = qemu_coroutine_self(),
        .serialising    = false,
        .overlap_offset = offset,
@@ -844,7 +870,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
    }

-    wait_serialising_requests(req);
+    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
+        wait_serialising_requests(req);
+    }

    if (flags & BDRV_REQ_COPY_ON_READ) {
        int pnum;
@@ -933,7 +961,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
    }

    /* Don't do copy-on-read if we read data before write operation */
-    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) {
+    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
        flags |= BDRV_REQ_COPY_ON_READ;
    }

@@ -967,7 +995,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
        bytes = ROUND_UP(bytes, align);
    }

-    tracked_request_begin(&req, bs, offset, bytes, false);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
@@ -1002,13 +1030,13 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }

-int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs,
+int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors);
+    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);

    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_NO_COPY_ON_READ);
+                            BDRV_REQ_NO_SERIALISING);
 }

 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
@@ -1151,7 +1179,9 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,

    bdrv_set_dirty(bs, sector_num, nb_sectors);

-    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+    if (bs->wr_highest_offset < offset + bytes) {
+        bs->wr_highest_offset = offset + bytes;
+    }

    if (ret >= 0) {
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
@@ -1286,7 +1316,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
     * Pad qiov with the read parts and be sure to have a tracked request not
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
     */
-    tracked_request_begin(&req, bs, offset, bytes, true);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

    if (!qiov) {
        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
@@ -1859,7 +1889,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
            merge = 1;
        }

-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
+            bs->bl.max_iov) {
            merge = 0;
        }

@@ -1903,7 +1934,10 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
        }
    }

-    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+    if (bs->blk) {
+        block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
+                              num_reqs - outidx - 1);
+    }

    return outidx + 1;
 }
@@ -2308,18 +2342,20 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
    int ret;
+    BdrvTrackedRequest req;

    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
        bdrv_is_sg(bs)) {
        return 0;
    }

+    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
    /* Write back cached data to the OS even with cache=unsafe */
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
    if (bs->drv->bdrv_co_flush_to_os) {
        ret = bs->drv->bdrv_co_flush_to_os(bs);
        if (ret < 0) {
-            return ret;
+            goto out;
        }
    }

@@ -2359,14 +2395,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
        ret = 0;
    }
    if (ret < 0) {
-        return ret;
+        goto out;
    }

    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
     * in the case of cache=unsafe, so there are no useless flushes.
     */
 flush_parent:
-    return bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }

 int bdrv_flush(BlockDriverState *bs)
@@ -2409,6 +2448,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors)
 {
+    BdrvTrackedRequest req;
    int max_discard, ret;

    if (!bs->drv) {
@@ -2431,6 +2471,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
        return 0;
    }

+    tracked_request_begin(&req, bs, sector_num, nb_sectors,
+                          BDRV_TRACKED_DISCARD);
    bdrv_set_dirty(bs, sector_num, nb_sectors);

    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
@@ -2464,20 +2506,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
                                            bdrv_co_io_em_complete, &co);
            if (acb == NULL) {
-                return -EIO;
+                ret = -EIO;
+                goto out;
            } else {
                qemu_coroutine_yield();
                ret = co.ret;
            }
        }
        if (ret && ret != -ENOTSUP) {
-            return ret;
+            goto out;
        }

        sector_num += num;
        nb_sectors -= num;
    }
-    return 0;
+    ret = 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }

 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
@@ -2506,26 +2552,110 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
    return rwco.ret;
 }

-/* needed for generic scsi interface */
+typedef struct {
+    CoroutineIOCompletion *co;
+    QEMUBH *bh;
+} BdrvIoctlCompletionData;

-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+    BdrvIoctlCompletionData *data = opaque;
+
+    bdrv_co_io_em_complete(data->co, -ENOTSUP);
+    qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
 {
    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest tracked_req;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;

-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
+    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    if (!drv || !drv->bdrv_aio_ioctl) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+        data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                bdrv_ioctl_bh_cb, data);
+        data->co = &co;
+        qemu_bh_schedule(data->bh);
+    }
+    qemu_coroutine_yield();
+out:
+    tracked_request_end(&tracked_req);
+    return co.ret;
+}
+
+typedef struct {
+    BlockDriverState *bs;
+    int req;
+    void *buf;
+    int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+    BdrvIoctlCoData *data = opaque;
+    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BdrvIoctlCoData data = {
+        .bs = bs,
+        .req = req,
+        .buf = buf,
+        .ret = -EINPROGRESS,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_co_ioctl_entry(&data);
+    } else {
+        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(bdrv_get_aio_context(bs), true);
+        }
+    }
+    return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+                                      acb->req.req, acb->req.buf);
+    bdrv_co_complete(acb);
 }

 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
        BlockCompletionFunc *cb, void *opaque)
 {
-    BlockDriver *drv = bs->drv;
+    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+                                            bs, cb, opaque);
+    Coroutine *co;

-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
-    return NULL;
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.req = req;
+    acb->req.buf = buf;
+    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
 }

 void *qemu_blockalign(BlockDriverState *bs, size_t size)
@@ -2618,3 +2748,20 @@ void bdrv_flush_io_queue(BlockDriverState *bs)
    }
    bdrv_start_throttled_reqs(bs);
 }
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+    }
+    bdrv_drain(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
+    }
+    aio_enable_external(bdrv_get_aio_context(bs));
+}
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -84,6 +84,7 @@ typedef struct IscsiTask {
    IscsiLun *iscsilun;
    QEMUTimer retry_timer;
    bool force_next_flush;
+    int err_code;
 } IscsiTask;

 typedef struct IscsiAIOCB {
@@ -96,6 +97,7 @@ typedef struct IscsiAIOCB {
    int status;
    int64_t sector_num;
    int nb_sectors;
+    int ret;
 #ifdef __linux__
    sg_io_hdr_t *ioh;
 #endif
@@ -169,19 +171,70 @@ static inline unsigned exp_random(double mean)
    return -mean * log((double)rand() / RAND_MAX);
 }

-/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
- * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
- * macro was introduced in 1.11.0. So use the API_VERSION macro as
- * a hint that the macros are defined and define them ourselves
- * otherwise to keep the required libiscsi version at 1.9.0 */
-#if !defined(LIBISCSI_API_VERSION)
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28
-#define QEMU_SCSI_STATUS_TIMEOUT        0x0f000002
-#else
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
-#define QEMU_SCSI_STATUS_TIMEOUT        SCSI_STATUS_TIMEOUT
+/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in
+ * libiscsi 1.10.0, together with other constants we need.  Use it as
+ * a hint that we have to define them ourselves if needed, to keep the
+ * minimum required libiscsi version at 1.9.0.  We use an ASCQ macro for
+ * the test because SCSI_STATUS_* is an enum.
+ *
+ * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes
+ * an enum, check against the LIBISCSI_API_VERSION macro, which was
+ * introduced in 1.11.0.  If it is present, there is no need to define
+ * anything.
+ */
+#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \
+    !defined(LIBISCSI_API_VERSION)
+#define SCSI_STATUS_TASK_SET_FULL                          0x28
+#define SCSI_STATUS_TIMEOUT                                0x0f000002
+#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST    0x2600
+#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR        0x1a00
 #endif

+static int iscsi_translate_sense(struct scsi_sense *sense)
+{
+    int ret;
+
+    switch (sense->key) {
+    case SCSI_SENSE_NOT_READY:
+        return -EBUSY;
+    case SCSI_SENSE_DATA_PROTECTION:
+        return -EACCES;
+    case SCSI_SENSE_COMMAND_ABORTED:
+        return -ECANCELED;
+    case SCSI_SENSE_ILLEGAL_REQUEST:
+        /* Parse ASCQ */
+        break;
+    default:
+        return -EIO;
+    }
+    switch (sense->ascq) {
+    case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR:
+    case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST:
+        ret = -EINVAL;
+        break;
+    case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE:
+        ret = -ENOSPC;
+        break;
+    case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED:
+        ret = -ENOTSUP;
+        break;
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN:
+        ret = -ENOMEDIUM;
+        break;
+    case SCSI_SENSE_ASCQ_WRITE_PROTECTED:
+        ret = -EACCES;
+        break;
+    default:
+        ret = -EIO;
+        break;
+    }
+    return ret;
+}
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                        void *command_data, void *opaque)
@@ -203,11 +256,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                goto out;
            }
            if (status == SCSI_STATUS_BUSY ||
-                status == QEMU_SCSI_STATUS_TIMEOUT ||
-                status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
+                status == SCSI_STATUS_TIMEOUT ||
+                status == SCSI_STATUS_TASK_SET_FULL) {
                unsigned retry_time =
                    exp_random(iscsi_retry_times[iTask->retries - 1]);
-                if (status == QEMU_SCSI_STATUS_TIMEOUT) {
+                if (status == SCSI_STATUS_TIMEOUT) {
                    /* make sure the request is rescheduled AFTER the
                     * reconnect is initiated */
                    retry_time = EVENT_INTERVAL * 2;
@@ -226,6 +279,7 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                return;
            }
        }
+        iTask->err_code = iscsi_translate_sense(&task->sense);
        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
    } else {
        iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
@@ -291,8 +345,8 @@ iscsi_set_events(IscsiLun *iscsilun)
    int ev = iscsi_which_events(iscsi);

    if (ev != iscsilun->events) {
-        aio_set_fd_handler(iscsilun->aio_context,
-                           iscsi_get_fd(iscsi),
+        aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi),
+                           false,
                           (ev & POLLIN) ? iscsi_process_read : NULL,
                           (ev & POLLOUT) ? iscsi_process_write : NULL,
                           iscsilun);
@@ -455,7 +509,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
@@ -644,7 +698,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    return 0;
@@ -683,7 +737,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    return 0;
@@ -703,7 +757,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    if (status < 0) {
        error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
+        acb->status = iscsi_translate_sense(&acb->task->sense);
    }

    acb->ioh->driver_status = 0;
@@ -726,6 +780,38 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    iscsi_schedule_bh(acb);
 }

+static void iscsi_ioctl_bh_completion(void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    qemu_bh_delete(acb->bh);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_aio_unref(acb);
+}
+
+static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf)
+{
+    BlockDriverState *bs = acb->common.bs;
+    IscsiLun *iscsilun = bs->opaque;
+    int ret = 0;
+
+    switch (req) {
+    case SG_GET_VERSION_NUM:
+        *(int *)buf = 30000;
+        break;
+    case SG_GET_SCSI_ID:
+        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
+        break;
+    default:
+        ret = -EINVAL;
+    }
+    assert(!acb->bh);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                         iscsi_ioctl_bh_completion, acb);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+}
+
 static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
        BlockCompletionFunc *cb, void *opaque)
@@ -735,8 +821,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    struct iscsi_data data;
    IscsiAIOCB *acb;

-    assert(req == SG_IO);
-
    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
@@ -745,6 +829,11 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb->buf         = NULL;
    acb->ioh         = buf;

+    if (req != SG_IO) {
+        iscsi_ioctl_handle_emulated(acb, req, buf);
+        return &acb->common;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -809,38 +898,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    return &acb->common;
 }

-static void ioctl_cb(void *opaque, int status)
-{
-    int *p_status = opaque;
-    *p_status = status;
-}
-
-static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    int status;
-
-    switch (req) {
-    case SG_GET_VERSION_NUM:
-        *(int *)buf = 30000;
-        break;
-    case SG_GET_SCSI_ID:
-        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
-        break;
-    case SG_IO:
-        status = -EINPROGRESS;
-        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
-
-        while (status == -EINPROGRESS) {
-            aio_poll(iscsilun->aio_context, true);
-        }
-
-        return 0;
-    default:
-        return -1;
-    }
-    return 0;
-}
 #endif

 static int64_t
@@ -905,7 +962,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
@@ -999,7 +1056,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1280,9 +1337,8 @@ static void iscsi_detach_aio_context(BlockDriverState *bs)
 {
    IscsiLun *iscsilun = bs->opaque;

-    aio_set_fd_handler(iscsilun->aio_context,
-                       iscsi_get_fd(iscsilun->iscsi),
-                       NULL, NULL, NULL);
+    aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi),
+                       false, NULL, NULL, NULL);
    iscsilun->events = 0;

    if (iscsilun->nop_timer) {
@@ -1772,7 +1828,6 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
-    .bdrv_ioctl       = iscsi_ioctl,
    .bdrv_aio_ioctl   = iscsi_aio_ioctl,
 #endif

--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -287,7 +287,7 @@ void laio_detach_aio_context(void *s_, AioContext *old_context)
 {
    struct qemu_laio_state *s = s_;

-    aio_set_event_notifier(old_context, &s->e, NULL);
+    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

@@ -296,7 +296,8 @@ void laio_attach_aio_context(void *s_, AioContext *new_context)
    struct qemu_laio_state *s = s_;

    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
-    aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb);
+    aio_set_event_notifier(new_context, &s->e, false,
+                           qemu_laio_completion_cb);
 }

 void *laio_init(void)
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -14,9 +14,11 @@
 #include "trace.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
+#include "qemu/error-report.h"

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
@@ -159,13 +161,15 @@ static void mirror_read_complete(void *opaque, int ret)
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->common.bs;
-    int nb_sectors, sectors_per_chunk, nb_chunks;
+    int nb_sectors, sectors_per_chunk, nb_chunks, max_iov;
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
    uint64_t delay_ns = 0;
    MirrorOp *op;
    int pnum;
    int64_t ret;

+    max_iov = MIN(source->bl.max_iov, s->target->bl.max_iov);
+
    s->sector_num = hbitmap_iter_next(&s->hbi);
    if (s->sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
@@ -246,7 +250,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
            break;
        }
-        if (IOV_MAX < nb_chunks + added_chunks) {
+        if (max_iov < nb_chunks + added_chunks) {
            trace_mirror_break_iov_max(s, nb_chunks, added_chunks);
            break;
        }
@@ -369,11 +373,22 @@ static void mirror_exit(BlockJob *job, void *opaque)
        if (s->to_replace) {
            to_replace = s->to_replace;
        }
+
+        /* This was checked in mirror_start_job(), but meanwhile one of the
+         * nodes could have been newly attached to a BlockBackend. */
+        if (to_replace->blk && s->target->blk) {
+            error_report("block job: Can't create node with two BlockBackends");
+            data->ret = -EINVAL;
+            goto out;
+        }
+
        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
        }
        bdrv_replace_in_backing_chain(to_replace, s->target);
    }
+
+out:
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
@@ -383,9 +398,11 @@ static void mirror_exit(BlockJob *job, void *opaque)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
+    bdrv_op_unblock_all(s->target, s->common.blocker);
    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
+    bdrv_drained_end(src);
    bdrv_unref(src);
 }

@@ -599,10 +616,15 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    bdrv_iostatus_disable(s->target);
+    if (s->target->blk) {
+        blk_iostatus_disable(s->target->blk);
+    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
+    /* Before we switch to target in mirror_exit, make sure data doesn't
+     * change. */
+    bdrv_drained_begin(s->common.bs);
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

@@ -621,7 +643,9 @@ static void mirror_iostatus_reset(BlockJob *job)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

-    bdrv_iostatus_reset(s->target);
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
 }

 static void mirror_complete(BlockJob *job, Error **errp)
@@ -630,7 +654,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
    Error *local_err = NULL;
    int ret;

-    ret = bdrv_open_backing_file(s->target, NULL, &local_err);
+    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        return;
@@ -695,6 +719,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             bool is_none_mode, BlockDriverState *base)
 {
    MirrorBlockJob *s;
+    BlockDriverState *replaced_bs;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -704,7 +729,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }
@@ -718,6 +743,21 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

+    /* We can't support this case as long as the block layer can't handle
+     * multiple BlockBackends per BlockDriverState. */
+    if (replaces) {
+        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
+        if (replaced_bs == NULL) {
+            return;
+        }
+    } else {
+        replaced_bs = bs;
+    }
+    if (replaced_bs->blk && target->blk) {
+        error_setg(errp, "Can't create node with two BlockBackends");
+        return;
+    }
+
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
@@ -736,12 +776,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
-        block_job_release(bs);
+        block_job_unref(&s->common);
        return;
    }
+
+    bdrv_op_block_all(s->target, s->common.blocker);
+
    bdrv_set_enable_write_cache(s->target, true);
-    bdrv_set_on_error(s->target, on_target_error, on_target_error);
-    bdrv_iostatus_enable(s->target);
+    if (s->target->blk) {
+        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(s->target->blk);
+    }
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -124,7 +124,7 @@ static int nbd_co_send_request(BlockDriverState *bs,
    s->send_coroutine = qemu_coroutine_self();
    aio_context = bdrv_get_aio_context(bs);

-    aio_set_fd_handler(aio_context, s->sock,
+    aio_set_fd_handler(aio_context, s->sock, false,
                       nbd_reply_ready, nbd_restart_write, bs);
    if (qiov) {
        if (!s->is_unix) {
@@ -144,7 +144,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
    } else {
        rc = nbd_send_request(s->sock, request);
    }
-    aio_set_fd_handler(aio_context, s->sock, nbd_reply_ready, NULL, bs);
+    aio_set_fd_handler(aio_context, s->sock, false,
+                       nbd_reply_ready, NULL, bs);
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
    return rc;
@@ -348,14 +349,15 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sock, NULL, NULL, NULL);
+                       nbd_get_client_session(bs)->sock,
+                       false, NULL, NULL, NULL);
 }

 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                   AioContext *new_context)
 {
    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock,
-                       nbd_reply_ready, NULL, bs);
+                       false, nbd_reply_ready, NULL, bs);
 }

 void nbd_client_close(BlockDriverState *bs)
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -206,24 +206,24 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
    saddr = g_new0(SocketAddress, 1);

    if (qdict_haskey(options, "path")) {
-        saddr->kind = SOCKET_ADDRESS_KIND_UNIX;
-        saddr->q_unix = g_new0(UnixSocketAddress, 1);
-        saddr->q_unix->path = g_strdup(qdict_get_str(options, "path"));
+        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
+        saddr->u.q_unix = g_new0(UnixSocketAddress, 1);
+        saddr->u.q_unix->path = g_strdup(qdict_get_str(options, "path"));
        qdict_del(options, "path");
    } else {
-        saddr->kind = SOCKET_ADDRESS_KIND_INET;
-        saddr->inet = g_new0(InetSocketAddress, 1);
-        saddr->inet->host = g_strdup(qdict_get_str(options, "host"));
+        saddr->type = SOCKET_ADDRESS_KIND_INET;
+        saddr->u.inet = g_new0(InetSocketAddress, 1);
+        saddr->u.inet->host = g_strdup(qdict_get_str(options, "host"));
        if (!qdict_get_try_str(options, "port")) {
-            saddr->inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+            saddr->u.inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
        } else {
-            saddr->inet->port = g_strdup(qdict_get_str(options, "port"));
+            saddr->u.inet->port = g_strdup(qdict_get_str(options, "port"));
        }
        qdict_del(options, "host");
        qdict_del(options, "port");
    }

-    s->client.is_unix = saddr->kind == SOCKET_ADDRESS_KIND_UNIX;
+    s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;

    *export = g_strdup(qdict_get_try_str(options, "export"));
    if (*export) {
@@ -342,13 +342,13 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
    nbd_client_attach_aio_context(bs, new_context);
 }

-static void nbd_refresh_filename(BlockDriverState *bs)
+static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    QDict *opts = qdict_new();
-    const char *path   = qdict_get_try_str(bs->options, "path");
-    const char *host   = qdict_get_try_str(bs->options, "host");
-    const char *port   = qdict_get_try_str(bs->options, "port");
-    const char *export = qdict_get_try_str(bs->options, "export");
+    const char *path   = qdict_get_try_str(options, "path");
+    const char *host   = qdict_get_try_str(options, "host");
+    const char *port   = qdict_get_try_str(options, "port");
+    const char *export = qdict_get_try_str(options, "export");

    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));

--- a/block/nfs.c
+++ b/block/nfs.c
@@ -63,11 +63,10 @@ static void nfs_set_events(NFSClient *client)
 {
    int ev = nfs_which_events(client->context);
    if (ev != client->events) {
-        aio_set_fd_handler(client->aio_context,
-                           nfs_get_fd(client->context),
+        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                           false,
                           (ev & POLLIN) ? nfs_process_read : NULL,
-                           (ev & POLLOUT) ? nfs_process_write : NULL,
-                           client);
+                           (ev & POLLOUT) ? nfs_process_write : NULL, client);

    }
    client->events = ev;
@@ -242,9 +241,8 @@ static void nfs_detach_aio_context(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;

-    aio_set_fd_handler(client->aio_context,
-                       nfs_get_fd(client->context),
-                       NULL, NULL, NULL);
+    aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                       false, NULL, NULL, NULL);
    client->events = 0;
 }

@@ -263,9 +261,8 @@ static void nfs_client_close(NFSClient *client)
        if (client->fh) {
            nfs_close(client->context, client->fh);
        }
-        aio_set_fd_handler(client->aio_context,
-                           nfs_get_fd(client->context),
-                           NULL, NULL, NULL);
+        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                           false, NULL, NULL, NULL);
        nfs_destroy_context(client->context);
    }
    memset(client, 0, sizeof(NFSClient));
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -61,7 +61,7 @@ typedef struct ParallelsHeader {
 typedef enum ParallelsPreallocMode {
    PRL_PREALLOC_MODE_FALLOCATE = 0,
    PRL_PREALLOC_MODE_TRUNCATE = 1,
-    PRL_PREALLOC_MODE_MAX = 2,
+    PRL_PREALLOC_MODE__MAX = 2,
 } ParallelsPreallocMode;

 static const char *prealloc_mode_lookup[] = {
@@ -220,7 +220,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
        s->data_end += s->tracks;
        bitmap_set(s->bat_dirty_bmap,
-                   bat_entry_off(idx) / s->bat_dirty_block, 1);
+                   bat_entry_off(idx + i) / s->bat_dirty_block, 1);
    }

    return bat2sect(s, idx) + sector_num % s->tracks;
@@ -660,7 +660,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
    s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf,
-            PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err);
+            PRL_PREALLOC_MODE__MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err);
    g_free(buf);
    if (local_err != NULL) {
        goto fail_options;
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -64,7 +64,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
        ThrottleConfig cfg;

        throttle_group_get_config(bs, &cfg);
@@ -245,15 +245,17 @@ void bdrv_query_image_info(BlockDriverState *bs,
        info->has_backing_filename = true;
        bdrv_get_full_backing_filename(bs, backing_filename2, PATH_MAX, &err);
        if (err) {
-            error_propagate(errp, err);
-            qapi_free_ImageInfo(info);
+            /* Can't reconstruct the full backing filename, so we must omit
+             * this field and apply a Best Effort to this query. */
            g_free(backing_filename2);
-            return;
+            backing_filename2 = NULL;
+            error_free(err);
        }

-        if (strcmp(backing_filename, backing_filename2) != 0) {
-            info->full_backing_filename =
-                        g_strdup(backing_filename2);
+        /* Always report the full_backing_filename if present, even if it's the
+         * same as backing_filename. That they are same is useful info. */
+        if (backing_filename2) {
+            info->full_backing_filename = g_strdup(backing_filename2);
            info->has_full_backing_filename = true;
        }

@@ -301,17 +303,17 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
        info->tray_open = blk_dev_is_tray_open(blk);
    }

-    if (bdrv_iostatus_is_enabled(bs)) {
+    if (blk_iostatus_is_enabled(blk)) {
        info->has_io_status = true;
-        info->io_status = bs->iostatus;
+        info->io_status = blk_iostatus(blk);
    }

-    if (!QLIST_EMPTY(&bs->dirty_bitmaps)) {
+    if (bs && !QLIST_EMPTY(&bs->dirty_bitmaps)) {
        info->has_dirty_bitmaps = true;
        info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs);
    }

-    if (bs->drv) {
+    if (bs && bs->drv) {
        info->has_inserted = true;
        info->inserted = bdrv_block_device_info(bs, errp);
        if (info->inserted == NULL) {
@@ -344,18 +346,73 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
    }

    s->stats = g_malloc0(sizeof(*s->stats));
-    s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ];
-    s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
-    s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
-    s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
-    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ];
-    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE];
-    s->stats->wr_highest_offset =
-        bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
-    s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
-    s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE];
-    s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ];
-    s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH];
+    if (bs->blk) {
+        BlockAcctStats *stats = blk_get_stats(bs->blk);
+        BlockAcctTimedStats *ts = NULL;
+
+        s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
+        s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+        s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
+        s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+
+        s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
+        s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+        s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
+
+        s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
+        s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+        s->stats->invalid_flush_operations =
+            stats->invalid_ops[BLOCK_ACCT_FLUSH];
+
+        s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
+        s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+        s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
+        s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+        s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
+        s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
+
+        s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
+        if (s->stats->has_idle_time_ns) {
+            s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
+        }
+
+        s->stats->account_invalid = stats->account_invalid;
+        s->stats->account_failed = stats->account_failed;
+
+        while ((ts = block_acct_interval_next(stats, ts))) {
+            BlockDeviceTimedStatsList *timed_stats =
+                g_malloc0(sizeof(*timed_stats));
+            BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
+            timed_stats->next = s->stats->timed_stats;
+            timed_stats->value = dev_stats;
+            s->stats->timed_stats = timed_stats;
+
+            TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
+            TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+            TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
+
+            dev_stats->interval_length = ts->interval_length;
+
+            dev_stats->min_rd_latency_ns = timed_average_min(rd);
+            dev_stats->max_rd_latency_ns = timed_average_max(rd);
+            dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
+
+            dev_stats->min_wr_latency_ns = timed_average_min(wr);
+            dev_stats->max_wr_latency_ns = timed_average_max(wr);
+            dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+
+            dev_stats->min_flush_latency_ns = timed_average_min(fl);
+            dev_stats->max_flush_latency_ns = timed_average_max(fl);
+            dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+            dev_stats->avg_rd_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+            dev_stats->avg_wr_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+        }
+    }
+
+    s->stats->wr_highest_offset = bs->wr_highest_offset;

    if (bs->file) {
        s->has_parent = true;
@@ -381,7 +438,9 @@ BlockInfoList *qmp_query_block(Error **errp)
        bdrv_query_info(blk, &info->value, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            goto err;
+            g_free(info);
+            qapi_free_BlockInfoList(head);
+            return NULL;
        }

        *p_next = info;
@@ -389,10 +448,6 @@ BlockInfoList *qmp_query_block(Error **errp)
    }

    return head;
-
- err:
-    qapi_free_BlockInfoList(head);
-    return NULL;
 }

 BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
@@ -535,7 +590,7 @@ static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation,
    int i = 0;

    for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) {
-        qtype_code type = qobject_type(entry->value);
+        QType type = qobject_type(entry->value);
        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST);
        const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: ";

@@ -553,7 +608,7 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation,
    const QDictEntry *entry;

    for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) {
-        qtype_code type = qobject_type(entry->value);
+        QType type = qobject_type(entry->value);
        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST);
        const char *format = composite ? "%*s%s:\n" : "%*s%s: ";
        char key[strlen(entry->key) + 1];
@@ -623,7 +678,10 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,

    if (info->has_backing_filename) {
        func_fprintf(f, "backing file: %s", info->backing_filename);
-        if (info->has_full_backing_filename) {
+        if (!info->has_full_backing_filename) {
+            func_fprintf(f, " (cannot determine actual path)");
+        } else if (strcmp(info->backing_filename,
+                          info->full_backing_filename) != 0) {
            func_fprintf(f, " (actual path: %s)", info->full_backing_filename);
        }
        func_fprintf(f, "\n");
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -312,7 +312,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
    if (!offset)
        return 0;

-    assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED);
+    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);

    for (i = 0; i < nb_clusters; i++) {
        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -324,14 +324,16 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 	return i;
 }

-static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table)
+static int count_contiguous_clusters_by_type(int nb_clusters,
+                                             uint64_t *l2_table,
+                                             int wanted_type)
 {
    int i;

    for (i = 0; i < nb_clusters; i++) {
        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));

-        if (type != QCOW2_CLUSTER_UNALLOCATED) {
+        if (type != wanted_type) {
            break;
        }
    }
@@ -554,13 +556,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
            ret = -EIO;
            goto fail;
        }
-        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], QCOW_OFLAG_ZERO);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_ZERO);
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_UNALLOCATED:
        /* how many empty clusters ? */
-        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_UNALLOCATED);
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_NORMAL:
@@ -1638,7 +1641,8 @@ fail:
 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                                      int l1_size, int64_t *visited_l1_entries,
                                      int64_t l1_entries,
-                                      BlockDriverAmendStatusCB *status_cb)
+                                      BlockDriverAmendStatusCB *status_cb,
+                                      void *cb_opaque)
 {
    BDRVQcow2State *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
@@ -1664,7 +1668,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            /* unallocated */
            (*visited_l1_entries)++;
            if (status_cb) {
-                status_cb(bs, *visited_l1_entries, l1_entries);
+                status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
            }
            continue;
        }
@@ -1801,7 +1805,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,

        (*visited_l1_entries)++;
        if (status_cb) {
-            status_cb(bs, *visited_l1_entries, l1_entries);
+            status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
        }
    }

@@ -1825,7 +1829,8 @@ fail:
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
 int qcow2_expand_zero_clusters(BlockDriverState *bs,
-                               BlockDriverAmendStatusCB *status_cb)
+                               BlockDriverAmendStatusCB *status_cb,
+                               void *cb_opaque)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l1_table = NULL;
@@ -1842,7 +1847,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,

    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
                                     &visited_l1_entries, l1_entries,
-                                     status_cb);
+                                     status_cb, cb_opaque);
    if (ret < 0) {
        goto fail;
    }
@@ -1878,7 +1883,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
                                         &visited_l1_entries, l1_entries,
-                                         status_cb);
+                                         status_cb, cb_opaque);
        if (ret < 0) {
            goto fail;
        }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -560,13 +560,16 @@ static int alloc_refcount_block(BlockDriverState *bs,
    }

    /* Hook up the new refcount table in the qcow2 header */
-    uint8_t data[12];
-    cpu_to_be64w((uint64_t*)data, table_offset);
-    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+    struct QEMU_PACKED {
+        uint64_t d64;
+        uint32_t d32;
+    } data;
+    cpu_to_be64w(&data.d64, table_offset);
+    cpu_to_be32w(&data.d32, table_clusters);
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
    ret = bdrv_pwrite_sync(bs->file->bs,
                           offsetof(QCowHeader, refcount_table_offset),
-                           data, sizeof(data));
+                           &data, sizeof(data));
    if (ret < 0) {
        goto fail_table;
    }
@@ -1241,7 +1244,7 @@ fail:
 /* refcount checking functions */


-static size_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
+static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
 {
    /* This assertion holds because there is no way we can address more than
     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
@@ -1342,6 +1345,9 @@ static int inc_refcounts(BlockDriverState *bs,
        if (refcount == s->refcount_max) {
            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
                    "\n", cluster_offset);
+            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
+                    "width or qemu-img convert to create a clean copy if the "
+                    "image cannot be opened for writing\n");
            res->corruptions++;
            continue;
        }
@@ -2464,3 +2470,450 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,

    return 0;
 }
+
+/* A pointer to a function of this type is given to walk_over_reftable(). That
+ * function will create refblocks and pass them to a RefblockFinishOp once they
+ * are completed (@refblock). @refblock_empty is set if the refblock is
+ * completely empty.
+ *
+ * Along with the refblock, a corresponding reftable entry is passed, in the
+ * reftable @reftable (which may be reallocated) at @reftable_index.
+ *
+ * @allocated should be set to true if a new cluster has been allocated.
+ */
+typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
+                               uint64_t reftable_index, uint64_t *reftable_size,
+                               void *refblock, bool refblock_empty,
+                               bool *allocated, Error **errp);
+
+/**
+ * This "operation" for walk_over_reftable() allocates the refblock on disk (if
+ * it is not empty) and inserts its offset into the new reftable. The size of
+ * this new reftable is increased as required.
+ */
+static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
+                          uint64_t reftable_index, uint64_t *reftable_size,
+                          void *refblock, bool refblock_empty, bool *allocated,
+                          Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t offset;
+
+    if (!refblock_empty && reftable_index >= *reftable_size) {
+        uint64_t *new_reftable;
+        uint64_t new_reftable_size;
+
+        new_reftable_size = ROUND_UP(reftable_index + 1,
+                                     s->cluster_size / sizeof(uint64_t));
+        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
+            error_setg(errp,
+                       "This operation would make the refcount table grow "
+                       "beyond the maximum size supported by QEMU, aborting");
+            return -ENOTSUP;
+        }
+
+        new_reftable = g_try_realloc(*reftable, new_reftable_size *
+                                                sizeof(uint64_t));
+        if (!new_reftable) {
+            error_setg(errp, "Failed to increase reftable buffer size");
+            return -ENOMEM;
+        }
+
+        memset(new_reftable + *reftable_size, 0,
+               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
+
+        *reftable      = new_reftable;
+        *reftable_size = new_reftable_size;
+    }
+
+    if (!refblock_empty && !(*reftable)[reftable_index]) {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            error_setg_errno(errp, -offset, "Failed to allocate refblock");
+            return offset;
+        }
+        (*reftable)[reftable_index] = offset;
+        *allocated = true;
+    }
+
+    return 0;
+}
+
+/**
+ * This "operation" for walk_over_reftable() writes the refblock to disk at the
+ * offset specified by the new reftable's entry. It does not modify the new
+ * reftable or change any refcounts.
+ */
+static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
+                          uint64_t reftable_index, uint64_t *reftable_size,
+                          void *refblock, bool refblock_empty, bool *allocated,
+                          Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t offset;
+    int ret;
+
+    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
+        offset = (*reftable)[reftable_index];
+
+        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Overlap check failed");
+            return ret;
+        }
+
+        ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to write refblock");
+            return ret;
+        }
+    } else {
+        assert(refblock_empty);
+    }
+
+    return 0;
+}
+
+/**
+ * This function walks over the existing reftable and every referenced refblock;
+ * if @new_set_refcount is non-NULL, it is called for every refcount entry to
+ * create an equal new entry in the passed @new_refblock. Once that
+ * @new_refblock is completely filled, @operation will be called.
+ *
+ * @status_cb and @cb_opaque are used for the amend operation's status callback.
+ * @index is the index of the walk_over_reftable() calls and @total is the total
+ * number of walk_over_reftable() calls per amend operation. Both are used for
+ * calculating the parameters for the status callback.
+ *
+ * @allocated is set to true if a new cluster has been allocated.
+ */
+static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
+                              uint64_t *new_reftable_index,
+                              uint64_t *new_reftable_size,
+                              void *new_refblock, int new_refblock_size,
+                              int new_refcount_bits,
+                              RefblockFinishOp *operation, bool *allocated,
+                              Qcow2SetRefcountFunc *new_set_refcount,
+                              BlockDriverAmendStatusCB *status_cb,
+                              void *cb_opaque, int index, int total,
+                              Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t reftable_index;
+    bool new_refblock_empty = true;
+    int refblock_index;
+    int new_refblock_index = 0;
+    int ret;
+
+    for (reftable_index = 0; reftable_index < s->refcount_table_size;
+         reftable_index++)
+    {
+        uint64_t refblock_offset = s->refcount_table[reftable_index]
+                                 & REFT_OFFSET_MASK;
+
+        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
+                  (uint64_t)total * s->refcount_table_size, cb_opaque);
+
+        if (refblock_offset) {
+            void *refblock;
+
+            if (offset_into_cluster(s, refblock_offset)) {
+                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
+                                        PRIx64 " unaligned (reftable index: %#"
+                                        PRIx64 ")", refblock_offset,
+                                        reftable_index);
+                error_setg(errp,
+                           "Image is corrupt (unaligned refblock offset)");
+                return -EIO;
+            }
+
+            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
+                                  &refblock);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
+                return ret;
+            }
+
+            for (refblock_index = 0; refblock_index < s->refcount_block_size;
+                 refblock_index++)
+            {
+                uint64_t refcount;
+
+                if (new_refblock_index >= new_refblock_size) {
+                    /* new_refblock is now complete */
+                    ret = operation(bs, new_reftable, *new_reftable_index,
+                                    new_reftable_size, new_refblock,
+                                    new_refblock_empty, allocated, errp);
+                    if (ret < 0) {
+                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                        return ret;
+                    }
+
+                    (*new_reftable_index)++;
+                    new_refblock_index = 0;
+                    new_refblock_empty = true;
+                }
+
+                refcount = s->get_refcount(refblock, refblock_index);
+                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
+                    uint64_t offset;
+
+                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+                    offset = ((reftable_index << s->refcount_block_bits)
+                              + refblock_index) << s->cluster_bits;
+
+                    error_setg(errp, "Cannot decrease refcount entry width to "
+                               "%i bits: Cluster at offset %#" PRIx64 " has a "
+                               "refcount of %" PRIu64, new_refcount_bits,
+                               offset, refcount);
+                    return -EINVAL;
+                }
+
+                if (new_set_refcount) {
+                    new_set_refcount(new_refblock, new_refblock_index++,
+                                     refcount);
+                } else {
+                    new_refblock_index++;
+                }
+                new_refblock_empty = new_refblock_empty && refcount == 0;
+            }
+
+            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        } else {
+            /* No refblock means every refcount is 0 */
+            for (refblock_index = 0; refblock_index < s->refcount_block_size;
+                 refblock_index++)
+            {
+                if (new_refblock_index >= new_refblock_size) {
+                    /* new_refblock is now complete */
+                    ret = operation(bs, new_reftable, *new_reftable_index,
+                                    new_reftable_size, new_refblock,
+                                    new_refblock_empty, allocated, errp);
+                    if (ret < 0) {
+                        return ret;
+                    }
+
+                    (*new_reftable_index)++;
+                    new_refblock_index = 0;
+                    new_refblock_empty = true;
+                }
+
+                if (new_set_refcount) {
+                    new_set_refcount(new_refblock, new_refblock_index++, 0);
+                } else {
+                    new_refblock_index++;
+                }
+            }
+        }
+    }
+
+    if (new_refblock_index > 0) {
+        /* Complete the potentially existing partially filled final refblock */
+        if (new_set_refcount) {
+            for (; new_refblock_index < new_refblock_size;
+                 new_refblock_index++)
+            {
+                new_set_refcount(new_refblock, new_refblock_index, 0);
+            }
+        }
+
+        ret = operation(bs, new_reftable, *new_reftable_index,
+                        new_reftable_size, new_refblock, new_refblock_empty,
+                        allocated, errp);
+        if (ret < 0) {
+            return ret;
+        }
+
+        (*new_reftable_index)++;
+    }
+
+    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
+              (uint64_t)total * s->refcount_table_size, cb_opaque);
+
+    return 0;
+}
+
+int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
+                                BlockDriverAmendStatusCB *status_cb,
+                                void *cb_opaque, Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    Qcow2GetRefcountFunc *new_get_refcount;
+    Qcow2SetRefcountFunc *new_set_refcount;
+    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
+    uint64_t *new_reftable = NULL, new_reftable_size = 0;
+    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
+    uint64_t new_reftable_index = 0;
+    uint64_t i;
+    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
+    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
+    int old_refcount_order;
+    int walk_index = 0;
+    int ret;
+    bool new_allocation;
+
+    assert(s->qcow_version >= 3);
+    assert(refcount_order >= 0 && refcount_order <= 6);
+
+    /* see qcow2_open() */
+    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
+
+    new_get_refcount = get_refcount_funcs[refcount_order];
+    new_set_refcount = set_refcount_funcs[refcount_order];
+
+
+    do {
+        int total_walks;
+
+        new_allocation = false;
+
+        /* At least we have to do this walk and the one which writes the
+         * refblocks; also, at least we have to do this loop here at least
+         * twice (normally), first to do the allocations, and second to
+         * determine that everything is correctly allocated, this then makes
+         * three walks in total */
+        total_walks = MAX(walk_index + 2, 3);
+
+        /* First, allocate the structures so they are present in the refcount
+         * structures */
+        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
+                                 &new_reftable_size, NULL, new_refblock_size,
+                                 new_refcount_bits, &alloc_refblock,
+                                 &new_allocation, NULL, status_cb, cb_opaque,
+                                 walk_index++, total_walks, errp);
+        if (ret < 0) {
+            goto done;
+        }
+
+        new_reftable_index = 0;
+
+        if (new_allocation) {
+            if (new_reftable_offset) {
+                qcow2_free_clusters(bs, new_reftable_offset,
+                                    allocated_reftable_size * sizeof(uint64_t),
+                                    QCOW2_DISCARD_NEVER);
+            }
+
+            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
+                                                           sizeof(uint64_t));
+            if (new_reftable_offset < 0) {
+                error_setg_errno(errp, -new_reftable_offset,
+                                 "Failed to allocate the new reftable");
+                ret = new_reftable_offset;
+                goto done;
+            }
+            allocated_reftable_size = new_reftable_size;
+        }
+    } while (new_allocation);
+
+    /* Second, write the new refblocks */
+    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
+                             &new_reftable_size, new_refblock,
+                             new_refblock_size, new_refcount_bits,
+                             &flush_refblock, &new_allocation, new_set_refcount,
+                             status_cb, cb_opaque, walk_index, walk_index + 1,
+                             errp);
+    if (ret < 0) {
+        goto done;
+    }
+    assert(!new_allocation);
+
+
+    /* Write the new reftable */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
+                                        new_reftable_size * sizeof(uint64_t));
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Overlap check failed");
+        goto done;
+    }
+
+    for (i = 0; i < new_reftable_size; i++) {
+        cpu_to_be64s(&new_reftable[i]);
+    }
+
+    ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
+                      new_reftable_size * sizeof(uint64_t));
+
+    for (i = 0; i < new_reftable_size; i++) {
+        be64_to_cpus(&new_reftable[i]);
+    }
+
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to write the new reftable");
+        goto done;
+    }
+
+
+    /* Empty the refcount cache */
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
+        goto done;
+    }
+
+    /* Update the image header to point to the new reftable; this only updates
+     * the fields which are relevant to qcow2_update_header(); other fields
+     * such as s->refcount_table or s->refcount_bits stay stale for now
+     * (because we have to restore everything if qcow2_update_header() fails) */
+    old_refcount_order  = s->refcount_order;
+    old_reftable_size   = s->refcount_table_size;
+    old_reftable_offset = s->refcount_table_offset;
+
+    s->refcount_order        = refcount_order;
+    s->refcount_table_size   = new_reftable_size;
+    s->refcount_table_offset = new_reftable_offset;
+
+    ret = qcow2_update_header(bs);
+    if (ret < 0) {
+        s->refcount_order        = old_refcount_order;
+        s->refcount_table_size   = old_reftable_size;
+        s->refcount_table_offset = old_reftable_offset;
+        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
+        goto done;
+    }
+
+    /* Now update the rest of the in-memory information */
+    old_reftable = s->refcount_table;
+    s->refcount_table = new_reftable;
+
+    s->refcount_bits = 1 << refcount_order;
+    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
+    s->refcount_max += s->refcount_max - 1;
+
+    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
+    s->refcount_block_size = 1 << s->refcount_block_bits;
+
+    s->get_refcount = new_get_refcount;
+    s->set_refcount = new_set_refcount;
+
+    /* For cleaning up all old refblocks and the old reftable below the "done"
+     * label */
+    new_reftable        = old_reftable;
+    new_reftable_size   = old_reftable_size;
+    new_reftable_offset = old_reftable_offset;
+
+done:
+    if (new_reftable) {
+        /* On success, new_reftable actually points to the old reftable (and
+         * new_reftable_size is the old reftable's size); but that is just
+         * fine */
+        for (i = 0; i < new_reftable_size; i++) {
+            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
+            if (offset) {
+                qcow2_free_clusters(bs, offset, s->cluster_size,
+                                    QCOW2_DISCARD_OTHER);
+            }
+        }
+        g_free(new_reftable);
+
+        if (new_reftable_offset > 0) {
+            qcow2_free_clusters(bs, new_reftable_offset,
+                                new_reftable_size * sizeof(uint64_t),
+                                QCOW2_DISCARD_OTHER);
+        }
+    }
+
+    qemu_vfree(new_refblock);
+    return ret;
+}
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1282,6 +1282,52 @@ static void qcow2_reopen_abort(BDRVReopenState *state)
    g_free(state->opaque);
 }

+static void qcow2_join_options(QDict *options, QDict *old_options)
+{
+    bool has_new_overlap_template =
+        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
+        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
+    bool has_new_total_cache_size =
+        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
+    bool has_all_cache_options;
+
+    /* New overlap template overrides all old overlap options */
+    if (has_new_overlap_template) {
+        qdict_del(old_options, QCOW2_OPT_OVERLAP);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
+        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
+    }
+
+    /* New total cache size overrides all old options */
+    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
+        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
+        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
+    }
+
+    qdict_join(options, old_options, false);
+
+    /*
+     * If after merging all cache size options are set, an old total size is
+     * overwritten. Do keep all options, however, if all three are new. The
+     * resulting error message is what we want to happen.
+     */
+    has_all_cache_options =
+        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
+        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
+        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
+
+    if (has_all_cache_options && !has_new_total_cache_size) {
+        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
+    }
+}
+
 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -2269,7 +2315,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
                                         DEFAULT_CLUSTER_SIZE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
-                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
                               &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -2738,18 +2784,16 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);

    *spec_info = (ImageInfoSpecific){
-        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
-        {
-            .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
-        },
+        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
+        .u.qcow2 = g_new(ImageInfoSpecificQCow2, 1),
    };
    if (s->qcow_version == 2) {
-        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
        };
    } else if (s->qcow_version == 3) {
-        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
@@ -2759,6 +2803,10 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
            .has_corrupt        = true,
            .refcount_bits      = s->refcount_bits,
        };
+    } else {
+        /* if this assertion fails, this probably means a new version was
+         * added without having it covered here */
+        assert(false);
    }

    return spec_info;
@@ -2826,7 +2874,7 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 * have to be removed.
 */
 static int qcow2_downgrade(BlockDriverState *bs, int target_version,
-                           BlockDriverAmendStatusCB *status_cb)
+                           BlockDriverAmendStatusCB *status_cb, void *cb_opaque)
 {
    BDRVQcow2State *s = bs->opaque;
    int current_version = s->qcow_version;
@@ -2841,13 +2889,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
    }

    if (s->refcount_order != 4) {
-        /* we would have to convert the image to a refcount_order == 4 image
-         * here; however, since qemu (at the time of writing this) does not
-         * support anything different than 4 anyway, there is no point in doing
-         * so right now; however, we should error out (if qemu supports this in
-         * the future and this code has not been adapted) */
-        error_report("qcow2_downgrade: Image refcount orders other than 4 are "
-                     "currently not supported.");
+        error_report("compat=0.10 requires refcount_bits=16");
        return -ENOTSUP;
    }

@@ -2875,7 +2917,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

-    ret = qcow2_expand_zero_clusters(bs, status_cb);
+    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
    if (ret < 0) {
        return ret;
    }
@@ -2889,8 +2931,79 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
    return 0;
 }

+typedef enum Qcow2AmendOperation {
+    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
+     * statically initialized to so that the helper CB can discern the first
+     * invocation from an operation change */
+    QCOW2_NO_OPERATION = 0,
+
+    QCOW2_CHANGING_REFCOUNT_ORDER,
+    QCOW2_DOWNGRADING,
+} Qcow2AmendOperation;
+
+typedef struct Qcow2AmendHelperCBInfo {
+    /* The code coordinating the amend operations should only modify
+     * these four fields; the rest will be managed by the CB */
+    BlockDriverAmendStatusCB *original_status_cb;
+    void *original_cb_opaque;
+
+    Qcow2AmendOperation current_operation;
+
+    /* Total number of operations to perform (only set once) */
+    int total_operations;
+
+    /* The following fields are managed by the CB */
+
+    /* Number of operations completed */
+    int operations_completed;
+
+    /* Cumulative offset of all completed operations */
+    int64_t offset_completed;
+
+    Qcow2AmendOperation last_operation;
+    int64_t last_work_size;
+} Qcow2AmendHelperCBInfo;
+
+static void qcow2_amend_helper_cb(BlockDriverState *bs,
+                                  int64_t operation_offset,
+                                  int64_t operation_work_size, void *opaque)
+{
+    Qcow2AmendHelperCBInfo *info = opaque;
+    int64_t current_work_size;
+    int64_t projected_work_size;
+
+    if (info->current_operation != info->last_operation) {
+        if (info->last_operation != QCOW2_NO_OPERATION) {
+            info->offset_completed += info->last_work_size;
+            info->operations_completed++;
+        }
+
+        info->last_operation = info->current_operation;
+    }
+
+    assert(info->total_operations > 0);
+    assert(info->operations_completed < info->total_operations);
+
+    info->last_work_size = operation_work_size;
+
+    current_work_size = info->offset_completed + operation_work_size;
+
+    /* current_work_size is the total work size for (operations_completed + 1)
+     * operations (which includes this one), so multiply it by the number of
+     * operations not covered and divide it by the number of operations
+     * covered to get a projection for the operations not covered */
+    projected_work_size = current_work_size * (info->total_operations -
+                                               info->operations_completed - 1)
+                                            / (info->operations_completed + 1);
+
+    info->original_status_cb(bs, info->offset_completed + operation_offset,
+                             current_work_size + projected_work_size,
+                             info->original_cb_opaque);
+}
+
 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
-                               BlockDriverAmendStatusCB *status_cb)
+                               BlockDriverAmendStatusCB *status_cb,
+                               void *cb_opaque)
 {
    BDRVQcow2State *s = bs->opaque;
    int old_version = s->qcow_version, new_version = old_version;
@@ -2900,8 +3013,10 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
    const char *compat = NULL;
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
+    int refcount_bits = s->refcount_bits;
    int ret;
    QemuOptDesc *desc = opts->list->desc;
+    Qcow2AmendHelperCBInfo helper_cb_info;

    while (desc && desc->name) {
        if (!qemu_opt_find(opts, desc->name)) {
@@ -2919,11 +3034,11 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
            } else if (!strcmp(compat, "1.1")) {
                new_version = 3;
            } else {
-                fprintf(stderr, "Unknown compatibility level %s.\n", compat);
+                error_report("Unknown compatibility level %s", compat);
                return -EINVAL;
            }
        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
-            fprintf(stderr, "Cannot change preallocation mode.\n");
+            error_report("Cannot change preallocation mode");
            return -ENOTSUP;
        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
@@ -2936,47 +3051,74 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
                                        !!s->cipher);

            if (encrypt != !!s->cipher) {
-                fprintf(stderr, "Changing the encryption flag is not "
-                        "supported.\n");
+                error_report("Changing the encryption flag is not supported");
                return -ENOTSUP;
            }
        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
                                             cluster_size);
            if (cluster_size != s->cluster_size) {
-                fprintf(stderr, "Changing the cluster size is not "
-                        "supported.\n");
+                error_report("Changing the cluster size is not supported");
                return -ENOTSUP;
            }
        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
                                               lazy_refcounts);
        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
-            error_report("Cannot change refcount entry width");
-            return -ENOTSUP;
+            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
+                                                refcount_bits);
+
+            if (refcount_bits <= 0 || refcount_bits > 64 ||
+                !is_power_of_2(refcount_bits))
+            {
+                error_report("Refcount width must be a power of two and may "
+                             "not exceed 64 bits");
+                return -EINVAL;
+            }
        } else {
-            /* if this assertion fails, this probably means a new option was
+            /* if this point is reached, this probably means a new option was
             * added without having it covered here */
-            assert(false);
+            abort();
        }

        desc++;
    }

-    if (new_version != old_version) {
-        if (new_version > old_version) {
-            /* Upgrade */
-            s->qcow_version = new_version;
-            ret = qcow2_update_header(bs);
-            if (ret < 0) {
-                s->qcow_version = old_version;
-                return ret;
-            }
-        } else {
-            ret = qcow2_downgrade(bs, new_version, status_cb);
-            if (ret < 0) {
-                return ret;
-            }
+    helper_cb_info = (Qcow2AmendHelperCBInfo){
+        .original_status_cb = status_cb,
+        .original_cb_opaque = cb_opaque,
+        .total_operations = (new_version < old_version)
+                          + (s->refcount_bits != refcount_bits)
+    };
+
+    /* Upgrade first (some features may require compat=1.1) */
+    if (new_version > old_version) {
+        s->qcow_version = new_version;
+        ret = qcow2_update_header(bs);
+        if (ret < 0) {
+            s->qcow_version = old_version;
+            return ret;
+        }
+    }
+
+    if (s->refcount_bits != refcount_bits) {
+        int refcount_order = ctz32(refcount_bits);
+        Error *local_error = NULL;
+
+        if (new_version < 3 && refcount_bits != 16) {
+            error_report("Different refcount widths than 16 bits require "
+                         "compatibility level 1.1 or above (use compat=1.1 or "
+                         "greater)");
+            return -EINVAL;
+        }
+
+        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
+        ret = qcow2_change_refcount_order(bs, refcount_order,
+                                          &qcow2_amend_helper_cb,
+                                          &helper_cb_info, &local_error);
+        if (ret < 0) {
+            error_report_err(local_error);
+            return ret;
        }
    }

@@ -2991,9 +3133,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,

    if (s->use_lazy_refcounts != lazy_refcounts) {
        if (lazy_refcounts) {
-            if (s->qcow_version < 3) {
-                fprintf(stderr, "Lazy refcounts only supported with compatibility "
-                        "level 1.1 and above (use compat=1.1 or greater)\n");
+            if (new_version < 3) {
+                error_report("Lazy refcounts only supported with compatibility "
+                             "level 1.1 and above (use compat=1.1 or greater)");
                return -EINVAL;
            }
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
@@ -3027,6 +3169,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
        }
    }

+    /* Downgrade last (so unsupported features can be removed before) */
+    if (new_version < old_version) {
+        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
+        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
+                              &helper_cb_info);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
    return 0;
 }

@@ -3147,6 +3299,7 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
+    .bdrv_join_options    = qcow2_join_options,
    .bdrv_create        = qcow2_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -26,7 +26,7 @@
 #define BLOCK_QCOW2_H

 #include "crypto/cipher.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"

 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -529,6 +529,10 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
                                  int64_t size);

+int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
+                                BlockDriverAmendStatusCB *status_cb,
+                                void *cb_opaque, Error **errp);
+
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        bool exact_size);
@@ -553,7 +557,8 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);

 int qcow2_expand_zero_clusters(BlockDriverState *bs,
-                               BlockDriverAmendStatusCB *status_cb);
+                               BlockDriverAmendStatusCB *status_cb,
+                               void *cb_opaque);

 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
--- a/block/qed.c
+++ b/block/qed.c
@@ -375,6 +375,18 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
    }
 }

+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Cancel timer and start doing I/O that were meant to happen as if it
+     * fired, that way we get bdrv_drain() taking care of the ongoing requests
+     * correctly. */
+    qed_cancel_need_check_timer(s);
+    qed_plug_allocating_write_reqs(s);
+    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
@@ -1676,6 +1688,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_check               = bdrv_qed_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };

 static void bdrv_qed_init(void)
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -847,7 +847,7 @@ static int parse_read_pattern(const char *opt)
        return QUORUM_READ_PATTERN_QUORUM;
    }

-    for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+    for (i = 0; i < QUORUM_READ_PATTERN__MAX; i++) {
        if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
            return i;
        }
@@ -997,7 +997,7 @@ static void quorum_attach_aio_context(BlockDriverState *bs,
    }
 }

-static void quorum_refresh_filename(BlockDriverState *bs)
+static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVQuorumState *s = bs->opaque;
    QDict *opts;
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,11 +127,6 @@ do { \

 #define FTYPE_FILE   0
 #define FTYPE_CD     1
-#define FTYPE_FD     2
-
-/* if the FD is not accessed during that time (in ns), we try to
-   reopen it to see if the disk has been changed */
-#define FD_OPEN_TIMEOUT (1000000000)

 #define MAX_BLOCKSIZE	4096

@@ -141,13 +136,6 @@ typedef struct BDRVRawState {
    int open_flags;
    size_t buf_align;

-#if defined(__linux__)
-    /* linux floppy specific */
-    int64_t fd_open_time;
-    int64_t fd_error_time;
-    int fd_got_error;
-    int fd_media_changed;
-#endif
 #ifdef CONFIG_LINUX_AIO
    int use_aio;
    void *aio_ctx;
@@ -512,21 +500,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        goto fail;
    }
    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
-        error_printf("WARNING: aio=native was specified for '%s', but "
-                     "it requires cache.direct=on, which was not "
-                     "specified. Falling back to aio=threads.\n"
-                     "         This will become an error condition in "
-                     "future QEMU versions.\n",
-                     bs->filename);
+        error_setg(errp, "aio=native was specified, but it requires "
+                         "cache.direct=on, which was not specified.");
+        ret = -EINVAL;
+        goto fail;
    }
 #else
    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
-        error_printf("WARNING: aio=native was specified for '%s', but "
-                     "is not supported in this build. Falling back to "
-                     "aio=threads.\n"
-                     "         This will become an error condition in "
-                     "future QEMU versions.\n",
-                     bs->filename);
+        error_setg(errp, "aio=native was specified, but is not supported "
+                         "in this build.");
+        ret = -EINVAL;
+        goto fail;
    }
 #endif /* !defined(CONFIG_LINUX_AIO) */

@@ -635,7 +619,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
    }
 #endif

-    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
+    if (s->type == FTYPE_CD) {
        raw_s->open_flags |= O_NONBLOCK;
    }

@@ -1648,7 +1632,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
-                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
                               &local_err);
    g_free(buf);
    if (local_err) {
@@ -1988,8 +1972,8 @@ BlockDriver bdrv_file = {

 #if defined(__APPLE__) && defined(__MACH__)
 static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
-static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
-
+static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
+                                CFIndex maxPathSize, int flags);
 kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
 {
    kern_return_t       kernResult;
@@ -2016,7 +2000,8 @@ kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
    return kernResult;
 }

-kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
+                         CFIndex maxPathSize, int flags)
 {
    io_object_t     nextMedia;
    kern_return_t   kernResult = KERN_FAILURE;
@@ -2029,7 +2014,9 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
        if ( bsdPathAsCFString ) {
            size_t devPathLength;
            strcpy( bsdPath, _PATH_DEV );
-            strcat( bsdPath, "r" );
+            if (flags & BDRV_O_NOCACHE) {
+                strcat(bsdPath, "r");
+            }
            devPathLength = strlen( bsdPath );
            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
                kernResult = KERN_SUCCESS;
@@ -2141,8 +2128,8 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
        int fd;

        kernResult = FindEjectableCDMedia( &mediaIterator );
-        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
-
+        kernResult = GetBSDPath(mediaIterator, bsdPath, sizeof(bsdPath),
+                                flags);
        if ( bsdPath[ 0 ] != '\0' ) {
            strcat(bsdPath,"s0");
            /* some CDs don't have a partition 0 */
@@ -2187,53 +2174,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
 }

 #if defined(__linux__)
-/* Note: we do not have a reliable method to detect if the floppy is
-   present. The current method is to try to open the floppy at every
-   I/O and to keep it opened during a few hundreds of ms. */
-static int fd_open(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-    int last_media_present;
-
-    if (s->type != FTYPE_FD)
-        return 0;
-    last_media_present = (s->fd >= 0);
-    if (s->fd >= 0 &&
-        (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
-        qemu_close(s->fd);
-        s->fd = -1;
-        DPRINTF("Floppy closed\n");
-    }
-    if (s->fd < 0) {
-        if (s->fd_got_error &&
-            (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
-            DPRINTF("No floppy (open delayed)\n");
-            return -EIO;
-        }
-        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
-        if (s->fd < 0) {
-            s->fd_error_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-            s->fd_got_error = 1;
-            if (last_media_present)
-                s->fd_media_changed = 1;
-            DPRINTF("No floppy\n");
-            return -EIO;
-        }
-        DPRINTF("Floppy opened\n");
-    }
-    if (!last_media_present)
-        s->fd_media_changed = 1;
-    s->fd_open_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-    s->fd_got_error = 0;
-    return 0;
-}
-
-static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    BDRVRawState *s = bs->opaque;
-
-    return ioctl(s->fd, req, buf);
-}

 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
@@ -2256,8 +2196,8 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
+#endif /* linux */

-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static int fd_open(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
@@ -2267,14 +2207,6 @@ static int fd_open(BlockDriverState *bs)
        return 0;
    return -EIO;
 }
-#else /* !linux && !FreeBSD */
-
-static int fd_open(BlockDriverState *bs)
-{
-    return 0;
-}
-
-#endif /* !linux && !FreeBSD */

 static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
@@ -2318,14 +2250,13 @@ static int hdev_create(const char *filename, QemuOpts *opts,
    int64_t total_size = 0;
    bool has_prefix;

-    /* This function is used by all three protocol block drivers and therefore
-     * any of these three prefixes may be given.
+    /* This function is used by both protocol block drivers and therefore either
+     * of these prefixes may be given.
     * The return value has to be stored somewhere, otherwise this is an error
     * due to -Werror=unused-value. */
    has_prefix =
        strstart(filename, "host_device:", &filename) ||
-        strstart(filename, "host_cdrom:" , &filename) ||
-        strstart(filename, "host_floppy:", &filename);
+        strstart(filename, "host_cdrom:" , &filename);

    (void)has_prefix;

@@ -2400,160 +2331,10 @@ static BlockDriver bdrv_host_device = {

    /* generic scsi device */
 #ifdef __linux__
-    .bdrv_ioctl         = hdev_ioctl,
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
 #endif
 };

-#ifdef __linux__
-static void floppy_parse_filename(const char *filename, QDict *options,
-                                  Error **errp)
-{
-    /* The prefix is optional, just as for "file". */
-    strstart(filename, "host_floppy:", &filename);
-
-    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
-}
-
-static int floppy_open(BlockDriverState *bs, QDict *options, int flags,
-                       Error **errp)
-{
-    BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;
-
-    s->type = FTYPE_FD;
-
-    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
-    if (ret) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
-        return ret;
-    }
-
-    /* close fd so that we can reopen it as needed */
-    qemu_close(s->fd);
-    s->fd = -1;
-    s->fd_media_changed = 1;
-
-    error_report("Host floppy pass-through is deprecated");
-    error_printf("Support for it will be removed in a future release.\n");
-    return 0;
-}
-
-static int floppy_probe_device(const char *filename)
-{
-    int fd, ret;
-    int prio = 0;
-    struct floppy_struct fdparam;
-    struct stat st;
-
-    if (strstart(filename, "/dev/fd", NULL) &&
-        !strstart(filename, "/dev/fdset/", NULL) &&
-        !strstart(filename, "/dev/fd/", NULL)) {
-        prio = 50;
-    }
-
-    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
-    if (fd < 0) {
-        goto out;
-    }
-    ret = fstat(fd, &st);
-    if (ret == -1 || !S_ISBLK(st.st_mode)) {
-        goto outc;
-    }
-
-    /* Attempt to detect via a floppy specific ioctl */
-    ret = ioctl(fd, FDGETPRM, &fdparam);
-    if (ret >= 0)
-        prio = 100;
-
-outc:
-    qemu_close(fd);
-out:
-    return prio;
-}
-
-
-static int floppy_is_inserted(BlockDriverState *bs)
-{
-    return fd_open(bs) >= 0;
-}
-
-static int floppy_media_changed(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-    int ret;
-
-    /*
-     * XXX: we do not have a true media changed indication.
-     * It does not work if the floppy is changed without trying to read it.
-     */
-    fd_open(bs);
-    ret = s->fd_media_changed;
-    s->fd_media_changed = 0;
-    DPRINTF("Floppy changed=%d\n", ret);
-    return ret;
-}
-
-static void floppy_eject(BlockDriverState *bs, bool eject_flag)
-{
-    BDRVRawState *s = bs->opaque;
-    int fd;
-
-    if (s->fd >= 0) {
-        qemu_close(s->fd);
-        s->fd = -1;
-    }
-    fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK);
-    if (fd >= 0) {
-        if (ioctl(fd, FDEJECT, 0) < 0)
-            perror("FDEJECT");
-        qemu_close(fd);
-    }
-}
-
-static BlockDriver bdrv_host_floppy = {
-    .format_name        = "host_floppy",
-    .protocol_name      = "host_floppy",
-    .instance_size      = sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_probe_device	= floppy_probe_device,
-    .bdrv_parse_filename = floppy_parse_filename,
-    .bdrv_file_open     = floppy_open,
-    .bdrv_close         = raw_close,
-    .bdrv_reopen_prepare = raw_reopen_prepare,
-    .bdrv_reopen_commit  = raw_reopen_commit,
-    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
-    .create_opts         = &raw_create_opts,
-
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
-    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_io_plug = raw_aio_plug,
-    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
-
-    .bdrv_truncate      = raw_truncate,
-    .bdrv_getlength      = raw_getlength,
-    .has_variable_length = true,
-    .bdrv_get_allocated_file_size
-                        = raw_get_allocated_file_size,
-
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
-    /* removable device support */
-    .bdrv_is_inserted   = floppy_is_inserted,
-    .bdrv_media_changed = floppy_media_changed,
-    .bdrv_eject         = floppy_eject,
-};
-#endif
-
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                 Error **errp)
@@ -2609,15 +2390,13 @@ out:
    return prio;
 }

-static int cdrom_is_inserted(BlockDriverState *bs)
+static bool cdrom_is_inserted(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int ret;

    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
-    if (ret == CDS_DISC_OK)
-        return 1;
-    return 0;
+    return ret == CDS_DISC_OK;
 }

 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
@@ -2684,7 +2463,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_lock_medium   = cdrom_lock_medium,

    /* generic scsi device */
-    .bdrv_ioctl         = hdev_ioctl,
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
 };
 #endif /* __linux__ */
@@ -2743,7 +2521,7 @@ static int cdrom_reopen(BlockDriverState *bs)
    return 0;
 }

-static int cdrom_is_inserted(BlockDriverState *bs)
+static bool cdrom_is_inserted(BlockDriverState *bs)
 {
    return raw_getlength(bs) > 0;
 }
@@ -2831,7 +2609,6 @@ static void bdrv_file_init(void)
    bdrv_register(&bdrv_file);
    bdrv_register(&bdrv_host_device);
 #ifdef __linux__
-    bdrv_register(&bdrv_host_floppy);
    bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -154,11 +154,6 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
    return bdrv_truncate(bs->file->bs, offset);
 }

-static int raw_is_inserted(BlockDriverState *bs)
-{
-    return bdrv_is_inserted(bs->file->bs);
-}
-
 static int raw_media_changed(BlockDriverState *bs)
 {
    return bdrv_media_changed(bs->file->bs);
@@ -174,11 +169,6 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
    bdrv_lock_medium(bs->file->bs, locked);
 }

-static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    return bdrv_ioctl(bs->file->bs, req, buf);
-}
-
 static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
                                 unsigned long int req, void *buf,
                                 BlockCompletionFunc *cb,
@@ -264,11 +254,9 @@ BlockDriver bdrv_raw = {
    .bdrv_refresh_limits  = &raw_refresh_limits,
    .bdrv_probe_blocksizes = &raw_probe_blocksizes,
    .bdrv_probe_geometry  = &raw_probe_geometry,
-    .bdrv_is_inserted     = &raw_is_inserted,
    .bdrv_media_changed   = &raw_media_changed,
    .bdrv_eject           = &raw_eject,
    .bdrv_lock_medium     = &raw_lock_medium,
-    .bdrv_ioctl           = &raw_ioctl,
    .bdrv_aio_ioctl       = &raw_aio_ioctl,
    .create_opts          = &raw_create_opts,
    .bdrv_has_zero_init   = &raw_has_zero_init
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -651,14 +651,16 @@ static coroutine_fn void do_co_req(void *opaque)
    unsigned int *rlen = srco->rlen;

    co = qemu_coroutine_self();
-    aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       NULL, restart_co_req, co);

    ret = send_co_req(sockfd, hdr, data, wlen);
    if (ret < 0) {
        goto out;
    }

-    aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       restart_co_req, NULL, co);

    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
    if (ret != sizeof(*hdr)) {
@@ -683,7 +685,8 @@ static coroutine_fn void do_co_req(void *opaque)
 out:
    /* there is at most one request for this sockfd, so it is safe to
     * set each handler to NULL. */
-    aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       NULL, NULL, NULL);

    srco->ret = ret;
    srco->finished = true;
@@ -735,7 +738,8 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
    BDRVSheepdogState *s = opaque;
    AIOReq *aio_req, *next;

-    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
+                       NULL, NULL);
    close(s->fd);
    s->fd = -1;

@@ -938,7 +942,8 @@ static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
        return fd;
    }

-    aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s);
+    aio_set_fd_handler(s->aio_context, fd, false,
+                       co_read_response, NULL, s);
    return fd;
 }

@@ -1199,7 +1204,7 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,

    qemu_co_mutex_lock(&s->lock);
    s->co_send = qemu_coroutine_self();
-    aio_set_fd_handler(s->aio_context, s->fd,
+    aio_set_fd_handler(s->aio_context, s->fd, false,
                       co_read_response, co_write_request, s);
    socket_set_cork(s->fd, 1);

@@ -1218,7 +1223,8 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
    }
 out:
    socket_set_cork(s->fd, 0);
-    aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s);
+    aio_set_fd_handler(s->aio_context, s->fd, false,
+                       co_read_response, NULL, s);
    s->co_send = NULL;
    qemu_co_mutex_unlock(&s->lock);
 }
@@ -1368,7 +1374,8 @@ static void sd_detach_aio_context(BlockDriverState *bs)
 {
    BDRVSheepdogState *s = bs->opaque;

-    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
+                       NULL, NULL);
 }

 static void sd_attach_aio_context(BlockDriverState *bs,
@@ -1377,7 +1384,8 @@ static void sd_attach_aio_context(BlockDriverState *bs,
    BDRVSheepdogState *s = bs->opaque;

    s->aio_context = new_context;
-    aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s);
+    aio_set_fd_handler(new_context, s->fd, false,
+                       co_read_response, NULL, s);
 }

 /* TODO Convert to fine grained options */
@@ -1490,7 +1498,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    g_free(buf);
    return 0;
 out:
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
+                       false, NULL, NULL, NULL);
    if (s->fd >= 0) {
        closesocket(s->fd);
    }
@@ -1528,7 +1537,8 @@ static void sd_reopen_commit(BDRVReopenState *state)
    BDRVSheepdogState *s = state->bs->opaque;

    if (s->fd) {
-        aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+        aio_set_fd_handler(s->aio_context, s->fd, false,
+                           NULL, NULL, NULL);
        closesocket(s->fd);
    }

@@ -1551,7 +1561,8 @@ static void sd_reopen_abort(BDRVReopenState *state)
    }

    if (re_s->fd) {
-        aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
+        aio_set_fd_handler(s->aio_context, re_s->fd, false,
+                           NULL, NULL, NULL);
        closesocket(re_s->fd);
    }

@@ -1935,7 +1946,8 @@ static void sd_close(BlockDriverState *bs)
        error_report("%s, %s", sd_strerror(rsp->result), s->name);
    }

-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
+                       false, NULL, NULL, NULL);
    closesocket(s->fd);
    g_free(s->host_spec);
 }
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -229,6 +229,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
                         Error **errp)
 {
    BlockDriver *drv = bs->drv;
+    int ret;
+
    if (!drv) {
        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
        return -ENOMEDIUM;
@@ -239,23 +241,26 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
    }

    /* drain all pending i/o before deleting snapshot */
-    bdrv_drain(bs);
+    bdrv_drained_begin(bs);

    if (drv->bdrv_snapshot_delete) {
-        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
+        ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
+    } else if (bs->file) {
+        ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp);
+    } else {
+        error_setg(errp, "Block format '%s' used by device '%s' "
+                   "does not support internal snapshot deletion",
+                   drv->format_name, bdrv_get_device_name(bs));
+        ret = -ENOTSUP;
    }
-    if (bs->file) {
-        return bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp);
-    }
-    error_setg(errp, "Block format '%s' used by device '%s' "
-               "does not support internal snapshot deletion",
-               drv->format_name, bdrv_get_device_name(bs));
-    return -ENOTSUP;
+
+    bdrv_drained_end(bs);
+    return ret;
 }

-void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
-                                        const char *id_or_name,
-                                        Error **errp)
+int bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
+                                       const char *id_or_name,
+                                       Error **errp)
 {
    int ret;
    Error *local_err = NULL;
@@ -270,6 +275,7 @@ void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
    if (ret < 0) {
        error_propagate(errp, local_err);
    }
+    return ret;
 }

 int bdrv_snapshot_list(BlockDriverState *bs,
@@ -356,3 +362,130 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,

    return ret;
 }
+
+
+/* Group operations. All block drivers are involved.
+ * These functions will properly handle dataplane (take aio_context_acquire
+ * when appropriate for appropriate block drivers) */
+
+bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
+{
+    bool ok = true;
+    BlockDriverState *bs = NULL;
+
+    while (ok && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_is_inserted(bs) && !bdrv_is_read_only(bs)) {
+            ok = bdrv_can_snapshot(bs);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return ok;
+}
+
+int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
+                             Error **err)
+{
+    int ret = 0;
+    BlockDriverState *bs = NULL;
+    QEMUSnapshotInfo sn1, *snapshot = &sn1;
+
+    while (ret == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs) &&
+                bdrv_snapshot_find(bs, snapshot, name) >= 0) {
+            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return ret;
+}
+
+
+int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
+{
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs)) {
+            err = bdrv_snapshot_goto(bs, name);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
+{
+    QEMUSnapshotInfo sn;
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs)) {
+            err = bdrv_snapshot_find(bs, &sn, name);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
+                             BlockDriverState *vm_state_bs,
+                             uint64_t vm_state_size,
+                             BlockDriverState **first_bad_bs)
+{
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bs == vm_state_bs) {
+            sn->vm_state_size = vm_state_size;
+            err = bdrv_snapshot_create(bs, sn);
+        } else if (bdrv_can_snapshot(bs)) {
+            sn->vm_state_size = 0;
+            err = bdrv_snapshot_create(bs, sn);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+BlockDriverState *bdrv_all_find_vmstate_bs(void)
+{
+    bool not_found = true;
+    BlockDriverState *bs = NULL;
+
+    while (not_found && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        not_found = !bdrv_can_snapshot(bs);
+        aio_context_release(ctx);
+    }
+    return bs;
+}
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -800,14 +800,15 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
            rd_handler, wr_handler);

    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       rd_handler, wr_handler, co);
+                       false, rd_handler, wr_handler, co);
 }

 static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
                                          BlockDriverState *bs)
 {
    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
+                       false, NULL, NULL, NULL);
 }

 /* A non-blocking call returned EAGAIN, so yield, ensuring the
--- a/block/stream.c
+++ b/block/stream.c
@@ -16,6 +16,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 enum {
    /*
@@ -222,7 +223,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,

    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
        return;
    }
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -33,8 +33,7 @@
 * its own locking.
 *
 * This locking is however handled internally in this file, so it's
- * mostly transparent to outside users (but see the documentation in
- * throttle_groups_lock()).
+ * transparent to outside users.
 *
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
@@ -76,9 +75,9 @@ static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
 * created.
 *
 * @name: the name of the ThrottleGroup
- * @ret:  the ThrottleGroup
+ * @ret:  the ThrottleState member of the ThrottleGroup
 */
-static ThrottleGroup *throttle_group_incref(const char *name)
+ThrottleState *throttle_group_incref(const char *name)
 {
    ThrottleGroup *tg = NULL;
    ThrottleGroup *iter;
@@ -108,7 +107,7 @@ static ThrottleGroup *throttle_group_incref(const char *name)

    qemu_mutex_unlock(&throttle_groups_lock);

-    return tg;
+    return &tg->ts;
 }

 /* Decrease the reference count of a ThrottleGroup.
@@ -116,10 +115,12 @@ static ThrottleGroup *throttle_group_incref(const char *name)
 * When the reference count reaches zero the ThrottleGroup is
 * destroyed.
 *
- * @tg:  The ThrottleGroup to unref
+ * @ts:  The ThrottleGroup to unref, given by its ThrottleState member
 */
-static void throttle_group_unref(ThrottleGroup *tg)
+void throttle_group_unref(ThrottleState *ts)
 {
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+
    qemu_mutex_lock(&throttle_groups_lock);
    if (--tg->refcount == 0) {
        QTAILQ_REMOVE(&throttle_groups, tg, list);
@@ -401,7 +402,8 @@ static void write_timer_cb(void *opaque)
 void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
 {
    int i;
-    ThrottleGroup *tg = throttle_group_incref(groupname);
+    ThrottleState *ts = throttle_group_incref(groupname);
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;

    if (qtest_enabled()) {
@@ -409,7 +411,7 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    bs->throttle_state = &tg->ts;
+    bs->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
    /* If the ThrottleGroup is new set this BlockDriverState as the token */
@@ -435,6 +437,9 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
 * list, destroying the timers and setting the throttle_state pointer
 * to NULL.
 *
+ * The BlockDriverState must not have pending throttled requests, so
+ * the caller has to drain them first.
+ *
 * The group will be destroyed if it's empty after this operation.
 *
 * @bs: the BlockDriverState to remove
@@ -444,6 +449,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs)
    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    int i;

+    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+
    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
        if (tg->tokens[i] == bs) {
@@ -461,38 +470,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs)
    throttle_timers_destroy(&bs->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

-    throttle_group_unref(tg);
+    throttle_group_unref(&tg->ts);
    bs->throttle_state = NULL;
 }

-/* Acquire the lock of this throttling group.
- *
- * You won't normally need to use this. None of the functions from the
- * ThrottleGroup API require you to acquire the lock since all of them
- * deal with it internally.
- *
- * This should only be used in exceptional cases when you want to
- * access the protected fields of a BlockDriverState directly
- * (e.g. bdrv_swap()).
- *
- * @bs: a BlockDriverState that is member of the group
- */
-void throttle_group_lock(BlockDriverState *bs)
-{
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    qemu_mutex_lock(&tg->lock);
-}
-
-/* Release the lock of this throttling group.
- *
- * See the comments in throttle_group_lock().
- */
-void throttle_group_unlock(BlockDriverState *bs)
-{
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    qemu_mutex_unlock(&tg->lock);
-}
-
 static void throttle_groups_init(void)
 {
    qemu_mutex_init(&throttle_groups_lock);
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,7 +53,7 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"

 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2161,19 +2161,19 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
-        .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK,
+        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
        {
            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
        },
    };

-    *spec_info->vmdk = (ImageInfoSpecificVmdk) {
+    *spec_info->u.vmdk = (ImageInfoSpecificVmdk) {
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

-    next = &spec_info->vmdk->extents;
+    next = &spec_info->u.vmdk->extents;
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -174,7 +174,7 @@ int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context)
 {
-    aio_set_event_notifier(old_context, &aio->e, NULL);
+    aio_set_event_notifier(old_context, &aio->e, false, NULL);
    aio->is_aio_context_attached = false;
 }

@@ -182,7 +182,8 @@ void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *new_context)
 {
    aio->is_aio_context_attached = true;
-    aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb);
+    aio_set_event_notifier(new_context, &aio->e, false,
+                           win32_aio_completion_cb);
 }

 QEMUWin32AIOState *win32_aio_init(void)
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -11,7 +11,7 @@
 */

 #include "block/block_int.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "block/write-threshold.h"
 #include "qemu/notify.h"
 #include "qapi-event.h"
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -29,13 +29,27 @@
 #include "block/block.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qjson.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"

+/* Transactional group of block jobs */
+struct BlockJobTxn {
+
+    /* Is this txn being cancelled? */
+    bool aborting;
+
+    /* List of jobs */
+    QLIST_HEAD(, BlockJob) jobs;
+
+    /* Reference count */
+    int refcnt;
+};
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
@@ -59,6 +73,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
+    job->refcnt        = 1;
    bs->job = job;

    /* Only set speed when necessary to avoid NotSupported error */
@@ -67,7 +82,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,

        block_job_set_speed(job, speed, &local_err);
        if (local_err) {
-            block_job_release(bs);
+            block_job_unref(job);
            error_propagate(errp, local_err);
            return NULL;
        }
@@ -75,15 +90,101 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    return job;
 }

-void block_job_release(BlockDriverState *bs)
+void block_job_ref(BlockJob *job)
 {
-    BlockJob *job = bs->job;
+    ++job->refcnt;
+}

-    bs->job = NULL;
-    bdrv_op_unblock_all(bs, job->blocker);
-    error_free(job->blocker);
-    g_free(job->id);
-    g_free(job);
+void block_job_unref(BlockJob *job)
+{
+    if (--job->refcnt == 0) {
+        job->bs->job = NULL;
+        bdrv_op_unblock_all(job->bs, job->blocker);
+        bdrv_unref(job->bs);
+        error_free(job->blocker);
+        g_free(job->id);
+        g_free(job);
+    }
+}
+
+static void block_job_completed_single(BlockJob *job)
+{
+    if (!job->ret) {
+        if (job->driver->commit) {
+            job->driver->commit(job);
+        }
+    } else {
+        if (job->driver->abort) {
+            job->driver->abort(job);
+        }
+    }
+    job->cb(job->opaque, job->ret);
+    if (job->txn) {
+        block_job_txn_unref(job->txn);
+    }
+    block_job_unref(job);
+}
+
+static void block_job_completed_txn_abort(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+
+    if (txn->aborting) {
+        /*
+         * We are cancelled by another job, which will handle everything.
+         */
+        return;
+    }
+    txn->aborting = true;
+    /* We are the first failed job. Cancel other jobs. */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+    }
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (other_job == job || other_job->completed) {
+            /* Other jobs are "effectively" cancelled by us, set the status for
+             * them; this job, however, may or may not be cancelled, depending
+             * on the caller, so leave it. */
+            if (other_job != job) {
+                other_job->cancelled = true;
+            }
+            continue;
+        }
+        block_job_cancel_sync(other_job);
+        assert(other_job->completed);
+    }
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
+}
+
+static void block_job_completed_txn_success(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+    /*
+     * Successful completion, see if there are other running jobs in this
+     * txn.
+     */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (!other_job->completed) {
+            return;
+        }
+    }
+    /* We are the last completed job, commit the transaction. */
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+        assert(other_job->ret == 0);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
 }

 void block_job_completed(BlockJob *job, int ret)
@@ -91,8 +192,16 @@ void block_job_completed(BlockJob *job, int ret)
    BlockDriverState *bs = job->bs;

    assert(bs->job == job);
-    job->cb(job->opaque, ret);
-    block_job_release(bs);
+    assert(!job->completed);
+    job->completed = true;
+    job->ret = ret;
+    if (!job->txn) {
+        block_job_completed_single(job);
+    } else if (ret < 0 || block_job_is_cancelled(job)) {
+        block_job_completed_txn_abort(job);
+    } else {
+        block_job_completed_txn_success(job);
+    }
 }

 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -177,43 +286,29 @@ struct BlockFinishData {
    int ret;
 };

-static void block_job_finish_cb(void *opaque, int ret)
-{
-    struct BlockFinishData *data = opaque;
-
-    data->cancelled = block_job_is_cancelled(data->job);
-    data->ret = ret;
-    data->cb(data->opaque, ret);
-}
-
 static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
-    struct BlockFinishData data;
    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
+    int ret;

    assert(bs->job == job);

-    /* Set up our own callback to store the result and chain to
-     * the original callback.
-     */
-    data.job = job;
-    data.cb = job->cb;
-    data.opaque = job->opaque;
-    data.ret = -EINPROGRESS;
-    job->cb = block_job_finish_cb;
-    job->opaque = &data;
+    block_job_ref(job);
    finish(job, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
+        block_job_unref(job);
        return -EBUSY;
    }
-    while (data.ret == -EINPROGRESS) {
+    while (!job->completed) {
        aio_poll(bdrv_get_aio_context(bs), true);
    }
-    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
+    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
+    block_job_unref(job);
+    return ret;
 }

 /* A wrapper around block_job_cancel() taking an Error ** parameter so it may be
@@ -354,8 +449,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
-        if (bs != job->bs) {
-            bdrv_iostatus_set_err(bs, error);
+        if (bs->blk && bs != job->bs) {
+            blk_iostatus_set_err(bs->blk, error);
        }
    }
    return action;
@@ -405,3 +500,36 @@ void block_job_defer_to_main_loop(BlockJob *job,

    qemu_bh_schedule(data->bh);
 }
+
+BlockJobTxn *block_job_txn_new(void)
+{
+    BlockJobTxn *txn = g_new0(BlockJobTxn, 1);
+    QLIST_INIT(&txn->jobs);
+    txn->refcnt = 1;
+    return txn;
+}
+
+static void block_job_txn_ref(BlockJobTxn *txn)
+{
+    txn->refcnt++;
+}
+
+void block_job_txn_unref(BlockJobTxn *txn)
+{
+    if (txn && --txn->refcnt == 0) {
+        g_free(txn);
+    }
+}
+
+void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job)
+{
+    if (!txn) {
+        return;
+    }
+
+    assert(!job->txn);
+    job->txn = txn;
+
+    QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
+    block_job_txn_ref(txn);
+}
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -740,8 +740,7 @@ static void padzero(abi_ulong elf_bss, abi_ulong last_bss)
           size must be known */
        if (qemu_real_host_page_size < qemu_host_page_size) {
            abi_ulong end_addr, end_addr1;
-            end_addr1 = (elf_bss + qemu_real_host_page_size - 1) &
-                ~(qemu_real_host_page_size - 1);
+            end_addr1 = REAL_HOST_PAGE_ALIGN(elf_bss);
            end_addr = HOST_PAGE_ALIGN(elf_bss);
            if (end_addr1 < end_addr) {
                mmap((void *)g2h(end_addr1), end_addr - end_addr1,
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -938,7 +938,7 @@ int main(int argc, char **argv)
            unsigned long tmp;
            if (fscanf(fp, "%lu", &tmp) == 1) {
                mmap_min_addr = tmp;
-                qemu_log("host mmap_min_addr=0x%lx\n", mmap_min_addr);
+                qemu_log_mask(CPU_LOG_PAGE, "host mmap_min_addr=0x%lx\n", mmap_min_addr);
            }
            fclose(fp);
        }
@@ -955,7 +955,7 @@ int main(int argc, char **argv)

    free(target_environ);

-    if (qemu_log_enabled()) {
+    if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
        qemu_log("guest_base  0x%lx\n", guest_base);
        log_page_dump();

--- a/bsd-user/signal.c
+++ b/bsd-user/signal.c
@@ -26,8 +26,6 @@
 #include "qemu.h"
 #include "target_signal.h"

-//#define DEBUG_SIGNAL
-
 void signal_init(void)
 {
 }
--- a/261
+++ b/261
@@ -8,6 +8,9 @@
 CLICOLOR_FORCE= GREP_OPTIONS=
 unset CLICOLOR_FORCE GREP_OPTIONS

+# Don't allow CCACHE, if present, to use cached results of compile tests!
+export CCACHE_RECACHE=yes
+
 # Temporary directory used for files created while
 # configure runs. Since it is in the build directory
 # we can safely blow away any previous version of it
@@ -261,6 +264,7 @@ rdma=""
 gprof="no"
 debug_tcg="no"
 debug="no"
+fortify_source=""
 strip_opt="yes"
 tcg_interpreter="no"
 bigendian="no"
@@ -331,6 +335,8 @@ gtkabi=""
 gtk_gl="no"
 gnutls=""
 gnutls_hash=""
+nettle=""
+gcrypt=""
 vte=""
 virglrenderer=""
 tpm="yes"
@@ -417,9 +423,6 @@ if test "$debug_info" = "yes"; then
    LDFLAGS="-g $LDFLAGS"
 fi

-test_cflags=""
-test_libs=""
-
 # make source path absolute
 source_path=`cd "$source_path"; pwd`

@@ -724,6 +727,8 @@ if test "$mingw32" = "yes" ; then
  QEMU_CFLAGS="-DWIN32_LEAN_AND_MEAN -DWINVER=0x501 $QEMU_CFLAGS"
  # enable C99/POSIX format strings (needs mingw32-runtime 3.15 or later)
  QEMU_CFLAGS="-D__USE_MINGW_ANSI_STDIO=1 $QEMU_CFLAGS"
+  # MinGW needs -mthreads for TLS and macro _MT.
+  QEMU_CFLAGS="-mthreads $QEMU_CFLAGS"
  LIBS="-lwinmm -lws2_32 -liphlpapi $LIBS"
  write_c_skeleton;
  if compile_prog "" "-liberty" ; then
@@ -788,6 +793,9 @@ for opt do
  --enable-modules)
      modules="yes"
  ;;
+  --disable-modules)
+      modules="no"
+  ;;
  --cpu=*)
  ;;
  --target-list=*) target_list="$optarg"
@@ -877,6 +885,7 @@ for opt do
      debug_tcg="yes"
      debug="yes"
      strip_opt="no"
+      fortify_source="no"
  ;;
  --enable-sparse) sparse="yes"
  ;;
@@ -1114,6 +1123,14 @@ for opt do
  ;;
  --enable-gnutls) gnutls="yes"
  ;;
+  --disable-nettle) nettle="no"
+  ;;
+  --enable-nettle) nettle="yes"
+  ;;
+  --disable-gcrypt) gcrypt="no"
+  ;;
+  --enable-gcrypt) gcrypt="yes"
+  ;;
  --enable-rdma) rdma="yes"
  ;;
  --disable-rdma) rdma="no"
@@ -1324,6 +1341,8 @@ disabled with --disable-FEATURE, default is enabled if available:
  sparse          sparse checker

  gnutls          GNUTLS cryptography support
+  nettle          nettle cryptography support
+  gcrypt          libgcrypt cryptography support
  sdl             SDL UI
  --with-sdlabi     select preferred SDL ABI 1.2 or 2.0
  gtk             gtk UI
@@ -1331,7 +1350,6 @@ disabled with --disable-FEATURE, default is enabled if available:
  vte             vte support for the gtk UI
  curses          curses UI
  vnc             VNC UI support
-  vnc-tls         TLS encryption for VNC server
  vnc-sasl        SASL encryption for VNC server
  vnc-jpeg        JPEG lossy compression for VNC server
  vnc-png         PNG compression for VNC server
@@ -1410,6 +1428,9 @@ if compile_object ; then
 else
    error_exit "\"$cc\" either does not exist or does not work"
 fi
+if ! compile_prog ; then
+    error_exit "\"$cc\" cannot build an executable (is your linker broken?)"
+fi

 # Check that the C++ compiler exists and works with the C compiler
 if has $cxx; then
@@ -1470,6 +1491,16 @@ for flag in $gcc_flags; do
 done

 if test "$stack_protector" != "no"; then
+  cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+    char arr[64], *p = arr, *c = argv[0];
+    while (*c) {
+        *p++ = *c++;
+    }
+    return 0;
+}
+EOF
  gcc_flags="-fstack-protector-strong -fstack-protector-all"
  sp_on=0
  for flag in $gcc_flags; do
@@ -1872,16 +1903,34 @@ fi
 # libseccomp check

 if test "$seccomp" != "no" ; then
-    if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
-        $pkg_config --atleast-version=2.1.1 libseccomp; then
+    case "$cpu" in
+    i386|x86_64)
+        libseccomp_minver="2.1.0"
+        ;;
+    arm|aarch64)
+        libseccomp_minver="2.2.3"
+        ;;
+    *)
+        libseccomp_minver=""
+        ;;
+    esac
+
+    if test "$libseccomp_minver" != "" &&
+       $pkg_config --atleast-version=$libseccomp_minver libseccomp ; then
        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
-	seccomp="yes"
+        seccomp="yes"
    else
-	if test "$seccomp" = "yes"; then
-            feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.1"
-	fi
-	seccomp="no"
+        if test "$seccomp" = "yes" ; then
+            if test "$libseccomp_minver" != "" ; then
+                feature_not_found "libseccomp" \
+                    "Install libseccomp devel >= $libseccomp_minver"
+            else
+                feature_not_found "libseccomp" \
+                    "libseccomp is not supported for host cpu $cpu"
+            fi
+        fi
+        seccomp="no"
    fi
 fi
 ##########################################
@@ -1912,6 +1961,23 @@ EOF
  elif
      cat > $TMPC <<EOF &&
 #include <xenctrl.h>
+#include <stdint.h>
+int main(void) {
+  xc_interface *xc = NULL;
+  xen_domain_handle_t handle;
+  xc_domain_create(xc, 0, handle, 0, NULL, NULL);
+  return 0;
+}
+EOF
+      compile_prog "" "$xen_libs"
+    then
+    xen_ctrl_version=470
+    xen=yes
+
+  # Xen 4.6
+  elif
+      cat > $TMPC <<EOF &&
+#include <xenctrl.h>
 #include <xenstore.h>
 #include <stdint.h>
 #include <xen/hvm/hvm_info_table.h>
@@ -2254,20 +2320,76 @@ else
    gnutls_hash="no"
 fi

-if test "$gnutls_gcrypt" != "no"; then
-    if has "libgcrypt-config"; then
+
+# If user didn't give a --disable/enable-gcrypt flag,
+# then mark as disabled if user requested nettle
+# explicitly, or if gnutls links to nettle
+if test -z "$gcrypt"
+then
+    if test "$nettle" = "yes" || test "$gnutls_nettle" = "yes"
+    then
+        gcrypt="no"
+    fi
+fi
+
+# If user didn't give a --disable/enable-nettle flag,
+# then mark as disabled if user requested gcrypt
+# explicitly, or if gnutls links to gcrypt
+if test -z "$nettle"
+then
+    if test "$gcrypt" = "yes" || test "$gnutls_gcrypt" = "yes"
+    then
+        nettle="no"
+    fi
+fi
+
+has_libgcrypt_config() {
+    if ! has "libgcrypt-config"
+    then
+	return 1
+    fi
+
+    if test -n "$cross_prefix"
+    then
+	host=`libgcrypt-config --host`
+	if test "$host-" != $cross_prefix
+	then
+	    return 1
+	fi
+    fi
+
+    return 0
+}
+
+if test "$gcrypt" != "no"; then
+    if has_libgcrypt_config; then
        gcrypt_cflags=`libgcrypt-config --cflags`
        gcrypt_libs=`libgcrypt-config --libs`
+        # Debian has remove -lgpg-error from libgcrypt-config
+        # as it "spreads unnecessary dependencies" which in
+        # turn breaks static builds...
+        if test "$static" = "yes"
+        then
+            gcrypt_libs="$gcrypt_libs -lgpg-error"
+        fi
        libs_softmmu="$gcrypt_libs $libs_softmmu"
        libs_tools="$gcrypt_libs $libs_tools"
        QEMU_CFLAGS="$QEMU_CFLAGS $gcrypt_cflags"
+        gcrypt="yes"
+        if test -z "$nettle"; then
+           nettle="no"
+        fi
    else
-        feature_not_found "gcrypt" "Install gcrypt devel"
+        if test "$gcrypt" = "yes"; then
+            feature_not_found "gcrypt" "Install gcrypt devel"
+        else
+            gcrypt="no"
+        fi
    fi
 fi


-if test "$gnutls_nettle" != "no"; then
+if test "$nettle" != "no"; then
    if $pkg_config --exists "nettle"; then
        nettle_cflags=`$pkg_config --cflags nettle`
        nettle_libs=`$pkg_config --libs nettle`
@@ -2275,25 +2397,43 @@ if test "$gnutls_nettle" != "no"; then
        libs_softmmu="$nettle_libs $libs_softmmu"
        libs_tools="$nettle_libs $libs_tools"
        QEMU_CFLAGS="$QEMU_CFLAGS $nettle_cflags"
+        nettle="yes"
    else
-        feature_not_found "nettle" "Install nettle devel"
+        if test "$nettle" = "yes"; then
+            feature_not_found "nettle" "Install nettle devel"
+        else
+            nettle="no"
+        fi
    fi
 fi

+if test "$gcrypt" = "yes" && test "$nettle" = "yes"
+then
+    error_exit "Only one of gcrypt & nettle can be enabled"
+fi
+
 ##########################################
 # libtasn1 - only for the TLS creds/session test suite

 tasn1=yes
+tasn1_cflags=""
+tasn1_libs=""
 if $pkg_config --exists "libtasn1"; then
    tasn1_cflags=`$pkg_config --cflags libtasn1`
    tasn1_libs=`$pkg_config --libs libtasn1`
-    test_cflags="$test_cflags $tasn1_cflags"
-    test_libs="$test_libs $tasn1_libs"
 else
    tasn1=no
 fi


+##########################################
+# getifaddrs (for tests/test-io-channel-socket )
+
+have_ifaddrs_h=yes
+if ! check_include "ifaddrs.h" ; then
+  have_ifaddrs_h=no
+fi
+
 ##########################################
 # VTE probe

@@ -3211,25 +3351,11 @@ fi
 libs_softmmu="$libs_softmmu $fdt_libs"

 ##########################################
-# opengl probe (for sdl2, milkymist-tmu2)
-
-# GLX probe, used by milkymist-tmu2
-# this is temporary, code will be switched to egl mid-term.
-cat > $TMPC << EOF
-#include <X11/Xlib.h>
-#include <GL/gl.h>
-#include <GL/glx.h>
-int main(void) { glBegin(0); glXQueryVersion(0,0,0); return 0; }
-EOF
-if compile_prog "" "-lGL -lX11" ; then
-  have_glx=yes
-else
-  have_glx=no
-fi
+# opengl probe (for sdl2, gtk, milkymist-tmu2)

 if test "$opengl" != "no" ; then
-  opengl_pkgs="gl glesv2 epoxy egl"
-  if $pkg_config $opengl_pkgs x11 && test "$have_glx" = "yes"; then
+  opengl_pkgs="epoxy"
+  if $pkg_config $opengl_pkgs x11; then
    opengl_cflags="$($pkg_config --cflags $opengl_pkgs) $x11_cflags"
    opengl_libs="$($pkg_config --libs $opengl_pkgs) $x11_libs"
    opengl=yes
@@ -3491,6 +3617,22 @@ if compile_prog "" "" ; then
  eventfd=yes
 fi

+# check if memfd is supported
+memfd=no
+cat > $TMPC << EOF
+#include <sys/memfd.h>
+
+int main(void)
+{
+    return memfd_create("foo", MFD_ALLOW_SEALING);
+}
+EOF
+if compile_prog "" "" ; then
+  memfd=yes
+fi
+
+
+
 # check for fallocate
 fallocate=no
 cat > $TMPC << EOF
@@ -4321,6 +4463,7 @@ fi
 # check if ccache is interfering with
 # semantic analysis of macros

+unset CCACHE_CPP2
 ccache_cpp2=no
 cat > $TMPC << EOF
 static const int Z = 1;
@@ -4344,6 +4487,20 @@ if ! compile_object "-Werror"; then
    ccache_cpp2=yes
 fi

+#################################################
+# clang does not support glibc + FORTIFY_SOURCE.
+
+if test "$fortify_source" != "no"; then
+  if echo | $cc -dM -E - | grep __clang__ > /dev/null 2>&1 ; then
+    fortify_source="no";
+  elif test -n "$cxx" &&
+       echo | $cxx -dM -E - | grep __clang__ >/dev/null 2>&1 ; then
+    fortify_source="no";
+  else
+    fortify_source="yes"
+  fi
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -4351,8 +4508,10 @@ fi
 if test "$gcov" = "yes" ; then
  CFLAGS="-fprofile-arcs -ftest-coverage -g $CFLAGS"
  LDFLAGS="-fprofile-arcs -ftest-coverage $LDFLAGS"
-elif test "$debug" = "no" ; then
+elif test "$fortify_source" = "yes" ; then
  CFLAGS="-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $CFLAGS"
+elif test "$debug" = "no"; then
+  CFLAGS="-O2 $CFLAGS"
 fi

 ##########################################
@@ -4417,6 +4576,7 @@ if test "$want_tools" = "yes" ; then
  tools="qemu-img\$(EXESUF) qemu-io\$(EXESUF) $tools"
  if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then
    tools="qemu-nbd\$(EXESUF) $tools"
+    tools="ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools"
  fi
 fi
 if test "$softmmu" = yes ; then
@@ -4437,7 +4597,7 @@ fi

 if [ "$guest_agent" != "no" ]; then
  if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" -o "$mingw32" = "yes" ] ; then
-      tools="qemu-ga\$(EXESUF) $tools"
+      tools="qemu-ga $tools"
      guest_agent=yes
  elif [ "$guest_agent" != yes ]; then
      guest_agent=no
@@ -4605,8 +4765,12 @@ echo "GTK support       $gtk"
 echo "GTK GL support    $gtk_gl"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
-echo "GNUTLS gcrypt     $gnutls_gcrypt"
-echo "GNUTLS nettle     $gnutls_nettle ${gnutls_nettle+($nettle_version)}"
+echo "libgcrypt         $gcrypt"
+if test "$nettle" = "yes"; then
+    echo "nettle            $nettle ($nettle_version)"
+else
+    echo "nettle            $nettle"
+fi
 echo "libtasn1          $tasn1"
 echo "VTE support       $vte"
 echo "curses support    $curses"
@@ -4885,6 +5049,9 @@ fi
 if test "$eventfd" = "yes" ; then
  echo "CONFIG_EVENTFD=y" >> $config_host_mak
 fi
+if test "$memfd" = "yes" ; then
+  echo "CONFIG_MEMFD=y" >> $config_host_mak
+fi
 if test "$fallocate" = "yes" ; then
  echo "CONFIG_FALLOCATE=y" >> $config_host_mak
 fi
@@ -4972,16 +5139,19 @@ fi
 if test "$gnutls_hash" = "yes" ; then
  echo "CONFIG_GNUTLS_HASH=y" >> $config_host_mak
 fi
-if test "$gnutls_gcrypt" = "yes" ; then
-  echo "CONFIG_GNUTLS_GCRYPT=y" >> $config_host_mak
+if test "$gcrypt" = "yes" ; then
+  echo "CONFIG_GCRYPT=y" >> $config_host_mak
 fi
-if test "$gnutls_nettle" = "yes" ; then
-  echo "CONFIG_GNUTLS_NETTLE=y" >> $config_host_mak
+if test "$nettle" = "yes" ; then
+  echo "CONFIG_NETTLE=y" >> $config_host_mak
  echo "CONFIG_NETTLE_VERSION_MAJOR=${nettle_version%%.*}" >> $config_host_mak
 fi
 if test "$tasn1" = "yes" ; then
  echo "CONFIG_TASN1=y" >> $config_host_mak
 fi
+if test "$have_ifaddrs_h" = "yes" ; then
+    echo "HAVE_IFADDRS_H=y" >> $config_host_mak
+fi
 if test "$vte" = "yes" ; then
  echo "CONFIG_VTE=y" >> $config_host_mak
  echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak
@@ -5311,8 +5481,8 @@ echo "EXESUF=$EXESUF" >> $config_host_mak
 echo "DSOSUF=$DSOSUF" >> $config_host_mak
 echo "LDFLAGS_SHARED=$LDFLAGS_SHARED" >> $config_host_mak
 echo "LIBS_QGA+=$libs_qga" >> $config_host_mak
-echo "TEST_LIBS=$test_libs" >> $config_host_mak
-echo "TEST_CFLAGS=$test_cflags" >> $config_host_mak
+echo "TASN1_LIBS=$tasn1_libs" >> $config_host_mak
+echo "TASN1_CFLAGS=$tasn1_cflags" >> $config_host_mak
 echo "POD2MAN=$POD2MAN" >> $config_host_mak
 echo "TRANSLATE_OPT_CFLAGS=$TRANSLATE_OPT_CFLAGS" >> $config_host_mak
 if test "$gcov" = "yes" ; then
@@ -5558,6 +5728,7 @@ case "$target_name" in
      echo "CONFIG_KVM=y" >> $config_target_mak
      if test "$vhost_net" = "yes" ; then
        echo "CONFIG_VHOST_NET=y" >> $config_target_mak
+        echo "CONFIG_VHOST_NET_TEST_$target_name=y" >> $config_host_mak
      fi
    fi
 esac
--- a/contrib/ivshmem-client/Makefile.objs
+++ b/contrib/ivshmem-client/Makefile.objs
@@ -0,0 +1 @@
+ivshmem-client-obj-y = ivshmem-client.o main.o
--- a/contrib/ivshmem-client/ivshmem-client.c
+++ b/contrib/ivshmem-client/ivshmem-client.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "qemu-common.h"
+#include "qemu/queue.h"
+
+#include "ivshmem-client.h"
+
+/* log a message on stdout if verbose=1 */
+#define IVSHMEM_CLIENT_DEBUG(client, fmt, ...) do { \
+        if ((client)->verbose) {         \
+            printf(fmt, ## __VA_ARGS__); \
+        }                                \
+    } while (0)
+
+/* read message from the unix socket */
+static int
+ivshmem_client_read_one_msg(IvshmemClient *client, int64_t *index, int *fd)
+{
+    int ret;
+    struct msghdr msg;
+    struct iovec iov[1];
+    union {
+        struct cmsghdr cmsg;
+        char control[CMSG_SPACE(sizeof(int))];
+    } msg_control;
+    struct cmsghdr *cmsg;
+
+    iov[0].iov_base = index;
+    iov[0].iov_len = sizeof(*index);
+
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 1;
+    msg.msg_control = &msg_control;
+    msg.msg_controllen = sizeof(msg_control);
+
+    ret = recvmsg(client->sock_fd, &msg, 0);
+    if (ret < sizeof(*index)) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read message: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+    if (ret == 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "lost connection to server\n");
+        return -1;
+    }
+
+    *index = GINT64_FROM_LE(*index);
+    *fd = -1;
+
+    for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+
+        if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)) ||
+            cmsg->cmsg_level != SOL_SOCKET ||
+            cmsg->cmsg_type != SCM_RIGHTS) {
+            continue;
+        }
+
+        memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
+    }
+
+    return 0;
+}
+
+/* free a peer when the server advertises a disconnection or when the
+ * client is freed */
+static void
+ivshmem_client_free_peer(IvshmemClient *client, IvshmemClientPeer *peer)
+{
+    unsigned vector;
+
+    QTAILQ_REMOVE(&client->peer_list, peer, next);
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        close(peer->vectors[vector]);
+    }
+
+    g_free(peer);
+}
+
+/* handle message coming from server (new peer, new vectors) */
+static int
+ivshmem_client_handle_server_msg(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    int64_t peer_id;
+    int ret, fd;
+
+    ret = ivshmem_client_read_one_msg(client, &peer_id, &fd);
+    if (ret < 0) {
+        return -1;
+    }
+
+    /* can return a peer or the local client */
+    peer = ivshmem_client_search_peer(client, peer_id);
+
+    /* delete peer */
+    if (fd == -1) {
+
+        if (peer == NULL || peer == &client->local) {
+            IVSHMEM_CLIENT_DEBUG(client, "receive delete for invalid "
+                                 "peer %" PRId64 "\n", peer_id);
+            return -1;
+        }
+
+        IVSHMEM_CLIENT_DEBUG(client, "delete peer id = %" PRId64 "\n", peer_id);
+        ivshmem_client_free_peer(client, peer);
+        return 0;
+    }
+
+    /* new peer */
+    if (peer == NULL) {
+        peer = g_malloc0(sizeof(*peer));
+        peer->id = peer_id;
+        peer->vectors_count = 0;
+        QTAILQ_INSERT_TAIL(&client->peer_list, peer, next);
+        IVSHMEM_CLIENT_DEBUG(client, "new peer id = %" PRId64 "\n", peer_id);
+    }
+
+    /* new vector */
+    IVSHMEM_CLIENT_DEBUG(client, "  new vector %d (fd=%d) for peer id %"
+                         PRId64 "\n", peer->vectors_count, fd, peer->id);
+    if (peer->vectors_count >= G_N_ELEMENTS(peer->vectors)) {
+        IVSHMEM_CLIENT_DEBUG(client, "Too many vectors received, failing");
+        return -1;
+    }
+
+    peer->vectors[peer->vectors_count] = fd;
+    peer->vectors_count++;
+
+    return 0;
+}
+
+/* init a new ivshmem client */
+int
+ivshmem_client_init(IvshmemClient *client, const char *unix_sock_path,
+                    IvshmemClientNotifCb notif_cb, void *notif_arg,
+                    bool verbose)
+{
+    int ret;
+    unsigned i;
+
+    memset(client, 0, sizeof(*client));
+
+    ret = snprintf(client->unix_sock_path, sizeof(client->unix_sock_path),
+                   "%s", unix_sock_path);
+
+    if (ret < 0 || ret >= sizeof(client->unix_sock_path)) {
+        IVSHMEM_CLIENT_DEBUG(client, "could not copy unix socket path\n");
+        return -1;
+    }
+
+    for (i = 0; i < IVSHMEM_CLIENT_MAX_VECTORS; i++) {
+        client->local.vectors[i] = -1;
+    }
+
+    QTAILQ_INIT(&client->peer_list);
+    client->local.id = -1;
+
+    client->notif_cb = notif_cb;
+    client->notif_arg = notif_arg;
+    client->verbose = verbose;
+    client->shm_fd = -1;
+    client->sock_fd = -1;
+
+    return 0;
+}
+
+/* create and connect to the unix socket */
+int
+ivshmem_client_connect(IvshmemClient *client)
+{
+    struct sockaddr_un sun;
+    int fd, ret;
+    int64_t tmp;
+
+    IVSHMEM_CLIENT_DEBUG(client, "connect to client %s\n",
+                         client->unix_sock_path);
+
+    client->sock_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (client->sock_fd < 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot create socket: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    sun.sun_family = AF_UNIX;
+    ret = snprintf(sun.sun_path, sizeof(sun.sun_path), "%s",
+                   client->unix_sock_path);
+    if (ret < 0 || ret >= sizeof(sun.sun_path)) {
+        IVSHMEM_CLIENT_DEBUG(client, "could not copy unix socket path\n");
+        goto err_close;
+    }
+
+    if (connect(client->sock_fd, (struct sockaddr *)&sun, sizeof(sun)) < 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot connect to %s: %s\n", sun.sun_path,
+                             strerror(errno));
+        goto err_close;
+    }
+
+    /* first, we expect a protocol version */
+    if (ivshmem_client_read_one_msg(client, &tmp, &fd) < 0 ||
+        (tmp != IVSHMEM_PROTOCOL_VERSION) || fd != -1) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server\n");
+        goto err_close;
+    }
+
+    /* then, we expect our index + a fd == -1 */
+    if (ivshmem_client_read_one_msg(client, &client->local.id, &fd) < 0 ||
+        client->local.id < 0 || fd != -1) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server (2)\n");
+        goto err_close;
+    }
+    IVSHMEM_CLIENT_DEBUG(client, "our_id=%" PRId64 "\n", client->local.id);
+
+    /* now, we expect shared mem fd + a -1 index, note that shm fd
+     * is not used */
+    if (ivshmem_client_read_one_msg(client, &tmp, &fd) < 0 ||
+        tmp != -1 || fd < 0) {
+        if (fd >= 0) {
+            close(fd);
+        }
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server (3)\n");
+        goto err_close;
+    }
+    client->shm_fd = fd;
+    IVSHMEM_CLIENT_DEBUG(client, "shm_fd=%d\n", fd);
+
+    return 0;
+
+err_close:
+    close(client->sock_fd);
+    client->sock_fd = -1;
+    return -1;
+}
+
+/* close connection to the server, and free all peer structures */
+void
+ivshmem_client_close(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    unsigned i;
+
+    IVSHMEM_CLIENT_DEBUG(client, "close client\n");
+
+    while ((peer = QTAILQ_FIRST(&client->peer_list)) != NULL) {
+        ivshmem_client_free_peer(client, peer);
+    }
+
+    close(client->shm_fd);
+    client->shm_fd = -1;
+    close(client->sock_fd);
+    client->sock_fd = -1;
+    client->local.id = -1;
+    for (i = 0; i < IVSHMEM_CLIENT_MAX_VECTORS; i++) {
+        close(client->local.vectors[i]);
+        client->local.vectors[i] = -1;
+    }
+    client->local.vectors_count = 0;
+}
+
+/* get the fd_set according to the unix socket and peer list */
+void
+ivshmem_client_get_fds(const IvshmemClient *client, fd_set *fds, int *maxfd)
+{
+    int fd;
+    unsigned vector;
+
+    FD_SET(client->sock_fd, fds);
+    if (client->sock_fd >= *maxfd) {
+        *maxfd = client->sock_fd + 1;
+    }
+
+    for (vector = 0; vector < client->local.vectors_count; vector++) {
+        fd = client->local.vectors[vector];
+        FD_SET(fd, fds);
+        if (fd >= *maxfd) {
+            *maxfd = fd + 1;
+        }
+    }
+}
+
+/* handle events from eventfd: just print a message on notification */
+static int
+ivshmem_client_handle_event(IvshmemClient *client, const fd_set *cur, int maxfd)
+{
+    IvshmemClientPeer *peer;
+    uint64_t kick;
+    unsigned i;
+    int ret;
+
+    peer = &client->local;
+
+    for (i = 0; i < peer->vectors_count; i++) {
+        if (peer->vectors[i] >= maxfd || !FD_ISSET(peer->vectors[i], cur)) {
+            continue;
+        }
+
+        ret = read(peer->vectors[i], &kick, sizeof(kick));
+        if (ret < 0) {
+            return ret;
+        }
+        if (ret != sizeof(kick)) {
+            IVSHMEM_CLIENT_DEBUG(client, "invalid read size = %d\n", ret);
+            errno = EINVAL;
+            return -1;
+        }
+        IVSHMEM_CLIENT_DEBUG(client, "received event on fd %d vector %d: %"
+                             PRIu64 "\n", peer->vectors[i], i, kick);
+        if (client->notif_cb != NULL) {
+            client->notif_cb(client, peer, i, client->notif_arg);
+        }
+    }
+
+    return 0;
+}
+
+/* read and handle new messages on the given fd_set */
+int
+ivshmem_client_handle_fds(IvshmemClient *client, fd_set *fds, int maxfd)
+{
+    if (client->sock_fd < maxfd && FD_ISSET(client->sock_fd, fds) &&
+        ivshmem_client_handle_server_msg(client) < 0 && errno != EINTR) {
+        IVSHMEM_CLIENT_DEBUG(client, "ivshmem_client_handle_server_msg() "
+                             "failed\n");
+        return -1;
+    } else if (ivshmem_client_handle_event(client, fds, maxfd) < 0 &&
+               errno != EINTR) {
+        IVSHMEM_CLIENT_DEBUG(client, "ivshmem_client_handle_event() failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+/* send a notification on a vector of a peer */
+int
+ivshmem_client_notify(const IvshmemClient *client,
+                      const IvshmemClientPeer *peer, unsigned vector)
+{
+    uint64_t kick;
+    int fd;
+
+    if (vector >= peer->vectors_count) {
+        IVSHMEM_CLIENT_DEBUG(client, "invalid vector %u on peer %" PRId64 "\n",
+                             vector, peer->id);
+        return -1;
+    }
+    fd = peer->vectors[vector];
+    IVSHMEM_CLIENT_DEBUG(client, "notify peer %" PRId64
+                         " on vector %d, fd %d\n", peer->id, vector, fd);
+
+    kick = 1;
+    if (write(fd, &kick, sizeof(kick)) != sizeof(kick)) {
+        fprintf(stderr, "could not write to %d: %s\n", peer->vectors[vector],
+                strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+/* send a notification to all vectors of a peer */
+int
+ivshmem_client_notify_all_vects(const IvshmemClient *client,
+                                const IvshmemClientPeer *peer)
+{
+    unsigned vector;
+    int ret = 0;
+
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        if (ivshmem_client_notify(client, peer, vector) < 0) {
+            ret = -1;
+        }
+    }
+
+    return ret;
+}
+
+/* send a notification to all peers */
+int
+ivshmem_client_notify_broadcast(const IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    int ret = 0;
+
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        if (ivshmem_client_notify_all_vects(client, peer) < 0) {
+            ret = -1;
+        }
+    }
+
+    return ret;
+}
+
+/* lookup peer from its id */
+IvshmemClientPeer *
+ivshmem_client_search_peer(IvshmemClient *client, int64_t peer_id)
+{
+    IvshmemClientPeer *peer;
+
+    if (peer_id == client->local.id) {
+        return &client->local;
+    }
+
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        if (peer->id == peer_id) {
+            return peer;
+        }
+    }
+    return NULL;
+}
+
+/* dump our info, the list of peers their vectors on stdout */
+void
+ivshmem_client_dump(const IvshmemClient *client)
+{
+    const IvshmemClientPeer *peer;
+    unsigned vector;
+
+    /* dump local infos */
+    peer = &client->local;
+    printf("our_id = %" PRId64 "\n", peer->id);
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        printf("  vector %d is enabled (fd=%d)\n", vector,
+               peer->vectors[vector]);
+    }
+
+    /* dump peers */
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        printf("peer_id = %" PRId64 "\n", peer->id);
+
+        for (vector = 0; vector < peer->vectors_count; vector++) {
+            printf("  vector %d is enabled (fd=%d)\n", vector,
+                   peer->vectors[vector]);
+        }
+    }
+}
--- a/contrib/ivshmem-client/ivshmem-client.h
+++ b/contrib/ivshmem-client/ivshmem-client.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _IVSHMEM_CLIENT_H_
+#define _IVSHMEM_CLIENT_H_
+
+/**
+ * This file provides helper to implement an ivshmem client. It is used
+ * on the host to ask QEMU to send an interrupt to an ivshmem PCI device in a
+ * guest. QEMU also implements an ivshmem client similar to this one, they both
+ * connect to an ivshmem server.
+ *
+ * A standalone ivshmem client based on this file is provided for debug/test
+ * purposes.
+ */
+
+#include <limits.h>
+#include <sys/select.h>
+
+#include "qemu/queue.h"
+#include "hw/misc/ivshmem.h"
+
+/**
+ * Maximum number of notification vectors supported by the client
+ */
+#define IVSHMEM_CLIENT_MAX_VECTORS 64
+
+/**
+ * Structure storing a peer
+ *
+ * Each time a client connects to an ivshmem server, it is advertised to
+ * all connected clients through the unix socket. When our ivshmem
+ * client receives a notification, it creates a IvshmemClientPeer
+ * structure to store the infos of this peer.
+ *
+ * This structure is also used to store the information of our own
+ * client in (IvshmemClient)->local.
+ */
+typedef struct IvshmemClientPeer {
+    QTAILQ_ENTRY(IvshmemClientPeer) next;    /**< next in list*/
+    int64_t id;                              /**< the id of the peer */
+    int vectors[IVSHMEM_CLIENT_MAX_VECTORS]; /**< one fd per vector */
+    unsigned vectors_count;                  /**< number of vectors */
+} IvshmemClientPeer;
+QTAILQ_HEAD(IvshmemClientPeerList, IvshmemClientPeer);
+
+typedef struct IvshmemClientPeerList IvshmemClientPeerList;
+typedef struct IvshmemClient IvshmemClient;
+
+/**
+ * Typedef of callback function used when our IvshmemClient receives a
+ * notification from a peer.
+ */
+typedef void (*IvshmemClientNotifCb)(
+    const IvshmemClient *client,
+    const IvshmemClientPeer *peer,
+    unsigned vect, void *arg);
+
+/**
+ * Structure describing an ivshmem client
+ *
+ * This structure stores all information related to our client: the name
+ * of the server unix socket, the list of peers advertised by the
+ * server, our own client information, and a pointer the notification
+ * callback function used when we receive a notification from a peer.
+ */
+struct IvshmemClient {
+    char unix_sock_path[PATH_MAX];      /**< path to unix sock */
+    int sock_fd;                        /**< unix sock filedesc */
+    int shm_fd;                         /**< shm file descriptor */
+
+    IvshmemClientPeerList peer_list;    /**< list of peers */
+    IvshmemClientPeer local;            /**< our own infos */
+
+    IvshmemClientNotifCb notif_cb;      /**< notification callback */
+    void *notif_arg;                    /**< notification argument */
+
+    bool verbose;                       /**< true to enable debug */
+};
+
+/**
+ * Initialize an ivshmem client
+ *
+ * @client:         A pointer to an uninitialized IvshmemClient structure
+ * @unix_sock_path: The pointer to the unix socket file name
+ * @notif_cb:       If not NULL, the pointer to the function to be called when
+ *                  our IvshmemClient receives a notification from a peer
+ * @notif_arg:      Opaque pointer given as-is to the notification callback
+ *                  function
+ * @verbose:        True to enable debug
+ *
+ * Returns:         0 on success, or a negative value on error
+ */
+int ivshmem_client_init(IvshmemClient *client, const char *unix_sock_path,
+                        IvshmemClientNotifCb notif_cb, void *notif_arg,
+                        bool verbose);
+
+/**
+ * Connect to the server
+ *
+ * Connect to the server unix socket, and read the first initial
+ * messages sent by the server, giving the ID of the client and the file
+ * descriptor of the shared memory.
+ *
+ * @client: The ivshmem client
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_connect(IvshmemClient *client);
+
+/**
+ * Close connection to the server and free all peer structures
+ *
+ * @client: The ivshmem client
+ */
+void ivshmem_client_close(IvshmemClient *client);
+
+/**
+ * Fill a fd_set with file descriptors to be monitored
+ *
+ * This function will fill a fd_set with all file descriptors
+ * that must be polled (unix server socket and peers eventfd). The
+ * function will not initialize the fd_set, it is up to the caller
+ * to do this.
+ *
+ * @client: The ivshmem client
+ * @fds:    The fd_set to be updated
+ * @maxfd:  Must be set to the max file descriptor + 1 in fd_set. This value is
+ *          updated if this function adds a greater fd in fd_set.
+ */
+void ivshmem_client_get_fds(const IvshmemClient *client, fd_set *fds,
+                            int *maxfd);
+
+/**
+ * Read and handle new messages
+ *
+ * Given a fd_set filled by select(), handle incoming messages from
+ * server or peers.
+ *
+ * @client: The ivshmem client
+ * @fds:    The fd_set containing the file descriptors to be checked. Note
+ *          that file descriptors that are not related to our client are
+ *          ignored.
+ * @maxfd:  The maximum fd in fd_set, plus one.
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_handle_fds(IvshmemClient *client, fd_set *fds, int maxfd);
+
+/**
+ * Send a notification to a vector of a peer
+ *
+ * @client: The ivshmem client
+ * @peer:   The peer to be notified
+ * @vector: The number of the vector
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_notify(const IvshmemClient *client,
+                          const IvshmemClientPeer *peer, unsigned vector);
+
+/**
+ * Send a notification to all vectors of a peer
+ *
+ * @client: The ivshmem client
+ * @peer:   The peer to be notified
+ *
+ * Returns: 0 on success, or a negative value on error (at least one
+ *          notification failed)
+ */
+int ivshmem_client_notify_all_vects(const IvshmemClient *client,
+                                    const IvshmemClientPeer *peer);
+
+/**
+ * Broadcat a notification to all vectors of all peers
+ *
+ * @client: The ivshmem client
+ *
+ * Returns: 0 on success, or a negative value on error (at least one
+ *          notification failed)
+ */
+int ivshmem_client_notify_broadcast(const IvshmemClient *client);
+
+/**
+ * Search a peer from its identifier
+ *
+ * Return the peer structure from its peer_id. If the given peer_id is
+ * the local id, the function returns the local peer structure.
+ *
+ * @client:  The ivshmem client
+ * @peer_id: The identifier of the peer structure
+ *
+ * Returns:  The peer structure, or NULL if not found
+ */
+IvshmemClientPeer *
+ivshmem_client_search_peer(IvshmemClient *client, int64_t peer_id);
+
+/**
+ * Dump information of this ivshmem client on stdout
+ *
+ * Dump the id and the vectors of the given ivshmem client and the list
+ * of its peers and their vectors on stdout.
+ *
+ * @client: The ivshmem client
+ */
+void ivshmem_client_dump(const IvshmemClient *client);
+
+#endif /* _IVSHMEM_CLIENT_H_ */
--- a/contrib/ivshmem-client/main.c
+++ b/contrib/ivshmem-client/main.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu-common.h"
+
+#include "ivshmem-client.h"
+
+#define IVSHMEM_CLIENT_DEFAULT_VERBOSE        0
+#define IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH "/tmp/ivshmem_socket"
+
+typedef struct IvshmemClientArgs {
+    bool verbose;
+    const char *unix_sock_path;
+} IvshmemClientArgs;
+
+/* show ivshmem_client_usage and exit with given error code */
+static void
+ivshmem_client_usage(const char *name, int code)
+{
+    fprintf(stderr, "%s [opts]\n", name);
+    fprintf(stderr, "  -h: show this help\n");
+    fprintf(stderr, "  -v: verbose mode\n");
+    fprintf(stderr, "  -S <unix_sock_path>: path to the unix socket\n"
+                    "     to connect to.\n"
+                    "     default=%s\n", IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH);
+    exit(code);
+}
+
+/* parse the program arguments, exit on error */
+static void
+ivshmem_client_parse_args(IvshmemClientArgs *args, int argc, char *argv[])
+{
+    int c;
+
+    while ((c = getopt(argc, argv,
+                       "h"  /* help */
+                       "v"  /* verbose */
+                       "S:" /* unix_sock_path */
+                      )) != -1) {
+
+        switch (c) {
+        case 'h': /* help */
+            ivshmem_client_usage(argv[0], 0);
+            break;
+
+        case 'v': /* verbose */
+            args->verbose = 1;
+            break;
+
+        case 'S': /* unix_sock_path */
+            args->unix_sock_path = optarg;
+            break;
+
+        default:
+            ivshmem_client_usage(argv[0], 1);
+            break;
+        }
+    }
+}
+
+/* show command line help */
+static void
+ivshmem_client_cmdline_help(void)
+{
+    printf("dump: dump peers (including us)\n"
+           "int <peer> <vector>: notify one vector on a peer\n"
+           "int <peer> all: notify all vectors of a peer\n"
+           "int all: notify all vectors of all peers (excepting us)\n");
+}
+
+/* read stdin and handle commands */
+static int
+ivshmem_client_handle_stdin_command(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    char buf[128];
+    char *s, *token;
+    int ret;
+    int peer_id, vector;
+
+    memset(buf, 0, sizeof(buf));
+    ret = read(0, buf, sizeof(buf) - 1);
+    if (ret < 0) {
+        return -1;
+    }
+
+    s = buf;
+    while ((token = strsep(&s, "\n\r;")) != NULL) {
+        if (!strcmp(token, "")) {
+            continue;
+        }
+        if (!strcmp(token, "?")) {
+            ivshmem_client_cmdline_help();
+        }
+        if (!strcmp(token, "help")) {
+            ivshmem_client_cmdline_help();
+        } else if (!strcmp(token, "dump")) {
+            ivshmem_client_dump(client);
+        } else if (!strcmp(token, "int all")) {
+            ivshmem_client_notify_broadcast(client);
+        } else if (sscanf(token, "int %d %d", &peer_id, &vector) == 2) {
+            peer = ivshmem_client_search_peer(client, peer_id);
+            if (peer == NULL) {
+                printf("cannot find peer_id = %d\n", peer_id);
+                continue;
+            }
+            ivshmem_client_notify(client, peer, vector);
+        } else if (sscanf(token, "int %d all", &peer_id) == 1) {
+            peer = ivshmem_client_search_peer(client, peer_id);
+            if (peer == NULL) {
+                printf("cannot find peer_id = %d\n", peer_id);
+                continue;
+            }
+            ivshmem_client_notify_all_vects(client, peer);
+        } else {
+            printf("invalid command, type help\n");
+        }
+    }
+
+    printf("cmd> ");
+    fflush(stdout);
+    return 0;
+}
+
+/* listen on stdin (command line), on unix socket (notifications of new
+ * and dead peers), and on eventfd (IRQ request) */
+static int
+ivshmem_client_poll_events(IvshmemClient *client)
+{
+    fd_set fds;
+    int ret, maxfd;
+
+    while (1) {
+
+        FD_ZERO(&fds);
+        FD_SET(0, &fds); /* add stdin in fd_set */
+        maxfd = 1;
+
+        ivshmem_client_get_fds(client, &fds, &maxfd);
+
+        ret = select(maxfd, &fds, NULL, NULL, NULL);
+        if (ret < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+
+            fprintf(stderr, "select error: %s\n", strerror(errno));
+            break;
+        }
+        if (ret == 0) {
+            continue;
+        }
+
+        if (FD_ISSET(0, &fds) &&
+            ivshmem_client_handle_stdin_command(client) < 0 && errno != EINTR) {
+            fprintf(stderr, "ivshmem_client_handle_stdin_command() failed\n");
+            break;
+        }
+
+        if (ivshmem_client_handle_fds(client, &fds, maxfd) < 0) {
+            fprintf(stderr, "ivshmem_client_handle_fds() failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+/* callback when we receive a notification (just display it) */
+static void
+ivshmem_client_notification_cb(const IvshmemClient *client,
+                               const IvshmemClientPeer *peer,
+                               unsigned vect, void *arg)
+{
+    (void)client;
+    (void)arg;
+    printf("receive notification from peer_id=%" PRId64 " vector=%u\n",
+           peer->id, vect);
+}
+
+int
+main(int argc, char *argv[])
+{
+    struct sigaction sa;
+    IvshmemClient client;
+    IvshmemClientArgs args = {
+        .verbose = IVSHMEM_CLIENT_DEFAULT_VERBOSE,
+        .unix_sock_path = IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH,
+    };
+
+    /* parse arguments, will exit on error */
+    ivshmem_client_parse_args(&args, argc, argv);
+
+    /* Ignore SIGPIPE, see this link for more info:
+     * http://www.mail-archive.com/libevent-users@monkey.org/msg01606.html */
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPIPE, &sa, 0) == -1) {
+        perror("failed to ignore SIGPIPE; sigaction");
+        return 1;
+    }
+
+    ivshmem_client_cmdline_help();
+    printf("cmd> ");
+    fflush(stdout);
+
+    if (ivshmem_client_init(&client, args.unix_sock_path,
+                            ivshmem_client_notification_cb, NULL,
+                            args.verbose) < 0) {
+        fprintf(stderr, "cannot init client\n");
+        return 1;
+    }
+
+    while (1) {
+        if (ivshmem_client_connect(&client) < 0) {
+            fprintf(stderr, "cannot connect to server, retry in 1 second\n");
+            sleep(1);
+            continue;
+        }
+
+        fprintf(stdout, "listen on server socket %d\n", client.sock_fd);
+
+        if (ivshmem_client_poll_events(&client) == 0) {
+            continue;
+        }
+
+        /* disconnected from server, reset all peers */
+        fprintf(stdout, "disconnected from server\n");
+
+        ivshmem_client_close(&client);
+    }
+
+    return 0;
+}
--- a/contrib/ivshmem-server/Makefile.objs
+++ b/contrib/ivshmem-server/Makefile.objs
@@ -0,0 +1 @@
+ivshmem-server-obj-y = ivshmem-server.o main.o
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -0,0 +1,493 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#ifdef CONFIG_LINUX
+#include <sys/vfs.h>
+#endif
+
+#include "ivshmem-server.h"
+
+/* log a message on stdout if verbose=1 */
+#define IVSHMEM_SERVER_DEBUG(server, fmt, ...) do { \
+        if ((server)->verbose) {         \
+            printf(fmt, ## __VA_ARGS__); \
+        }                                \
+    } while (0)
+
+/** maximum size of a huge page, used by ivshmem_server_ftruncate() */
+#define IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE (1024 * 1024 * 1024)
+
+/** default listen backlog (number of sockets not accepted) */
+#define IVSHMEM_SERVER_LISTEN_BACKLOG 10
+
+/* send message to a client unix socket */
+static int
+ivshmem_server_send_one_msg(int sock_fd, int64_t peer_id, int fd)
+{
+    int ret;
+    struct msghdr msg;
+    struct iovec iov[1];
+    union {
+        struct cmsghdr cmsg;
+        char control[CMSG_SPACE(sizeof(int))];
+    } msg_control;
+    struct cmsghdr *cmsg;
+
+    peer_id = GINT64_TO_LE(peer_id);
+    iov[0].iov_base = &peer_id;
+    iov[0].iov_len = sizeof(peer_id);
+
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 1;
+
+    /* if fd is specified, add it in a cmsg */
+    if (fd >= 0) {
+        memset(&msg_control, 0, sizeof(msg_control));
+        msg.msg_control = &msg_control;
+        msg.msg_controllen = sizeof(msg_control);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+        memcpy(CMSG_DATA(cmsg), &fd, sizeof(fd));
+    }
+
+    ret = sendmsg(sock_fd, &msg, 0);
+    if (ret <= 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* free a peer when the server advertises a disconnection or when the
+ * server is freed */
+static void
+ivshmem_server_free_peer(IvshmemServer *server, IvshmemServerPeer *peer)
+{
+    unsigned vector;
+    IvshmemServerPeer *other_peer;
+
+    IVSHMEM_SERVER_DEBUG(server, "free peer %" PRId64 "\n", peer->id);
+    close(peer->sock_fd);
+    QTAILQ_REMOVE(&server->peer_list, peer, next);
+
+    /* advertise the deletion to other peers */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        ivshmem_server_send_one_msg(other_peer->sock_fd, peer->id, -1);
+    }
+
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        event_notifier_cleanup(&peer->vectors[vector]);
+    }
+
+    g_free(peer);
+}
+
+/* send the peer id and the shm_fd just after a new client connection */
+static int
+ivshmem_server_send_initial_info(IvshmemServer *server, IvshmemServerPeer *peer)
+{
+    int ret;
+
+    /* send our protocol version first */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, IVSHMEM_PROTOCOL_VERSION,
+                                      -1);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send version: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    /* send the peer id to the client */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, peer->id, -1);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send peer id: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    /* send the shm_fd */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, -1, server->shm_fd);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send shm fd: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/* handle message on listening unix socket (new client connection) */
+static int
+ivshmem_server_handle_new_conn(IvshmemServer *server)
+{
+    IvshmemServerPeer *peer, *other_peer;
+    struct sockaddr_un unaddr;
+    socklen_t unaddr_len;
+    int newfd;
+    unsigned i;
+
+    /* accept the incoming connection */
+    unaddr_len = sizeof(unaddr);
+    newfd = qemu_accept(server->sock_fd,
+                        (struct sockaddr *)&unaddr, &unaddr_len);
+
+    if (newfd < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot accept() %s\n", strerror(errno));
+        return -1;
+    }
+
+    qemu_set_nonblock(newfd);
+    IVSHMEM_SERVER_DEBUG(server, "accept()=%d\n", newfd);
+
+    /* allocate new structure for this peer */
+    peer = g_malloc0(sizeof(*peer));
+    peer->sock_fd = newfd;
+
+    /* get an unused peer id */
+    /* XXX: this could use id allocation such as Linux IDA, or simply
+     * a free-list */
+    for (i = 0; i < G_MAXUINT16; i++) {
+        if (ivshmem_server_search_peer(server, server->cur_id) == NULL) {
+            break;
+        }
+        server->cur_id++;
+    }
+    if (i == G_MAXUINT16) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot allocate new client id\n");
+        close(newfd);
+        g_free(peer);
+        return -1;
+    }
+    peer->id = server->cur_id++;
+
+    /* create eventfd, one per vector */
+    peer->vectors_count = server->n_vectors;
+    for (i = 0; i < peer->vectors_count; i++) {
+        if (event_notifier_init(&peer->vectors[i], FALSE) < 0) {
+            IVSHMEM_SERVER_DEBUG(server, "cannot create eventfd\n");
+            goto fail;
+        }
+    }
+
+    /* send peer id and shm fd */
+    if (ivshmem_server_send_initial_info(server, peer) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send initial info\n");
+        goto fail;
+    }
+
+    /* advertise the new peer to others */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        for (i = 0; i < peer->vectors_count; i++) {
+            ivshmem_server_send_one_msg(other_peer->sock_fd, peer->id,
+                                        peer->vectors[i].wfd);
+        }
+    }
+
+    /* advertise the other peers to the new one */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        for (i = 0; i < peer->vectors_count; i++) {
+            ivshmem_server_send_one_msg(peer->sock_fd, other_peer->id,
+                                        other_peer->vectors[i].wfd);
+        }
+    }
+
+    /* advertise the new peer to itself */
+    for (i = 0; i < peer->vectors_count; i++) {
+        ivshmem_server_send_one_msg(peer->sock_fd, peer->id,
+                                    event_notifier_get_fd(&peer->vectors[i]));
+    }
+
+    QTAILQ_INSERT_TAIL(&server->peer_list, peer, next);
+    IVSHMEM_SERVER_DEBUG(server, "new peer id = %" PRId64 "\n",
+                         peer->id);
+    return 0;
+
+fail:
+    while (i--) {
+        event_notifier_cleanup(&peer->vectors[i]);
+    }
+    close(newfd);
+    g_free(peer);
+    return -1;
+}
+
+/* Try to ftruncate a file to next power of 2 of shmsize.
+ * If it fails; all power of 2 above shmsize are tested until
+ * we reach the maximum huge page size. This is useful
+ * if the shm file is in a hugetlbfs that cannot be truncated to the
+ * shm_size value. */
+static int
+ivshmem_server_ftruncate(int fd, unsigned shmsize)
+{
+    int ret;
+    struct stat mapstat;
+
+    /* align shmsize to next power of 2 */
+    shmsize = pow2ceil(shmsize);
+
+    if (fstat(fd, &mapstat) != -1 && mapstat.st_size == shmsize) {
+        return 0;
+    }
+
+    while (shmsize <= IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE) {
+        ret = ftruncate(fd, shmsize);
+        if (ret == 0) {
+            return ret;
+        }
+        shmsize *= 2;
+    }
+
+    return -1;
+}
+
+/* Init a new ivshmem server */
+int
+ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
+                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    bool verbose)
+{
+    int ret;
+
+    memset(server, 0, sizeof(*server));
+    server->verbose = verbose;
+
+    ret = snprintf(server->unix_sock_path, sizeof(server->unix_sock_path),
+                   "%s", unix_sock_path);
+    if (ret < 0 || ret >= sizeof(server->unix_sock_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy unix socket path\n");
+        return -1;
+    }
+    ret = snprintf(server->shm_path, sizeof(server->shm_path),
+                   "%s", shm_path);
+    if (ret < 0 || ret >= sizeof(server->shm_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy shm path\n");
+        return -1;
+    }
+
+    server->shm_size = shm_size;
+    server->n_vectors = n_vectors;
+
+    QTAILQ_INIT(&server->peer_list);
+
+    return 0;
+}
+
+#ifdef CONFIG_LINUX
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static long gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+        ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+        return -1;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC) {
+        return -1;
+    }
+
+    return fs.f_bsize;
+}
+#endif
+
+/* open shm, create and bind to the unix socket */
+int
+ivshmem_server_start(IvshmemServer *server)
+{
+    struct sockaddr_un sun;
+    int shm_fd, sock_fd, ret;
+
+    /* open shm file */
+#ifdef CONFIG_LINUX
+    long hpagesize;
+
+    hpagesize = gethugepagesize(server->shm_path);
+    if (hpagesize < 0 && errno != ENOENT) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n",
+                             server->shm_path, strerror(errno));
+    }
+
+    if (hpagesize > 0) {
+        gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path);
+        IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path);
+        shm_fd = mkstemp(filename);
+        unlink(filename);
+        g_free(filename);
+    } else
+#endif
+    {
+        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
+                             server->shm_path);
+        shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU);
+    }
+
+    if (shm_fd < 0) {
+        fprintf(stderr, "cannot open shm file %s: %s\n", server->shm_path,
+                strerror(errno));
+        return -1;
+    }
+    if (ivshmem_server_ftruncate(shm_fd, server->shm_size) < 0) {
+        fprintf(stderr, "ftruncate(%s) failed: %s\n", server->shm_path,
+                strerror(errno));
+        goto err_close_shm;
+    }
+
+    IVSHMEM_SERVER_DEBUG(server, "create & bind socket %s\n",
+                         server->unix_sock_path);
+
+    /* create the unix listening socket */
+    sock_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock_fd < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot create socket: %s\n",
+                             strerror(errno));
+        goto err_close_shm;
+    }
+
+    sun.sun_family = AF_UNIX;
+    ret = snprintf(sun.sun_path, sizeof(sun.sun_path), "%s",
+                   server->unix_sock_path);
+    if (ret < 0 || ret >= sizeof(sun.sun_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy unix socket path\n");
+        goto err_close_sock;
+    }
+    if (bind(sock_fd, (struct sockaddr *)&sun, sizeof(sun)) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot connect to %s: %s\n", sun.sun_path,
+                             strerror(errno));
+        goto err_close_sock;
+    }
+
+    if (listen(sock_fd, IVSHMEM_SERVER_LISTEN_BACKLOG) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "listen() failed: %s\n", strerror(errno));
+        goto err_close_sock;
+    }
+
+    server->sock_fd = sock_fd;
+    server->shm_fd = shm_fd;
+
+    return 0;
+
+err_close_sock:
+    close(sock_fd);
+err_close_shm:
+    close(shm_fd);
+    return -1;
+}
+
+/* close connections to clients, the unix socket and the shm fd */
+void
+ivshmem_server_close(IvshmemServer *server)
+{
+    IvshmemServerPeer *peer, *npeer;
+
+    IVSHMEM_SERVER_DEBUG(server, "close server\n");
+
+    QTAILQ_FOREACH_SAFE(peer, &server->peer_list, next, npeer) {
+        ivshmem_server_free_peer(server, peer);
+    }
+
+    unlink(server->unix_sock_path);
+    close(server->sock_fd);
+    close(server->shm_fd);
+    server->sock_fd = -1;
+    server->shm_fd = -1;
+}
+
+/* get the fd_set according to the unix socket and the peer list */
+void
+ivshmem_server_get_fds(const IvshmemServer *server, fd_set *fds, int *maxfd)
+{
+    IvshmemServerPeer *peer;
+
+    if (server->sock_fd == -1) {
+        return;
+    }
+
+    FD_SET(server->sock_fd, fds);
+    if (server->sock_fd >= *maxfd) {
+        *maxfd = server->sock_fd + 1;
+    }
+
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        FD_SET(peer->sock_fd, fds);
+        if (peer->sock_fd >= *maxfd) {
+            *maxfd = peer->sock_fd + 1;
+        }
+    }
+}
+
+/* process incoming messages on the sockets in fd_set */
+int
+ivshmem_server_handle_fds(IvshmemServer *server, fd_set *fds, int maxfd)
+{
+    IvshmemServerPeer *peer, *peer_next;
+
+    if (server->sock_fd < maxfd && FD_ISSET(server->sock_fd, fds) &&
+        ivshmem_server_handle_new_conn(server) < 0 && errno != EINTR) {
+        IVSHMEM_SERVER_DEBUG(server, "ivshmem_server_handle_new_conn() "
+                             "failed\n");
+        return -1;
+    }
+
+    QTAILQ_FOREACH_SAFE(peer, &server->peer_list, next, peer_next) {
+        /* any message from a peer socket result in a close() */
+        IVSHMEM_SERVER_DEBUG(server, "peer->sock_fd=%d\n", peer->sock_fd);
+        if (peer->sock_fd < maxfd && FD_ISSET(peer->sock_fd, fds)) {
+            ivshmem_server_free_peer(server, peer);
+        }
+    }
+
+    return 0;
+}
+
+/* lookup peer from its id */
+IvshmemServerPeer *
+ivshmem_server_search_peer(IvshmemServer *server, int64_t peer_id)
+{
+    IvshmemServerPeer *peer;
+
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        if (peer->id == peer_id) {
+            return peer;
+        }
+    }
+    return NULL;
+}
+
+/* dump our info, the list of peers their vectors on stdout */
+void
+ivshmem_server_dump(const IvshmemServer *server)
+{
+    const IvshmemServerPeer *peer;
+    unsigned vector;
+
+    /* dump peers */
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        printf("peer_id = %" PRId64 "\n", peer->id);
+
+        for (vector = 0; vector < peer->vectors_count; vector++) {
+            printf("  vector %d is enabled (fd=%d)\n", vector,
+                   event_notifier_get_fd(&peer->vectors[vector]));
+        }
+    }
+}
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _IVSHMEM_SERVER_H_
+#define _IVSHMEM_SERVER_H_
+
+/**
+ * The ivshmem server is a daemon that creates a unix socket in listen
+ * mode. The ivshmem clients (qemu or ivshmem-client) connect to this
+ * unix socket. For each client, the server will create some eventfd
+ * (see EVENTFD(2)), one per vector. These fd are transmitted to all
+ * clients using the SCM_RIGHTS cmsg message. Therefore, each client is
+ * able to send a notification to another client without beeing
+ * "profixied" by the server.
+ *
+ * We use this mechanism to send interruptions between guests.
+ * qemu is able to transform an event on a eventfd into a PCI MSI-x
+ * interruption in the guest.
+ *
+ * The ivshmem server is also able to share the file descriptor
+ * associated to the ivshmem shared memory.
+ */
+
+#include <limits.h>
+#include <sys/select.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "qemu/event_notifier.h"
+#include "qemu/queue.h"
+#include "hw/misc/ivshmem.h"
+
+/**
+ * Maximum number of notification vectors supported by the server
+ */
+#define IVSHMEM_SERVER_MAX_VECTORS 64
+
+/**
+ * Structure storing a peer
+ *
+ * Each time a client connects to an ivshmem server, a new
+ * IvshmemServerPeer structure is created. This peer and all its
+ * vectors are advertised to all connected clients through the connected
+ * unix sockets.
+ */
+typedef struct IvshmemServerPeer {
+    QTAILQ_ENTRY(IvshmemServerPeer) next;    /**< next in list*/
+    int sock_fd;                             /**< connected unix sock */
+    int64_t id;                              /**< the id of the peer */
+    EventNotifier vectors[IVSHMEM_SERVER_MAX_VECTORS]; /**< one per vector */
+    unsigned vectors_count;                  /**< number of vectors */
+} IvshmemServerPeer;
+QTAILQ_HEAD(IvshmemServerPeerList, IvshmemServerPeer);
+
+typedef struct IvshmemServerPeerList IvshmemServerPeerList;
+
+/**
+ * Structure describing an ivshmem server
+ *
+ * This structure stores all information related to our server: the name
+ * of the server unix socket and the list of connected peers.
+ */
+typedef struct IvshmemServer {
+    char unix_sock_path[PATH_MAX];   /**< path to unix socket */
+    int sock_fd;                     /**< unix sock file descriptor */
+    char shm_path[PATH_MAX];         /**< path to shm */
+    size_t shm_size;                 /**< size of shm */
+    int shm_fd;                      /**< shm file descriptor */
+    unsigned n_vectors;              /**< number of vectors */
+    uint16_t cur_id;                 /**< id to be given to next client */
+    bool verbose;                    /**< true in verbose mode */
+    IvshmemServerPeerList peer_list; /**< list of peers */
+} IvshmemServer;
+
+/**
+ * Initialize an ivshmem server
+ *
+ * @server:         A pointer to an uninitialized IvshmemServer structure
+ * @unix_sock_path: The pointer to the unix socket file name
+ * @shm_path:       Path to the shared memory. The path corresponds to a POSIX
+ *                  shm name or a hugetlbfs mount point.
+ * @shm_size:       Size of shared memory
+ * @n_vectors:      Number of interrupt vectors per client
+ * @verbose:        True to enable verbose mode
+ *
+ * Returns:         0 on success, or a negative value on error
+ */
+int
+ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
+                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    bool verbose);
+
+/**
+ * Open the shm, then create and bind to the unix socket
+ *
+ * @server: The pointer to the initialized IvshmemServer structure
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_server_start(IvshmemServer *server);
+
+/**
+ * Close the server
+ *
+ * Close connections to all clients, close the unix socket and the
+ * shared memory file descriptor. The structure remains initialized, so
+ * it is possible to call ivshmem_server_start() again after a call to
+ * ivshmem_server_close().
+ *
+ * @server: The ivshmem server
+ */
+void ivshmem_server_close(IvshmemServer *server);
+
+/**
+ * Fill a fd_set with file descriptors to be monitored
+ *
+ * This function will fill a fd_set with all file descriptors that must
+ * be polled (unix server socket and peers unix socket). The function
+ * will not initialize the fd_set, it is up to the caller to do it.
+ *
+ * @server: The ivshmem server
+ * @fds:    The fd_set to be updated
+ * @maxfd:  Must be set to the max file descriptor + 1 in fd_set. This value is
+ *          updated if this function adds a greater fd in fd_set.
+ */
+void
+ivshmem_server_get_fds(const IvshmemServer *server, fd_set *fds, int *maxfd);
+
+/**
+ * Read and handle new messages
+ *
+ * Given a fd_set (for instance filled by a call to select()), handle
+ * incoming messages from peers.
+ *
+ * @server: The ivshmem server
+ * @fds:    The fd_set containing the file descriptors to be checked. Note that
+ *          file descriptors that are not related to our server are ignored.
+ * @maxfd:  The maximum fd in fd_set, plus one.
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_server_handle_fds(IvshmemServer *server, fd_set *fds, int maxfd);
+
+/**
+ * Search a peer from its identifier
+ *
+ * @server:  The ivshmem server
+ * @peer_id: The identifier of the peer structure
+ *
+ * Returns:  The peer structure, or NULL if not found
+ */
+IvshmemServerPeer *
+ivshmem_server_search_peer(IvshmemServer *server, int64_t peer_id);
+
+/**
+ * Dump information of this ivshmem server and its peers on stdout
+ *
+ * @server: The ivshmem server
+ */
+void ivshmem_server_dump(const IvshmemServer *server);
+
+#endif /* _IVSHMEM_SERVER_H_ */
--- a/contrib/ivshmem-server/main.c
+++ b/contrib/ivshmem-server/main.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu-common.h"
+
+#include "ivshmem-server.h"
+
+#define IVSHMEM_SERVER_DEFAULT_VERBOSE        0
+#define IVSHMEM_SERVER_DEFAULT_FOREGROUND     0
+#define IVSHMEM_SERVER_DEFAULT_PID_FILE       "/var/run/ivshmem-server.pid"
+#define IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "/tmp/ivshmem_socket"
+#define IVSHMEM_SERVER_DEFAULT_SHM_PATH       "ivshmem"
+#define IVSHMEM_SERVER_DEFAULT_SHM_SIZE       (4*1024*1024)
+#define IVSHMEM_SERVER_DEFAULT_N_VECTORS      1
+
+/* used to quit on signal SIGTERM */
+static int ivshmem_server_quit;
+
+/* arguments given by the user */
+typedef struct IvshmemServerArgs {
+    bool verbose;
+    bool foreground;
+    const char *pid_file;
+    const char *unix_socket_path;
+    const char *shm_path;
+    uint64_t shm_size;
+    unsigned n_vectors;
+} IvshmemServerArgs;
+
+/* show ivshmem_server_usage and exit with given error code */
+static void
+ivshmem_server_usage(const char *name, int code)
+{
+    fprintf(stderr, "%s [opts]\n", name);
+    fprintf(stderr, "  -h: show this help\n");
+    fprintf(stderr, "  -v: verbose mode\n");
+    fprintf(stderr, "  -F: foreground mode (default is to daemonize)\n");
+    fprintf(stderr, "  -p <pid_file>: path to the PID file (used in daemon\n"
+                    "     mode only).\n"
+                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+    fprintf(stderr, "  -S <unix_socket_path>: path to the unix socket\n"
+                    "     to listen to.\n"
+                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH);
+    fprintf(stderr, "  -m <shm_path>: path to the shared memory.\n"
+                    "     The path corresponds to a POSIX shm name or a\n"
+                    "     hugetlbfs mount point.\n"
+                    "     default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+    fprintf(stderr, "  -l <size>: size of shared memory in bytes. The suffix\n"
+                    "     K, M and G can be used (ex: 1K means 1024).\n"
+                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE);
+    fprintf(stderr, "  -n <n_vects>: number of vectors.\n"
+                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+
+    exit(code);
+}
+
+/* parse the program arguments, exit on error */
+static void
+ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
+{
+    int c;
+    unsigned long long v;
+    Error *errp = NULL;
+
+    while ((c = getopt(argc, argv,
+                       "h"  /* help */
+                       "v"  /* verbose */
+                       "F"  /* foreground */
+                       "p:" /* pid_file */
+                       "S:" /* unix_socket_path */
+                       "m:" /* shm_path */
+                       "l:" /* shm_size */
+                       "n:" /* n_vectors */
+                      )) != -1) {
+
+        switch (c) {
+        case 'h': /* help */
+            ivshmem_server_usage(argv[0], 0);
+            break;
+
+        case 'v': /* verbose */
+            args->verbose = 1;
+            break;
+
+        case 'F': /* foreground */
+            args->foreground = 1;
+            break;
+
+        case 'p': /* pid_file */
+            args->pid_file = optarg;
+            break;
+
+        case 'S': /* unix_socket_path */
+            args->unix_socket_path = optarg;
+            break;
+
+        case 'm': /* shm_path */
+            args->shm_path = optarg;
+            break;
+
+        case 'l': /* shm_size */
+            parse_option_size("shm_size", optarg, &args->shm_size, &errp);
+            if (errp) {
+                fprintf(stderr, "cannot parse shm size: %s\n",
+                        error_get_pretty(errp));
+                error_free(errp);
+                ivshmem_server_usage(argv[0], 1);
+            }
+            break;
+
+        case 'n': /* n_vectors */
+            if (parse_uint_full(optarg, &v, 0) < 0) {
+                fprintf(stderr, "cannot parse n_vectors\n");
+                ivshmem_server_usage(argv[0], 1);
+            }
+            args->n_vectors = v;
+            break;
+
+        default:
+            ivshmem_server_usage(argv[0], 1);
+            break;
+        }
+    }
+
+    if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) {
+        fprintf(stderr, "too many requested vectors (max is %d)\n",
+                IVSHMEM_SERVER_MAX_VECTORS);
+        ivshmem_server_usage(argv[0], 1);
+    }
+
+    if (args->verbose == 1 && args->foreground == 0) {
+        fprintf(stderr, "cannot use verbose in daemon mode\n");
+        ivshmem_server_usage(argv[0], 1);
+    }
+}
+
+/* wait for events on listening server unix socket and connected client
+ * sockets */
+static int
+ivshmem_server_poll_events(IvshmemServer *server)
+{
+    fd_set fds;
+    int ret = 0, maxfd;
+
+    while (!ivshmem_server_quit) {
+
+        FD_ZERO(&fds);
+        maxfd = 0;
+        ivshmem_server_get_fds(server, &fds, &maxfd);
+
+        ret = select(maxfd, &fds, NULL, NULL, NULL);
+
+        if (ret < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+
+            fprintf(stderr, "select error: %s\n", strerror(errno));
+            break;
+        }
+        if (ret == 0) {
+            continue;
+        }
+
+        if (ivshmem_server_handle_fds(server, &fds, maxfd) < 0) {
+            fprintf(stderr, "ivshmem_server_handle_fds() failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static void
+ivshmem_server_quit_cb(int signum)
+{
+    ivshmem_server_quit = 1;
+}
+
+int
+main(int argc, char *argv[])
+{
+    IvshmemServer server;
+    struct sigaction sa, sa_quit;
+    IvshmemServerArgs args = {
+        .verbose = IVSHMEM_SERVER_DEFAULT_VERBOSE,
+        .foreground = IVSHMEM_SERVER_DEFAULT_FOREGROUND,
+        .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE,
+        .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH,
+        .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH,
+        .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
+        .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS,
+    };
+    int ret = 1;
+
+    /* parse arguments, will exit on error */
+    ivshmem_server_parse_args(&args, argc, argv);
+
+    /* Ignore SIGPIPE, see this link for more info:
+     * http://www.mail-archive.com/libevent-users@monkey.org/msg01606.html */
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPIPE, &sa, 0) == -1) {
+        perror("failed to ignore SIGPIPE; sigaction");
+        goto err;
+    }
+
+    sa_quit.sa_handler = ivshmem_server_quit_cb;
+    sa_quit.sa_flags = 0;
+    if (sigemptyset(&sa_quit.sa_mask) == -1 ||
+        sigaction(SIGTERM, &sa_quit, 0) == -1) {
+        perror("failed to add SIGTERM handler; sigaction");
+        goto err;
+    }
+
+    /* init the ivshms structure */
+    if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path,
+                            args.shm_size, args.n_vectors, args.verbose) < 0) {
+        fprintf(stderr, "cannot init server\n");
+        goto err;
+    }
+
+    /* start the ivshmem server (open shm & unix socket) */
+    if (ivshmem_server_start(&server) < 0) {
+        fprintf(stderr, "cannot bind\n");
+        goto err;
+    }
+
+    /* daemonize if asked to */
+    if (!args.foreground) {
+        FILE *fp;
+
+        if (qemu_daemon(1, 1) < 0) {
+            fprintf(stderr, "cannot daemonize: %s\n", strerror(errno));
+            goto err_close;
+        }
+
+        /* write pid file */
+        fp = fopen(args.pid_file, "w");
+        if (fp == NULL) {
+            fprintf(stderr, "cannot write pid file: %s\n", strerror(errno));
+            goto err_close;
+        }
+
+        fprintf(fp, "%d\n", (int) getpid());
+        fclose(fp);
+    }
+
+    ivshmem_server_poll_events(&server);
+    fprintf(stdout, "server disconnected\n");
+    ret = 0;
+
+err_close:
+    ivshmem_server_close(&server);
+err:
+    return ret;
+}
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -30,6 +30,7 @@
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
+#include "sysemu/replay.h"

 /* -icount align implementation. */

@@ -184,7 +185,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
-                             TranslationBlock *orig_tb)
+                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;

@@ -194,7 +195,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
        max_cycles = CF_COUNT_MASK;

    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
-                     max_cycles | CF_NOCACHE);
+                     max_cycles | CF_NOCACHE
+                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
    cpu->current_tb = tb;
    /* execute the generated code */
@@ -345,21 +347,25 @@ int cpu_exec(CPUState *cpu)
    uintptr_t next_tb;
    SyncClocks sc;

+    /* replay_interrupt may need current_cpu */
+    current_cpu = cpu;
+
    if (cpu->halted) {
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
            apic_poll_irq(x86_cpu->apic_state);
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
        }
 #endif
        if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
            return EXCP_HALTED;
        }

        cpu->halted = 0;
    }

-    current_cpu = cpu;
    atomic_mb_set(&tcg_current_cpu, cpu);
    rcu_read_lock();

@@ -401,10 +407,22 @@ int cpu_exec(CPUState *cpu)
                    cpu->exception_index = -1;
                    break;
 #else
-                    cc->do_interrupt(cpu);
-                    cpu->exception_index = -1;
+                    if (replay_exception()) {
+                        cc->do_interrupt(cpu);
+                        cpu->exception_index = -1;
+                    } else if (!replay_has_interrupt()) {
+                        /* give a chance to iothread in replay mode */
+                        ret = EXCP_INTERRUPT;
+                        break;
+                    }
 #endif
                }
+            } else if (replay_has_exception()
+                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+                /* try to cause an exception pending in the log */
+                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
+                ret = -1;
+                break;
            }

            next_tb = 0; /* force lookup of first TB */
@@ -420,30 +438,40 @@ int cpu_exec(CPUState *cpu)
                        cpu->exception_index = EXCP_DEBUG;
                        cpu_loop_exit(cpu);
                    }
-                    if (interrupt_request & CPU_INTERRUPT_HALT) {
+                    if (replay_mode == REPLAY_MODE_PLAY
+                        && !replay_has_interrupt()) {
+                        /* Do nothing */
+                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+                        replay_interrupt();
                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                        cpu->halted = 1;
                        cpu->exception_index = EXCP_HLT;
                        cpu_loop_exit(cpu);
                    }
 #if defined(TARGET_I386)
-                    if (interrupt_request & CPU_INTERRUPT_INIT) {
+                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
+                        replay_interrupt();
                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
                        do_cpu_init(x86_cpu);
                        cpu->exception_index = EXCP_HALTED;
                        cpu_loop_exit(cpu);
                    }
 #else
-                    if (interrupt_request & CPU_INTERRUPT_RESET) {
+                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
+                        replay_interrupt();
                        cpu_reset(cpu);
+                        cpu_loop_exit(cpu);
                    }
 #endif
                    /* The target hook has 3 exit conditions:
                       False when the interrupt isn't processed,
                       True when it is, and we should restart on a new TB,
                       and via longjmp via cpu_loop_exit.  */
-                    if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                        next_tb = 0;
+                    else {
+                        replay_interrupt();
+                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                            next_tb = 0;
+                        }
                    }
                    /* Don't use the cached interrupt_request value,
                       do_interrupt may have updated the EXITTB flag. */
@@ -454,7 +482,8 @@ int cpu_exec(CPUState *cpu)
                        next_tb = 0;
                    }
                }
-                if (unlikely(cpu->exit_request)) {
+                if (unlikely(cpu->exit_request
+                             || replay_has_interrupt())) {
                    cpu->exit_request = 0;
                    cpu->exception_index = EXCP_INTERRUPT;
                    cpu_loop_exit(cpu);
@@ -477,7 +506,8 @@ int cpu_exec(CPUState *cpu)
                /* see if we can patch the calling TB. When the TB
                   spans two pages, we cannot safely do a direct
                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1) {
+                if (next_tb != 0 && tb->page_addr[1] == -1
+                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
                                next_tb & TB_EXIT_MASK, tb);
                }
@@ -518,7 +548,7 @@ int cpu_exec(CPUState *cpu)
                            if (insns_left > 0) {
                                /* Execute remaining instructions.  */
                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb);
+                                cpu_exec_nocache(cpu, insns_left, tb, false);
                                align_clocks(&sc, cpu);
                            }
                            cpu->exception_index = EXCP_INTERRUPT;
@@ -538,15 +568,27 @@ int cpu_exec(CPUState *cpu)
                   only be set by a memory fault) */
            } /* for(;;) */
        } else {
-            /* Reload env after longjmp - the compiler may have smashed all
-             * local variables as longjmp is marked 'noreturn'. */
+#if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
+            /* Some compilers wrongly smash all local variables after
+             * siglongjmp. There were bug reports for gcc 4.5.0 and clang.
+             * Reload essential local variables here for those compilers.
+             * Newer versions of gcc would complain about this code (-Wclobbered). */
            cpu = current_cpu;
            cc = CPU_GET_CLASS(cpu);
-            cpu->can_do_io = 1;
 #ifdef TARGET_I386
            x86_cpu = X86_CPU(cpu);
            env = &x86_cpu->env;
 #endif
+#else /* buggy compiler */
+            /* Assert that the compiler does not smash local variables. */
+            g_assert(cpu == current_cpu);
+            g_assert(cc == CPU_GET_CLASS(cpu));
+#ifdef TARGET_I386
+            g_assert(x86_cpu == X86_CPU(cpu));
+            g_assert(env == &x86_cpu->env);
+#endif
+#endif /* buggy compiler */
+            cpu->can_do_io = 1;
            tb_lock_reset();
        }
    } /* for(;;) */
--- a/cpus.c
+++ b/cpus.c
@@ -42,6 +42,7 @@
 #include "qemu/seqlock.h"
 #include "qapi-event.h"
 #include "hw/nmi.h"
+#include "sysemu/replay.h"

 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -334,7 +335,7 @@ static int64_t qemu_icount_round(int64_t count)
    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 }

-static void icount_warp_rt(void *opaque)
+static void icount_warp_rt(void)
 {
    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
     * changes from -1 to another value, so the race here is okay.
@@ -345,7 +346,8 @@ static void icount_warp_rt(void *opaque)

    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
-        int64_t clock = cpu_get_clock_locked();
+        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
+                                     cpu_get_clock_locked());
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
@@ -368,6 +370,11 @@ static void icount_warp_rt(void *opaque)
    }
 }

+static void icount_dummy_timer(void *opaque)
+{
+    (void)opaque;
+}
+
 void qtest_clock_warp(int64_t dest)
 {
    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
@@ -403,6 +410,18 @@ void qemu_clock_warp(QEMUClockType type)
        return;
    }

+    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
+     * do not fire, so computing the deadline does not make sense.
+     */
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    /* warp clock deterministically in record/replay mode */
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
+        return;
+    }
+
    if (icount_sleep) {
        /*
         * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
@@ -412,7 +431,7 @@ void qemu_clock_warp(QEMUClockType type)
         * the CPU starts running, in case the CPU is woken by an event other
         * than the earliest QEMU_CLOCK_VIRTUAL timer.
         */
-        icount_warp_rt(NULL);
+        icount_warp_rt();
        timer_del(icount_warp_timer);
    }
    if (!all_cpu_threads_idle()) {
@@ -605,7 +624,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
-                                         icount_warp_rt, NULL);
+                                         icount_dummy_timer, NULL);
    }

    icount_align_option = qemu_opt_get_bool(opts, "align", false);
@@ -694,15 +713,6 @@ void cpu_synchronize_all_post_init(void)
    }
 }

-void cpu_clean_all_dirty(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        cpu_clean_state(cpu);
-    }
-}
-
 static int do_vm_stop(RunState state)
 {
    int ret = 0;
@@ -1405,12 +1415,36 @@ int vm_stop_force_state(RunState state)
        return vm_stop(state);
    } else {
        runstate_set(state);
+
+        bdrv_drain_all();
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
        return bdrv_flush_all();
    }
 }

+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        /* Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return qemu_icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
 static int tcg_cpu_exec(CPUState *cpu)
 {
    int ret;
@@ -1423,24 +1457,12 @@ static int tcg_cpu_exec(CPUState *cpu)
 #endif
    if (use_icount) {
        int64_t count;
-        int64_t deadline;
        int decr;
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-        /* Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        count = qemu_icount_round(deadline);
+        count = tcg_get_icount_limit();
        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
@@ -1458,6 +1480,7 @@ static int tcg_cpu_exec(CPUState *cpu)
                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
+        replay_account_executed_instructions();
    }
    return ret;
 }
@@ -1535,22 +1558,29 @@ CpuInfoList *qmp_query_cpus(Error **errp)
        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
        info->value->thread_id = cpu->thread_id;
 #if defined(TARGET_I386)
-        info->value->has_pc = true;
-        info->value->pc = env->eip + env->segs[R_CS].base;
+        info->value->arch = CPU_INFO_ARCH_X86;
+        info->value->u.x86 = g_new0(CpuInfoX86, 1);
+        info->value->u.x86->pc = env->eip + env->segs[R_CS].base;
 #elif defined(TARGET_PPC)
-        info->value->has_nip = true;
-        info->value->nip = env->nip;
+        info->value->arch = CPU_INFO_ARCH_PPC;
+        info->value->u.ppc = g_new0(CpuInfoPPC, 1);
+        info->value->u.ppc->nip = env->nip;
 #elif defined(TARGET_SPARC)
-        info->value->has_pc = true;
-        info->value->pc = env->pc;
-        info->value->has_npc = true;
-        info->value->npc = env->npc;
+        info->value->arch = CPU_INFO_ARCH_SPARC;
+        info->value->u.sparc = g_new0(CpuInfoSPARC, 1);
+        info->value->u.sparc->pc = env->pc;
+        info->value->u.sparc->npc = env->npc;
 #elif defined(TARGET_MIPS)
-        info->value->has_PC = true;
-        info->value->PC = env->active_tc.PC;
+        info->value->arch = CPU_INFO_ARCH_MIPS;
+        info->value->u.mips = g_new0(CpuInfoMIPS, 1);
+        info->value->u.mips->PC = env->active_tc.PC;
 #elif defined(TARGET_TRICORE)
-        info->value->has_PC = true;
-        info->value->PC = env->PC;
+        info->value->arch = CPU_INFO_ARCH_TRICORE;
+        info->value->u.tricore = g_new0(CpuInfoTricore, 1);
+        info->value->u.tricore->PC = env->PC;
+#else
+        info->value->arch = CPU_INFO_ARCH_OTHER;
+        info->value->u.other = g_new0(CpuInfoOther, 1);
 #endif

        /* XXX: waiting for the qapi to support GSList */
--- a/crypto/Makefile.objs
+++ b/crypto/Makefile.objs
@@ -7,6 +7,7 @@ crypto-obj-y += tlscreds.o
 crypto-obj-y += tlscredsanon.o
 crypto-obj-y += tlscredsx509.o
 crypto-obj-y += tlssession.o
+crypto-obj-y += secret.o

 # Let the userspace emulators avoid linking gnutls/etc
 crypto-aes-obj-y = aes.o
--- a/crypto/cipher-builtin.c
+++ b/crypto/cipher-builtin.c
@@ -25,8 +25,7 @@ typedef struct QCryptoCipherBuiltinAES QCryptoCipherBuiltinAES;
 struct QCryptoCipherBuiltinAES {
    AES_KEY encrypt_key;
    AES_KEY decrypt_key;
-    uint8_t *iv;
-    size_t niv;
+    uint8_t iv[AES_BLOCK_SIZE];
 };
 typedef struct QCryptoCipherBuiltinDESRFB QCryptoCipherBuiltinDESRFB;
 struct QCryptoCipherBuiltinDESRFB {
@@ -40,6 +39,7 @@ struct QCryptoCipherBuiltin {
        QCryptoCipherBuiltinAES aes;
        QCryptoCipherBuiltinDESRFB desrfb;
    } state;
+    size_t blocksize;
    void (*free)(QCryptoCipher *cipher);
    int (*setiv)(QCryptoCipher *cipher,
                 const uint8_t *iv, size_t niv,
@@ -61,7 +61,6 @@ static void qcrypto_cipher_free_aes(QCryptoCipher *cipher)
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    g_free(ctxt->state.aes.iv);
    g_free(ctxt);
    cipher->opaque = NULL;
 }
@@ -145,15 +144,13 @@ static int qcrypto_cipher_setiv_aes(QCryptoCipher *cipher,
                                     Error **errp)
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;
-    if (niv != 16) {
-        error_setg(errp, "IV must be 16 bytes not %zu", niv);
+    if (niv != AES_BLOCK_SIZE) {
+        error_setg(errp, "IV must be %d bytes not %zu",
+                   AES_BLOCK_SIZE, niv);
        return -1;
    }

-    g_free(ctxt->state.aes.iv);
-    ctxt->state.aes.iv = g_new0(uint8_t, niv);
-    memcpy(ctxt->state.aes.iv, iv, niv);
-    ctxt->state.aes.niv = niv;
+    memcpy(ctxt->state.aes.iv, iv, AES_BLOCK_SIZE);

    return 0;
 }
@@ -185,6 +182,7 @@ static int qcrypto_cipher_init_aes(QCryptoCipher *cipher,
        goto error;
    }

+    ctxt->blocksize = AES_BLOCK_SIZE;
    ctxt->free = qcrypto_cipher_free_aes;
    ctxt->setiv = qcrypto_cipher_setiv_aes;
    ctxt->encrypt = qcrypto_cipher_encrypt_aes;
@@ -286,6 +284,7 @@ static int qcrypto_cipher_init_des_rfb(QCryptoCipher *cipher,
    memcpy(ctxt->state.desrfb.key, key, nkey);
    ctxt->state.desrfb.nkey = nkey;

+    ctxt->blocksize = 8;
    ctxt->free = qcrypto_cipher_free_des_rfb;
    ctxt->setiv = qcrypto_cipher_setiv_des_rfb;
    ctxt->encrypt = qcrypto_cipher_encrypt_des_rfb;
@@ -374,6 +373,12 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

+    if (len % ctxt->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctxt->blocksize);
+        return -1;
+    }
+
    return ctxt->encrypt(cipher, in, out, len, errp);
 }

@@ -386,6 +391,12 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

+    if (len % ctxt->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctxt->blocksize);
+        return -1;
+    }
+
    return ctxt->decrypt(cipher, in, out, len, errp);
 }

--- a/crypto/cipher-gcrypt.c
+++ b/crypto/cipher-gcrypt.c
@@ -34,6 +34,11 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    }
 }

+typedef struct QCryptoCipherGcrypt QCryptoCipherGcrypt;
+struct QCryptoCipherGcrypt {
+    gcry_cipher_hd_t handle;
+    size_t blocksize;
+};

 QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                                  QCryptoCipherMode mode,
@@ -41,7 +46,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                                  Error **errp)
 {
    QCryptoCipher *cipher;
-    gcry_cipher_hd_t handle;
+    QCryptoCipherGcrypt *ctx;
    gcry_error_t err;
    int gcryalg, gcrymode;

@@ -87,7 +92,9 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    cipher->alg = alg;
    cipher->mode = mode;

-    err = gcry_cipher_open(&handle, gcryalg, gcrymode, 0);
+    ctx = g_new0(QCryptoCipherGcrypt, 1);
+
+    err = gcry_cipher_open(&ctx->handle, gcryalg, gcrymode, 0);
    if (err != 0) {
        error_setg(errp, "Cannot initialize cipher: %s",
                   gcry_strerror(err));
@@ -100,10 +107,12 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
         * bizarre RFB variant of DES :-)
         */
        uint8_t *rfbkey = qcrypto_cipher_munge_des_rfb_key(key, nkey);
-        err = gcry_cipher_setkey(handle, rfbkey, nkey);
+        err = gcry_cipher_setkey(ctx->handle, rfbkey, nkey);
        g_free(rfbkey);
+        ctx->blocksize = 8;
    } else {
-        err = gcry_cipher_setkey(handle, key, nkey);
+        err = gcry_cipher_setkey(ctx->handle, key, nkey);
+        ctx->blocksize = 16;
    }
    if (err != 0) {
        error_setg(errp, "Cannot set key: %s",
@@ -111,11 +120,12 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        goto error;
    }

-    cipher->opaque = handle;
+    cipher->opaque = ctx;
    return cipher;

 error:
-    gcry_cipher_close(handle);
+    gcry_cipher_close(ctx->handle);
+    g_free(ctx);
    g_free(cipher);
    return NULL;
 }
@@ -123,12 +133,13 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

 void qcrypto_cipher_free(QCryptoCipher *cipher)
 {
-    gcry_cipher_hd_t handle;
+    QCryptoCipherGcrypt *ctx;
    if (!cipher) {
        return;
    }
-    handle = cipher->opaque;
-    gcry_cipher_close(handle);
+    ctx = cipher->opaque;
+    gcry_cipher_close(ctx->handle);
+    g_free(ctx);
    g_free(cipher);
 }

@@ -139,10 +150,16 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
                           size_t len,
                           Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    err = gcry_cipher_encrypt(handle,
+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
+    err = gcry_cipher_encrypt(ctx->handle,
                              out, len,
                              in, len);
    if (err != 0) {
@@ -161,10 +178,16 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
                           size_t len,
                           Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    err = gcry_cipher_decrypt(handle,
+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
+    err = gcry_cipher_decrypt(ctx->handle,
                              out, len,
                              in, len);
    if (err != 0) {
@@ -180,11 +203,17 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
                         const uint8_t *iv, size_t niv,
                         Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    gcry_cipher_reset(handle);
-    err = gcry_cipher_setiv(handle, iv, niv);
+    if (niv != ctx->blocksize) {
+        error_setg(errp, "Expected IV size %zu not %zu",
+                   ctx->blocksize, niv);
+        return -1;
+    }
+
+    gcry_cipher_reset(ctx->handle);
+    err = gcry_cipher_setiv(ctx->handle, iv, niv);
    if (err != 0) {
        error_setg(errp, "Cannot set IV: %s",
                   gcry_strerror(err));
--- a/crypto/cipher-nettle.c
+++ b/crypto/cipher-nettle.c
@@ -69,7 +69,7 @@ struct QCryptoCipherNettle {
    nettle_cipher_func *alg_encrypt;
    nettle_cipher_func *alg_decrypt;
    uint8_t *iv;
-    size_t niv;
+    size_t blocksize;
 };

 bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
@@ -125,7 +125,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        ctx->alg_encrypt = des_encrypt_wrapper;
        ctx->alg_decrypt = des_decrypt_wrapper;

-        ctx->niv = DES_BLOCK_SIZE;
+        ctx->blocksize = DES_BLOCK_SIZE;
        break;

    case QCRYPTO_CIPHER_ALG_AES_128:
@@ -140,14 +140,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        ctx->alg_encrypt = aes_encrypt_wrapper;
        ctx->alg_decrypt = aes_decrypt_wrapper;

-        ctx->niv = AES_BLOCK_SIZE;
+        ctx->blocksize = AES_BLOCK_SIZE;
        break;
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        goto error;
    }

-    ctx->iv = g_new0(uint8_t, ctx->niv);
+    ctx->iv = g_new0(uint8_t, ctx->blocksize);
    cipher->opaque = ctx;

    return cipher;
@@ -184,6 +184,12 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherNettle *ctx = cipher->opaque;

+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
        ctx->alg_encrypt(ctx->ctx_encrypt, len, out, in);
@@ -191,7 +197,7 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,

    case QCRYPTO_CIPHER_MODE_CBC:
        cbc_encrypt(ctx->ctx_encrypt, ctx->alg_encrypt,
-                    ctx->niv, ctx->iv,
+                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
    default:
@@ -211,6 +217,12 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherNettle *ctx = cipher->opaque;

+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
        ctx->alg_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
@@ -219,7 +231,7 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,

    case QCRYPTO_CIPHER_MODE_CBC:
        cbc_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                    ctx->alg_decrypt, ctx->niv, ctx->iv,
+                    ctx->alg_decrypt, ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
    default:
@@ -235,9 +247,9 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
                         Error **errp)
 {
    QCryptoCipherNettle *ctx = cipher->opaque;
-    if (niv != ctx->niv) {
+    if (niv != ctx->blocksize) {
        error_setg(errp, "Expected IV size %zu not %zu",
-                   ctx->niv, niv);
+                   ctx->blocksize, niv);
        return -1;
    }
    memcpy(ctx->iv, iv, niv);
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -21,19 +21,67 @@
 #include "crypto/cipher.h"


-static size_t alg_key_len[QCRYPTO_CIPHER_ALG_LAST] = {
+static size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = {
    [QCRYPTO_CIPHER_ALG_AES_128] = 16,
    [QCRYPTO_CIPHER_ALG_AES_192] = 24,
    [QCRYPTO_CIPHER_ALG_AES_256] = 32,
    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
 };

+static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
+    [QCRYPTO_CIPHER_ALG_AES_128] = 16,
+    [QCRYPTO_CIPHER_ALG_AES_192] = 16,
+    [QCRYPTO_CIPHER_ALG_AES_256] = 16,
+    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
+};
+
+static bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {
+    [QCRYPTO_CIPHER_MODE_ECB] = false,
+    [QCRYPTO_CIPHER_MODE_CBC] = true,
+};
+
+
+size_t qcrypto_cipher_get_block_len(QCryptoCipherAlgorithm alg)
+{
+    if (alg >= G_N_ELEMENTS(alg_key_len)) {
+        return 0;
+    }
+    return alg_block_len[alg];
+}
+
+
+size_t qcrypto_cipher_get_key_len(QCryptoCipherAlgorithm alg)
+{
+    if (alg >= G_N_ELEMENTS(alg_key_len)) {
+        return 0;
+    }
+    return alg_key_len[alg];
+}
+
+
+size_t qcrypto_cipher_get_iv_len(QCryptoCipherAlgorithm alg,
+                                 QCryptoCipherMode mode)
+{
+    if (alg >= G_N_ELEMENTS(alg_block_len)) {
+        return 0;
+    }
+    if (mode >= G_N_ELEMENTS(mode_need_iv)) {
+        return 0;
+    }
+
+    if (mode_need_iv[mode]) {
+        return alg_block_len[alg];
+    }
+    return 0;
+}
+
+
 static bool
 qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
                                   size_t nkey,
                                   Error **errp)
 {
-    if ((unsigned)alg >= QCRYPTO_CIPHER_ALG_LAST) {
+    if ((unsigned)alg >= QCRYPTO_CIPHER_ALG__MAX) {
        error_setg(errp, "Cipher algorithm %d out of range",
                   alg);
        return false;
@@ -41,13 +89,13 @@ qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,

    if (alg_key_len[alg] != nkey) {
        error_setg(errp, "Cipher key length %zu should be %zu",
-                   alg_key_len[alg], nkey);
+                   nkey, alg_key_len[alg]);
        return false;
    }
    return true;
 }

-#if defined(CONFIG_GNUTLS_GCRYPT) || defined(CONFIG_GNUTLS_NETTLE)
+#if defined(CONFIG_GCRYPT) || defined(CONFIG_NETTLE)
 static uint8_t *
 qcrypto_cipher_munge_des_rfb_key(const uint8_t *key,
                                 size_t nkey)
@@ -63,11 +111,11 @@ qcrypto_cipher_munge_des_rfb_key(const uint8_t *key,
    }
    return ret;
 }
-#endif /* CONFIG_GNUTLS_GCRYPT || CONFIG_GNUTLS_NETTLE */
+#endif /* CONFIG_GCRYPT || CONFIG_NETTLE */

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
 #include "crypto/cipher-gcrypt.c"
-#elif defined CONFIG_GNUTLS_NETTLE
+#elif defined CONFIG_NETTLE
 #include "crypto/cipher-nettle.c"
 #else
 #include "crypto/cipher-builtin.c"
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -24,12 +24,18 @@
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>

-static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG_LAST] = {
+static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = {
    [QCRYPTO_HASH_ALG_MD5] = GNUTLS_DIG_MD5,
    [QCRYPTO_HASH_ALG_SHA1] = GNUTLS_DIG_SHA1,
    [QCRYPTO_HASH_ALG_SHA256] = GNUTLS_DIG_SHA256,
 };

+static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = {
+    [QCRYPTO_HASH_ALG_MD5] = 16,
+    [QCRYPTO_HASH_ALG_SHA1] = 20,
+    [QCRYPTO_HASH_ALG_SHA256] = 32,
+};
+
 gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
 {
    if (alg < G_N_ELEMENTS(qcrypto_hash_alg_map)) {
@@ -38,6 +44,15 @@ gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
    return false;
 }

+size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg)
+{
+    if (alg >= G_N_ELEMENTS(qcrypto_hash_alg_size)) {
+        return 0;
+    }
+    return qcrypto_hash_alg_size[alg];
+}
+
+
 int qcrypto_hash_bytesv(QCryptoHashAlgorithm alg,
                        const struct iovec *iov,
                        size_t niov,
--- a/crypto/init.c
+++ b/crypto/init.c
@@ -24,8 +24,9 @@
 #ifdef CONFIG_GNUTLS
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>
+#endif

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
 #include <gcrypt.h>
 #endif

@@ -37,6 +38,7 @@
 *  - When GNUTLS >= 2.12, we must not initialize gcrypt threading
 *    because GNUTLS will do that itself
 *  - When GNUTLS < 2.12 we must always initialize gcrypt threading
+ *  - When GNUTLS is disabled we must always initialize gcrypt threading
 *
 * But....
 *
@@ -47,12 +49,15 @@
 *
 *   - gcrypt < 1.6.0
 * AND
- *   - gnutls < 2.12
+ *      - gnutls < 2.12
+ *   OR
+ *      - gnutls is disabled
 *
 */

-#if (defined(CONFIG_GNUTLS_GCRYPT) &&           \
-     (!defined(GNUTLS_VERSION_NUMBER) ||        \
+#if (defined(CONFIG_GCRYPT) &&                  \
+     (!defined(CONFIG_GNUTLS) ||                \
+      !defined(GNUTLS_VERSION_NUMBER) ||       \
      (GNUTLS_VERSION_NUMBER < 0x020c00)) &&    \
     (!defined(GCRYPT_VERSION_NUMBER) ||        \
      (GCRYPT_VERSION_NUMBER < 0x010600)))
@@ -113,6 +118,7 @@ static struct gcry_thread_cbs qcrypto_gcrypt_thread_impl = {

 int qcrypto_init(Error **errp)
 {
+#ifdef CONFIG_GNUTLS
    int ret;
    ret = gnutls_global_init();
    if (ret < 0) {
@@ -125,8 +131,9 @@ int qcrypto_init(Error **errp)
    gnutls_global_set_log_level(10);
    gnutls_global_set_log_function(qcrypto_gnutls_log);
 #endif
+#endif

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
    if (!gcry_check_version(GCRYPT_VERSION)) {
        error_setg(errp, "Unable to initialize gcrypt");
        return -1;
@@ -139,12 +146,3 @@ int qcrypto_init(Error **errp)

    return 0;
 }
-
-#else /* ! CONFIG_GNUTLS */
-
-int qcrypto_init(Error **errp G_GNUC_UNUSED)
-{
-    return 0;
-}
-
-#endif /* ! CONFIG_GNUTLS */
--- a/crypto/secret.c
+++ b/crypto/secret.c
@@ -0,0 +1,513 @@
+/*
+ * QEMU crypto secret support
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/secret.h"
+#include "crypto/cipher.h"
+#include "qom/object_interfaces.h"
+#include "qemu/base64.h"
+#include "trace.h"
+
+
+static void
+qcrypto_secret_load_data(QCryptoSecret *secret,
+                         uint8_t **output,
+                         size_t *outputlen,
+                         Error **errp)
+{
+    char *data = NULL;
+    size_t length = 0;
+    GError *gerr = NULL;
+
+    *output = NULL;
+    *outputlen = 0;
+
+    if (secret->file) {
+        if (secret->data) {
+            error_setg(errp,
+                       "'file' and 'data' are mutually exclusive");
+            return;
+        }
+        if (!g_file_get_contents(secret->file, &data, &length, &gerr)) {
+            error_setg(errp,
+                       "Unable to read %s: %s",
+                       secret->file, gerr->message);
+            g_error_free(gerr);
+            return;
+        }
+        *output = (uint8_t *)data;
+        *outputlen = length;
+    } else if (secret->data) {
+        *outputlen = strlen(secret->data);
+        *output = (uint8_t *)g_strdup(secret->data);
+    } else {
+        error_setg(errp, "Either 'file' or 'data' must be provided");
+    }
+}
+
+
+static void qcrypto_secret_decrypt(QCryptoSecret *secret,
+                                   const uint8_t *input,
+                                   size_t inputlen,
+                                   uint8_t **output,
+                                   size_t *outputlen,
+                                   Error **errp)
+{
+    uint8_t *key = NULL, *ciphertext = NULL, *iv = NULL;
+    size_t keylen, ciphertextlen, ivlen;
+    QCryptoCipher *aes = NULL;
+    uint8_t *plaintext = NULL;
+
+    *output = NULL;
+    *outputlen = 0;
+
+    if (qcrypto_secret_lookup(secret->keyid,
+                              &key, &keylen,
+                              errp) < 0) {
+        goto cleanup;
+    }
+
+    if (keylen != 32) {
+        error_setg(errp, "Key should be 32 bytes in length");
+        goto cleanup;
+    }
+
+    if (!secret->iv) {
+        error_setg(errp, "IV is required to decrypt secret");
+        goto cleanup;
+    }
+
+    iv = qbase64_decode(secret->iv, -1, &ivlen, errp);
+    if (!iv) {
+        goto cleanup;
+    }
+    if (ivlen != 16) {
+        error_setg(errp, "IV should be 16 bytes in length not %zu",
+                   ivlen);
+        goto cleanup;
+    }
+
+    aes = qcrypto_cipher_new(QCRYPTO_CIPHER_ALG_AES_256,
+                             QCRYPTO_CIPHER_MODE_CBC,
+                             key, keylen,
+                             errp);
+    if (!aes) {
+        goto cleanup;
+    }
+
+    if (qcrypto_cipher_setiv(aes, iv, ivlen, errp) < 0) {
+        goto cleanup;
+    }
+
+    if (secret->format == QCRYPTO_SECRET_FORMAT_BASE64) {
+        ciphertext = qbase64_decode((const gchar*)input,
+                                    inputlen,
+                                    &ciphertextlen,
+                                    errp);
+        if (!ciphertext) {
+            goto cleanup;
+        }
+        plaintext = g_new0(uint8_t, ciphertextlen + 1);
+    } else {
+        ciphertextlen = inputlen;
+        plaintext = g_new0(uint8_t, inputlen + 1);
+    }
+    if (qcrypto_cipher_decrypt(aes,
+                               ciphertext ? ciphertext : input,
+                               plaintext,
+                               ciphertextlen,
+                               errp) < 0) {
+        plaintext = NULL;
+        goto cleanup;
+    }
+
+    if (plaintext[ciphertextlen - 1] > 16 ||
+        plaintext[ciphertextlen - 1] > ciphertextlen) {
+        error_setg(errp, "Incorrect number of padding bytes (%d) "
+                   "found on decrypted data",
+                   (int)plaintext[ciphertextlen - 1]);
+        g_free(plaintext);
+        plaintext = NULL;
+        goto cleanup;
+    }
+
+    /* Even though plaintext may contain arbitrary NUL
+     * ensure it is explicitly NUL terminated.
+     */
+    ciphertextlen -= plaintext[ciphertextlen - 1];
+    plaintext[ciphertextlen] = '\0';
+
+    *output = plaintext;
+    *outputlen = ciphertextlen;
+
+ cleanup:
+    g_free(ciphertext);
+    g_free(iv);
+    g_free(key);
+    qcrypto_cipher_free(aes);
+}
+
+
+static void qcrypto_secret_decode(const uint8_t *input,
+                                  size_t inputlen,
+                                  uint8_t **output,
+                                  size_t *outputlen,
+                                  Error **errp)
+{
+    *output = qbase64_decode((const gchar*)input,
+                             inputlen,
+                             outputlen,
+                             errp);
+}
+
+
+static void
+qcrypto_secret_prop_set_loaded(Object *obj,
+                               bool value,
+                               Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    if (value) {
+        Error *local_err = NULL;
+        uint8_t *input = NULL;
+        size_t inputlen = 0;
+        uint8_t *output = NULL;
+        size_t outputlen = 0;
+
+        qcrypto_secret_load_data(secret, &input, &inputlen, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        if (secret->keyid) {
+            qcrypto_secret_decrypt(secret, input, inputlen,
+                                   &output, &outputlen, &local_err);
+            g_free(input);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                return;
+            }
+            input = output;
+            inputlen = outputlen;
+        } else {
+            if (secret->format != QCRYPTO_SECRET_FORMAT_RAW) {
+                qcrypto_secret_decode(input, inputlen,
+                                      &output, &outputlen, &local_err);
+                g_free(input);
+                if (local_err) {
+                    error_propagate(errp, local_err);
+                    return;
+                }
+                input = output;
+                inputlen = outputlen;
+            }
+        }
+
+        secret->rawdata = input;
+        secret->rawlen = inputlen;
+    } else {
+        g_free(secret->rawdata);
+        secret->rawlen = 0;
+    }
+}
+
+
+static bool
+qcrypto_secret_prop_get_loaded(Object *obj,
+                               Error **errp G_GNUC_UNUSED)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+    return secret->data != NULL;
+}
+
+
+static void
+qcrypto_secret_prop_set_format(Object *obj,
+                               int value,
+                               Error **errp G_GNUC_UNUSED)
+{
+    QCryptoSecret *creds = QCRYPTO_SECRET(obj);
+
+    creds->format = value;
+}
+
+
+static int
+qcrypto_secret_prop_get_format(Object *obj,
+                               Error **errp G_GNUC_UNUSED)
+{
+    QCryptoSecret *creds = QCRYPTO_SECRET(obj);
+
+    return creds->format;
+}
+
+
+static void
+qcrypto_secret_prop_set_data(Object *obj,
+                             const char *value,
+                             Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    g_free(secret->data);
+    secret->data = g_strdup(value);
+}
+
+
+static char *
+qcrypto_secret_prop_get_data(Object *obj,
+                             Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+    return g_strdup(secret->data);
+}
+
+
+static void
+qcrypto_secret_prop_set_file(Object *obj,
+                             const char *value,
+                             Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    g_free(secret->file);
+    secret->file = g_strdup(value);
+}
+
+
+static char *
+qcrypto_secret_prop_get_file(Object *obj,
+                             Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+    return g_strdup(secret->file);
+}
+
+
+static void
+qcrypto_secret_prop_set_iv(Object *obj,
+                           const char *value,
+                           Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    g_free(secret->iv);
+    secret->iv = g_strdup(value);
+}
+
+
+static char *
+qcrypto_secret_prop_get_iv(Object *obj,
+                           Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+    return g_strdup(secret->iv);
+}
+
+
+static void
+qcrypto_secret_prop_set_keyid(Object *obj,
+                              const char *value,
+                              Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    g_free(secret->keyid);
+    secret->keyid = g_strdup(value);
+}
+
+
+static char *
+qcrypto_secret_prop_get_keyid(Object *obj,
+                              Error **errp)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+    return g_strdup(secret->keyid);
+}
+
+
+static void
+qcrypto_secret_complete(UserCreatable *uc, Error **errp)
+{
+    object_property_set_bool(OBJECT(uc), true, "loaded", errp);
+}
+
+
+static void
+qcrypto_secret_init(Object *obj)
+{
+    object_property_add_bool(obj, "loaded",
+                             qcrypto_secret_prop_get_loaded,
+                             qcrypto_secret_prop_set_loaded,
+                             NULL);
+    object_property_add_enum(obj, "format",
+                             "QCryptoSecretFormat",
+                             QCryptoSecretFormat_lookup,
+                             qcrypto_secret_prop_get_format,
+                             qcrypto_secret_prop_set_format,
+                             NULL);
+    object_property_add_str(obj, "data",
+                            qcrypto_secret_prop_get_data,
+                            qcrypto_secret_prop_set_data,
+                            NULL);
+    object_property_add_str(obj, "file",
+                            qcrypto_secret_prop_get_file,
+                            qcrypto_secret_prop_set_file,
+                            NULL);
+    object_property_add_str(obj, "keyid",
+                            qcrypto_secret_prop_get_keyid,
+                            qcrypto_secret_prop_set_keyid,
+                            NULL);
+    object_property_add_str(obj, "iv",
+                            qcrypto_secret_prop_get_iv,
+                            qcrypto_secret_prop_set_iv,
+                            NULL);
+}
+
+
+static void
+qcrypto_secret_finalize(Object *obj)
+{
+    QCryptoSecret *secret = QCRYPTO_SECRET(obj);
+
+    g_free(secret->iv);
+    g_free(secret->file);
+    g_free(secret->keyid);
+    g_free(secret->rawdata);
+    g_free(secret->data);
+}
+
+static void
+qcrypto_secret_class_init(ObjectClass *oc, void *data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+    ucc->complete = qcrypto_secret_complete;
+}
+
+
+int qcrypto_secret_lookup(const char *secretid,
+                          uint8_t **data,
+                          size_t *datalen,
+                          Error **errp)
+{
+    Object *obj;
+    QCryptoSecret *secret;
+
+    obj = object_resolve_path_component(
+        object_get_objects_root(), secretid);
+    if (!obj) {
+        error_setg(errp, "No secret with id '%s'", secretid);
+        return -1;
+    }
+
+    secret = (QCryptoSecret *)
+        object_dynamic_cast(obj,
+                            TYPE_QCRYPTO_SECRET);
+    if (!secret) {
+        error_setg(errp, "Object with id '%s' is not a secret",
+                   secretid);
+        return -1;
+    }
+
+    if (!secret->rawdata) {
+        error_setg(errp, "Secret with id '%s' has no data",
+                   secretid);
+        return -1;
+    }
+
+    *data = g_new0(uint8, secret->rawlen + 1);
+    memcpy(*data, secret->rawdata, secret->rawlen);
+    (*data)[secret->rawlen] = '\0';
+    *datalen = secret->rawlen;
+
+    return 0;
+}
+
+
+char *qcrypto_secret_lookup_as_utf8(const char *secretid,
+                                    Error **errp)
+{
+    uint8_t *data;
+    size_t datalen;
+
+    if (qcrypto_secret_lookup(secretid,
+                              &data,
+                              &datalen,
+                              errp) < 0) {
+        return NULL;
+    }
+
+    if (!g_utf8_validate((const gchar*)data, datalen, NULL)) {
+        error_setg(errp,
+                   "Data from secret %s is not valid UTF-8",
+                   secretid);
+        g_free(data);
+        return NULL;
+    }
+
+    return (char *)data;
+}
+
+
+char *qcrypto_secret_lookup_as_base64(const char *secretid,
+                                      Error **errp)
+{
+    uint8_t *data;
+    size_t datalen;
+    char *ret;
+
+    if (qcrypto_secret_lookup(secretid,
+                              &data,
+                              &datalen,
+                              errp) < 0) {
+        return NULL;
+    }
+
+    ret = g_base64_encode(data, datalen);
+    g_free(data);
+    return ret;
+}
+
+
+static const TypeInfo qcrypto_secret_info = {
+    .parent = TYPE_OBJECT,
+    .name = TYPE_QCRYPTO_SECRET,
+    .instance_size = sizeof(QCryptoSecret),
+    .instance_init = qcrypto_secret_init,
+    .instance_finalize = qcrypto_secret_finalize,
+    .class_size = sizeof(QCryptoSecretClass),
+    .class_init = qcrypto_secret_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+
+static void
+qcrypto_secret_register_types(void)
+{
+    type_register_static(&qcrypto_secret_info);
+}
+
+
+type_init(qcrypto_secret_register_types);
--- a/crypto/tlscreds.c
+++ b/crypto/tlscreds.c
@@ -123,10 +123,10 @@ qcrypto_tls_creds_get_path(QCryptoTLSCreds *creds,
        goto cleanup;
    }

-    trace_qcrypto_tls_creds_get_path(creds, filename,
-                                     *cred ? *cred : "<none>");
    ret = 0;
 cleanup:
+    trace_qcrypto_tls_creds_get_path(creds, filename,
+                                     *cred ? *cred : "<none>");
    return ret;
 }

--- a/crypto/tlscredsx509.c
+++ b/crypto/tlscredsx509.c
@@ -20,6 +20,7 @@

 #include "crypto/tlscredsx509.h"
 #include "crypto/tlscredspriv.h"
+#include "crypto/secret.h"
 #include "qom/object_interfaces.h"
 #include "trace.h"

@@ -255,6 +256,7 @@ qcrypto_tls_creds_check_cert_key_purpose(QCryptoTLSCredsX509 *creds,
        }

        g_free(buffer);
+        buffer = NULL;
    }

    if (isServer) {
@@ -485,7 +487,8 @@ qcrypto_tls_creds_x509_sanity_check(QCryptoTLSCredsX509 *creds,
    int ret = -1;

    memset(cacerts, 0, sizeof(cacerts));
-    if (access(certFile, R_OK) == 0) {
+    if (certFile &&
+        access(certFile, R_OK) == 0) {
        cert = qcrypto_tls_creds_load_cert(creds,
                                           certFile, isServer,
                                           errp);
@@ -605,9 +608,30 @@ qcrypto_tls_creds_x509_load(QCryptoTLSCredsX509 *creds,
    }

    if (cert != NULL && key != NULL) {
+#if GNUTLS_VERSION_NUMBER >= 0x030111
+        char *password = NULL;
+        if (creds->passwordid) {
+            password = qcrypto_secret_lookup_as_utf8(creds->passwordid,
+                                                     errp);
+            if (!password) {
+                goto cleanup;
+            }
+        }
+        ret = gnutls_certificate_set_x509_key_file2(creds->data,
+                                                    cert, key,
+                                                    GNUTLS_X509_FMT_PEM,
+                                                    password,
+                                                    0);
+        g_free(password);
+#else /* GNUTLS_VERSION_NUMBER < 0x030111 */
+        if (creds->passwordid) {
+            error_setg(errp, "PKCS8 decryption requires GNUTLS >= 3.1.11");
+            goto cleanup;
+        }
        ret = gnutls_certificate_set_x509_key_file(creds->data,
                                                   cert, key,
                                                   GNUTLS_X509_FMT_PEM);
+#endif /* GNUTLS_VERSION_NUMBER < 0x030111 */
        if (ret < 0) {
            error_setg(errp, "Cannot load certificate '%s' & key '%s': %s",
                       cert, key, gnutls_strerror(ret));
@@ -654,6 +678,10 @@ qcrypto_tls_creds_x509_unload(QCryptoTLSCredsX509 *creds)
        gnutls_certificate_free_credentials(creds->data);
        creds->data = NULL;
    }
+    if (creds->parent_obj.dh_params) {
+        gnutls_dh_params_deinit(creds->parent_obj.dh_params);
+        creds->parent_obj.dh_params = NULL;
+    }
 }


@@ -731,6 +759,27 @@ qcrypto_tls_creds_x509_prop_set_sanity(Object *obj,
 }


+static void
+qcrypto_tls_creds_x509_prop_set_passwordid(Object *obj,
+                                           const char *value,
+                                           Error **errp G_GNUC_UNUSED)
+{
+    QCryptoTLSCredsX509 *creds = QCRYPTO_TLS_CREDS_X509(obj);
+
+    creds->passwordid = g_strdup(value);
+}
+
+
+static char *
+qcrypto_tls_creds_x509_prop_get_passwordid(Object *obj,
+                                           Error **errp G_GNUC_UNUSED)
+{
+    QCryptoTLSCredsX509 *creds = QCRYPTO_TLS_CREDS_X509(obj);
+
+    return g_strdup(creds->passwordid);
+}
+
+
 static bool
 qcrypto_tls_creds_x509_prop_get_sanity(Object *obj,
                                       Error **errp G_GNUC_UNUSED)
@@ -763,6 +812,10 @@ qcrypto_tls_creds_x509_init(Object *obj)
                             qcrypto_tls_creds_x509_prop_get_sanity,
                             qcrypto_tls_creds_x509_prop_set_sanity,
                             NULL);
+    object_property_add_str(obj, "passwordid",
+                            qcrypto_tls_creds_x509_prop_get_passwordid,
+                            qcrypto_tls_creds_x509_prop_set_passwordid,
+                            NULL);
 }


@@ -771,6 +824,7 @@ qcrypto_tls_creds_x509_finalize(Object *obj)
 {
    QCryptoTLSCredsX509 *creds = QCRYPTO_TLS_CREDS_X509(obj);

+    g_free(creds->passwordid);
    qcrypto_tls_creds_x509_unload(creds);
 }

--- a/crypto/tlssession.c
+++ b/crypto/tlssession.c
@@ -304,9 +304,9 @@ qcrypto_tls_session_check_certificate(QCryptoTLSSession *session,

                allow = qemu_acl_party_is_allowed(acl, session->peername);

-                error_setg(errp, "TLS x509 ACL check for %s is %s",
-                           session->peername, allow ? "allowed" : "denied");
                if (!allow) {
+                    error_setg(errp, "TLS x509 ACL check for %s is denied",
+                               session->peername);
                    goto error;
                }
            }
--- a/default-configs/aarch64-linux-user.mak
+++ b/default-configs/aarch64-linux-user.mak
@@ -1,3 +1 @@
 # Default configuration for aarch64-linux-user
-
-CONFIG_GDBSTUB_XML=y
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -9,6 +9,11 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VIRTIO_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
+CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
+CONFIG_ISA_IPMI_KCS=y
+CONFIG_ISA_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
@@ -46,7 +51,10 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
 CONFIG_SMBIOS=y
+CONFIG_HYPERV_TESTDEV=$(CONFIG_KVM)
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -35,5 +35,5 @@ CONFIG_SDHCI=y
 CONFIG_EDU=y
 CONFIG_VGA=y
 CONFIG_VGA_PCI=y
-CONFIG_IVSHMEM=$(CONFIG_KVM)
+CONFIG_IVSHMEM=$(CONFIG_POSIX)
 CONFIG_ROCKER=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -3,6 +3,7 @@
 include pci.mak
 include sound.mak
 include usb.mak
+CONFIG_VIRTIO_VGA=y
 CONFIG_ISA_MMIO=y
 CONFIG_ESCC=y
 CONFIG_M48T59=y
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -9,6 +9,11 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VIRTIO_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
+CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
+CONFIG_ISA_IPMI_KCS=y
+CONFIG_ISA_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
@@ -46,7 +51,10 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
 CONFIG_SMBIOS=y
+CONFIG_HYPERV_TESTDEV=$(CONFIG_KVM)
--- a/disas.c
+++ b/disas.c
@@ -214,11 +214,6 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
        s.info.mach = bfd_mach_i386_i386;
    }
    s.info.print_insn = print_insn_i386;
-#elif defined(TARGET_SPARC)
-    s.info.print_insn = print_insn_sparc;
-#ifdef TARGET_SPARC64
-    s.info.mach = bfd_mach_sparc_v9b;
-#endif
 #elif defined(TARGET_PPC)
    if ((flags >> 16) & 1) {
        s.info.endian = BFD_ENDIAN_LITTLE;
@@ -235,29 +230,6 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
    }
    s.info.disassembler_options = (char *)"any";
    s.info.print_insn = print_insn_ppc;
-#elif defined(TARGET_M68K)
-    s.info.print_insn = print_insn_m68k;
-#elif defined(TARGET_MIPS)
-#ifdef TARGET_WORDS_BIGENDIAN
-    s.info.print_insn = print_insn_big_mips;
-#else
-    s.info.print_insn = print_insn_little_mips;
-#endif
-#elif defined(TARGET_SH4)
-    s.info.mach = bfd_mach_sh4;
-    s.info.print_insn = print_insn_sh;
-#elif defined(TARGET_ALPHA)
-    s.info.mach = bfd_mach_alpha_ev6;
-    s.info.print_insn = print_insn_alpha;
-#elif defined(TARGET_S390X)
-    s.info.mach = bfd_mach_s390_64;
-    s.info.print_insn = print_insn_s390;
-#elif defined(TARGET_MOXIE)
-    s.info.mach = bfd_arch_moxie;
-    s.info.print_insn = print_insn_moxie;
-#elif defined(TARGET_LM32)
-    s.info.mach = bfd_mach_lm32;
-    s.info.print_insn = print_insn_lm32;
 #endif
    if (s.info.print_insn == NULL) {
        s.info.print_insn = print_insn_od_target;
@@ -429,13 +401,6 @@ void monitor_disas(Monitor *mon, CPUState *cpu,
        s.info.mach = bfd_mach_i386_i386;
    }
    s.info.print_insn = print_insn_i386;
-#elif defined(TARGET_ALPHA)
-    s.info.print_insn = print_insn_alpha;
-#elif defined(TARGET_SPARC)
-    s.info.print_insn = print_insn_sparc;
-#ifdef TARGET_SPARC64
-    s.info.mach = bfd_mach_sparc_v9b;
-#endif
 #elif defined(TARGET_PPC)
    if (flags & 0xFFFF) {
        /* If we have a precise definition of the instruction set, use it. */
@@ -451,26 +416,6 @@ void monitor_disas(Monitor *mon, CPUState *cpu,
        s.info.endian = BFD_ENDIAN_LITTLE;
    }
    s.info.print_insn = print_insn_ppc;
-#elif defined(TARGET_M68K)
-    s.info.print_insn = print_insn_m68k;
-#elif defined(TARGET_MIPS)
-#ifdef TARGET_WORDS_BIGENDIAN
-    s.info.print_insn = print_insn_big_mips;
-#else
-    s.info.print_insn = print_insn_little_mips;
-#endif
-#elif defined(TARGET_SH4)
-    s.info.mach = bfd_mach_sh4;
-    s.info.print_insn = print_insn_sh;
-#elif defined(TARGET_S390X)
-    s.info.mach = bfd_mach_s390_64;
-    s.info.print_insn = print_insn_s390;
-#elif defined(TARGET_MOXIE)
-    s.info.mach = bfd_arch_moxie;
-    s.info.print_insn = print_insn_moxie;
-#elif defined(TARGET_LM32)
-    s.info.mach = bfd_mach_lm32;
-    s.info.print_insn = print_insn_lm32;
 #endif
    if (!s.info.print_insn) {
        monitor_printf(mon, "0x" TARGET_FMT_lx
--- a/disas/arm.c
+++ b/disas/arm.c
@@ -1779,7 +1779,7 @@ print_insn_coprocessor (bfd_vma pc, struct disassemble_info *info, long given,

 			/* Is ``imm'' a negative number?  */
 			if (imm & 0x40)
-			  imm |= (-1 << 7);
+			  imm |= (~0u << 7);

 			func (stream, "%d", imm);
 		      }
--- a/disas/mips.c
+++ b/disas/mips.c
@@ -2420,9 +2420,11 @@ const struct mips_opcode mips_builtin_opcodes[] =
 {"hibernate","",        0x42000023, 0xffffffff,	0, 			0,		V1	},
 {"ins",     "t,r,+A,+B", 0x7c000004, 0xfc00003f, WR_t|RD_s,    		0,		I33	},
 {"jr",      "s",	0x00000008, 0xfc1fffff,	UBD|RD_s,		0,		I1	},
+{"jr",      "s",	0x00000009, 0xfc1fffff,	UBD|RD_s,		0,		I32R6	}, /* jalr */
 /* jr.hb is officially MIPS{32,64}R2, but it works on R1 as jr with
   the same hazard barrier effect.  */
 {"jr.hb",   "s",	0x00000408, 0xfc1fffff,	UBD|RD_s,		0,		I32	},
+{"jr.hb",   "s",	0x00000409, 0xfc1fffff,	UBD|RD_s,		0,		I32R6	}, /* jalr.hb */
 {"j",       "s",	0x00000008, 0xfc1fffff,	UBD|RD_s,		0,		I1	}, /* jr */
 /* SVR4 PIC code requires special handling for j, so it must be a
   macro.  */
--- a/docs/bitmaps.md
+++ b/docs/bitmaps.md
@@ -19,12 +19,20 @@ which is included at the end of this document.
 * A dirty bitmap's name is unique to the node, but bitmaps attached to different
  nodes can share the same name.

+* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
+  name, but any user-created bitmaps may not be. There can be any number of
+  anonymous bitmaps per node.
+
+* The name of a user-created bitmap must not be empty ("").
+
 ## Bitmap Modes

 * A Bitmap can be "frozen," which means that it is currently in-use by a backup
  operation and cannot be deleted, renamed, written to, reset,
  etc.

+* The normal operating mode for a bitmap is "active."
+
 ## Basic QMP Usage

 ### Supported Commands ###
@@ -97,11 +105,7 @@ which is included at the end of this document.
 }
 ```

-## Transactions (Not yet implemented)
-
-* Transactional commands are forthcoming in a future version,
-  and are not yet available for use. This section serves as
-  documentation of intent for their design and usage.
+## Transactions

 ### Justification

@@ -323,6 +327,155 @@ full backup as a backing image.
      "event": "BLOCK_JOB_COMPLETED" }
    ```

+### Partial Transactional Failures
+
+* Sometimes, a transaction will succeed in launching and return success,
+  but then later the backup jobs themselves may fail. It is possible that
+  a management application may have to deal with a partial backup failure
+  after a successful transaction.
+
+* If multiple backup jobs are specified in a single transaction, when one of
+  them fails, it will not interact with the other backup jobs in any way.
+
+* The job(s) that succeeded will clear the dirty bitmap associated with the
+  operation, but the job(s) that failed will not. It is not "safe" to delete
+  any incremental backups that were created successfully in this scenario,
+  even though others failed.
+
+#### Example
+
+* QMP example highlighting two backup jobs:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ]
+      }
+    }
+    ```
+
+* QMP example response, highlighting one success and one failure:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the first job was completed:
+        ```json
+        { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
+          "data": { "device": "drive0", "type": "backup",
+                     "speed": 0, "len": 67108864, "offset": 67108864 },
+          "event": "BLOCK_JOB_COMPLETED"
+        }
+        ```
+
+    * Later yet, QEMU sends notice that the second job has failed:
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+
+* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
+  but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
+  incremental backup of all drives at a point-in-time is to be made,
+  new backups for both drives will need to be made, taking into account
+  that a new incremental backup for drive0 needs to be based on top of
+  "d0-incr-1.qcow2."
+
+### Grouped Completion Mode
+
+* While jobs launched by transactions normally complete or fail on their own,
+  it is possible to instruct them to complete or fail together as a group.
+
+* QMP transactions take an optional properties structure that can affect
+  the semantics of the transaction.
+
+* The "completion-mode" transaction property can be either "individual"
+  which is the default, legacy behavior described above, or "grouped,"
+  a new behavior detailed below.
+
+* Delayed Completion: In grouped completion mode, no jobs will report
+  success until all jobs are ready to report success.
+
+* Grouped failure: If any job fails in grouped completion mode, all remaining
+  jobs will be cancelled. Any incremental backups will restore their dirty
+  bitmap objects as if no backup command was ever issued.
+
+    * Regardless of if QEMU reports a particular incremental backup job as
+      CANCELLED or as an ERROR, the in-memory bitmap will be restored.
+
+#### Example
+
+* Here's the same example scenario from above with the new property:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ],
+        "properties": {
+          "completion-mode": "grouped"
+        }
+      }
+    }
+    ```
+
+* QMP example response, highlighting a failure for drive2:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the second job has errored out,
+      but that the first job was also cancelled:
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
+          "data": { "device": "drive0", "type": "backup", "speed": 0,
+                    "len": 67108864, "offset": 16777216 },
+          "event": "BLOCK_JOB_CANCELLED" }
+        ```
+
 <!--
 The FreeBSD Documentation License

--- a/docs/blkdebug.txt
+++ b/docs/blkdebug.txt
@@ -1,6 +1,6 @@
 Block I/O error injection using blkdebug
 ----------------------------------------
-Copyright (C) 2014 Red Hat Inc
+Copyright (C) 2014-2015 Red Hat Inc

 This work is licensed under the terms of the GNU GPL, version 2 or later.  See
 the COPYING file in the top-level directory.
@@ -92,8 +92,9 @@ The core events are:

  flush_to_disk - flush the host block device's disk cache

-See block/blkdebug.c:event_names[] for the full list of events.  You may need
-to grep block driver source code to understand the meaning of specific events.
+See qapi/block-core.json:BlkdebugEvent for the full list of events.
+You may need to grep block driver source code to understand the
+meaning of specific events.

 State transitions
 -----------------
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -291,3 +291,194 @@ save/send this state when we are in the middle of a pio operation
 (that is what ide_drive_pio_state_needed() checks).  If DRQ_STAT is
 not enabled, the values on that fields are garbage and don't need to
 be sent.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+  Source side
+     Forward path - written by migration thread
+     Return path  - opened by main thread, read by return-path thread
+
+  Destination side
+     Forward path - read by main thread
+     Return path  - opened by main thread, written by main thread AND postcopy
+                    thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability x-postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode.  Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on.  Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up.  This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+   Command: 'postcopy listen'
+   The device state
+      A series of sections, identical to the precopy streams device state stream
+      containing everything except postcopiable devices (i.e. RAM)
+   Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location.  This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+                        1      2   3     4 5                      6   7
+main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
+thread                             |       |
+                                   |     (page request)
+                                   |        \___
+                                   v            \
+listen thread:                     --- page -- page -- page -- page -- page --
+
+                                   a   b        c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+   All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
+recurses into qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package.   It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+  Advise:  Set at the start of migration if postcopy is enabled, even
+           if it hasn't had the start command; here the destination
+           checks that its OS has the support needed for postcopy, and performs
+           setup to ensure the RAM mappings are suitable for later postcopy.
+           The destination will fail early in migration at this point if the
+           required OS support is not present.
+           (Triggered by reception of POSTCOPY_ADVISE command)
+
+  Discard: Entered on receipt of the first 'discard' command; prior to
+           the first Discard being performed, hugepages are switched off
+           (using madvise) to ensure that no new huge pages are created
+           during the postcopy phase, and to cause any huge pages that
+           have discards on them to be broken.
+
+  Listen:  The first command in the package, POSTCOPY_LISTEN, switches
+           the destination state to Listen, and starts a new thread
+           (the 'listen thread') which takes over the job of receiving
+           pages off the migration stream, while the main thread carries
+           on processing the blob.  With this thread able to process page
+           reception, the destination now 'sensitises' the RAM to detect
+           any access to missing pages (on Linux using the 'userfault'
+           system).
+
+  Running: POSTCOPY_RUN causes the destination to synchronise all
+           state and start the CPUs and IO devices running.  The main
+           thread now finishes processing the migration package and
+           now carries on as it would for normal precopy migration
+           (although it can't do the cleanup it would do as it
+           finishes a normal migration).
+
+  End:     The listen thread can now quit, and perform the cleanup of migration
+           state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'.  The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending.  During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+   a) Have been sent but then redirtied (which must be discarded)
+   b) Have not yet been sent - which also must be discarded to cause any
+      transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy.  The dirtymap
+is still valid and is used to ensure that no page is sent more than once.  Any
+request for a page that has already been sent is ignored.  Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
--- a/docs/pci_expander_bridge.txt
+++ b/docs/pci_expander_bridge.txt
@@ -23,9 +23,9 @@ A detailed command line would be:
 -m 2G
 -object memory-backend-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0
 -object memory-backend-ram,size=1024M,policy=bind,host-nodes=1,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1
-device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd-device e1000,bus=bridge1,addr=0x4,netdev=nd
-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8,bus=pci.0 -device e1000,bus=bridge2,addr=0x3
-device pxb,id=bridge3,bus=pci.0,bus_nr=40,bus=pci.0 -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1
+-device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd -device e1000,bus=bridge1,addr=0x4,netdev=nd
+-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8, -device e1000,bus=bridge2,addr=0x3
+-device pxb,id=bridge3,bus=pci.0,bus_nr=40, -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1

 Here you have:
 - 2 NUMA nodes for the guest, 0 and 1. (both mapped to the same NUMA node in host, but you can and should put it in different host NUMA nodes)
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -106,24 +106,28 @@ Types, commands, and events share a common namespace.  Therefore,
 generally speaking, type definitions should always use CamelCase for
 user-defined type names, while built-in types are lowercase. Type
 definitions should not end in 'Kind', as this namespace is used for
-creating implicit C enums for visiting union types.  Command names,
+creating implicit C enums for visiting union types, or in 'List', as
+this namespace is used for creating array types.  Command names,
 and field names within a type, should be all lower case with words
 separated by a hyphen.  However, some existing older commands and
 complex types use underscore; when extending such expressions,
 consistency is preferred over blindly avoiding underscore.  Event
-names should be ALL_CAPS with words separated by underscore.
+names should be ALL_CAPS with words separated by underscore.  Field
+names cannot start with 'has-' or 'has_', as this is reserved for
+tracking optional fields.

 Any name (command, event, type, field, or enum value) beginning with
 "x-" is marked experimental, and may be withdrawn or changed
-incompatibly in a future release.  Downstream vendors may add
-extensions; such extensions should begin with a prefix matching
-"__RFQDN_" (for the reverse-fully-qualified-domain-name of the
-vendor), even if the rest of the name uses dash (example:
-__com.redhat_drive-mirror).  Other than downstream extensions (with
-leading underscore and the use of dots), all names should begin with a
-letter, and contain only ASCII letters, digits, dash, and underscore.
-It is okay to reuse names that match C keywords; the generator will
-rename a field named "default" in the QAPI to "q_default" in the
+incompatibly in a future release.  All names must begin with a letter,
+and contain only ASCII letters, digits, dash, and underscore.  There
+are two exceptions: enum values may start with a digit, and any
+extensions added by downstream vendors should start with a prefix
+matching "__RFQDN_" (for the reverse-fully-qualified-domain-name of
+the vendor), even if the rest of the name uses dash (example:
+__com.redhat_drive-mirror).  Names beginning with 'q_' are reserved
+for the generator: QMP names that resemble C keywords or other
+problematic strings will be munged in C to use this prefix.  For
+example, a field named "default" in qapi becomes "q_default" in the
 generated C code.

 In the rest of this document, usage lines are given for each
@@ -156,6 +160,7 @@ The following types are predefined, and map to C as follows:
                       accepts size suffixes
  bool      bool       JSON true or false
  any       QObject *  any JSON value
+  QType     QType      JSON string matching enum QType values


 === Includes ===
@@ -379,9 +384,6 @@ where each branch of the union names a QAPI type.  For example:
   'data': { 'definition': 'BlockdevOptions',
             'reference': 'str' } }

-Just like for a simple union, an implicit C enum 'NameKind' is created
-to enumerate the branches for the alternate 'Name'.
-
 Unlike a union, the discriminator string is never passed on the wire
 for the Client JSON Protocol.  Instead, the value's JSON type serves
 as an implicit discriminator, which in turn means that an alternate
@@ -510,8 +512,23 @@ exactly the server (QEMU) supports.
 For this purpose, QMP provides introspection via command
 query-qmp-schema.  QGA currently doesn't support introspection.

+While Client JSON Protocol wire compatibility should be maintained
+between qemu versions, we cannot make the same guarantees for
+introspection stability.  For example, one version of qemu may provide
+a non-variant optional member of a struct, and a later version rework
+the member to instead be non-optional and associated with a variant.
+Likewise, one version of qemu may list a member with open-ended type
+'str', and a later version could convert it to a finite set of strings
+via an enum type; or a member may be converted from a specific type to
+an alternate that represents a choice between the original type and
+something else.
+
 query-qmp-schema returns a JSON array of SchemaInfo objects.  These
 objects together describe the wire ABI, as defined in the QAPI schema.
+There is no specified order to the SchemaInfo objects returned; a
+client must search for a particular name throughout the entire array
+to learn more about that name, but is at least guaranteed that there
+will be no collisions between type, command, and event names.

 However, the SchemaInfo can't reflect all the rules and restrictions
 that apply to QMP.  It's interface introspection (figuring out what's
@@ -592,7 +609,9 @@ any.  Each element is a JSON object with members "name" (the member's
 name), "type" (the name of its type), and optionally "default".  The
 member is optional if "default" is present.  Currently, "default" can
 only have value null.  Other values are reserved for future
-extensions.
+extensions.  The "members" array is in no particular order; clients
+must search the entire object when learning whether a particular
+member is supported.

 Example: the SchemaInfo for MyType from section Struct types

@@ -606,7 +625,9 @@ Example: the SchemaInfo for MyType from section Struct types
 "variants" is a JSON array describing the object's variant members.
 Each element is a JSON object with members "case" (the value of type
 tag this element applies to) and "type" (the name of an object type
-that provides the variant members for this type tag value).
+that provides the variant members for this type tag value).  The
+"variants" array is in no particular order, and is not guaranteed to
+list cases in the same order as the corresponding "tag" enum type.

 Example: the SchemaInfo for flat union BlockdevOptions from section
 Union types
@@ -647,7 +668,8 @@ Union types
 The SchemaInfo for an alternate type has meta-type "alternate", and
 variant member "members".  "members" is a JSON array.  Each element is
 a JSON object with member "type", which names a type.  Values of the
-alternate type conform to exactly one of its member types.
+alternate type conform to exactly one of its member types.  There is
+no guarantee on the order in which "members" will be listed.

 Example: the SchemaInfo for BlockRef from section Alternate types

@@ -658,15 +680,20 @@ Example: the SchemaInfo for BlockRef from section Alternate types

 The SchemaInfo for an array type has meta-type "array", and variant
 member "element-type", which names the array's element type.  Array
-types are implicitly defined.
+types are implicitly defined.  For convenience, the array's name may
+resemble the element type; however, clients should examine member
+"element-type" instead of making assumptions based on parsing member
+"name".

 Example: the SchemaInfo for ['str']

-    { "name": "strList", "meta-type": "array",
+    { "name": "[str]", "meta-type": "array",
      "element-type": "str" }

 The SchemaInfo for an enumeration type has meta-type "enum" and
-variant member "values".
+variant member "values".  The values are listed in no particular
+order; clients must search the entire enum when learning whether a
+particular value is supported.

 Example: the SchemaInfo for MyEnum from section Enumeration types

@@ -1024,7 +1051,7 @@ Example:

    const char *const example_QAPIEvent_lookup[] = {
        [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT",
-        [EXAMPLE_QAPI_EVENT_MAX] = NULL,
+        [EXAMPLE_QAPI_EVENT__MAX] = NULL,
    };
    $ cat qapi-generated/example-qapi-event.h
 [Uninteresting stuff omitted...]
@@ -1041,7 +1068,7 @@ Example:

    typedef enum example_QAPIEvent {
        EXAMPLE_QAPI_EVENT_MY_EVENT = 0,
-        EXAMPLE_QAPI_EVENT_MAX = 1,
+        EXAMPLE_QAPI_EVENT__MAX = 1,
    } example_QAPIEvent;

    extern const char *const example_QAPIEvent_lookup[];
--- a/docs/qmp-events.txt
+++ b/docs/qmp-events.txt
@@ -28,6 +28,8 @@ Example:
    "data": { "actual": 944766976 },
    "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }

+Note: this event is rate-limited.
+
 BLOCK_IMAGE_CORRUPTED
 ---------------------

@@ -296,6 +298,8 @@ Example:
     "data": { "reference": "usr1", "sector-num": 345435, "sectors-count": 5 },
     "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }

+Note: this event is rate-limited.
+
 QUORUM_REPORT_BAD
 -----------------

@@ -318,6 +322,8 @@ Example:
     "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 },
     "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }

+Note: this event is rate-limited.
+
 RESET
 -----

@@ -358,6 +364,8 @@ Example:
    "data": { "offset": 78 },
    "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }

+Note: this event is rate-limited.
+
 SHUTDOWN
 --------

@@ -632,6 +640,8 @@ Example:
    "data": { "id": "channel0", "open": true },
    "timestamp": { "seconds": 1401385907, "microseconds": 422329 } }

+Note: this event is rate-limited separately for each "id".
+
 WAKEUP
 ------

@@ -662,3 +672,5 @@ Example:

 Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is
 followed respectively by the RESET, SHUTDOWN, or STOP events.
+
+Note: this event is rate-limited.
--- a/docs/qmp-spec.txt
+++ b/docs/qmp-spec.txt
@@ -175,6 +175,11 @@ The format of asynchronous events is:
 For a listing of supported asynchronous events, please, refer to the
 qmp-events.txt file.

+Some events are rate-limited to at most one per second.  If additional
+"similar" events arrive within one second, all but the last one are
+dropped, and the last one is delayed.  "Similar" normally means same
+event type.  See qmp-events.txt for details.
+
 2.5 QGA Synchronization
 -----------------------

--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -0,0 +1,168 @@
+Copyright (c) 2010-2015 Institute for System Programming
+                        of the Russian Academy of Sciences.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Record/replay
+-------------
+
+Record/replay functions are used for the reverse execution and deterministic
+replay of qemu execution. This implementation of deterministic replay can
+be used for deterministic debugging of guest code through a gdb remote
+interface.
+
+Execution recording writes a non-deterministic events log, which can be later
+used for replaying the execution anywhere and for unlimited number of times.
+It also supports checkpointing for faster rewinding during reverse debugging.
+Execution replaying reads the log and replays all non-deterministic events
+including external input, hardware clocks, and interrupts.
+
+Deterministic replay has the following features:
+ * Deterministically replays whole system execution and all contents of
+   the memory, state of the hardware devices, clocks, and screen of the VM.
+ * Writes execution log into the file for later replaying for multiple times
+   on different machines.
+ * Supports i386, x86_64, and ARM hardware platforms.
+ * Performs deterministic replay of all operations with keyboard and mouse
+   input devices.
+
+Usage of the record/replay:
+ * First, record the execution, by adding the following arguments to the command line:
+   '-icount shift=7,rr=record,rrfile=replay.bin -net none'.
+   Block devices' images are not actually changed in the recording mode,
+   because all of the changes are written to the temporary overlay file.
+ * Then you can replay it by using another command
+   line option: '-icount shift=7,rr=replay,rrfile=replay.bin -net none'
+ * '-net none' option should also be specified if network replay patches
+   are not applied.
+
+Papers with description of deterministic replay implementation:
+http://www.computer.org/csdl/proceedings/csmr/2012/4666/00/4666a553-abs.html
+http://dl.acm.org/citation.cfm?id=2786805.2803179
+
+Modifications of qemu include:
+ * wrappers for clock and time functions to save their return values in the log
+ * saving different asynchronous events (e.g. system shutdown) into the log
+ * synchronization of the bottom halves execution
+ * synchronization of the threads from thread pool
+ * recording/replaying user input (mouse and keyboard)
+ * adding internal checkpoints for cpu and io synchronization
+
+Non-deterministic events
+------------------------
+
+Our record/replay system is based on saving and replaying non-deterministic
+events (e.g. keyboard input) and simulating deterministic ones (e.g. reading
+from HDD or memory of the VM). Saving only non-deterministic events makes
+log file smaller, simulation faster, and allows using reverse debugging even
+for realtime applications.
+
+The following non-deterministic data from peripheral devices is saved into
+the log: mouse and keyboard input, network packets, audio controller input,
+USB packets, serial port input, and hardware clocks (they are non-deterministic
+too, because their values are taken from the host machine). Inputs from
+simulated hardware, memory of VM, software interrupts, and execution of
+instructions are not saved into the log, because they are deterministic and
+can be replayed by simulating the behavior of virtual machine starting from
+initial state.
+
+We had to solve three tasks to implement deterministic replay: recording
+non-deterministic events, replaying non-deterministic events, and checking
+that there is no divergence between record and replay modes.
+
+We changed several parts of QEMU to make event log recording and replaying.
+Devices' models that have non-deterministic input from external devices were
+changed to write every external event into the execution log immediately.
+E.g. network packets are written into the log when they arrive into the virtual
+network adapter.
+
+All non-deterministic events are coming from these devices. But to
+replay them we need to know at which moments they occur. We specify
+these moments by counting the number of instructions executed between
+every pair of consecutive events.
+
+Instruction counting
+--------------------
+
+QEMU should work in icount mode to use record/replay feature. icount was
+designed to allow deterministic execution in absence of external inputs
+of the virtual machine. We also use icount to control the occurrence of the
+non-deterministic events. The number of instructions elapsed from the last event
+is written to the log while recording the execution. In replay mode we
+can predict when to inject that event using the instruction counter.
+
+Timers
+------
+
+Timers are used to execute callbacks from different subsystems of QEMU
+at the specified moments of time. There are several kinds of timers:
+ * Real time clock. Based on host time and used only for callbacks that
+   do not change the virtual machine state. For this reason real time
+   clock and timers does not affect deterministic replay at all.
+ * Virtual clock. These timers run only during the emulation. In icount
+   mode virtual clock value is calculated using executed instructions counter.
+   That is why it is completely deterministic and does not have to be recorded.
+ * Host clock. This clock is used by device models that simulate real time
+   sources (e.g. real time clock chip). Host clock is the one of the sources
+   of non-determinism. Host clock read operations should be logged to
+   make the execution deterministic.
+ * Real time clock for icount. This clock is similar to real time clock but
+   it is used only for increasing virtual clock while virtual machine is
+   sleeping. Due to its nature it is also non-deterministic as the host clock
+   and has to be logged too.
+
+Checkpoints
+-----------
+
+Replaying of the execution of virtual machine is bound by sources of
+non-determinism. These are inputs from clock and peripheral devices,
+and QEMU thread scheduling. Thread scheduling affect on processing events
+from timers, asynchronous input-output, and bottom halves.
+
+Invocations of timers are coupled with clock reads and changing the state
+of the virtual machine. Reads produce non-deterministic data taken from
+host clock. And VM state changes should preserve their order. Their relative
+order in replay mode must replicate the order of callbacks in record mode.
+To preserve this order we use checkpoints. When a specific clock is processed
+in record mode we save to the log special "checkpoint" event.
+Checkpoints here do not refer to virtual machine snapshots. They are just
+record/replay events used for synchronization.
+
+QEMU in replay mode will try to invoke timers processing in random moment
+of time. That's why we do not process a group of timers until the checkpoint
+event will be read from the log. Such an event allows synchronizing CPU
+execution and timer events.
+
+Another checkpoints application in record/replay is instruction counting
+while the virtual machine is idle. This function (qemu_clock_warp) is called
+from the wait loop. It changes virtual machine state and must be deterministic
+then. That is why we added checkpoint to this function to prevent its
+operation in replay mode when it does not correspond to record mode.
+
+Bottom halves
+-------------
+
+Disk I/O events are completely deterministic in our model, because
+in both record and replay modes we start virtual machine from the same
+disk state. But callbacks that virtual disk controller uses for reading and
+writing the disk may occur at different moments of time in record and replay
+modes.
+
+Reading and writing requests are created by CPU thread of QEMU. Later these
+requests proceed to block layer which creates "bottom halves". Bottom
+halves consist of callback and its parameters. They are processed when
+main loop locks the global mutex. These locks are not synchronized with
+replaying process because main loop also processes the events that do not
+affect the virtual machine state (like user interaction with monitor).
+
+That is why we had to implement saving and replaying bottom halves callbacks
+synchronously to the CPU execution. When the callback is about to execute
+it is added to the queue in the replay module. This queue is written to the
+log when its callbacks are executed. In replay mode callbacks are not processed
+until the corresponding event is read from the events log file.
+
+Sometimes the block layer uses asynchronous callbacks for its internal purposes
+(like reading or writing VM snapshots or disk image cluster tables). In this
+case bottom halves are not marked as "replayable" and do not saved
+into the log.
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -76,6 +76,13 @@ increasing address order, similar to memcpy().

 Selector Register IOport: 0x510
 Data Register IOport:     0x511
+DMA Address IOport:       0x514
+
+=== ARM Register Locations ===
+
+Selector Register address: Base + 8 (2 bytes)
+Data Register address:     Base + 0 (8 bytes)
+DMA Address address:       Base + 16 (8 bytes)

 == Firmware Configuration Items ==

@@ -86,11 +93,15 @@ by selecting the "signature" item using key 0x0000 (FW_CFG_SIGNATURE),
 and reading four bytes from the data register. If the fw_cfg device is
 present, the four bytes read will contain the characters "QEMU".

-=== Revision (Key 0x0001, FW_CFG_ID) ===
+If the DMA interface is available, then reading the DMA Address
+Register returns 0x51454d5520434647 ("QEMU CFG" in big-endian format).

-A 32-bit little-endian unsigned int, this item is used as an interface
-revision number, and is currently set to 1 by QEMU when fw_cfg is
-initialized.
+=== Revision / feature bitmap (Key 0x0001, FW_CFG_ID) ===
+
+A 32-bit little-endian unsigned int, this item is used to check for enabled
+features.
+ - Bit 0: traditional interface. Always set.
+ - Bit 1: DMA interface.

 === File Directory (Key 0x0019, FW_CFG_FILE_DIR) ===

@@ -132,79 +143,56 @@ Selector Reg.    Range Usage
 In practice, the number of allowed firmware configuration items is given
 by the value of FW_CFG_MAX_ENTRY (see fw_cfg.h).

-= Host-side API =
+= Guest-side DMA Interface =

-The following functions are available to the QEMU programmer for adding
-data to a fw_cfg device during guest initialization (see fw_cfg.h for
-each function's complete prototype):
+If bit 1 of the feature bitmap is set, the DMA interface is present. This does
+not replace the existing fw_cfg interface, it is an add-on. This interface
+can be used through the 64-bit wide address register.

-== fw_cfg_add_bytes() ==
+The address register is in big-endian format. The value for the register is 0
+at startup and after an operation. A write to the least significant half (at
+offset 4) triggers an operation. This means that operations with 32-bit
+addresses can be triggered with just one write, whereas operations with
+64-bit addresses can be triggered with one 64-bit write or two 32-bit writes,
+starting with the most significant half (at offset 0).

-Given a selector key value, starting pointer, and size, create an item
-as a raw "blob" of the given size, available by selecting the given key.
-The data referenced by the starting pointer is only linked, NOT copied,
-into the data structure of the fw_cfg device.
+In this register, the physical address of a FWCfgDmaAccess structure in RAM
+should be written. This is the format of the FWCfgDmaAccess structure:

-== fw_cfg_add_string() ==
+typedef struct FWCfgDmaAccess {
+    uint32_t control;
+    uint32_t length;
+    uint64_t address;
+} FWCfgDmaAccess;

-Instead of a starting pointer and size, this function accepts a pointer
-to a NUL-terminated ascii string, and inserts a newly allocated copy of
-the string (including the NUL terminator) into the fw_cfg device data
-structure.
+The fields of the structure are in big endian mode, and the field at the lowest
+address is the "control" field.

-== fw_cfg_add_iXX() ==
+The "control" field has the following bits:
+ - Bit 0: Error
+ - Bit 1: Read
+ - Bit 2: Skip
+ - Bit 3: Select. The upper 16 bits are the selected index.

-Insert an XX-bit item, where XX may be 16, 32, or 64. These functions
-will convert a 16-, 32-, or 64-bit integer to little-endian, then add
-a dynamically allocated copy of the appropriately sized item to fw_cfg
-under the given selector key value.
+When an operation is triggered, if the "control" field has bit 3 set, the
+upper 16 bits are interpreted as an index of a firmware configuration item.
+This has the same effect as writing the selector register.

-== fw_cfg_add_file() ==
+If the "control" field has bit 1 set, a read operation will be performed.
+"length" bytes for the current selector and offset will be copied into the
+physical RAM address specified by the "address" field.

-Given a filename (i.e., fw_cfg item name), starting pointer, and size,
-create an item as a raw "blob" of the given size. Unlike fw_cfg_add_bytes()
-above, the next available selector key (above 0x0020, FW_CFG_FILE_FIRST)
-will be used, and a new entry will be added to the file directory structure
-(at key 0x0019), containing the item name, blob size, and automatically
-assigned selector key value. The data referenced by the starting pointer
-is only linked, NOT copied, into the fw_cfg data structure.
+If the "control" field has bit 2 set (and not bit 1), a skip operation will be
+performed. The offset for the current selector will be advanced "length" bytes.

-== fw_cfg_add_file_callback() ==
+To check the result, read the "control" field:
+   error bit set        ->  something went wrong.
+   all bits cleared     ->  transfer finished successfully.
+   otherwise            ->  transfer still in progress (doesn't happen
+                            today due to implementation not being async,
+                            but may in the future).

-Like fw_cfg_add_file(), but additionally sets pointers to a callback
-function (and opaque argument), which will be executed host-side by
-QEMU each time a byte is read by the guest from this particular item.
-
-NOTE: The callback function is given the opaque argument set by
-fw_cfg_add_file_callback(), but also the current data offset,
-allowing it the option of only acting upon specific offset values
-(e.g., 0, before the first data byte of the selected item is
-returned to the guest).
-
-== fw_cfg_modify_file() ==
-
-Given a filename (i.e., fw_cfg item name), starting pointer, and size,
-completely replace the configuration item referenced by the given item
-name with the new given blob. If an existing blob is found, its
-callback information is removed, and a pointer to the old data is
-returned to allow the caller to free it, helping avoid memory leaks.
-If a configuration item does not already exist under the given item
-name, a new item will be created as with fw_cfg_add_file(), and NULL
-is returned to the caller. In any case, the data referenced by the
-starting pointer is only linked, NOT copied, into the fw_cfg data
-structure.
-
-== fw_cfg_add_callback() ==
-
-Like fw_cfg_add_bytes(), but additionally sets pointers to a callback
-function (and opaque argument), which will be executed host-side by
-QEMU each time a guest-side write operation to this particular item
-completes fully overwriting the item's data.
-
-NOTE: This function is deprecated, and will be completely removed
-starting with QEMU v2.4.
-
-== Externally Provided Items ==
+= Externally Provided Items =

 As of v2.4, "file" fw_cfg items (i.e., items with selector keys above
 FW_CFG_FILE_FIRST, and with a corresponding entry in the fw_cfg file
@@ -216,6 +204,21 @@ the following syntax:
 where <item_name> is the fw_cfg item name, and <path> is the location
 on the host file system of a file containing the data to be inserted.

+Small enough items may be provided directly as strings on the command
+line, using the syntax:
+
+    -fw_cfg [name=]<item_name>,string=<string>
+
+The terminating NUL character of the content <string> will NOT be
+included as part of the fw_cfg item data, which is consistent with
+the absence of a NUL terminator for items inserted via the file option.
+
+Both <item_name> and, if applicable, the content <string> are passed
+through by QEMU without any interpretation, expansion, or further
+processing. Any such processing (potentially performed e.g., by the shell)
+is outside of QEMU's responsibility; as such, using plain ASCII characters
+is recommended.
+
 NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/"
 when using the "-fw_cfg" command line option, to avoid conflicting with
 item names used internally by QEMU. For instance:
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -2,30 +2,106 @@
 Device Specification for Inter-VM shared memory device
 ------------------------------------------------------

-The Inter-VM shared memory device is designed to share a region of memory to
-userspace in multiple virtual guests.  The memory region does not belong to any
-guest, but is a POSIX memory object on the host.  Optionally, the device may
-support sending interrupts to other guests sharing the same memory region.
+The Inter-VM shared memory device is designed to share a memory region (created
+on the host via the POSIX shared memory API) between multiple QEMU processes
+running different guests. In order for all guests to be able to pick up the
+shared memory area, it is modeled by QEMU as a PCI device exposing said memory
+to the guest as a PCI BAR.
+The memory region does not belong to any guest, but is a POSIX memory object on
+the host. The host can access this shared memory if needed.
+
+The device also provides an optional communication mechanism between guests
+sharing the same memory object. More details about that in the section 'Guest to
+guest communication' section.


 The Inter-VM PCI device
 -----------------------

-*BARs*
+From the VM point of view, the ivshmem PCI device supports three BARs.

-The device supports three BARs.  BAR0 is a 1 Kbyte MMIO region to support
-registers.  BAR1 is used for MSI-X when it is enabled in the device.  BAR2 is
-used to map the shared memory object from the host.  The size of BAR2 is
-specified when the guest is started and must be a power of 2 in size.
+- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is
+  not used.
+- BAR1 is used for MSI-X when it is enabled in the device.
+- BAR2 is used to access the shared memory object.

-*Registers*
+It is your choice how to use the device but you must choose between two
+behaviors :

-The device currently supports 4 registers of 32-bits each.  Registers
-are used for synchronization between guests sharing the same memory object when
-interrupts are supported (this requires using the shared memory server).
+- basically, if you only need the shared memory part, you will map BAR2.
+  This way, you have access to the shared memory in guest and can use it as you
+  see fit (memnic, for example, uses it in userland
+  http://dpdk.org/browse/memnic).

-The server assigns each VM an ID number and sends this ID number to the QEMU
-process when the guest starts.
+- BAR0 and BAR1 are used to implement an optional communication mechanism
+  through interrupts in the guests. If you need an event mechanism between the
+  guests accessing the shared memory, you will most likely want to write a
+  kernel driver that will handle interrupts. See details in the section 'Guest
+  to guest communication' section.
+
+The behavior is chosen when starting your QEMU processes:
+- no communication mechanism needed, the first QEMU to start creates the shared
+  memory on the host, subsequent QEMU processes will use it.
+
+- communication mechanism needed, an ivshmem server must be started before any
+  QEMU processes, then each QEMU process connects to the server unix socket.
+
+For more details on the QEMU ivshmem parameters, see qemu-doc documentation.
+
+
+Guest to guest communication
+----------------------------
+
+This section details the communication mechanism between the guests accessing
+the ivhsmem shared memory.
+
+*ivshmem server*
+
+This server code is available in qemu.git/contrib/ivshmem-server.
+
+The server must be started on the host before any guest.
+It creates a shared memory object then waits for clients to connect on a unix
+socket. All the messages are little-endian int64_t integer.
+
+For each client (QEMU process) that connects to the server:
+- the server sends a protocol version, if client does not support it, the client
+  closes the communication,
+- the server assigns an ID for this client and sends this ID to him as the first
+  message,
+- the server sends a fd to the shared memory object to this client,
+- the server creates a new set of host eventfds associated to the new client and
+  sends this set to all already connected clients,
+- finally, the server sends all the eventfds sets for all clients to the new
+  client.
+
+The server signals all clients when one of them disconnects.
+
+The client IDs are limited to 16 bits because of the current implementation (see
+Doorbell register in 'PCI device registers' subsection). Hence only 65536
+clients are supported.
+
+All the file descriptors (fd to the shared memory, eventfds for each client)
+are passed to clients using SCM_RIGHTS over the server unix socket.
+
+Apart from the current ivshmem implementation in QEMU, an ivshmem client has
+been provided in qemu.git/contrib/ivshmem-client for debug.
+
+*QEMU as an ivshmem client*
+
+At initialisation, when creating the ivshmem device, QEMU first receives a
+protocol version and closes communication with server if it does not match.
+Then, QEMU gets its ID from the server then makes it available through BAR0
+IVPosition register for the VM to use (see 'PCI device registers' subsection).
+QEMU then uses the fd to the shared memory to map it to BAR2.
+eventfds for all other clients received from the server are stored to implement
+BAR0 Doorbell register (see 'PCI device registers' subsection).
+Finally, eventfds assigned to this QEMU process are used to send interrupts in
+this VM.
+
+*PCI device registers*
+
+From the VM point of view, the ivshmem PCI device supports 4 registers of
+32-bits each.

 enum ivshmem_registers {
    IntrMask = 0,
@@ -49,8 +125,8 @@ bit to 0 and unmasked by setting the first bit to 1.
 IVPosition Register: The IVPosition register is read-only and reports the
 guest's ID number.  The guest IDs are non-negative integers.  When using the
 server, since the server is a separate process, the VM ID will only be set when
-the device is ready (shared memory is received from the server and accessible via
-the device).  If the device is not ready, the IVPosition will return -1.
+the device is ready (shared memory is received from the server and accessible
+via the device).  If the device is not ready, the IVPosition will return -1.
 Applications should ensure that they have a valid VM ID before accessing the
 shared memory.

@@ -59,8 +135,8 @@ Doorbell register.  The doorbell register is 32-bits, logically divided into
 two 16-bit fields.  The high 16-bits are the guest ID to interrupt and the low
 16-bits are the interrupt vector to trigger.  The semantics of the value
 written to the doorbell depends on whether the device is using MSI or a regular
-pin-based interrupt.  In short, MSI uses vectors while regular interrupts set the
-status register.
+pin-based interrupt.  In short, MSI uses vectors while regular interrupts set
+the status register.

 Regular Interrupts

@@ -71,7 +147,7 @@ interrupt in the destination guest.

 Message Signalled Interrupts

-A ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
+An ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
 written to the Doorbell register must be between 0 and the maximum number of
 vectors the guest supports.  The lower 16 bits written to the doorbell is the
 MSI vector that will be raised in the destination guest.  The number of MSI
@@ -83,14 +159,3 @@ interrupt itself should be communicated via the shared memory region.  Devices
 supporting multiple MSI vectors can use different vectors to indicate different
 events have occurred.  The semantics of interrupt vectors are left to the
 user's discretion.
-
-
-Usage in the Guest
------------------
-
-The shared memory device is intended to be used with the provided UIO driver.
-Very little configuration is needed.  The guest should map BAR0 to access the
-registers (an array of 32-bit ints allows simple writing) and map BAR2 to
-access the shared memory region itself.  The size of the shared memory region
-is specified when the guest (or shared memory server) is started.  A guest may
-map the whole shared memory region or only part of it.
--- a/docs/specs/parallels.txt
+++ b/docs/specs/parallels.txt
@@ -0,0 +1,228 @@
+= License =
+
+Copyright (c) 2015 Denis Lunev
+Copyright (c) 2015 Vladimir Sementsov-Ogievskiy
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+= Parallels Expandable Image File Format =
+
+A Parallels expandable image file consists of three consecutive parts:
+    * header
+    * BAT
+    * data area
+
+All numbers in a Parallels expandable image are stored in little-endian byte
+order.
+
+
+== Definitions ==
+
+    Sector    A 512-byte data chunk.
+
+    Cluster   A data chunk of the size specified in the image header.
+              Currently, the default size is 1MiB (2048 sectors). In previous
+              versions, cluster sizes of 63 sectors, 256 and 252 kilobytes were
+              used.
+
+    BAT       Block Allocation Table, an entity that contains information for
+              guest-to-host I/O data address translation.
+
+
+== Header ==
+
+The header is placed at the start of an image and contains the following
+fields:
+
+Bytes:
+   0 - 15:    magic
+              Must contain "WithoutFreeSpace" or "WithouFreSpacExt".
+
+  16 - 19:    version
+              Must be 2.
+
+  20 - 23:    heads
+              Disk geometry parameter for guest.
+
+  24 - 27:    cylinders
+              Disk geometry parameter for guest.
+
+  28 - 31:    tracks
+              Cluster size, in sectors.
+
+  32 - 35:    nb_bat_entries
+              Disk size, in clusters (BAT size).
+
+  36 - 43:    nb_sectors
+              Disk size, in sectors.
+
+              For "WithoutFreeSpace" images:
+              Only the lowest 4 bytes are used. The highest 4 bytes must be
+              cleared in this case.
+
+              For "WithouFreSpacExt" images, there are no such
+              restrictions.
+
+  44 - 47:    in_use
+              Set to 0x746F6E59 when the image is opened by software in R/W
+              mode; set to 0x312e3276 when the image is closed.
+
+              A zero in this field means that the image was opened by an old
+              version of the software that doesn't support Format Extension
+              (see below).
+
+              Other values are not allowed.
+
+  48 - 51:    data_off
+              An offset, in sectors, from the start of the file to the start of
+              the data area.
+
+              For "WithoutFreeSpace" images:
+              - If data_off is zero, the offset is calculated as the end of BAT
+                table plus some padding to ensure sector size alignment.
+              - If data_off is non-zero, the offset should be aligned to sector
+                size. However it is recommended to align it to cluster size for
+                newly created images.
+
+              For "WithouFreSpacExt" images:
+              data_off must be non-zero and aligned to cluster size.
+
+  52 - 55:    flags
+              Miscellaneous flags.
+
+              Bit 0: Empty Image bit. If set, the image should be
+                     considered clear.
+
+              Bits 2-31: Unused.
+
+  56 - 63:    ext_off
+              Format Extension offset, an offset, in sectors, from the start of
+              the file to the start of the Format Extension Cluster.
+
+              ext_off must meet the same requirements as cluster offsets
+              defined by BAT entries (see below).
+
+
+== BAT ==
+
+BAT is placed immediately after the image header. In the file, BAT is a
+contiguous array of 32-bit unsigned little-endian integers with
+(bat_entries * 4) bytes size.
+
+Each BAT entry contains an offset from the start of the file to the
+corresponding cluster. The offset set in clusters for "WithouFreSpacExt" images
+and in sectors for "WithoutFreeSpace" images.
+
+If a BAT entry is zero, the corresponding cluster is not allocated and should
+be considered as filled with zeroes.
+
+Cluster offsets specified by BAT entries must meet the following requirements:
+    - the value must not be lower than data offset (provided by header.data_off
+      or calculated as specified above),
+    - the value must be lower than the desired file size,
+    - the value must be unique among all BAT entries,
+    - the result of (cluster offset - data offset) must be aligned to cluster
+      size.
+
+
+== Data Area ==
+
+The data area is an area from the data offset (provided by header.data_off or
+calculated as specified above) to the end of the file. It represents a
+contiguous array of clusters. Most of them are allocated by the BAT, some may
+be allocated by the ext_off field in the header while other may be allocated by
+extensions. All clusters allocated by ext_off and extensions should meet the
+same requirements as clusters specified by BAT entries.
+
+
+== Format Extension ==
+
+The Format Extension is an area 1 cluster in size that provides additional
+format features. This cluster is addressed by the ext_off field in the header.
+The format of the Format Extension area is the following:
+
+   0 -  7:    magic
+              Must be 0xAB234CEF23DCEA87
+
+   8 - 23:    m_CheckSum
+              The MD5 checksum of the entire Header Extension cluster except
+              the first 24 bytes.
+
+    The above are followed by feature sections or "extensions". The last
+    extension must be "End of features" (see below).
+
+Each feature section has the following format:
+
+   0 -  7:    magic
+              The identifier of the feature:
+              0x0000000000000000 - End of features
+              0x20385FAE252CB34A - Dirty bitmap
+
+   8 - 15:    flags
+              External flags for extension:
+
+              Bit 0: NECESSARY
+                     If the software cannot load the extension (due to an
+                     unknown magic number or error), the file should not be
+                     changed. If this flag is unset and there is an error on
+                     loading the extension, said extension should be dropped.
+
+              Bit 1: TRANSIT
+                     If there is an unknown extension with this flag set,
+                     said extension should be left as is.
+
+              If neither NECESSARY nor TRANSIT are set, the extension should be
+              dropped.
+
+  16 - 19:    data_size
+              The size of the following feature data, in bytes.
+
+  20 - 23:    unused32
+              Align header to 8 bytes boundary.
+
+  variable:   data (data_size bytes)
+
+    The above is followed by padding to the next 8 bytes boundary, then the
+    next extension starts.
+
+    The last extension must be "End of features" with all the fields set to 0.
+
+
+=== Dirty bitmaps feature ===
+
+This feature provides a way of storing dirty bitmaps in the image. The fields
+of its data area are:
+
+   0 -  7:    size
+              The bitmap size, should be equal to disk size in sectors.
+
+   8 - 23:    id
+              An identifier for backup consistency checking.
+
+  24 - 27:    granularity
+              Bitmap granularity, in sectors. I.e., the number of sectors
+              corresponding to one bit of the bitmap. Granularity must be
+              a power of 2.
+
+  28 - 31:    l1_size
+              The number of entries in the L1 table of the bitmap.
+
+  variable:   l1 (64 * l1_size bytes)
+              L1 offset table (in bytes)
+
+A dirty bitmap is stored using a one-level structure for the mapping to host
+clusters - an L1 table.
+
+Given an offset in bytes into the bitmap data, the offset in bytes into the
+image file can be obtained as follows:
+
+    offset = l1_table[offset / cluster_size] + (offset % cluster_size)
+
+If an L1 table entry is 0, the corresponding cluster of the bitmap is assumed
+to be zero.
+
+If an L1 table entry is 1, the corresponding cluster of the bitmap is assumed
+to have all bits set.
+
+If an L1 table entry is not 0 or 1, it allocates a cluster from the data area.
--- a/docs/specs/vhost-user.txt
+++ b/docs/specs/vhost-user.txt
@@ -87,6 +87,14 @@ Depending on the request type, payload can be:
   User address: a 64-bit user address
   mmap offset: 64-bit offset where region starts in the mapped memory

+* Log description
+   ---------------------------
+   | log size | log offset |
+   ---------------------------
+   log size: size of area used for logging
+   log offset: offset from start of supplied file descriptor
+       where logging starts (i.e. where guest address 0 would be logged)
+
 In QEMU the vhost-user message is implemented with the following struct:

 typedef struct VhostUserMsg {
@@ -98,6 +106,7 @@ typedef struct VhostUserMsg {
        struct vhost_vring_state state;
        struct vhost_vring_addr addr;
        VhostUserMemory memory;
+        VhostUserLog log;
    };
 } QEMU_PACKED VhostUserMsg;

@@ -115,11 +124,13 @@ the ones that do:
 * VHOST_GET_FEATURES
 * VHOST_GET_PROTOCOL_FEATURES
 * VHOST_GET_VRING_BASE
+ * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)

 There are several messages that the master sends with file descriptors passed
 in the ancillary data:

 * VHOST_SET_MEM_TABLE
+ * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)
 * VHOST_SET_LOG_FD
 * VHOST_SET_VRING_KICK
 * VHOST_SET_VRING_CALL
@@ -135,13 +146,45 @@ As older slaves don't support negotiating protocol features,
 a feature bit was dedicated for this purpose:
 #define VHOST_USER_F_PROTOCOL_FEATURES 30

+Starting and stopping rings
+----------------------
+Client must only process each ring when it is started.
+
+Client must only pass data between the ring and the
+backend, when the ring is enabled.
+
+If ring is started but disabled, client must process the
+ring without talking to the backend.
+
+For example, for a networking device, in the disabled state
+client must not supply any new RX packets, but must process
+and discard any TX packets.
+
+If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized
+in an enabled state.
+
+If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized
+in a disabled state. Client must not pass data to/from the backend until ring is enabled by
+VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by
+VHOST_USER_SET_VRING_ENABLE with parameter 0.
+
+Each ring is initialized in a stopped state, client must not process it until
+ring is started, or after it has been stopped.
+
+Client must start ring upon receiving a kick (that is, detecting that file
+descriptor is readable) on the descriptor specified by
+VHOST_USER_SET_VRING_KICK, and stop ring upon receiving
+VHOST_USER_GET_VRING_BASE.
+
+While processing the rings (whether they are enabled or not), client must
+support changing some configuration aspects on the fly.
+
 Multiple queue support
 ----------------------

 Multiple queue is treated as a protocol extension, hence the slave has to
 implement protocol features first. The multiple queues feature is supported
-only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set:
-#define VHOST_USER_PROTOCOL_F_MQ    0
+only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set.

 The max number of queues the slave supports can be queried with message
 VHOST_USER_GET_PROTOCOL_FEATURES. Master should stop when the number of
@@ -152,6 +195,66 @@ queue in the sent message to identify a specified queue. One queue pair
 is enabled initially. More queues are enabled dynamically, by sending
 message VHOST_USER_SET_VRING_ENABLE.

+Migration
+---------
+
+During live migration, the master may need to track the modifications
+the slave makes to the memory mapped regions. The client should mark
+the dirty pages in a log. Once it complies to this logging, it may
+declare the VHOST_F_LOG_ALL vhost feature.
+
+To start/stop logging of data/used ring writes, server may send messages
+VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with
+VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively.
+
+All the modifications to memory pointed by vring "descriptor" should
+be marked. Modifications to "used" vring should be marked if
+VHOST_VRING_F_LOG is part of ring's flags.
+
+Dirty pages are of size:
+#define VHOST_LOG_PAGE 0x1000
+
+The log memory fd is provided in the ancillary data of
+VHOST_USER_SET_LOG_BASE message when the slave has
+VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature.
+
+The size of the log is supplied as part of VhostUserMsg
+which should be large enough to cover all known guest
+addresses. Log starts at the supplied offset in the
+supplied file descriptor.
+The log covers from address 0 to the maximum of guest
+regions. In pseudo-code, to mark page at "addr" as dirty:
+
+page = addr / VHOST_LOG_PAGE
+log[page / 8] |= 1 << page % 8
+
+Where addr is the guest physical address.
+
+Use atomic operations, as the log may be concurrently manipulated.
+
+Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG
+is set for this ring), log_guest_addr should be used to calculate the log
+offset: the write to first byte of the used ring is logged at this offset from
+log start. Also note that this value might be outside the legal guest physical
+address range (i.e. does not have to be covered by the VhostUserMemory table),
+but the bit offset of the last byte of the ring must fall within
+the size supplied by VhostUserLog.
+
+VHOST_USER_SET_LOG_FD is an optional message with an eventfd in
+ancillary data, it may be used to inform the master that the log has
+been modified.
+
+Once the source has finished migration, rings will be stopped by
+the source. No further update must be done before rings are
+restarted.
+
+Protocol features
+-----------------
+
+#define VHOST_USER_PROTOCOL_F_MQ             0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD      1
+#define VHOST_USER_PROTOCOL_F_RARP           2
+
 Message types
 -------------

@@ -211,14 +314,16 @@ Message types
      as an owner of the session. This can be used on the Slave as a
      "session start" flag.

- * VHOST_USER_RESET_DEVICE
+ * VHOST_USER_RESET_OWNER

      Id: 4
-      Equivalent ioctl: VHOST_RESET_DEVICE
      Master payload: N/A

-      Issued when a new connection is about to be closed. The Master will no
-      longer own this connection (and will usually close it).
+      This is no longer used. Used to be sent to request disabling
+      all rings, but some clients interpreted it to also discard
+      connection state (this interpretation would lead to bugs).
+      It is recommended that clients either ignore this message,
+      or use it to disable all rings.

 * VHOST_USER_SET_MEM_TABLE

@@ -236,8 +341,14 @@ Message types
      Id: 6
      Equivalent ioctl: VHOST_SET_LOG_BASE
      Master payload: u64
+      Slave payload: N/A
+
+      Sets logging shared memory space.
+      When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol
+      feature, the log memory fd is provided in the ancillary data of
+      VHOST_USER_SET_LOG_BASE message, the size and offset of shared
+      memory area provided in the message.

-      Sets the logging base address.

 * VHOST_USER_SET_LOG_FD

@@ -337,3 +448,19 @@ Message types
      Master payload: vring state description

      Signal slave to enable or disable corresponding vring.
+      This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES
+      has been negotiated.
+
+ * VHOST_USER_SEND_RARP
+
+      Id: 19
+      Equivalent ioctl: N/A
+      Master payload: u64
+
+      Ask vhost user backend to broadcast a fake RARP to notify the migration
+      is terminated for guest that does not support GUEST_ANNOUNCE.
+      Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in
+      VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP
+      is present in VHOST_USER_GET_PROTOCOL_FEATURES.
+      The first 6 bytes of the payload contain the mac address of the guest to
+      allow the vhost user backend to construct and broadcast the fake RARP.
--- a/docs/virtio-migration.txt
+++ b/docs/virtio-migration.txt
@@ -0,0 +1,106 @@
+Virtio devices and migration
+============================
+
+Copyright 2015 IBM Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.  See
+the COPYING file in the top-level directory.
+
+Saving and restoring the state of virtio devices is a bit of a twisty maze,
+for several reasons:
+- state is distributed between several parts:
+  - virtio core, for common fields like features, number of queues, ...
+  - virtio transport (pci, ccw, ...), for the different proxy devices and
+    transport specific state (msix vectors, indicators, ...)
+  - virtio device (net, blk, ...), for the different device types and their
+    state (mac address, request queue, ...)
+- most fields are saved via the stream interface; subsequently, subsections
+  have been added to make cross-version migration possible
+
+This file attempts to document the current procedure and point out some
+caveats.
+
+
+Save state procedure
+====================
+
+virtio core               virtio transport          virtio device
+-----------               ----------------          -------------
+
+                                                    save() function registered
+                                                    via register_savevm()
+virtio_save()                                       <----------
+             ------>      save_config()
+                          - save proxy device
+                          - save transport-specific
+                            device fields
+- save common device
+  fields
+- save common virtqueue
+  fields
+             ------>      save_queue()
+                          - save transport-specific
+                            virtqueue fields
+             ------>                               save_device()
+                                                   - save device-specific
+                                                     fields
+- save subsections
+  - device endianness,
+    if changed from
+    default endianness
+  - 64 bit features, if
+    any high feature bit
+    is set
+  - virtio-1 virtqueue
+    fields, if VERSION_1
+    is set
+
+
+Load state procedure
+====================
+
+virtio core               virtio transport          virtio device
+-----------               ----------------          -------------
+
+                                                    load() function registered
+                                                    via register_savevm()
+virtio_load()                                       <----------
+             ------>      load_config()
+                          - load proxy device
+                          - load transport-specific
+                            device fields
+- load common device
+  fields
+- load common virtqueue
+  fields
+             ------>      load_queue()
+                          - load transport-specific
+                            virtqueue fields
+- notify guest
+             ------>                               load_device()
+                                                   - load device-specific
+                                                     fields
+- load subsections
+  - device endianness
+  - 64 bit features
+  - virtio-1 virtqueue
+    fields
+- sanitize endianness
+- sanitize features
+- virtqueue index sanity
+  check
+                                                   - feature-dependent setup
+
+
+Implications of this setup
+==========================
+
+Devices need to be careful in their state processing during load: The
+load_device() procedure is invoked by the core before subsections have
+been loaded. Any code that depends on information transmitted in subsections
+therefore has to be invoked in the device's load() function _after_
+virtio_load() returned (like e.g. code depending on features).
+
+Any extension of the state being migrated should be done in subsections
+added to the core for compatibility reasons. If transport or device specific
+state is added, core needs to invoke a callback from the new subsection.
--- a/docs/writing-qmp-commands.txt
+++ b/docs/writing-qmp-commands.txt
@@ -210,7 +210,7 @@ if you don't see these strings, then something went wrong.
 === Errors ===

 QMP commands should use the error interface exported by the error.h header
-file. Basically, errors are set by calling the error_set() function.
+file. Basically, most errors are set by calling the error_setg() function.

 Let's say we don't accept the string "message" to contain the word "love". If
 it does contain it, we want the "hello-world" command to return an error:
@@ -219,8 +219,7 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
 {
    if (has_message) {
        if (strstr(message, "love")) {
-            error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                      "the word 'love' is not allowed");
+            error_setg(errp, "the word 'love' is not allowed");
            return;
        }
        printf("%s\n", message);
@@ -229,10 +228,8 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
    }
 }

-The first argument to the error_set() function is the Error pointer to pointer,
-which is passed to all QMP functions. The second argument is a ErrorClass
-value, which should be ERROR_CLASS_GENERIC_ERROR most of the time (more
-details about error classes are given below). The third argument is a human
+The first argument to the error_setg() function is the Error pointer
+to pointer, which is passed to all QMP functions. The next argument is a human
 description of the error, this is a free-form printf-like string.

 Let's test the example above. Build qemu, run it as defined in the "Testing"
@@ -249,8 +246,9 @@ The QMP server's response should be:
    }
 }

-As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR. There
-are two exceptions to this rule:
+As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR
+(done by default when using error_setg()). There are two exceptions to
+this rule:

 1. A non-generic ErrorClass value exists* for the failure you want to report
    (eg. DeviceNotFound)
@@ -259,8 +257,8 @@ are two exceptions to this rule:
    want to report, hence you have to add a new ErrorClass value so that they
    can check for it

-If the failure you want to report doesn't fall in one of the two cases above,
-just report ERROR_CLASS_GENERIC_ERROR.
+If the failure you want to report falls into one of the two cases above,
+use error_set() with a second argument of an ErrorClass value.

 * All existing ErrorClass values are defined in the qapi-schema.json file

--- a/exec.c
+++ b/exec.c
@@ -50,11 +50,15 @@
 #include "qemu/rcu_queue.h"
 #include "qemu/main-loop.h"
 #include "translate-all.h"
+#include "sysemu/replay.h"

 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"

 #include "qemu/range.h"
+#ifndef _WIN32
+#include "qemu/mmap-alloc.h"
+#endif

 //#define DEBUG_SUBPAGE

@@ -84,9 +88,6 @@ static MemoryRegion io_mem_unassigned;
 */
 #define RAM_RESIZEABLE (1 << 2)

-/* An extra page is mapped on top of this RAM.
- */
-#define RAM_EXTRA (1 << 3)
 #endif

 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
@@ -389,18 +390,6 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
    return section;
 }

-static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
-{
-    if (memory_region_is_ram(mr)) {
-        return !(is_write && mr->readonly);
-    }
-    if (memory_region_is_romd(mr)) {
-        return !is_write;
-    }
-
-    return false;
-}
-
 /* Called from RCU critical section */
 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
                                      hwaddr *xlat, hwaddr *plen,
@@ -869,7 +858,7 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
-    if (qemu_log_enabled()) {
+    if (qemu_log_separate()) {
        qemu_log("qemu: fatal: ");
        qemu_log_vprintf(fmt, ap2);
        qemu_log("\n");
@@ -879,6 +868,7 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
    }
    va_end(ap2);
    va_end(ap);
+    replay_finish();
 #if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
@@ -898,7 +888,7 @@ static RAMBlock *qemu_get_ram_block(ram_addr_t addr)

    block = atomic_rcu_read(&ram_list.mru_block);
    if (block && addr - block->offset < block->max_length) {
-        goto found;
+        return block;
    }
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        if (addr - block->offset < block->max_length) {
@@ -1059,9 +1049,11 @@ static uint16_t phys_section_add(PhysPageMap *map,

 static void phys_section_destroy(MemoryRegion *mr)
 {
+    bool have_sub_page = mr->subpage;
+
    memory_region_unref(mr);

-    if (mr->subpage) {
+    if (have_sub_page) {
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
        object_unref(OBJECT(&subpage->iomem));
        g_free(subpage);
@@ -1191,9 +1183,6 @@ static long gethugepagesize(const char *path, Error **errp)
        return 0;
    }

-    if (fs.f_type != HUGETLBFS_MAGIC)
-        fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
-
    return fs.f_bsize;
 }

@@ -1202,16 +1191,14 @@ static void *file_ram_alloc(RAMBlock *block,
                            const char *path,
                            Error **errp)
 {
+    struct stat st;
    char *filename;
    char *sanitized_name;
    char *c;
-    void *ptr;
-    void *area = NULL;
+    void *area;
    int fd;
    uint64_t hpagesize;
-    uint64_t total;
    Error *local_err = NULL;
-    size_t offset;

    hpagesize = gethugepagesize(path, &local_err);
    if (local_err) {
@@ -1233,29 +1220,35 @@ static void *file_ram_alloc(RAMBlock *block,
        goto error;
    }

-    /* Make name safe to use with mkstemp by replacing '/' with '_'. */
-    sanitized_name = g_strdup(memory_region_name(block->mr));
-    for (c = sanitized_name; *c != '\0'; c++) {
-        if (*c == '/')
-            *c = '_';
+    if (!stat(path, &st) && S_ISDIR(st.st_mode)) {
+        /* Make name safe to use with mkstemp by replacing '/' with '_'. */
+        sanitized_name = g_strdup(memory_region_name(block->mr));
+        for (c = sanitized_name; *c != '\0'; c++) {
+            if (*c == '/') {
+                *c = '_';
+            }
+        }
+
+        filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
+                                   sanitized_name);
+        g_free(sanitized_name);
+
+        fd = mkstemp(filename);
+        if (fd >= 0) {
+            unlink(filename);
+        }
+        g_free(filename);
+    } else {
+        fd = open(path, O_RDWR | O_CREAT, 0644);
    }

-    filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
-                               sanitized_name);
-    g_free(sanitized_name);
-
-    fd = mkstemp(filename);
    if (fd < 0) {
        error_setg_errno(errp, errno,
                         "unable to create backing store for hugepages");
-        g_free(filename);
        goto error;
    }
-    unlink(filename);
-    g_free(filename);

    memory = ROUND_UP(memory, hpagesize);
-    total = memory + hpagesize;

    /*
     * ftruncate is not supported by hugetlbfs in older
@@ -1267,40 +1260,14 @@ static void *file_ram_alloc(RAMBlock *block,
        perror("ftruncate");
    }

-    ptr = mmap(0, total, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
-                -1, 0);
-    if (ptr == MAP_FAILED) {
-        error_setg_errno(errp, errno,
-                         "unable to allocate memory range for hugepages");
-        close(fd);
-        goto error;
-    }
-
-    offset = QEMU_ALIGN_UP((uintptr_t)ptr, hpagesize) - (uintptr_t)ptr;
-
-    area = mmap(ptr + offset, memory, PROT_READ | PROT_WRITE,
-                (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE) |
-                MAP_FIXED,
-                fd, 0);
+    area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED);
    if (area == MAP_FAILED) {
        error_setg_errno(errp, errno,
                         "unable to map backing store for hugepages");
-        munmap(ptr, total);
        close(fd);
        goto error;
    }

-    if (offset > 0) {
-        munmap(ptr, offset);
-    }
-    ptr += offset;
-    total -= offset;
-
-    if (total > memory + getpagesize()) {
-        munmap(ptr + memory + getpagesize(),
-               total - memory - getpagesize());
-    }
-
    if (mem_prealloc) {
        os_mem_prealloc(fd, area, memory);
    }
@@ -1309,10 +1276,6 @@ static void *file_ram_alloc(RAMBlock *block,
    return area;

 error:
-    if (mem_prealloc) {
-        error_report("%s", error_get_pretty(*errp));
-        exit(1);
-    }
    return NULL;
 }
 #endif
@@ -1398,6 +1361,11 @@ static RAMBlock *find_ram_block(ram_addr_t addr)
    return NULL;
 }

+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+    return rb->idstr;
+}
+
 /* Called with iothread lock held.  */
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
 {
@@ -1468,7 +1436,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)

    assert(block);

-    newsize = TARGET_PAGE_ALIGN(newsize);
+    newsize = HOST_PAGE_ALIGN(newsize);

    if (block->used_length == newsize) {
        return 0;
@@ -1612,13 +1580,12 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
        return -1;
    }

-    size = TARGET_PAGE_ALIGN(size);
+    size = HOST_PAGE_ALIGN(size);
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->used_length = size;
    new_block->max_length = size;
    new_block->flags = share ? RAM_SHARED : 0;
-    new_block->flags |= RAM_EXTRA;
    new_block->host = file_ram_alloc(new_block, size,
                                     mem_path, errp);
    if (!new_block->host) {
@@ -1648,8 +1615,8 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
    ram_addr_t addr;
    Error *local_err = NULL;

-    size = TARGET_PAGE_ALIGN(size);
-    max_size = TARGET_PAGE_ALIGN(max_size);
+    size = HOST_PAGE_ALIGN(size);
+    max_size = HOST_PAGE_ALIGN(max_size);
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->resized = resized;
@@ -1693,25 +1660,6 @@ ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
 }

-void qemu_ram_free_from_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    qemu_mutex_lock_ramlist();
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (addr == block->offset) {
-            QLIST_REMOVE_RCU(block, next);
-            ram_list.mru_block = NULL;
-            /* Write list before version */
-            smp_wmb();
-            ram_list.version++;
-            g_free_rcu(block, rcu);
-            break;
-        }
-    }
-    qemu_mutex_unlock_ramlist();
-}
-
 static void reclaim_ramblock(RAMBlock *block)
 {
    if (block->flags & RAM_PREALLOC) {
@@ -1720,11 +1668,7 @@ static void reclaim_ramblock(RAMBlock *block)
        xen_invalidate_map_cache_entry(block->host);
 #ifndef _WIN32
    } else if (block->fd >= 0) {
-        if (block->flags & RAM_EXTRA) {
-            munmap(block->host, block->max_length + getpagesize());
-        } else {
-            munmap(block->host, block->max_length);
-        }
+        qemu_ram_munmap(block->host, block->max_length);
        close(block->fd);
 #endif
    } else {
@@ -1830,19 +1774,11 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
 * or address_space_rw instead. For local memory (e.g. video ram) that the
 * device owns, use memory_region_get_ram_ptr.
 *
- * By the time this function returns, the returned pointer is not protected
- * by RCU anymore.  If the caller is not within an RCU critical section and
- * does not hold the iothread lock, it must have other means of protecting the
- * pointer, such as a reference to the region that includes the incoming
- * ram_addr_t.
+ * Called within RCU critical section.
 */
 void *qemu_get_ram_ptr(ram_addr_t addr)
 {
-    RAMBlock *block;
-    void *ptr;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
+    RAMBlock *block = qemu_get_ram_block(addr);

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
@@ -1850,56 +1786,56 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
         * In that case just map until the end of the page.
         */
        if (block->offset == 0) {
-            ptr = xen_map_cache(addr, 0, 0);
-            goto unlock;
+            return xen_map_cache(addr, 0, 0);
        }

        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }
-    ptr = ramblock_ptr(block, addr - block->offset);
-
-unlock:
-    rcu_read_unlock();
-    return ptr;
+    return ramblock_ptr(block, addr - block->offset);
 }

 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
 * but takes a size argument.
 *
- * By the time this function returns, the returned pointer is not protected
- * by RCU anymore.  If the caller is not within an RCU critical section and
- * does not hold the iothread lock, it must have other means of protecting the
- * pointer, such as a reference to the region that includes the incoming
- * ram_addr_t.
+ * Called within RCU critical section.
 */
 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
 {
-    void *ptr;
+    RAMBlock *block;
+    ram_addr_t offset_inside_block;
    if (*size == 0) {
        return NULL;
    }
-    if (xen_enabled()) {
-        return xen_map_cache(addr, *size, 1);
-    } else {
-        RAMBlock *block;
-        rcu_read_lock();
-        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-            if (addr - block->offset < block->max_length) {
-                if (addr - block->offset + *size > block->max_length)
-                    *size = block->max_length - addr + block->offset;
-                ptr = ramblock_ptr(block, addr - block->offset);
-                rcu_read_unlock();
-                return ptr;
-            }
+
+    block = qemu_get_ram_block(addr);
+    offset_inside_block = addr - block->offset;
+    *size = MIN(*size, block->max_length - offset_inside_block);
+
+    if (xen_enabled() && block->host == NULL) {
+        /* We need to check if the requested address is in the RAM
+         * because we don't want to map the entire memory in QEMU.
+         * In that case just map the requested area.
+         */
+        if (block->offset == 0) {
+            return xen_map_cache(addr, *size, 1);
        }

-        fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-        abort();
+        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }
+
+    return ramblock_ptr(block, offset_inside_block);
 }

-/* Some of the softmmu routines need to translate from a host pointer
- * (typically a TLB entry) back to a ram offset.
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
@@ -1907,18 +1843,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr,
+                                   ram_addr_t *offset)
 {
    RAMBlock *block;
    uint8_t *host = ptr;
-    MemoryRegion *mr;

    if (xen_enabled()) {
        rcu_read_lock();
        *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        mr = qemu_get_ram_block(*ram_addr)->mr;
+        block = qemu_get_ram_block(*ram_addr);
+        if (block) {
+            *offset = (host - block->host);
+        }
        rcu_read_unlock();
-        return mr;
+        return block;
    }

    rcu_read_lock();
@@ -1941,12 +1881,52 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
    return NULL;

 found:
-    *ram_addr = block->offset + (host - block->host);
-    mr = block->mr;
+    *offset = (host - block->host);
+    if (round_offset) {
+        *offset &= TARGET_PAGE_MASK;
+    }
+    *ram_addr = block->offset + *offset;
    rcu_read_unlock();
-    return mr;
+    return block;
 }

+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (!strcmp(name, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+{
+    RAMBlock *block;
+    ram_addr_t offset; /* Not used */
+
+    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+
+    if (!block) {
+        return NULL;
+    }
+
+    return block->mr;
+}
+
+/* Called within RCU critical section.  */
 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
                               uint64_t val, unsigned size)
 {
@@ -2477,101 +2457,58 @@ static bool prepare_mmio_access(MemoryRegion *mr)
    return release_lock;
 }

-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
-                             uint8_t *buf, int len, bool is_write)
+/* Called within RCU critical section.  */
+static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
+                                                MemTxAttrs attrs,
+                                                const uint8_t *buf,
+                                                int len, hwaddr addr1,
+                                                hwaddr l, MemoryRegion *mr)
 {
-    hwaddr l;
    uint8_t *ptr;
    uint64_t val;
-    hwaddr addr1;
-    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;

-    rcu_read_lock();
-    while (len > 0) {
-        l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, is_write);
-
-        if (is_write) {
-            if (!memory_access_is_direct(mr, is_write)) {
-                release_lock |= prepare_mmio_access(mr);
-                l = memory_access_size(mr, l, addr1);
-                /* XXX: could force current_cpu to NULL to avoid
-                   potential bugs */
-                switch (l) {
-                case 8:
-                    /* 64 bit write access */
-                    val = ldq_p(buf);
-                    result |= memory_region_dispatch_write(mr, addr1, val, 8,
-                                                           attrs);
-                    break;
-                case 4:
-                    /* 32 bit write access */
-                    val = ldl_p(buf);
-                    result |= memory_region_dispatch_write(mr, addr1, val, 4,
-                                                           attrs);
-                    break;
-                case 2:
-                    /* 16 bit write access */
-                    val = lduw_p(buf);
-                    result |= memory_region_dispatch_write(mr, addr1, val, 2,
-                                                           attrs);
-                    break;
-                case 1:
-                    /* 8 bit write access */
-                    val = ldub_p(buf);
-                    result |= memory_region_dispatch_write(mr, addr1, val, 1,
-                                                           attrs);
-                    break;
-                default:
-                    abort();
-                }
-            } else {
-                addr1 += memory_region_get_ram_addr(mr);
-                /* RAM case */
-                ptr = qemu_get_ram_ptr(addr1);
-                memcpy(ptr, buf, l);
-                invalidate_and_set_dirty(mr, addr1, l);
+    for (;;) {
+        if (!memory_access_is_direct(mr, true)) {
+            release_lock |= prepare_mmio_access(mr);
+            l = memory_access_size(mr, l, addr1);
+            /* XXX: could force current_cpu to NULL to avoid
+               potential bugs */
+            switch (l) {
+            case 8:
+                /* 64 bit write access */
+                val = ldq_p(buf);
+                result |= memory_region_dispatch_write(mr, addr1, val, 8,
+                                                       attrs);
+                break;
+            case 4:
+                /* 32 bit write access */
+                val = ldl_p(buf);
+                result |= memory_region_dispatch_write(mr, addr1, val, 4,
+                                                       attrs);
+                break;
+            case 2:
+                /* 16 bit write access */
+                val = lduw_p(buf);
+                result |= memory_region_dispatch_write(mr, addr1, val, 2,
+                                                       attrs);
+                break;
+            case 1:
+                /* 8 bit write access */
+                val = ldub_p(buf);
+                result |= memory_region_dispatch_write(mr, addr1, val, 1,
+                                                       attrs);
+                break;
+            default:
+                abort();
            }
        } else {
-            if (!memory_access_is_direct(mr, is_write)) {
-                /* I/O case */
-                release_lock |= prepare_mmio_access(mr);
-                l = memory_access_size(mr, l, addr1);
-                switch (l) {
-                case 8:
-                    /* 64 bit read access */
-                    result |= memory_region_dispatch_read(mr, addr1, &val, 8,
-                                                          attrs);
-                    stq_p(buf, val);
-                    break;
-                case 4:
-                    /* 32 bit read access */
-                    result |= memory_region_dispatch_read(mr, addr1, &val, 4,
-                                                          attrs);
-                    stl_p(buf, val);
-                    break;
-                case 2:
-                    /* 16 bit read access */
-                    result |= memory_region_dispatch_read(mr, addr1, &val, 2,
-                                                          attrs);
-                    stw_p(buf, val);
-                    break;
-                case 1:
-                    /* 8 bit read access */
-                    result |= memory_region_dispatch_read(mr, addr1, &val, 1,
-                                                          attrs);
-                    stb_p(buf, val);
-                    break;
-                default:
-                    abort();
-                }
-            } else {
-                /* RAM case */
-                ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
-                memcpy(buf, ptr, l);
-            }
+            addr1 += memory_region_get_ram_addr(mr);
+            /* RAM case */
+            ptr = qemu_get_ram_ptr(addr1);
+            memcpy(ptr, buf, l);
+            invalidate_and_set_dirty(mr, addr1, l);
        }

        if (release_lock) {
@@ -2582,8 +2519,14 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
        len -= l;
        buf += l;
        addr += l;
+
+        if (!len) {
+            break;
+        }
+
+        l = len;
+        mr = address_space_translate(as, addr, &addr1, &l, true);
    }
-    rcu_read_unlock();

    return result;
 }
@@ -2591,15 +2534,122 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
 MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                                const uint8_t *buf, int len)
 {
-    return address_space_rw(as, addr, attrs, (uint8_t *)buf, len, true);
+    hwaddr l;
+    hwaddr addr1;
+    MemoryRegion *mr;
+    MemTxResult result = MEMTX_OK;
+
+    if (len > 0) {
+        rcu_read_lock();
+        l = len;
+        mr = address_space_translate(as, addr, &addr1, &l, true);
+        result = address_space_write_continue(as, addr, attrs, buf, len,
+                                              addr1, l, mr);
+        rcu_read_unlock();
+    }
+
+    return result;
 }

-MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
-                               uint8_t *buf, int len)
+/* Called within RCU critical section.  */
+MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
+                                        MemTxAttrs attrs, uint8_t *buf,
+                                        int len, hwaddr addr1, hwaddr l,
+                                        MemoryRegion *mr)
 {
-    return address_space_rw(as, addr, attrs, buf, len, false);
+    uint8_t *ptr;
+    uint64_t val;
+    MemTxResult result = MEMTX_OK;
+    bool release_lock = false;
+
+    for (;;) {
+        if (!memory_access_is_direct(mr, false)) {
+            /* I/O case */
+            release_lock |= prepare_mmio_access(mr);
+            l = memory_access_size(mr, l, addr1);
+            switch (l) {
+            case 8:
+                /* 64 bit read access */
+                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
+                                                      attrs);
+                stq_p(buf, val);
+                break;
+            case 4:
+                /* 32 bit read access */
+                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
+                                                      attrs);
+                stl_p(buf, val);
+                break;
+            case 2:
+                /* 16 bit read access */
+                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
+                                                      attrs);
+                stw_p(buf, val);
+                break;
+            case 1:
+                /* 8 bit read access */
+                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
+                                                      attrs);
+                stb_p(buf, val);
+                break;
+            default:
+                abort();
+            }
+        } else {
+            /* RAM case */
+            ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
+            memcpy(buf, ptr, l);
+        }
+
+        if (release_lock) {
+            qemu_mutex_unlock_iothread();
+            release_lock = false;
+        }
+
+        len -= l;
+        buf += l;
+        addr += l;
+
+        if (!len) {
+            break;
+        }
+
+        l = len;
+        mr = address_space_translate(as, addr, &addr1, &l, false);
+    }
+
+    return result;
 }

+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+                                    MemTxAttrs attrs, uint8_t *buf, int len)
+{
+    hwaddr l;
+    hwaddr addr1;
+    MemoryRegion *mr;
+    MemTxResult result = MEMTX_OK;
+
+    if (len > 0) {
+        rcu_read_lock();
+        l = len;
+        mr = address_space_translate(as, addr, &addr1, &l, false);
+        result = address_space_read_continue(as, addr, attrs, buf, len,
+                                             addr1, l, mr);
+        rcu_read_unlock();
+    }
+
+    return result;
+}
+
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+                             uint8_t *buf, int len, bool is_write)
+{
+    if (is_write) {
+        return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
+    } else {
+        return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
+    }
+}

 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                            int len, int is_write)
@@ -2725,8 +2775,8 @@ void cpu_register_map_client(QEMUBH *bh)
 void cpu_exec_init_all(void)
 {
    qemu_mutex_init(&ram_list.mutex);
-    memory_map_init();
    io_mem_init();
+    memory_map_init();
    qemu_mutex_init(&map_client_list_lock);
 }

@@ -2791,6 +2841,7 @@ void *address_space_map(AddressSpace *as,
    hwaddr l, xlat, base;
    MemoryRegion *mr, *this_mr;
    ram_addr_t raddr;
+    void *ptr;

    if (len == 0) {
        return NULL;
@@ -2842,9 +2893,11 @@ void *address_space_map(AddressSpace *as,
    }

    memory_region_ref(mr);
-    rcu_read_unlock();
    *plen = done;
-    return qemu_ram_ptr_length(raddr + base, plen);
+    ptr = qemu_ram_ptr_length(raddr + base, plen);
+    rcu_read_unlock();
+
+    return ptr;
 }

 /* Unmaps a memory region previously mapped by address_space_map().
@@ -3523,6 +3576,16 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
    }
    return 0;
 }
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_bits(void)
+{
+    return TARGET_PAGE_BITS;
+}
+
 #endif

 /*
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -1128,10 +1128,19 @@ int main(int argc, char **argv)
        }
    }

+    if (chdir("/") < 0) {
+        do_perror("chdir");
+        goto error;
+    }
+    if (chroot(rpath) < 0) {
+        do_perror("chroot");
+        goto error;
+    }
+
    get_version = false;
 #ifdef FS_IOC_GETVERSION
    /* check whether underlying FS support IOC_GETVERSION */
-    retval = statfs(rpath, &st_fs);
+    retval = statfs("/", &st_fs);
    if (!retval) {
        switch (st_fs.f_type) {
        case EXT2_SUPER_MAGIC:
@@ -1144,16 +1153,7 @@ int main(int argc, char **argv)
    }
 #endif

-    if (chdir("/") < 0) {
-        do_perror("chdir");
-        goto error;
-    }
-    if (chroot(rpath) < 0) {
-        do_perror("chroot");
-        goto error;
-    }
    umask(0);
-
    if (init_capabilities() < 0) {
        goto error;
    }
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -956,6 +956,13 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
        if (*p == ',')
            p++;
        len = strtoull(p, NULL, 16);
+
+        /* memtohex() doubles the required space */
+        if (len > MAX_PACKET_LENGTH / 2) {
+            put_packet (s, "E22");
+            break;
+        }
+
        if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len, false) != 0) {
            put_packet (s, "E14");
        } else {
@@ -970,6 +977,12 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
        len = strtoull(p, (char **)&p, 16);
        if (*p == ':')
            p++;
+
+        /* hextomem() reads 2*len bytes */
+        if (len > strlen(p) / 2) {
+            put_packet (s, "E22");
+            break;
+        }
        hextomem(mem_buf, p, len);
        if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len,
                                   true) != 0) {
@@ -1107,7 +1120,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
            cpu = find_cpu(thread);
            if (cpu != NULL) {
                cpu_synchronize_state(cpu);
-                len = snprintf((char *)mem_buf, sizeof(mem_buf),
+                /* memtohex() doubles the required space */
+                len = snprintf((char *)mem_buf, sizeof(buf) / 2,
                               "CPU#%d [%s]", cpu->cpu_index,
                               cpu->halted ? "halted " : "running");
                memtohex(buf, mem_buf, len);
@@ -1136,8 +1150,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
                put_packet(s, "E01");
                break;
            }
-            hextomem(mem_buf, p + 5, len);
            len = len / 2;
+            hextomem(mem_buf, p + 5, len);
            mem_buf[len++] = 0;
            qemu_chr_be_write(s->mon_chr, mem_buf, len);
            put_packet(s, "OK");
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -194,8 +194,8 @@ ETEXI

    {
        .name       = "change",
-        .args_type  = "device:B,target:F,arg:s?",
-        .params     = "device filename [format]",
+        .args_type  = "device:B,target:F,arg:s?,read-only-mode:s?",
+        .params     = "device filename [format [read-only-mode]]",
        .help       = "change a removable medium, optional format",
        .mhandler.cmd = hmp_change,
    },
@@ -206,7 +206,7 @@ STEXI
 Change the configuration of a device.

@table @option
-@item change @var{diskdevice} @var{filename} [@var{format}]
+@item change @var{diskdevice} @var{filename} [@var{format} [@var{read-only-mode}]]
 Change the medium for a removable disk device to point to @var{filename}. eg

@example
@@ -215,6 +215,20 @@ Change the medium for a removable disk device to point to @var{filename}. eg

@var{format} is optional.

+@var{read-only-mode} may be used to change the read-only status of the device.
+It accepts the following values:
+
+@table @var
+@item retain
+Retains the current status; this is the default.
+
+@item read-only
+Makes the device read-only.
+
+@item read-write
+Makes the device writable.
+@end table
+
@item change vnc @var{display},@var{options}
 Change the configuration of the VNC server. The valid syntax for @var{display}
 and @var{options} are described at @ref{sec_invocation}. eg
@@ -1005,6 +1019,23 @@ STEXI
@item migrate_set_parameter @var{parameter} @var{value}
@findex migrate_set_parameter
 Set the parameter @var{parameter} for migration.
+ETEXI
+
+    {
+        .name       = "migrate_start_postcopy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Followup to a migration command to switch the migration"
+                      " to postcopy mode. The x-postcopy-ram capability must "
+                      "be set before the original migration command.",
+        .mhandler.cmd = hmp_migrate_start_postcopy,
+    },
+
+STEXI
+@item migrate_start_postcopy
+@findex migrate_start_postcopy
+Switch in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
 ETEXI

    {
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .4.50
 .5.50
				`@@ -0,0 +1 @@`
				`ivshmem-client-obj-y = ivshmem-client.o main.o`
				`@@ -0,0 +1 @@`
				`ivshmem-server-obj-y = ivshmem-server.o main.o`