Update version for v2.5.0-rc4 release

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
blockdev: Mark {insert, remove}-medium experimental
2015-12-11 16:37:55 +00:00 · 2015-12-11 15:39:29 +00:00 · 2015-12-11 12:51:27 +00:00 · 2015-12-10 13:50:45 +00:00 · 2015-12-10 11:19:18 +00:00 · 2015-12-10 11:17:25 +00:00
684 changed files with 35013 additions and 7731 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,8 @@
 /*-darwin-user
 /*-linux-user
 /*-bsd-user
+/ivshmem-client
+/ivshmem-server
 /libdis*
 /libuser
 /linux-headers/asm
--- a/76
+++ b/76
@@ -62,14 +62,22 @@ Guest CPU cores (TCG):
 ----------------------
 Overall
 L: qemu-devel@nongnu.org
-S: Odd fixes
+M: Paolo Bonzini <pbonzini@redhat.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+M: Richard Henderson <rth@twiddle.net>
+S: Maintained
 F: cpu-exec.c
+F: cpu-exec-common.c
+F: cpus.c
 F: cputlb.c
+F: exec.c
 F: softmmu_template.h
-F: translate-all.c
-F: include/exec/cpu_ldst.h
-F: include/exec/cpu_ldst_template.h
+F: translate-all.*
+F: translate-common.c
+F: include/exec/cpu*.h
+F: include/exec/exec-all.h
 F: include/exec/helper*.h
+F: include/exec/tb-hash.h

 Alpha
 M: Richard Henderson <rth@twiddle.net>
@@ -81,6 +89,7 @@ F: disas/alpha.c

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: target-arm/
 F: hw/arm/
@@ -216,6 +225,7 @@ F: */kvm.*

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: target-arm/kvm.c

@@ -235,9 +245,14 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target-s390x/kvm.c
+F: target-s390x/ioinst.[ch]
+F: target-s390x/machine.c
 F: hw/intc/s390_flic.c
 F: hw/intc/s390_flic_kvm.c
 F: include/hw/s390x/s390_flic.h
+F: gdb-xml/s390*.xml
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 X86
 M: Paolo Bonzini <pbonzini@redhat.com>
@@ -287,6 +302,7 @@ ARM Machines
 ------------
 Allwinner-a10
 M: Beniamino Galvani <b.galvani@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/allwinner*
 F: include/hw/*/allwinner*
@@ -294,6 +310,7 @@ F: hw/arm/cubieboard.c

 ARM PrimeCell
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/char/pl011.c
 F: hw/display/pl110*
@@ -308,6 +325,7 @@ F: include/hw/arm/primecell.h

 ARM cores
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/intc/arm*
 F: hw/intc/gic_internal.h
@@ -327,54 +345,64 @@ M: Evgeny Voevodin <e.voevodin@samsung.com>
 M: Maksim Kozlov <m.kozlov@samsung.com>
 M: Igor Mitsyanko <i.mitsyanko@gmail.com>
 M: Dmitry Solodkiy <d.solodkiy@samsung.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/exynos*

 Calxeda Highbank
 M: Rob Herring <robh@kernel.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/highbank.c
 F: hw/net/xgmac.c

 Canon DIGIC
 M: Antony Pavlov <antonynpavlov@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: include/hw/arm/digic.h
 F: hw/*/digic*

 Gumstix
 L: qemu-devel@nongnu.org
+L: qemu-arm@nongnu.org
 S: Orphan
 F: hw/arm/gumstix.c

 i.MX31
 M: Peter Chubb <peter.chubb@nicta.com.au>
+L: qemu-arm@nongnu.org
 S: Odd fixes
 F: hw/*/imx*
 F: hw/arm/kzm.c

 Integrator CP
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/integratorcp.c

 Musicpal
 M: Jan Kiszka <jan.kiszka@web.de>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/musicpal.c

 nSeries
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/nseries.c

 Palm
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/palm.c

 Real View
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/realview*
 F: hw/intc/realview_gic.c
@@ -382,6 +410,7 @@ F: include/hw/intc/realview_gic.h

 PXA2XX
 M: Andrzej Zaborowski <balrogg@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/mainstone.c
 F: hw/arm/spitz.c
@@ -391,17 +420,20 @@ F: hw/*/pxa2xx*

 Stellaris
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/stellaris*

 Versatile PB
 M: Peter Maydell <peter.maydell@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/versatile*

 Xilinx Zynq
 M: Alistair Francis <alistair.francis@xilinx.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/xilinx_zynq.c
 F: hw/misc/zynq_slcr.c
@@ -411,6 +443,7 @@ F: hw/ssi/xilinx_spips.c
 Xilinx ZynqMP
 M: Alistair Francis <alistair.francis@xilinx.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/xlnx-zynqmp.c
 F: hw/arm/xlnx-ep108.c
@@ -419,6 +452,7 @@ F: include/hw/arm/xlnx-zynqmp.h
 ARM ACPI Subsystem
 M: Shannon Zhao <zhaoshenglong@huawei.com>
 M: Shannon Zhao <shannon.zhao@linaro.org>
+L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/virt-acpi-build.c
 F: include/hw/arm/virt-acpi-build.h
@@ -616,15 +650,13 @@ M: Christian Borntraeger <borntraeger@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
-F: hw/s390x/s390-virtio-ccw.c
-F: hw/s390x/css.[hc]
-F: hw/s390x/sclp*.[hc]
-F: hw/s390x/ipl*.[hc]
-F: hw/s390x/*pci*.[hc]
-F: hw/s390x/s390-skeys*.c
+F: hw/s390x/
+X: hw/s390x/s390-virtio-bus.[ch]
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+F: hw/watchdog/wdt_diag288.c
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 UniCore32 Machines
 -------------
@@ -831,6 +863,7 @@ F: net/vhost-user.c

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+M: Greg Kurz <gkurz@linux.vnet.ibm.com>
 S: Supported
 F: hw/9pfs/
 F: fsdev/
@@ -851,7 +884,8 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 F: hw/s390x/virtio-ccw.[hc]
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next

 virtio-input
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -1017,6 +1051,7 @@ S: Supported
 F: include/exec/ioport.h
 F: ioport.c
 F: include/exec/memory.h
+F: include/exec/ram_addr.h
 F: memory.c
 F: include/exec/memory-internal.h
 F: exec.c
@@ -1139,6 +1174,8 @@ F: include/qom/
 X: include/qom/cpu.h
 F: qom/
 X: qom/cpu.c
+F: tests/check-qom-interface.c
+F: tests/check-qom-proplist.c
 F: tests/qom-test.c

 QMP
@@ -1193,6 +1230,19 @@ F: crypto/
 F: include/crypto/
 F: tests/test-crypto-*

+Coroutines
+M: Stefan Hajnoczi <stefanha@redhat.com>
+M: Kevin Wolf <kwolf@redhat.com>
+F: util/*coroutine*
+F: include/qemu/coroutine*
+F: tests/test-coroutine.c
+
+Buffers
+M: Daniel P. Berrange <berrange@redhat.com>
+S: Odd fixes
+F: util/buffer.c
+F: include/qemu/buffer.h
+
 Usermode Emulation
 ------------------
 Overall
@@ -1222,6 +1272,7 @@ AArch64 target
 M: Claudio Fontana <claudio.fontana@huawei.com>
 M: Claudio Fontana <claudio.fontana@gmail.com>
 S: Maintained
+L: qemu-arm@nongnu.org
 F: tcg/aarch64/
 F: disas/arm-a64.cc
 F: disas/libvixl/
@@ -1229,6 +1280,7 @@ F: disas/libvixl/
 ARM target
 M: Andrzej Zaborowski <balrogg@gmail.com>
 S: Maintained
+L: qemu-arm@nongnu.org
 F: tcg/arm/
 F: disas/arm.c

--- a/23
+++ b/23
@@ -151,6 +151,8 @@ dummy := $(call unnest-vars,, \
                stub-obj-y \
                util-obj-y \
                qga-obj-y \
+                ivshmem-client-obj-y \
+                ivshmem-server-obj-y \
                qga-vss-dll-obj-y \
                block-obj-y \
                block-obj-m \
@@ -298,18 +300,15 @@ $(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
 $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)

-# we require QGA_VSS_PROVIDER files to be built alongside qemu-ga
-# executable since they are shipped together, but we don't want to actually
-# link against them
-qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(QGA_VSS_PROVIDER)
-	$(call LINK, $(filter-out $(QGA_VSS_PROVIDER), $^))
+qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
+	$(call LINK, $^)

 ifdef QEMU_GA_MSI_ENABLED
 QEMU_GA_MSI=qemu-ga-$(ARCH).msi

 msi: $(QEMU_GA_MSI)

-$(QEMU_GA_MSI): qemu-ga.exe
+$(QEMU_GA_MSI): qemu-ga.exe $(QGA_VSS_PROVIDER)

 $(QEMU_GA_MSI): config-host.mak

@@ -321,6 +320,16 @@ msi:
 	@echo "MSI build not configured or dependency resolution failed (reconfigure with --enable-guest-agent-msi option)"
 endif

+ifneq ($(EXESUF),)
+.PHONY: qemu-ga
+qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI)
+endif
+
+ivshmem-client$(EXESUF): $(ivshmem-client-obj-y)
+	$(call LINK, $^)
+ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
+	$(call LINK, $^)
+
 clean:
 # avoid old build problems by removing potentially incorrect old files
 	rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h
@@ -431,7 +440,7 @@ endif
 install: all $(if $(BUILD_DOCS),install-doc) \
 install-datadir install-localstatedir
 ifneq ($(TOOLS),)
-	$(call install-prog,$(TOOLS),$(DESTDIR)$(bindir))
+	$(call install-prog,$(subst qemu-ga,qemu-ga$(EXESUF),$(TOOLS)),$(DESTDIR)$(bindir))
 endif
 ifneq ($(CONFIG_MODULES),)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_moddir)"
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -15,10 +15,6 @@ block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o

-block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
-block-obj-y += qemu-coroutine-sleep.o
-block-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
-
 block-obj-m = block/

 #######################################################################
@@ -58,6 +54,8 @@ common-obj-y += audio/
 common-obj-y += hw/
 common-obj-y += accel.o

+common-obj-y += replay/
+
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
 bt-host.o-cflags := $(BLUEZ_CFLAGS)
@@ -108,3 +106,8 @@ target-obj-y += trace/
 # by libqemuutil.a.  These should be moved to a separate .json schema.
 qga-obj-y = qga/
 qga-vss-dll-obj-y = qga/
+
+######################################################################
+# contrib
+ivshmem-client-obj-y = contrib/ivshmem-client/
+ivshmem-server-obj-y = contrib/ivshmem-server/
--- a/2
+++ b/2
@@ -1 +1 @@
-2.4.50
+2.4.94
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif

 struct AioHandler
 {
@@ -25,9 +28,166 @@ struct AioHandler
    IOHandler *io_write;
    int deleted;
    void *opaque;
+    bool is_external;
    QLIST_ENTRY(AioHandler) node;
 };

+#ifdef CONFIG_EPOLL
+
+/* The fd number threashold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+static void aio_epoll_disable(AioContext *ctx)
+{
+    ctx->epoll_available = false;
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    ctx->epoll_enabled = false;
+    close(ctx->epollfd);
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static bool aio_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        int r;
+        if (node->deleted || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+    ctx->epoll_enabled = true;
+    return true;
+}
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    struct epoll_event event;
+    int r;
+
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    if (!node->pfd.events) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
+        if (r) {
+            aio_epoll_disable(ctx);
+        }
+    } else {
+        event.data.ptr = node;
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        if (is_new) {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        } else {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        }
+    }
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    assert(npfd == 1);
+    assert(pfds[0].fd == ctx->epollfd);
+    if (timeout > 0) {
+        ret = qemu_poll_ns(pfds, npfd, timeout);
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         sizeof(events) / sizeof(events[0]),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            node = events[i].data.ptr;
+            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                (ev & EPOLLERR ? G_IO_ERR : 0);
+        }
+    }
+out:
+    return ret;
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    /* Fall back to ppoll when external clients are disabled. */
+    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                                 unsigned npfd, int64_t timeout)
+{
+    if (!ctx->epoll_available) {
+        return false;
+    }
+    if (aio_epoll_enabled(ctx)) {
+        return true;
+    }
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (aio_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            aio_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+#else
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    assert(false);
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    return false;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                          unsigned npfd, int64_t timeout)
+{
+    return false;
+}
+
+#endif
+
 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 {
    AioHandler *node;
@@ -43,11 +203,14 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)

 void aio_set_fd_handler(AioContext *ctx,
                        int fd,
+                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
 {
    AioHandler *node;
+    bool is_new = false;
+    bool deleted = false;

    node = find_aio_handler(ctx, fd);

@@ -66,7 +229,7 @@ void aio_set_fd_handler(AioContext *ctx,
                 * releasing the walking_handlers lock.
                 */
                QLIST_REMOVE(node, node);
-                g_free(node);
+                deleted = true;
            }
        }
    } else {
@@ -77,25 +240,32 @@ void aio_set_fd_handler(AioContext *ctx,
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
+            is_new = true;
        }
        /* Update handler with latest information */
        node->io_read = io_read;
        node->io_write = io_write;
        node->opaque = opaque;
+        node->is_external = is_external;

        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
    }

+    aio_epoll_update(ctx, node, is_new);
    aio_notify(ctx);
+    if (deleted) {
+        g_free(node);
+    }
 }

 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *notifier,
+                            bool is_external,
                            EventNotifierHandler *io_read)
 {
    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
-                       (IOHandler *)io_read, NULL, notifier);
+                       is_external, (IOHandler *)io_read, NULL, notifier);
 }

 bool aio_prepare(AioContext *ctx)
@@ -257,7 +427,9 @@ bool aio_poll(AioContext *ctx, bool blocking)

    /* fill pollfds */
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->pfd.events) {
+        if (!node->deleted && node->pfd.events
+            && !aio_epoll_enabled(ctx)
+            && aio_node_check(ctx, node->is_external)) {
            add_pollfd(node);
        }
    }
@@ -268,7 +440,17 @@ bool aio_poll(AioContext *ctx, bool blocking)
    if (timeout) {
        aio_context_release(ctx);
    }
-    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+        AioHandler epoll_handler;
+
+        epoll_handler.pfd.fd = ctx->epollfd;
+        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
+        npfd = 0;
+        add_pollfd(&epoll_handler);
+        ret = aio_epoll(ctx, pollfds, npfd, timeout);
+    } else  {
+        ret = qemu_poll_ns(pollfds, npfd, timeout);
+    }
    if (blocking) {
        atomic_sub(&ctx->notify_me, 2);
    }
@@ -297,3 +479,16 @@ bool aio_poll(AioContext *ctx, bool blocking)

    return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+#ifdef CONFIG_EPOLL
+    assert(!ctx->epollfd);
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        ctx->epoll_available = false;
+    } else {
+        ctx->epoll_available = true;
+    }
+#endif
+}
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -28,11 +28,13 @@ struct AioHandler {
    GPollFD pfd;
    int deleted;
    void *opaque;
+    bool is_external;
    QLIST_ENTRY(AioHandler) node;
 };

 void aio_set_fd_handler(AioContext *ctx,
                        int fd,
+                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
@@ -86,6 +88,7 @@ void aio_set_fd_handler(AioContext *ctx,
        node->opaque = opaque;
        node->io_read = io_read;
        node->io_write = io_write;
+        node->is_external = is_external;

        event = event_notifier_get_handle(&ctx->notifier);
        WSAEventSelect(node->pfd.fd, event,
@@ -98,6 +101,7 @@ void aio_set_fd_handler(AioContext *ctx,

 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *e,
+                            bool is_external,
                            EventNotifierHandler *io_notify)
 {
    AioHandler *node;
@@ -133,6 +137,7 @@ void aio_set_event_notifier(AioContext *ctx,
            node->e = e;
            node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
            node->pfd.events = G_IO_IN;
+            node->is_external = is_external;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
@@ -304,7 +309,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /* fill fd sets */
    count = 0;
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->io_notify) {
+        if (!node->deleted && node->io_notify
+            && aio_node_check(ctx, node->is_external)) {
            events[count++] = event_notifier_get_handle(node->e);
        }
    }
@@ -363,3 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    aio_context_release(ctx);
    return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+}
--- a/async.c
+++ b/async.c
@@ -59,6 +59,11 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
    return bh;
 }

+void aio_bh_call(QEMUBH *bh)
+{
+    bh->cb(bh->opaque);
+}
+
 /* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 int aio_bh_poll(AioContext *ctx)
 {
@@ -84,7 +89,7 @@ int aio_bh_poll(AioContext *ctx)
                ret = 1;
            }
            bh->idle = 0;
-            bh->cb(bh->opaque);
+            aio_bh_call(bh);
        }
    }

@@ -247,7 +252,7 @@ aio_ctx_finalize(GSource     *source)
    }
    qemu_mutex_unlock(&ctx->bh_lock);

-    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
+    aio_set_event_notifier(ctx, &ctx->notifier, false, NULL);
    event_notifier_cleanup(&ctx->notifier);
    rfifolock_destroy(&ctx->lock);
    qemu_mutex_destroy(&ctx->bh_lock);
@@ -320,15 +325,22 @@ AioContext *aio_context_new(Error **errp)
 {
    int ret;
    AioContext *ctx;
+    Error *local_err = NULL;
+
    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    aio_context_setup(ctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto fail;
+    }
    ret = event_notifier_init(&ctx->notifier, false);
    if (ret < 0) {
-        g_source_destroy(&ctx->source);
        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
-        return NULL;
+        goto fail;
    }
    g_source_set_can_recurse(&ctx->source, true);
    aio_set_event_notifier(ctx, &ctx->notifier,
+                           false,
                           (EventNotifierHandler *)
                           event_notifier_dummy_cb);
    ctx->thread_pool = NULL;
@@ -339,6 +351,9 @@ AioContext *aio_context_new(Error **errp)
    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);

    return ctx;
+fail:
+    g_source_destroy(&ctx->source);
+    return NULL;
 }

 void aio_context_ref(AioContext *ctx)
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -313,9 +313,11 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
        assert(maxnode <= MAX_NODES);
        if (mbind(ptr, sz, backend->policy,
                  maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {
-            error_setg_errno(errp, errno,
-                             "cannot bind memory to host NUMA nodes");
-            return;
+            if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
+                error_setg_errno(errp, errno,
+                                 "cannot bind memory to host NUMA nodes");
+                return;
+            }
        }
 #endif
        /* Preallocate memory after the NUMA policy has been instantiated.
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,17 @@
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
 static void *balloon_opaque;
+static bool balloon_inhibited;
+
+bool qemu_balloon_is_inhibited(void)
+{
+    return balloon_inhibited;
+}
+
+void qemu_balloon_inhibit(bool state)
+{
+    balloon_inhibited = state;
+}

 static bool have_balloon(Error **errp)
 {
--- a/block.c
+++ b/block.c
@@ -33,7 +33,7 @@
 #include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
 #include "qemu/notify.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "block/qapi.h"
 #include "qmp-commands.h"
 #include "qemu/timer.h"
@@ -73,8 +73,7 @@ struct BdrvDirtyBitmap {

 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

-static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
-    QTAILQ_HEAD_INITIALIZER(bdrv_states);
+struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states);

 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
@@ -257,7 +256,6 @@ BlockDriverState *bdrv_new(void)
    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
        QLIST_INIT(&bs->op_blockers[i]);
    }
-    bdrv_iostatus_disable(bs);
    notifier_list_init(&bs->close_notifiers);
    notifier_with_return_list_init(&bs->before_write_notifiers);
    qemu_co_queue_init(&bs->throttled_reqs[0]);
@@ -857,7 +855,6 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
        goto fail_opts;
    }

-    bs->guest_block_size = 512;
    bs->request_alignment = 512;
    bs->zero_beyond_eof = true;
    open_flags = bdrv_open_flags(bs, flags);
@@ -1081,6 +1078,10 @@ static int bdrv_fill_options(QDict **options, const char **pfilename,
        }
    }

+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        *flags |= BDRV_O_INCOMING;
+    }
+
    return 0;
 }

@@ -1392,6 +1393,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    BlockDriverState *bs;
    BlockDriver *drv = NULL;
    const char *drvname;
+    const char *backing;
    Error *local_err = NULL;
    int snapshot_flags = 0;

@@ -1459,6 +1461,12 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,

    assert(drvname || !(flags & BDRV_O_PROTOCOL));

+    backing = qdict_get_try_str(options, "backing");
+    if (backing && *backing == '\0') {
+        flags |= BDRV_O_NO_BACKING;
+        qdict_del(options, "backing");
+    }
+
    bs->open_flags = flags;
    bs->options = options;
    options = qdict_clone_shallow(options);
@@ -1793,8 +1801,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,

    ret = bdrv_flush(reopen_state->bs);
    if (ret) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
-                  strerror(-ret));
+        error_setg_errno(errp, -ret, "Error flushing drive");
        goto error;
    }

@@ -1899,7 +1906,7 @@ void bdrv_close(BlockDriverState *bs)
    }

    /* Disable I/O limits and drain all pending throttled requests */
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
        bdrv_io_limits_disable(bs);
    }

@@ -1908,6 +1915,10 @@ void bdrv_close(BlockDriverState *bs)
    bdrv_drain(bs); /* in case flush left pending I/O */
    notifier_list_notify(&bs->close_notifiers, bs);

+    if (bs->blk) {
+        blk_dev_change_media_cb(bs->blk, false);
+    }
+
    if (bs->drv) {
        BdrvChild *child, *next;

@@ -1946,10 +1957,6 @@ void bdrv_close(BlockDriverState *bs)
        bs->full_open_options = NULL;
    }

-    if (bs->blk) {
-        blk_dev_change_media_cb(bs->blk, false);
-    }
-
    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
        g_free(ban);
    }
@@ -1998,19 +2005,10 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
    /* move some fields that need to stay attached to the device */

    /* dev info */
-    bs_dest->guest_block_size   = bs_src->guest_block_size;
    bs_dest->copy_on_read       = bs_src->copy_on_read;

    bs_dest->enable_write_cache = bs_src->enable_write_cache;

-    /* r/w error */
-    bs_dest->on_read_error      = bs_src->on_read_error;
-    bs_dest->on_write_error     = bs_src->on_write_error;
-
-    /* i/o status */
-    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
-    bs_dest->iostatus           = bs_src->iostatus;
-
    /* dirty bitmap */
    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
 }
@@ -2497,82 +2495,6 @@ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }

-void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
-                       BlockdevOnError on_write_error)
-{
-    bs->on_read_error = on_read_error;
-    bs->on_write_error = on_write_error;
-}
-
-BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
-{
-    return is_read ? bs->on_read_error : bs->on_write_error;
-}
-
-BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
-{
-    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
-
-    switch (on_err) {
-    case BLOCKDEV_ON_ERROR_ENOSPC:
-        return (error == ENOSPC) ?
-               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
-    case BLOCKDEV_ON_ERROR_STOP:
-        return BLOCK_ERROR_ACTION_STOP;
-    case BLOCKDEV_ON_ERROR_REPORT:
-        return BLOCK_ERROR_ACTION_REPORT;
-    case BLOCKDEV_ON_ERROR_IGNORE:
-        return BLOCK_ERROR_ACTION_IGNORE;
-    default:
-        abort();
-    }
-}
-
-static void send_qmp_error_event(BlockDriverState *bs,
-                                 BlockErrorAction action,
-                                 bool is_read, int error)
-{
-    IoOperationType optype;
-
-    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
-    qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
-                                   bdrv_iostatus_is_enabled(bs),
-                                   error == ENOSPC, strerror(error),
-                                   &error_abort);
-}
-
-/* This is done by device models because, while the block layer knows
- * about the error, it does not know whether an operation comes from
- * the device or the block layer (from a job, for example).
- */
-void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
-                       bool is_read, int error)
-{
-    assert(error >= 0);
-
-    if (action == BLOCK_ERROR_ACTION_STOP) {
-        /* First set the iostatus, so that "info block" returns an iostatus
-         * that matches the events raised so far (an additional error iostatus
-         * is fine, but not a lost one).
-         */
-        bdrv_iostatus_set_err(bs, error);
-
-        /* Then raise the request to stop the VM and the event.
-         * qemu_system_vmstop_request_prepare has two effects.  First,
-         * it ensures that the STOP event always comes after the
-         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
-         * can observe the STOP event and do a "cont" before the STOP
-         * event is issued, the VM will not stop.  In this case, vm_start()
-         * also ensures that the STOP/RESUME pair of events is emitted.
-         */
-        qemu_system_vmstop_request_prepare();
-        send_qmp_error_event(bs, action, is_read, error);
-        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
-    } else {
-        send_qmp_error_event(bs, action, is_read, error);
-    }
-}
-
 int bdrv_is_read_only(BlockDriverState *bs)
 {
    return bs->read_only;
@@ -2766,7 +2688,12 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
        blk = blk_by_name(device);

        if (blk) {
-            return blk_bs(blk);
+            bs = blk_bs(blk);
+            if (!bs) {
+                error_setg(errp, "Device '%s' has no medium", device);
+            }
+
+            return bs;
        }
    }

@@ -3136,15 +3063,23 @@ void bdrv_invalidate_cache_all(Error **errp)
 /**
 * Return TRUE if the media is present
 */
-int bdrv_is_inserted(BlockDriverState *bs)
+bool bdrv_is_inserted(BlockDriverState *bs)
 {
    BlockDriver *drv = bs->drv;
+    BdrvChild *child;

-    if (!drv)
-        return 0;
-    if (!drv->bdrv_is_inserted)
-        return 1;
-    return drv->bdrv_is_inserted(bs);
+    if (!drv) {
+        return false;
+    }
+    if (drv->bdrv_is_inserted) {
+        return drv->bdrv_is_inserted(bs);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        if (!bdrv_is_inserted(child->bs)) {
+            return false;
+        }
+    }
+    return true;
 }

 /**
@@ -3195,11 +3130,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
    }
 }

-void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
-{
-    bs->guest_block_size = align;
-}
-
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
    BdrvDirtyBitmap *bm;
@@ -3474,10 +3404,25 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }

-void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_reset_all(bitmap->bitmap);
+    if (!out) {
+        hbitmap_reset_all(bitmap->bitmap);
+    } else {
+        HBitmap *backup = bitmap->bitmap;
+        bitmap->bitmap = hbitmap_alloc(bitmap->size,
+                                       hbitmap_granularity(backup));
+        *out = backup;
+    }
+}
+
+void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
+{
+    HBitmap *tmp = bitmap->bitmap;
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    bitmap->bitmap = in;
+    hbitmap_free(tmp);
 }

 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
@@ -3597,46 +3542,6 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
    return true;
 }

-void bdrv_iostatus_enable(BlockDriverState *bs)
-{
-    bs->iostatus_enabled = true;
-    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
-}
-
-/* The I/O status is only enabled if the drive explicitly
- * enables it _and_ the VM is configured to stop on errors */
-bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
-{
-    return (bs->iostatus_enabled &&
-           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
-            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
-            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
-}
-
-void bdrv_iostatus_disable(BlockDriverState *bs)
-{
-    bs->iostatus_enabled = false;
-}
-
-void bdrv_iostatus_reset(BlockDriverState *bs)
-{
-    if (bdrv_iostatus_is_enabled(bs)) {
-        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
-        if (bs->job) {
-            block_job_iostatus_reset(bs->job);
-        }
-    }
-}
-
-void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
-{
-    assert(bdrv_iostatus_is_enabled(bs));
-    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
-                                         BLOCK_DEVICE_IO_STATUS_FAILED;
-    }
-}
-
 void bdrv_img_create(const char *filename, const char *fmt,
                     const char *base_filename, const char *base_fmt,
                     char *options, uint64_t img_size, int flags,
@@ -3821,7 +3726,7 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
        baf->detach_aio_context(baf->opaque);
    }

-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
        throttle_timers_detach_aio_context(&bs->throttle_timers);
    }
    if (bs->drv->bdrv_detach_aio_context) {
@@ -3857,7 +3762,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
    if (bs->drv->bdrv_attach_aio_context) {
        bs->drv->bdrv_attach_aio_context(bs, new_context);
    }
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
        throttle_timers_attach_aio_context(&bs->throttle_timers, new_context);
    }

@@ -4148,14 +4053,3 @@ void bdrv_refresh_filename(BlockDriverState *bs)
        QDECREF(json);
    }
 }
-
-/* This accessor function purpose is to allow the device models to access the
- * BlockAcctStats structure embedded inside a BlockDriverState without being
- * aware of the BlockDriverState structure layout.
- * It will go away when the BlockAcctStats structure will be moved inside
- * the device models.
- */
-BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
-{
-    return &bs->stats;
-}
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -2,6 +2,7 @@
 * QEMU System Emulator block accounting
 *
 * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +26,54 @@
 #include "block/accounting.h"
 #include "block/block_int.h"
 #include "qemu/timer.h"
+#include "sysemu/qtest.h"
+
+static QEMUClockType clock_type = QEMU_CLOCK_REALTIME;
+static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000;
+
+void block_acct_init(BlockAcctStats *stats, bool account_invalid,
+                     bool account_failed)
+{
+    stats->account_invalid = account_invalid;
+    stats->account_failed = account_failed;
+
+    if (qtest_enabled()) {
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+}
+
+void block_acct_cleanup(BlockAcctStats *stats)
+{
+    BlockAcctTimedStats *s, *next;
+    QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) {
+        g_free(s);
+    }
+}
+
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length)
+{
+    BlockAcctTimedStats *s;
+    unsigned i;
+
+    s = g_new0(BlockAcctTimedStats, 1);
+    s->interval_length = interval_length;
+    QSLIST_INSERT_HEAD(&stats->intervals, s, entries);
+
+    for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
+        timed_average_init(&s->latency[i], clock_type,
+                           (uint64_t) interval_length * NANOSECONDS_PER_SECOND);
+    }
+}
+
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+                                              BlockAcctTimedStats *s)
+{
+    if (s == NULL) {
+        return QSLIST_FIRST(&stats->intervals);
+    } else {
+        return QSLIST_NEXT(s, entries);
+    }
+}

 void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
                      int64_t bytes, enum BlockAcctType type)
@@ -32,26 +81,69 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
    assert(type < BLOCK_MAX_IOTYPE);

    cookie->bytes = bytes;
-    cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    cookie->start_time_ns = qemu_clock_get_ns(clock_type);
    cookie->type = type;
 }

 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
 {
+    BlockAcctTimedStats *s;
+    int64_t time_ns = qemu_clock_get_ns(clock_type);
+    int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+    if (qtest_enabled()) {
+        latency_ns = qtest_latency_ns;
+    }
+
    assert(cookie->type < BLOCK_MAX_IOTYPE);

    stats->nr_bytes[cookie->type] += cookie->bytes;
    stats->nr_ops[cookie->type]++;
-    stats->total_time_ns[cookie->type] +=
-        qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns;
+    stats->total_time_ns[cookie->type] += latency_ns;
+    stats->last_access_time_ns = time_ns;
+
+    QSLIST_FOREACH(s, &stats->intervals, entries) {
+        timed_average_account(&s->latency[cookie->type], latency_ns);
+    }
 }

-
-void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
-                               unsigned int nb_sectors)
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
 {
-    if (stats->wr_highest_sector < sector_num + nb_sectors - 1) {
-        stats->wr_highest_sector = sector_num + nb_sectors - 1;
+    assert(cookie->type < BLOCK_MAX_IOTYPE);
+
+    stats->failed_ops[cookie->type]++;
+
+    if (stats->account_failed) {
+        BlockAcctTimedStats *s;
+        int64_t time_ns = qemu_clock_get_ns(clock_type);
+        int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+        if (qtest_enabled()) {
+            latency_ns = qtest_latency_ns;
+        }
+
+        stats->total_time_ns[cookie->type] += latency_ns;
+        stats->last_access_time_ns = time_ns;
+
+        QSLIST_FOREACH(s, &stats->intervals, entries) {
+            timed_average_account(&s->latency[cookie->type], latency_ns);
+        }
+    }
+}
+
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    /* block_acct_done() and block_acct_failed() update
+     * total_time_ns[], but this one does not. The reason is that
+     * invalid requests are accounted during their submission,
+     * therefore there's no actual I/O involved. */
+
+    stats->invalid_ops[type]++;
+
+    if (stats->account_invalid) {
+        stats->last_access_time_ns = qemu_clock_get_ns(clock_type);
    }
 }

@@ -61,3 +153,20 @@ void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
    assert(type < BLOCK_MAX_IOTYPE);
    stats->merged[type] += num_requests;
 }
+
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats)
+{
+    return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns;
+}
+
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type)
+{
+    uint64_t sum, elapsed;
+
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    sum = timed_average_sum(&stats->latency[type], &elapsed);
+
+    return (double) sum / elapsed;
+}
--- a/block/backup.c
+++ b/block/backup.c
@@ -21,6 +21,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 #define BACKUP_CLUSTER_BITS 16
 #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
@@ -131,7 +132,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

        if (is_write_notifier) {
-            ret = bdrv_co_no_copy_on_readv(bs,
+            ret = bdrv_co_readv_no_serialising(bs,
                                           start * BACKUP_SECTORS_PER_CLUSTER,
                                           n, &bounce_qiov);
        } else {
@@ -215,7 +216,41 @@ static void backup_iostatus_reset(BlockJob *job)
 {
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

-    bdrv_iostatus_reset(s->target);
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
+}
+
+static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
+{
+    BdrvDirtyBitmap *bm;
+    BlockDriverState *bs = job->common.bs;
+
+    if (ret < 0 || block_job_is_cancelled(&job->common)) {
+        /* Merge the successor back into the parent, delete nothing. */
+        bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    } else {
+        /* Everything is fine, delete this bitmap and install the backup. */
+        bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    }
+}
+
+static void backup_commit(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, 0);
+    }
+}
+
+static void backup_abort(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, -1);
+    }
 }

 static const BlockJobDriver backup_job_driver = {
@@ -223,6 +258,8 @@ static const BlockJobDriver backup_job_driver = {
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
    .set_speed      = backup_set_speed,
    .iostatus_reset = backup_iostatus_reset,
+    .commit         = backup_commit,
+    .abort          = backup_abort,
 };

 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -360,8 +397,10 @@ static void coroutine_fn backup_run(void *opaque)
    job->bitmap = hbitmap_alloc(end, 0);

    bdrv_set_enable_write_cache(target, true);
-    bdrv_set_on_error(target, on_target_error, on_target_error);
-    bdrv_iostatus_enable(target);
+    if (target->blk) {
+        blk_set_on_error(target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(target->blk);
+    }

    bdrv_add_before_write_notifier(bs, &before_write);

@@ -436,22 +475,11 @@ static void coroutine_fn backup_run(void *opaque)
    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
-
-    if (job->sync_bitmap) {
-        BdrvDirtyBitmap *bm;
-        if (ret < 0 || block_job_is_cancelled(&job->common)) {
-            /* Merge the successor back into the parent, delete nothing. */
-            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        } else {
-            /* Everything is fine, delete this bitmap and install the backup. */
-            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        }
-    }
    hbitmap_free(job->bitmap);

-    bdrv_iostatus_disable(target);
+    if (target->blk) {
+        blk_iostatus_disable(target->blk);
+    }
    bdrv_op_unblock_all(target, job->common.blocker);

    data = g_malloc(sizeof(*data));
@@ -465,7 +493,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  BlockCompletionFunc *cb, void *opaque,
-                  Error **errp)
+                  BlockJobTxn *txn, Error **errp)
 {
    int64_t len;

@@ -480,7 +508,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }
@@ -547,6 +575,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                       sync_bitmap : NULL;
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
+    block_job_txn_add_job(txn, &job->common);
    qemu_coroutine_enter(job->common.co, job);
    return;

--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -30,6 +30,7 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
+#include "sysemu/qtest.h"

 typedef struct BDRVBlkdebugState {
    int state;
@@ -583,9 +584,13 @@ static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
    remove_rule(rule);
    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);

-    printf("blkdebug: Suspended request '%s'\n", r.tag);
+    if (!qtest_enabled()) {
+        printf("blkdebug: Suspended request '%s'\n", r.tag);
+    }
    qemu_coroutine_yield();
-    printf("blkdebug: Resuming request '%s'\n", r.tag);
+    if (!qtest_enabled()) {
+        printf("blkdebug: Resuming request '%s'\n", r.tag);
+    }

    QLIST_REMOVE(&r, next);
    g_free(r.tag);
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -12,12 +12,17 @@

 #include "sysemu/block-backend.h"
 #include "block/block_int.h"
+#include "block/blockjob.h"
+#include "block/throttle-groups.h"
 #include "sysemu/blockdev.h"
+#include "sysemu/sysemu.h"
 #include "qapi-event.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64

+static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
+
 struct BlockBackend {
    char *name;
    int refcnt;
@@ -29,15 +34,31 @@ struct BlockBackend {
    /* TODO change to DeviceState when all users are qdevified */
    const BlockDevOps *dev_ops;
    void *dev_opaque;
+
+    /* the block size for which the guest device expects atomicity */
+    int guest_block_size;
+
+    /* If the BDS tree is removed, some of its options are stored here (which
+     * can be used to restore those options in the new BDS on insert) */
+    BlockBackendRootState root_state;
+
+    /* I/O stats (display with "info blockstats"). */
+    BlockAcctStats stats;
+
+    BlockdevOnError on_read_error, on_write_error;
+    bool iostatus_enabled;
+    BlockDeviceIoStatus iostatus;
 };

 typedef struct BlockBackendAIOCB {
    BlockAIOCB common;
    QEMUBH *bh;
+    BlockBackend *blk;
    int ret;
 } BlockBackendAIOCB;

 static const AIOCBInfo block_backend_aiocb_info = {
+    .get_aio_context = blk_aiocb_get_aio_context,
    .aiocb_size = sizeof(BlockBackendAIOCB),
 };

@@ -145,12 +166,17 @@ static void blk_delete(BlockBackend *blk)
        bdrv_unref(blk->bs);
        blk->bs = NULL;
    }
+    if (blk->root_state.throttle_state) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
    /* Avoid double-remove after blk_hide_on_behalf_of_hmp_drive_del() */
    if (blk->name[0]) {
        QTAILQ_REMOVE(&blk_backends, blk, link);
    }
    g_free(blk->name);
    drive_info_del(blk->legacy_dinfo);
+    block_acct_cleanup(&blk->stats);
    g_free(blk);
 }

@@ -164,6 +190,11 @@ static void drive_info_del(DriveInfo *dinfo)
    g_free(dinfo);
 }

+int blk_get_refcnt(BlockBackend *blk)
+{
+    return blk ? blk->refcnt : 0;
+}
+
 /*
 * Increment @blk's reference count.
 * @blk must not be null.
@@ -308,6 +339,29 @@ void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk)
    }
 }

+/*
+ * Disassociates the currently associated BlockDriverState from @blk.
+ */
+void blk_remove_bs(BlockBackend *blk)
+{
+    blk_update_root_state(blk);
+
+    blk->bs->blk = NULL;
+    bdrv_unref(blk->bs);
+    blk->bs = NULL;
+}
+
+/*
+ * Associates a new BlockDriverState with @blk.
+ */
+void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
+{
+    assert(!blk->bs && !bs->blk);
+    bdrv_ref(bs);
+    blk->bs = bs;
+    bs->blk = blk;
+}
+
 /*
 * Attach device model @dev to @blk.
 * Return 0 on success, -EBUSY when a device model is attached already.
@@ -320,7 +374,7 @@ int blk_attach_dev(BlockBackend *blk, void *dev)
    }
    blk_ref(blk);
    blk->dev = dev;
-    bdrv_iostatus_reset(blk->bs);
+    blk_iostatus_reset(blk);
    return 0;
 }

@@ -347,7 +401,7 @@ void blk_detach_dev(BlockBackend *blk, void *dev)
    blk->dev = NULL;
    blk->dev_ops = NULL;
    blk->dev_opaque = NULL;
-    bdrv_set_guest_block_size(blk->bs, 512);
+    blk->guest_block_size = 512;
    blk_unref(blk);
 }

@@ -381,18 +435,15 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 void blk_dev_change_media_cb(BlockBackend *blk, bool load)
 {
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
-        bool tray_was_closed = !blk_dev_is_tray_open(blk);
+        bool tray_was_open, tray_is_open;

+        tray_was_open = blk_dev_is_tray_open(blk);
        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
-        if (tray_was_closed) {
-            /* tray open */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              true, &error_abort);
-        }
-        if (load) {
-            /* tray close */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              false, &error_abort);
+        tray_is_open = blk_dev_is_tray_open(blk);
+
+        if (tray_was_open != tray_is_open) {
+            qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open,
+                                              &error_abort);
        }
    }
 }
@@ -452,7 +503,47 @@ void blk_dev_resize_cb(BlockBackend *blk)

 void blk_iostatus_enable(BlockBackend *blk)
 {
-    bdrv_iostatus_enable(blk->bs);
+    blk->iostatus_enabled = true;
+    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+}
+
+/* The I/O status is only enabled if the drive explicitly
+ * enables it _and_ the VM is configured to stop on errors */
+bool blk_iostatus_is_enabled(const BlockBackend *blk)
+{
+    return (blk->iostatus_enabled &&
+           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
+            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
+            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
+}
+
+BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
+{
+    return blk->iostatus;
+}
+
+void blk_iostatus_disable(BlockBackend *blk)
+{
+    blk->iostatus_enabled = false;
+}
+
+void blk_iostatus_reset(BlockBackend *blk)
+{
+    if (blk_iostatus_is_enabled(blk)) {
+        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+        if (blk->bs && blk->bs->job) {
+            block_job_iostatus_reset(blk->bs->job);
+        }
+    }
+}
+
+void blk_iostatus_set_err(BlockBackend *blk, int error)
+{
+    assert(blk_iostatus_is_enabled(blk));
+    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+                                          BLOCK_DEVICE_IO_STATUS_FAILED;
+    }
 }

 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
@@ -464,7 +555,7 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
        return -EIO;
    }

-    if (!blk_is_inserted(blk)) {
+    if (!blk_is_available(blk)) {
        return -ENOMEDIUM;
    }

@@ -551,13 +642,15 @@ static void error_callback_bh(void *opaque)
    qemu_aio_unref(acb);
 }

-static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb,
-                                     void *opaque, int ret)
+BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque, int ret)
 {
    struct BlockBackendAIOCB *acb;
    QEMUBH *bh;

    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
+    acb->blk = blk;
    acb->ret = ret;

    bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb);
@@ -573,7 +666,7 @@ BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags,
@@ -602,16 +695,28 @@ int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)

 int64_t blk_getlength(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_getlength(blk->bs);
 }

 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
 {
-    bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+    if (!blk->bs) {
+        *nb_sectors_ptr = 0;
+    } else {
+        bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+    }
 }

 int64_t blk_nb_sectors(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_nb_sectors(blk->bs);
 }

@@ -621,7 +726,7 @@ BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -633,7 +738,7 @@ BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -642,6 +747,10 @@ BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                          BlockCompletionFunc *cb, void *opaque)
 {
+    if (!blk_is_available(blk)) {
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+    }
+
    return bdrv_aio_flush(blk->bs, cb, opaque);
 }

@@ -651,7 +760,7 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk,
 {
    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

    return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque);
@@ -683,12 +792,20 @@ int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)

 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_ioctl(blk->bs, req, buf);
 }

 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                          BlockCompletionFunc *cb, void *opaque)
 {
+    if (!blk_is_available(blk)) {
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+    }
+
    return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque);
 }

@@ -704,11 +821,19 @@ int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)

 int blk_co_flush(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_co_flush(blk->bs);
 }

 int blk_flush(BlockBackend *blk)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_flush(blk->bs);
 }

@@ -719,7 +844,9 @@ int blk_flush_all(void)

 void blk_drain(BlockBackend *blk)
 {
-    bdrv_drain(blk->bs);
+    if (blk->bs) {
+        bdrv_drain(blk->bs);
+    }
 }

 void blk_drain_all(void)
@@ -727,76 +854,178 @@ void blk_drain_all(void)
    bdrv_drain_all();
 }

+void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
+                      BlockdevOnError on_write_error)
+{
+    blk->on_read_error = on_read_error;
+    blk->on_write_error = on_write_error;
+}
+
 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
 {
-    return bdrv_get_on_error(blk->bs, is_read);
+    return is_read ? blk->on_read_error : blk->on_write_error;
 }

 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
                                      int error)
 {
-    return bdrv_get_error_action(blk->bs, is_read, error);
+    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        return (error == ENOSPC) ?
+               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_STOP:
+        return BLOCK_ERROR_ACTION_STOP;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        return BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        return BLOCK_ERROR_ACTION_IGNORE;
+    default:
+        abort();
+    }
 }

+static void send_qmp_error_event(BlockBackend *blk,
+                                 BlockErrorAction action,
+                                 bool is_read, int error)
+{
+    IoOperationType optype;
+
+    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
+    qapi_event_send_block_io_error(blk_name(blk), optype, action,
+                                   blk_iostatus_is_enabled(blk),
+                                   error == ENOSPC, strerror(error),
+                                   &error_abort);
+}
+
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
                      bool is_read, int error)
 {
-    bdrv_error_action(blk->bs, action, is_read, error);
+    assert(error >= 0);
+
+    if (action == BLOCK_ERROR_ACTION_STOP) {
+        /* First set the iostatus, so that "info block" returns an iostatus
+         * that matches the events raised so far (an additional error iostatus
+         * is fine, but not a lost one).
+         */
+        blk_iostatus_set_err(blk, error);
+
+        /* Then raise the request to stop the VM and the event.
+         * qemu_system_vmstop_request_prepare has two effects.  First,
+         * it ensures that the STOP event always comes after the
+         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
+         * can observe the STOP event and do a "cont" before the STOP
+         * event is issued, the VM will not stop.  In this case, vm_start()
+         * also ensures that the STOP/RESUME pair of events is emitted.
+         */
+        qemu_system_vmstop_request_prepare();
+        send_qmp_error_event(blk, action, is_read, error);
+        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
+    } else {
+        send_qmp_error_event(blk, action, is_read, error);
+    }
 }

 int blk_is_read_only(BlockBackend *blk)
 {
-    return bdrv_is_read_only(blk->bs);
+    if (blk->bs) {
+        return bdrv_is_read_only(blk->bs);
+    } else {
+        return blk->root_state.read_only;
+    }
 }

 int blk_is_sg(BlockBackend *blk)
 {
+    if (!blk->bs) {
+        return 0;
+    }
+
    return bdrv_is_sg(blk->bs);
 }

 int blk_enable_write_cache(BlockBackend *blk)
 {
-    return bdrv_enable_write_cache(blk->bs);
+    if (blk->bs) {
+        return bdrv_enable_write_cache(blk->bs);
+    } else {
+        return !!(blk->root_state.open_flags & BDRV_O_CACHE_WB);
+    }
 }

 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
 {
-    bdrv_set_enable_write_cache(blk->bs, wce);
+    if (blk->bs) {
+        bdrv_set_enable_write_cache(blk->bs, wce);
+    } else {
+        if (wce) {
+            blk->root_state.open_flags |= BDRV_O_CACHE_WB;
+        } else {
+            blk->root_state.open_flags &= ~BDRV_O_CACHE_WB;
+        }
+    }
 }

 void blk_invalidate_cache(BlockBackend *blk, Error **errp)
 {
+    if (!blk->bs) {
+        error_setg(errp, "Device '%s' has no medium", blk->name);
+        return;
+    }
+
    bdrv_invalidate_cache(blk->bs, errp);
 }

-int blk_is_inserted(BlockBackend *blk)
+bool blk_is_inserted(BlockBackend *blk)
 {
-    return bdrv_is_inserted(blk->bs);
+    return blk->bs && bdrv_is_inserted(blk->bs);
+}
+
+bool blk_is_available(BlockBackend *blk)
+{
+    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
 }

 void blk_lock_medium(BlockBackend *blk, bool locked)
 {
-    bdrv_lock_medium(blk->bs, locked);
+    if (blk->bs) {
+        bdrv_lock_medium(blk->bs, locked);
+    }
 }

 void blk_eject(BlockBackend *blk, bool eject_flag)
 {
-    bdrv_eject(blk->bs, eject_flag);
+    if (blk->bs) {
+        bdrv_eject(blk->bs, eject_flag);
+    }
 }

 int blk_get_flags(BlockBackend *blk)
 {
-    return bdrv_get_flags(blk->bs);
+    if (blk->bs) {
+        return bdrv_get_flags(blk->bs);
+    } else {
+        return blk->root_state.open_flags;
+    }
 }

 int blk_get_max_transfer_length(BlockBackend *blk)
 {
-    return blk->bs->bl.max_transfer_length;
+    if (blk->bs) {
+        return blk->bs->bl.max_transfer_length;
+    } else {
+        return 0;
+    }
 }

 void blk_set_guest_block_size(BlockBackend *blk, int align)
 {
-    bdrv_set_guest_block_size(blk->bs, align);
+    blk->guest_block_size = align;
 }

 void *blk_blockalign(BlockBackend *blk, size_t size)
@@ -806,40 +1035,64 @@ void *blk_blockalign(BlockBackend *blk, size_t size)

 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
 {
+    if (!blk->bs) {
+        return false;
+    }
+
    return bdrv_op_is_blocked(blk->bs, op, errp);
 }

 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
 {
-    bdrv_op_unblock(blk->bs, op, reason);
+    if (blk->bs) {
+        bdrv_op_unblock(blk->bs, op, reason);
+    }
 }

 void blk_op_block_all(BlockBackend *blk, Error *reason)
 {
-    bdrv_op_block_all(blk->bs, reason);
+    if (blk->bs) {
+        bdrv_op_block_all(blk->bs, reason);
+    }
 }

 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 {
-    bdrv_op_unblock_all(blk->bs, reason);
+    if (blk->bs) {
+        bdrv_op_unblock_all(blk->bs, reason);
+    }
 }

 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    return bdrv_get_aio_context(blk->bs);
+    if (blk->bs) {
+        return bdrv_get_aio_context(blk->bs);
+    } else {
+        return qemu_get_aio_context();
+    }
+}
+
+static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
+{
+    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
+    return blk_get_aio_context(blk_acb->blk);
 }

 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
-    bdrv_set_aio_context(blk->bs, new_context);
+    if (blk->bs) {
+        bdrv_set_aio_context(blk->bs, new_context);
+    }
 }

 void blk_add_aio_context_notifier(BlockBackend *blk,
        void (*attached_aio_context)(AioContext *new_context, void *opaque),
        void (*detach_aio_context)(void *opaque), void *opaque)
 {
-    bdrv_add_aio_context_notifier(blk->bs, attached_aio_context,
-                                  detach_aio_context, opaque);
+    if (blk->bs) {
+        bdrv_add_aio_context_notifier(blk->bs, attached_aio_context,
+                                      detach_aio_context, opaque);
+    }
 }

 void blk_remove_aio_context_notifier(BlockBackend *blk,
@@ -848,28 +1101,36 @@ void blk_remove_aio_context_notifier(BlockBackend *blk,
                                     void (*detach_aio_context)(void *),
                                     void *opaque)
 {
-    bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context,
-                                     detach_aio_context, opaque);
+    if (blk->bs) {
+        bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context,
+                                         detach_aio_context, opaque);
+    }
 }

 void blk_add_close_notifier(BlockBackend *blk, Notifier *notify)
 {
-    bdrv_add_close_notifier(blk->bs, notify);
+    if (blk->bs) {
+        bdrv_add_close_notifier(blk->bs, notify);
+    }
 }

 void blk_io_plug(BlockBackend *blk)
 {
-    bdrv_io_plug(blk->bs);
+    if (blk->bs) {
+        bdrv_io_plug(blk->bs);
+    }
 }

 void blk_io_unplug(BlockBackend *blk)
 {
-    bdrv_io_unplug(blk->bs);
+    if (blk->bs) {
+        bdrv_io_unplug(blk->bs);
+    }
 }

 BlockAcctStats *blk_get_stats(BlockBackend *blk)
 {
-    return bdrv_get_stats(blk->bs);
+    return &blk->stats;
 }

 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
@@ -902,6 +1163,10 @@ int blk_write_compressed(BlockBackend *blk, int64_t sector_num,

 int blk_truncate(BlockBackend *blk, int64_t offset)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_truncate(blk->bs, offset);
 }

@@ -918,20 +1183,94 @@ int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                     int64_t pos, int size)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_save_vmstate(blk->bs, buf, pos, size);
 }

 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_load_vmstate(blk->bs, buf, pos, size);
 }

 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_probe_blocksizes(blk->bs, bsz);
 }

 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
 {
+    if (!blk_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
    return bdrv_probe_geometry(blk->bs, geo);
 }
+
+/*
+ * Updates the BlockBackendRootState object with data from the currently
+ * attached BlockDriverState.
+ */
+void blk_update_root_state(BlockBackend *blk)
+{
+    assert(blk->bs);
+
+    blk->root_state.open_flags    = blk->bs->open_flags;
+    blk->root_state.read_only     = blk->bs->read_only;
+    blk->root_state.detect_zeroes = blk->bs->detect_zeroes;
+
+    if (blk->root_state.throttle_group) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
+    if (blk->bs->throttle_state) {
+        const char *name = throttle_group_get_name(blk->bs);
+        blk->root_state.throttle_group = g_strdup(name);
+        blk->root_state.throttle_state = throttle_group_incref(name);
+    } else {
+        blk->root_state.throttle_group = NULL;
+        blk->root_state.throttle_state = NULL;
+    }
+}
+
+/*
+ * Applies the information in the root state to the given BlockDriverState. This
+ * does not include the flags which have to be specified for bdrv_open(), use
+ * blk_get_open_flags_from_root_state() to inquire them.
+ */
+void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
+{
+    bs->detect_zeroes = blk->root_state.detect_zeroes;
+    if (blk->root_state.throttle_group) {
+        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
+    }
+}
+
+/*
+ * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
+ * supposed to inherit the root state.
+ */
+int blk_get_open_flags_from_root_state(BlockBackend *blk)
+{
+    int bs_flags;
+
+    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
+    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
+
+    return bs_flags;
+}
+
+BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
+{
+    return &blk->root_state;
+}
--- a/block/commit.c
+++ b/block/commit.c
@@ -17,6 +17,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 enum {
    /*
@@ -213,7 +214,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,

    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, "Invalid parameter combination");
        return;
    }
@@ -235,14 +236,14 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
    orig_overlay_flags = bdrv_get_flags(overlay_bs);

    /* convert base & overlay_bs to r/w, if necessary */
-    if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
                                         orig_overlay_flags | BDRV_O_RDWR);
    }
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
    if (reopen_queue) {
        bdrv_reopen_multiple(reopen_queue, &local_err);
        if (local_err != NULL) {
--- a/block/curl.c
+++ b/block/curl.c
@@ -154,18 +154,20 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd);
    switch (action) {
        case CURL_POLL_IN:
-            aio_set_fd_handler(s->aio_context, fd, curl_multi_read,
-                               NULL, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               curl_multi_read, NULL, state);
            break;
        case CURL_POLL_OUT:
-            aio_set_fd_handler(s->aio_context, fd, NULL, curl_multi_do, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               NULL, curl_multi_do, state);
            break;
        case CURL_POLL_INOUT:
-            aio_set_fd_handler(s->aio_context, fd, curl_multi_read,
-                               curl_multi_do, state);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               curl_multi_read, curl_multi_do, state);
            break;
        case CURL_POLL_REMOVE:
-            aio_set_fd_handler(s->aio_context, fd, NULL, NULL, NULL);
+            aio_set_fd_handler(s->aio_context, fd, false,
+                               NULL, NULL, NULL);
            break;
    }

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -429,28 +429,23 @@ static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 static inline bool gluster_supports_zerofill(void)
@@ -541,35 +536,30 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

    if (write) {
        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
    } else {
        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
    }

    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -600,26 +590,21 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;

-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }

 #ifdef CONFIG_GLUSTERFS_DISCARD
@@ -627,28 +612,23 @@ static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors)
 {
    int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;

-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);

-    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
    if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
    }

    qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 #endif

--- a/block/io.c
+++ b/block/io.c
@@ -23,6 +23,7 @@
 */

 #include "trace.h"
+#include "sysemu/block-backend.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
 #include "block/throttle-groups.h"
@@ -215,6 +216,8 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
 /* Check if any requests are in-flight (including throttled requests) */
 bool bdrv_requests_pending(BlockDriverState *bs)
 {
+    BdrvChild *child;
+
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
        return true;
    }
@@ -224,17 +227,31 @@ bool bdrv_requests_pending(BlockDriverState *bs)
    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
        return true;
    }
-    if (bs->file && bdrv_requests_pending(bs->file->bs)) {
-        return true;
-    }
-    if (bs->backing && bdrv_requests_pending(bs->backing->bs)) {
-        return true;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        if (bdrv_requests_pending(child->bs)) {
+            return true;
+        }
    }
+
    return false;
 }

+static void bdrv_drain_recurse(BlockDriverState *bs)
+{
+    BdrvChild *child;
+
+    if (bs->drv && bs->drv->bdrv_drain) {
+        bs->drv->bdrv_drain(bs);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_drain_recurse(child->bs);
+    }
+}
+
 /*
- * Wait for pending requests to complete on a single BlockDriverState subtree
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
 *
 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 * AioContext.
@@ -247,6 +264,7 @@ void bdrv_drain(BlockDriverState *bs)
 {
    bool busy = true;

+    bdrv_drain_recurse(bs);
    while (busy) {
        /* Keep iterating */
         bdrv_flush_io_queue(bs);
@@ -344,13 +362,14 @@ static void tracked_request_end(BdrvTrackedRequest *req)
 static void tracked_request_begin(BdrvTrackedRequest *req,
                                  BlockDriverState *bs,
                                  int64_t offset,
-                                  unsigned int bytes, bool is_write)
+                                  unsigned int bytes,
+                                  enum BdrvTrackedRequestType type)
 {
    *req = (BdrvTrackedRequest){
        .bs = bs,
        .offset         = offset,
        .bytes          = bytes,
-        .is_write       = is_write,
+        .type           = type,
        .co             = qemu_coroutine_self(),
        .serialising    = false,
        .overlap_offset = offset,
@@ -844,7 +863,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
        mark_request_serialising(req, bdrv_get_cluster_size(bs));
    }

-    wait_serialising_requests(req);
+    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
+        wait_serialising_requests(req);
+    }

    if (flags & BDRV_REQ_COPY_ON_READ) {
        int pnum;
@@ -933,7 +954,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
    }

    /* Don't do copy-on-read if we read data before write operation */
-    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) {
+    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
        flags |= BDRV_REQ_COPY_ON_READ;
    }

@@ -967,7 +988,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
        bytes = ROUND_UP(bytes, align);
    }

-    tracked_request_begin(&req, bs, offset, bytes, false);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
@@ -1002,13 +1023,13 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }

-int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs,
+int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors);
+    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);

    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_NO_COPY_ON_READ);
+                            BDRV_REQ_NO_SERIALISING);
 }

 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
@@ -1151,7 +1172,9 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,

    bdrv_set_dirty(bs, sector_num, nb_sectors);

-    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+    if (bs->wr_highest_offset < offset + bytes) {
+        bs->wr_highest_offset = offset + bytes;
+    }

    if (ret >= 0) {
        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
@@ -1286,7 +1309,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
     * Pad qiov with the read parts and be sure to have a tracked request not
     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
     */
-    tracked_request_begin(&req, bs, offset, bytes, true);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

    if (!qiov) {
        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
@@ -1903,7 +1926,10 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
        }
    }

-    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+    if (bs->blk) {
+        block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
+                              num_reqs - outidx - 1);
+    }

    return outidx + 1;
 }
@@ -2308,18 +2334,20 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
    int ret;
+    BdrvTrackedRequest req;

    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
        bdrv_is_sg(bs)) {
        return 0;
    }

+    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
    /* Write back cached data to the OS even with cache=unsafe */
    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
    if (bs->drv->bdrv_co_flush_to_os) {
        ret = bs->drv->bdrv_co_flush_to_os(bs);
        if (ret < 0) {
-            return ret;
+            goto out;
        }
    }

@@ -2359,14 +2387,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
        ret = 0;
    }
    if (ret < 0) {
-        return ret;
+        goto out;
    }

    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
     * in the case of cache=unsafe, so there are no useless flushes.
     */
 flush_parent:
-    return bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }

 int bdrv_flush(BlockDriverState *bs)
@@ -2409,6 +2440,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors)
 {
+    BdrvTrackedRequest req;
    int max_discard, ret;

    if (!bs->drv) {
@@ -2431,6 +2463,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
        return 0;
    }

+    tracked_request_begin(&req, bs, sector_num, nb_sectors,
+                          BDRV_TRACKED_DISCARD);
    bdrv_set_dirty(bs, sector_num, nb_sectors);

    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
@@ -2464,20 +2498,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
                                            bdrv_co_io_em_complete, &co);
            if (acb == NULL) {
-                return -EIO;
+                ret = -EIO;
+                goto out;
            } else {
                qemu_coroutine_yield();
                ret = co.ret;
            }
        }
        if (ret && ret != -ENOTSUP) {
-            return ret;
+            goto out;
        }

        sector_num += num;
        nb_sectors -= num;
    }
-    return 0;
+    ret = 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }

 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
@@ -2506,26 +2544,109 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
    return rwco.ret;
 }

-/* needed for generic scsi interface */
+typedef struct {
+    CoroutineIOCompletion *co;
+    QEMUBH *bh;
+} BdrvIoctlCompletionData;

-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+    BdrvIoctlCompletionData *data = opaque;
+
+    bdrv_co_io_em_complete(data->co, -ENOTSUP);
+    qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
 {
    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest tracked_req;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;

-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
+    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    if (!drv || !drv->bdrv_aio_ioctl) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+        data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                bdrv_ioctl_bh_cb, data);
+        data->co = &co;
+        qemu_bh_schedule(data->bh);
+    }
+    qemu_coroutine_yield();
+out:
+    tracked_request_end(&tracked_req);
+    return co.ret;
+}
+
+typedef struct {
+    BlockDriverState *bs;
+    int req;
+    void *buf;
+    int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+    BdrvIoctlCoData *data = opaque;
+    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BdrvIoctlCoData data = {
+        .bs = bs,
+        .req = req,
+        .buf = buf,
+        .ret = -EINPROGRESS,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_co_ioctl_entry(&data);
+    } else {
+        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+        qemu_coroutine_enter(co, &data);
+    }
+    while (data.ret == -EINPROGRESS) {
+        aio_poll(bdrv_get_aio_context(bs), true);
+    }
+    return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+                                      acb->req.req, acb->req.buf);
+    bdrv_co_complete(acb);
 }

 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
        BlockCompletionFunc *cb, void *opaque)
 {
-    BlockDriver *drv = bs->drv;
+    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+                                            bs, cb, opaque);
+    Coroutine *co;

-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
-    return NULL;
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.req = req;
+    acb->req.buf = buf;
+    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
 }

 void *qemu_blockalign(BlockDriverState *bs, size_t size)
@@ -2618,3 +2739,20 @@ void bdrv_flush_io_queue(BlockDriverState *bs)
    }
    bdrv_start_throttled_reqs(bs);
 }
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+    }
+    bdrv_drain(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
+    }
+    aio_enable_external(bdrv_get_aio_context(bs));
+}
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -84,6 +84,7 @@ typedef struct IscsiTask {
    IscsiLun *iscsilun;
    QEMUTimer retry_timer;
    bool force_next_flush;
+    int err_code;
 } IscsiTask;

 typedef struct IscsiAIOCB {
@@ -96,6 +97,7 @@ typedef struct IscsiAIOCB {
    int status;
    int64_t sector_num;
    int nb_sectors;
+    int ret;
 #ifdef __linux__
    sg_io_hdr_t *ioh;
 #endif
@@ -169,19 +171,70 @@ static inline unsigned exp_random(double mean)
    return -mean * log((double)rand() / RAND_MAX);
 }

-/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
- * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
- * macro was introduced in 1.11.0. So use the API_VERSION macro as
- * a hint that the macros are defined and define them ourselves
- * otherwise to keep the required libiscsi version at 1.9.0 */
-#if !defined(LIBISCSI_API_VERSION)
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28
-#define QEMU_SCSI_STATUS_TIMEOUT        0x0f000002
-#else
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
-#define QEMU_SCSI_STATUS_TIMEOUT        SCSI_STATUS_TIMEOUT
+/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in
+ * libiscsi 1.10.0, together with other constants we need.  Use it as
+ * a hint that we have to define them ourselves if needed, to keep the
+ * minimum required libiscsi version at 1.9.0.  We use an ASCQ macro for
+ * the test because SCSI_STATUS_* is an enum.
+ *
+ * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes
+ * an enum, check against the LIBISCSI_API_VERSION macro, which was
+ * introduced in 1.11.0.  If it is present, there is no need to define
+ * anything.
+ */
+#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \
+    !defined(LIBISCSI_API_VERSION)
+#define SCSI_STATUS_TASK_SET_FULL                          0x28
+#define SCSI_STATUS_TIMEOUT                                0x0f000002
+#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST    0x2600
+#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR        0x1a00
 #endif

+static int iscsi_translate_sense(struct scsi_sense *sense)
+{
+    int ret;
+
+    switch (sense->key) {
+    case SCSI_SENSE_NOT_READY:
+        return -EBUSY;
+    case SCSI_SENSE_DATA_PROTECTION:
+        return -EACCES;
+    case SCSI_SENSE_COMMAND_ABORTED:
+        return -ECANCELED;
+    case SCSI_SENSE_ILLEGAL_REQUEST:
+        /* Parse ASCQ */
+        break;
+    default:
+        return -EIO;
+    }
+    switch (sense->ascq) {
+    case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR:
+    case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST:
+        ret = -EINVAL;
+        break;
+    case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE:
+        ret = -ENOSPC;
+        break;
+    case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED:
+        ret = -ENOTSUP;
+        break;
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN:
+        ret = -ENOMEDIUM;
+        break;
+    case SCSI_SENSE_ASCQ_WRITE_PROTECTED:
+        ret = -EACCES;
+        break;
+    default:
+        ret = -EIO;
+        break;
+    }
+    return ret;
+}
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                        void *command_data, void *opaque)
@@ -203,11 +256,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                goto out;
            }
            if (status == SCSI_STATUS_BUSY ||
-                status == QEMU_SCSI_STATUS_TIMEOUT ||
-                status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
+                status == SCSI_STATUS_TIMEOUT ||
+                status == SCSI_STATUS_TASK_SET_FULL) {
                unsigned retry_time =
                    exp_random(iscsi_retry_times[iTask->retries - 1]);
-                if (status == QEMU_SCSI_STATUS_TIMEOUT) {
+                if (status == SCSI_STATUS_TIMEOUT) {
                    /* make sure the request is rescheduled AFTER the
                     * reconnect is initiated */
                    retry_time = EVENT_INTERVAL * 2;
@@ -226,6 +279,7 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                return;
            }
        }
+        iTask->err_code = iscsi_translate_sense(&task->sense);
        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
    } else {
        iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
@@ -291,8 +345,8 @@ iscsi_set_events(IscsiLun *iscsilun)
    int ev = iscsi_which_events(iscsi);

    if (ev != iscsilun->events) {
-        aio_set_fd_handler(iscsilun->aio_context,
-                           iscsi_get_fd(iscsi),
+        aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi),
+                           false,
                           (ev & POLLIN) ? iscsi_process_read : NULL,
                           (ev & POLLOUT) ? iscsi_process_write : NULL,
                           iscsilun);
@@ -455,7 +509,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
@@ -644,7 +698,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    return 0;
@@ -683,7 +737,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    return 0;
@@ -703,7 +757,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    if (status < 0) {
        error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
                     iscsi_get_error(iscsi));
-        acb->status = -EIO;
+        acb->status = iscsi_translate_sense(&acb->task->sense);
    }

    acb->ioh->driver_status = 0;
@@ -726,6 +780,38 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    iscsi_schedule_bh(acb);
 }

+static void iscsi_ioctl_bh_completion(void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    qemu_bh_delete(acb->bh);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_aio_unref(acb);
+}
+
+static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf)
+{
+    BlockDriverState *bs = acb->common.bs;
+    IscsiLun *iscsilun = bs->opaque;
+    int ret = 0;
+
+    switch (req) {
+    case SG_GET_VERSION_NUM:
+        *(int *)buf = 30000;
+        break;
+    case SG_GET_SCSI_ID:
+        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
+        break;
+    default:
+        ret = -EINVAL;
+    }
+    assert(!acb->bh);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                         iscsi_ioctl_bh_completion, acb);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+}
+
 static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
        BlockCompletionFunc *cb, void *opaque)
@@ -735,8 +821,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    struct iscsi_data data;
    IscsiAIOCB *acb;

-    assert(req == SG_IO);
-
    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
@@ -745,6 +829,11 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb->buf         = NULL;
    acb->ioh         = buf;

+    if (req != SG_IO) {
+        iscsi_ioctl_handle_emulated(acb, req, buf);
+        return &acb->common;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -809,38 +898,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    return &acb->common;
 }

-static void ioctl_cb(void *opaque, int status)
-{
-    int *p_status = opaque;
-    *p_status = status;
-}
-
-static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    int status;
-
-    switch (req) {
-    case SG_GET_VERSION_NUM:
-        *(int *)buf = 30000;
-        break;
-    case SG_GET_SCSI_ID:
-        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
-        break;
-    case SG_IO:
-        status = -EINPROGRESS;
-        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
-
-        while (status == -EINPROGRESS) {
-            aio_poll(iscsilun->aio_context, true);
-        }
-
-        return 0;
-    default:
-        return -1;
-    }
-    return 0;
-}
 #endif

 static int64_t
@@ -905,7 +962,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
@@ -999,7 +1056,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1280,9 +1337,8 @@ static void iscsi_detach_aio_context(BlockDriverState *bs)
 {
    IscsiLun *iscsilun = bs->opaque;

-    aio_set_fd_handler(iscsilun->aio_context,
-                       iscsi_get_fd(iscsilun->iscsi),
-                       NULL, NULL, NULL);
+    aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi),
+                       false, NULL, NULL, NULL);
    iscsilun->events = 0;

    if (iscsilun->nop_timer) {
@@ -1772,7 +1828,6 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
-    .bdrv_ioctl       = iscsi_ioctl,
    .bdrv_aio_ioctl   = iscsi_aio_ioctl,
 #endif

--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -287,7 +287,7 @@ void laio_detach_aio_context(void *s_, AioContext *old_context)
 {
    struct qemu_laio_state *s = s_;

-    aio_set_event_notifier(old_context, &s->e, NULL);
+    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

@@ -296,7 +296,8 @@ void laio_attach_aio_context(void *s_, AioContext *new_context)
    struct qemu_laio_state *s = s_;

    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
-    aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb);
+    aio_set_event_notifier(new_context, &s->e, false,
+                           qemu_laio_completion_cb);
 }

 void *laio_init(void)
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -14,6 +14,7 @@
 #include "trace.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
@@ -383,9 +384,11 @@ static void mirror_exit(BlockJob *job, void *opaque)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
+    bdrv_op_unblock_all(s->target, s->common.blocker);
    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
+    bdrv_drained_end(src);
    bdrv_unref(src);
 }

@@ -599,10 +602,15 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    bdrv_iostatus_disable(s->target);
+    if (s->target->blk) {
+        blk_iostatus_disable(s->target->blk);
+    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
+    /* Before we switch to target in mirror_exit, make sure data doesn't
+     * change. */
+    bdrv_drained_begin(s->common.bs);
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

@@ -621,7 +629,9 @@ static void mirror_iostatus_reset(BlockJob *job)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

-    bdrv_iostatus_reset(s->target);
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
 }

 static void mirror_complete(BlockJob *job, Error **errp)
@@ -704,7 +714,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }
@@ -736,12 +746,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
-        block_job_release(bs);
+        block_job_unref(&s->common);
        return;
    }
+
+    bdrv_op_block_all(s->target, s->common.blocker);
+
    bdrv_set_enable_write_cache(s->target, true);
-    bdrv_set_on_error(s->target, on_target_error, on_target_error);
-    bdrv_iostatus_enable(s->target);
+    if (s->target->blk) {
+        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(s->target->blk);
+    }
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -124,7 +124,7 @@ static int nbd_co_send_request(BlockDriverState *bs,
    s->send_coroutine = qemu_coroutine_self();
    aio_context = bdrv_get_aio_context(bs);

-    aio_set_fd_handler(aio_context, s->sock,
+    aio_set_fd_handler(aio_context, s->sock, false,
                       nbd_reply_ready, nbd_restart_write, bs);
    if (qiov) {
        if (!s->is_unix) {
@@ -144,7 +144,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
    } else {
        rc = nbd_send_request(s->sock, request);
    }
-    aio_set_fd_handler(aio_context, s->sock, nbd_reply_ready, NULL, bs);
+    aio_set_fd_handler(aio_context, s->sock, false,
+                       nbd_reply_ready, NULL, bs);
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
    return rc;
@@ -348,14 +349,15 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sock, NULL, NULL, NULL);
+                       nbd_get_client_session(bs)->sock,
+                       false, NULL, NULL, NULL);
 }

 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                   AioContext *new_context)
 {
    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock,
-                       nbd_reply_ready, NULL, bs);
+                       false, nbd_reply_ready, NULL, bs);
 }

 void nbd_client_close(BlockDriverState *bs)
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -206,24 +206,24 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
    saddr = g_new0(SocketAddress, 1);

    if (qdict_haskey(options, "path")) {
-        saddr->kind = SOCKET_ADDRESS_KIND_UNIX;
-        saddr->q_unix = g_new0(UnixSocketAddress, 1);
-        saddr->q_unix->path = g_strdup(qdict_get_str(options, "path"));
+        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
+        saddr->u.q_unix = g_new0(UnixSocketAddress, 1);
+        saddr->u.q_unix->path = g_strdup(qdict_get_str(options, "path"));
        qdict_del(options, "path");
    } else {
-        saddr->kind = SOCKET_ADDRESS_KIND_INET;
-        saddr->inet = g_new0(InetSocketAddress, 1);
-        saddr->inet->host = g_strdup(qdict_get_str(options, "host"));
+        saddr->type = SOCKET_ADDRESS_KIND_INET;
+        saddr->u.inet = g_new0(InetSocketAddress, 1);
+        saddr->u.inet->host = g_strdup(qdict_get_str(options, "host"));
        if (!qdict_get_try_str(options, "port")) {
-            saddr->inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+            saddr->u.inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
        } else {
-            saddr->inet->port = g_strdup(qdict_get_str(options, "port"));
+            saddr->u.inet->port = g_strdup(qdict_get_str(options, "port"));
        }
        qdict_del(options, "host");
        qdict_del(options, "port");
    }

-    s->client.is_unix = saddr->kind == SOCKET_ADDRESS_KIND_UNIX;
+    s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;

    *export = g_strdup(qdict_get_try_str(options, "export"));
    if (*export) {
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -63,11 +63,10 @@ static void nfs_set_events(NFSClient *client)
 {
    int ev = nfs_which_events(client->context);
    if (ev != client->events) {
-        aio_set_fd_handler(client->aio_context,
-                           nfs_get_fd(client->context),
+        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                           false,
                           (ev & POLLIN) ? nfs_process_read : NULL,
-                           (ev & POLLOUT) ? nfs_process_write : NULL,
-                           client);
+                           (ev & POLLOUT) ? nfs_process_write : NULL, client);

    }
    client->events = ev;
@@ -242,9 +241,8 @@ static void nfs_detach_aio_context(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;

-    aio_set_fd_handler(client->aio_context,
-                       nfs_get_fd(client->context),
-                       NULL, NULL, NULL);
+    aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                       false, NULL, NULL, NULL);
    client->events = 0;
 }

@@ -263,9 +261,8 @@ static void nfs_client_close(NFSClient *client)
        if (client->fh) {
            nfs_close(client->context, client->fh);
        }
-        aio_set_fd_handler(client->aio_context,
-                           nfs_get_fd(client->context),
-                           NULL, NULL, NULL);
+        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
+                           false, NULL, NULL, NULL);
        nfs_destroy_context(client->context);
    }
    memset(client, 0, sizeof(NFSClient));
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -220,7 +220,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
        s->data_end += s->tracks;
        bitmap_set(s->bat_dirty_bmap,
-                   bat_entry_off(idx) / s->bat_dirty_block, 1);
+                   bat_entry_off(idx + i) / s->bat_dirty_block, 1);
    }

    return bat2sect(s, idx) + sector_num % s->tracks;
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -64,7 +64,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
        ThrottleConfig cfg;

        throttle_group_get_config(bs, &cfg);
@@ -301,17 +301,17 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
        info->tray_open = blk_dev_is_tray_open(blk);
    }

-    if (bdrv_iostatus_is_enabled(bs)) {
+    if (blk_iostatus_is_enabled(blk)) {
        info->has_io_status = true;
-        info->io_status = bs->iostatus;
+        info->io_status = blk_iostatus(blk);
    }

-    if (!QLIST_EMPTY(&bs->dirty_bitmaps)) {
+    if (bs && !QLIST_EMPTY(&bs->dirty_bitmaps)) {
        info->has_dirty_bitmaps = true;
        info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs);
    }

-    if (bs->drv) {
+    if (bs && bs->drv) {
        info->has_inserted = true;
        info->inserted = bdrv_block_device_info(bs, errp);
        if (info->inserted == NULL) {
@@ -344,18 +344,73 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
    }

    s->stats = g_malloc0(sizeof(*s->stats));
-    s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ];
-    s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
-    s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
-    s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
-    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ];
-    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE];
-    s->stats->wr_highest_offset =
-        bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
-    s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
-    s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE];
-    s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ];
-    s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH];
+    if (bs->blk) {
+        BlockAcctStats *stats = blk_get_stats(bs->blk);
+        BlockAcctTimedStats *ts = NULL;
+
+        s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
+        s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+        s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
+        s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+
+        s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
+        s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+        s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
+
+        s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
+        s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+        s->stats->invalid_flush_operations =
+            stats->invalid_ops[BLOCK_ACCT_FLUSH];
+
+        s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
+        s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+        s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
+        s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+        s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
+        s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
+
+        s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
+        if (s->stats->has_idle_time_ns) {
+            s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
+        }
+
+        s->stats->account_invalid = stats->account_invalid;
+        s->stats->account_failed = stats->account_failed;
+
+        while ((ts = block_acct_interval_next(stats, ts))) {
+            BlockDeviceTimedStatsList *timed_stats =
+                g_malloc0(sizeof(*timed_stats));
+            BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
+            timed_stats->next = s->stats->timed_stats;
+            timed_stats->value = dev_stats;
+            s->stats->timed_stats = timed_stats;
+
+            TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
+            TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+            TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
+
+            dev_stats->interval_length = ts->interval_length;
+
+            dev_stats->min_rd_latency_ns = timed_average_min(rd);
+            dev_stats->max_rd_latency_ns = timed_average_max(rd);
+            dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
+
+            dev_stats->min_wr_latency_ns = timed_average_min(wr);
+            dev_stats->max_wr_latency_ns = timed_average_max(wr);
+            dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+
+            dev_stats->min_flush_latency_ns = timed_average_min(fl);
+            dev_stats->max_flush_latency_ns = timed_average_max(fl);
+            dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+            dev_stats->avg_rd_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+            dev_stats->avg_wr_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+        }
+    }
+
+    s->stats->wr_highest_offset = bs->wr_highest_offset;

    if (bs->file) {
        s->has_parent = true;
@@ -381,7 +436,9 @@ BlockInfoList *qmp_query_block(Error **errp)
        bdrv_query_info(blk, &info->value, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            goto err;
+            g_free(info);
+            qapi_free_BlockInfoList(head);
+            return NULL;
        }

        *p_next = info;
@@ -389,10 +446,6 @@ BlockInfoList *qmp_query_block(Error **errp)
    }

    return head;
-
- err:
-    qapi_free_BlockInfoList(head);
-    return NULL;
 }

 BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -312,7 +312,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
    if (!offset)
        return 0;

-    assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED);
+    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);

    for (i = 0; i < nb_clusters; i++) {
        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -324,14 +324,16 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 	return i;
 }

-static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table)
+static int count_contiguous_clusters_by_type(int nb_clusters,
+                                             uint64_t *l2_table,
+                                             int wanted_type)
 {
    int i;

    for (i = 0; i < nb_clusters; i++) {
        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));

-        if (type != QCOW2_CLUSTER_UNALLOCATED) {
+        if (type != wanted_type) {
            break;
        }
    }
@@ -554,13 +556,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
            ret = -EIO;
            goto fail;
        }
-        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], QCOW_OFLAG_ZERO);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_ZERO);
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_UNALLOCATED:
        /* how many empty clusters ? */
-        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_UNALLOCATED);
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_NORMAL:
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -560,13 +560,16 @@ static int alloc_refcount_block(BlockDriverState *bs,
    }

    /* Hook up the new refcount table in the qcow2 header */
-    uint8_t data[12];
-    cpu_to_be64w((uint64_t*)data, table_offset);
-    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+    struct QEMU_PACKED {
+        uint64_t d64;
+        uint32_t d32;
+    } data;
+    cpu_to_be64w(&data.d64, table_offset);
+    cpu_to_be32w(&data.d32, table_clusters);
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
    ret = bdrv_pwrite_sync(bs->file->bs,
                           offsetof(QCowHeader, refcount_table_offset),
-                           data, sizeof(data));
+                           &data, sizeof(data));
    if (ret < 0) {
        goto fail_table;
    }
@@ -1241,7 +1244,7 @@ fail:
 /* refcount checking functions */


-static size_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
+static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
 {
    /* This assertion holds because there is no way we can address more than
     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2738,18 +2738,16 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);

    *spec_info = (ImageInfoSpecific){
-        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
-        {
-            .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
-        },
+        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
+        .u.qcow2 = g_new(ImageInfoSpecificQCow2, 1),
    };
    if (s->qcow_version == 2) {
-        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
        };
    } else if (s->qcow_version == 3) {
-        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -26,7 +26,7 @@
 #define BLOCK_QCOW2_H

 #include "crypto/cipher.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"

 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
--- a/block/qed.c
+++ b/block/qed.c
@@ -375,6 +375,18 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
    }
 }

+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Cancel timer and start doing I/O that were meant to happen as if it
+     * fired, that way we get bdrv_drain() taking care of the ongoing requests
+     * correctly. */
+    qed_cancel_need_check_timer(s);
+    qed_plug_allocating_write_reqs(s);
+    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
@@ -1676,6 +1688,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_check               = bdrv_qed_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };

 static void bdrv_qed_init(void)
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,11 +127,6 @@ do { \

 #define FTYPE_FILE   0
 #define FTYPE_CD     1
-#define FTYPE_FD     2
-
-/* if the FD is not accessed during that time (in ns), we try to
-   reopen it to see if the disk has been changed */
-#define FD_OPEN_TIMEOUT (1000000000)

 #define MAX_BLOCKSIZE	4096

@@ -141,13 +136,6 @@ typedef struct BDRVRawState {
    int open_flags;
    size_t buf_align;

-#if defined(__linux__)
-    /* linux floppy specific */
-    int64_t fd_open_time;
-    int64_t fd_error_time;
-    int fd_got_error;
-    int fd_media_changed;
-#endif
 #ifdef CONFIG_LINUX_AIO
    int use_aio;
    void *aio_ctx;
@@ -635,7 +623,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
    }
 #endif

-    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
+    if (s->type == FTYPE_CD) {
        raw_s->open_flags |= O_NONBLOCK;
    }

@@ -1988,8 +1976,8 @@ BlockDriver bdrv_file = {

 #if defined(__APPLE__) && defined(__MACH__)
 static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
-static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
-
+static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
+                                CFIndex maxPathSize, int flags);
 kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
 {
    kern_return_t       kernResult;
@@ -2016,7 +2004,8 @@ kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
    return kernResult;
 }

-kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
+                         CFIndex maxPathSize, int flags)
 {
    io_object_t     nextMedia;
    kern_return_t   kernResult = KERN_FAILURE;
@@ -2029,7 +2018,9 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
        if ( bsdPathAsCFString ) {
            size_t devPathLength;
            strcpy( bsdPath, _PATH_DEV );
-            strcat( bsdPath, "r" );
+            if (flags & BDRV_O_NOCACHE) {
+                strcat(bsdPath, "r");
+            }
            devPathLength = strlen( bsdPath );
            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
                kernResult = KERN_SUCCESS;
@@ -2141,8 +2132,8 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
        int fd;

        kernResult = FindEjectableCDMedia( &mediaIterator );
-        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
-
+        kernResult = GetBSDPath(mediaIterator, bsdPath, sizeof(bsdPath),
+                                flags);
        if ( bsdPath[ 0 ] != '\0' ) {
            strcat(bsdPath,"s0");
            /* some CDs don't have a partition 0 */
@@ -2187,53 +2178,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
 }

 #if defined(__linux__)
-/* Note: we do not have a reliable method to detect if the floppy is
-   present. The current method is to try to open the floppy at every
-   I/O and to keep it opened during a few hundreds of ms. */
-static int fd_open(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-    int last_media_present;
-
-    if (s->type != FTYPE_FD)
-        return 0;
-    last_media_present = (s->fd >= 0);
-    if (s->fd >= 0 &&
-        (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
-        qemu_close(s->fd);
-        s->fd = -1;
-        DPRINTF("Floppy closed\n");
-    }
-    if (s->fd < 0) {
-        if (s->fd_got_error &&
-            (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
-            DPRINTF("No floppy (open delayed)\n");
-            return -EIO;
-        }
-        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
-        if (s->fd < 0) {
-            s->fd_error_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-            s->fd_got_error = 1;
-            if (last_media_present)
-                s->fd_media_changed = 1;
-            DPRINTF("No floppy\n");
-            return -EIO;
-        }
-        DPRINTF("Floppy opened\n");
-    }
-    if (!last_media_present)
-        s->fd_media_changed = 1;
-    s->fd_open_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-    s->fd_got_error = 0;
-    return 0;
-}
-
-static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    BDRVRawState *s = bs->opaque;
-
-    return ioctl(s->fd, req, buf);
-}

 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
@@ -2256,8 +2200,8 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
+#endif /* linux */

-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static int fd_open(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
@@ -2267,14 +2211,6 @@ static int fd_open(BlockDriverState *bs)
        return 0;
    return -EIO;
 }
-#else /* !linux && !FreeBSD */
-
-static int fd_open(BlockDriverState *bs)
-{
-    return 0;
-}
-
-#endif /* !linux && !FreeBSD */

 static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
@@ -2318,14 +2254,13 @@ static int hdev_create(const char *filename, QemuOpts *opts,
    int64_t total_size = 0;
    bool has_prefix;

-    /* This function is used by all three protocol block drivers and therefore
-     * any of these three prefixes may be given.
+    /* This function is used by both protocol block drivers and therefore either
+     * of these prefixes may be given.
     * The return value has to be stored somewhere, otherwise this is an error
     * due to -Werror=unused-value. */
    has_prefix =
        strstart(filename, "host_device:", &filename) ||
-        strstart(filename, "host_cdrom:" , &filename) ||
-        strstart(filename, "host_floppy:", &filename);
+        strstart(filename, "host_cdrom:" , &filename);

    (void)has_prefix;

@@ -2400,160 +2335,10 @@ static BlockDriver bdrv_host_device = {

    /* generic scsi device */
 #ifdef __linux__
-    .bdrv_ioctl         = hdev_ioctl,
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
 #endif
 };

-#ifdef __linux__
-static void floppy_parse_filename(const char *filename, QDict *options,
-                                  Error **errp)
-{
-    /* The prefix is optional, just as for "file". */
-    strstart(filename, "host_floppy:", &filename);
-
-    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
-}
-
-static int floppy_open(BlockDriverState *bs, QDict *options, int flags,
-                       Error **errp)
-{
-    BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;
-
-    s->type = FTYPE_FD;
-
-    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
-    if (ret) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
-        return ret;
-    }
-
-    /* close fd so that we can reopen it as needed */
-    qemu_close(s->fd);
-    s->fd = -1;
-    s->fd_media_changed = 1;
-
-    error_report("Host floppy pass-through is deprecated");
-    error_printf("Support for it will be removed in a future release.\n");
-    return 0;
-}
-
-static int floppy_probe_device(const char *filename)
-{
-    int fd, ret;
-    int prio = 0;
-    struct floppy_struct fdparam;
-    struct stat st;
-
-    if (strstart(filename, "/dev/fd", NULL) &&
-        !strstart(filename, "/dev/fdset/", NULL) &&
-        !strstart(filename, "/dev/fd/", NULL)) {
-        prio = 50;
-    }
-
-    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
-    if (fd < 0) {
-        goto out;
-    }
-    ret = fstat(fd, &st);
-    if (ret == -1 || !S_ISBLK(st.st_mode)) {
-        goto outc;
-    }
-
-    /* Attempt to detect via a floppy specific ioctl */
-    ret = ioctl(fd, FDGETPRM, &fdparam);
-    if (ret >= 0)
-        prio = 100;
-
-outc:
-    qemu_close(fd);
-out:
-    return prio;
-}
-
-
-static int floppy_is_inserted(BlockDriverState *bs)
-{
-    return fd_open(bs) >= 0;
-}
-
-static int floppy_media_changed(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-    int ret;
-
-    /*
-     * XXX: we do not have a true media changed indication.
-     * It does not work if the floppy is changed without trying to read it.
-     */
-    fd_open(bs);
-    ret = s->fd_media_changed;
-    s->fd_media_changed = 0;
-    DPRINTF("Floppy changed=%d\n", ret);
-    return ret;
-}
-
-static void floppy_eject(BlockDriverState *bs, bool eject_flag)
-{
-    BDRVRawState *s = bs->opaque;
-    int fd;
-
-    if (s->fd >= 0) {
-        qemu_close(s->fd);
-        s->fd = -1;
-    }
-    fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK);
-    if (fd >= 0) {
-        if (ioctl(fd, FDEJECT, 0) < 0)
-            perror("FDEJECT");
-        qemu_close(fd);
-    }
-}
-
-static BlockDriver bdrv_host_floppy = {
-    .format_name        = "host_floppy",
-    .protocol_name      = "host_floppy",
-    .instance_size      = sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_probe_device	= floppy_probe_device,
-    .bdrv_parse_filename = floppy_parse_filename,
-    .bdrv_file_open     = floppy_open,
-    .bdrv_close         = raw_close,
-    .bdrv_reopen_prepare = raw_reopen_prepare,
-    .bdrv_reopen_commit  = raw_reopen_commit,
-    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
-    .create_opts         = &raw_create_opts,
-
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
-    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_io_plug = raw_aio_plug,
-    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
-
-    .bdrv_truncate      = raw_truncate,
-    .bdrv_getlength      = raw_getlength,
-    .has_variable_length = true,
-    .bdrv_get_allocated_file_size
-                        = raw_get_allocated_file_size,
-
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
-    /* removable device support */
-    .bdrv_is_inserted   = floppy_is_inserted,
-    .bdrv_media_changed = floppy_media_changed,
-    .bdrv_eject         = floppy_eject,
-};
-#endif
-
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                 Error **errp)
@@ -2609,15 +2394,13 @@ out:
    return prio;
 }

-static int cdrom_is_inserted(BlockDriverState *bs)
+static bool cdrom_is_inserted(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int ret;

    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
-    if (ret == CDS_DISC_OK)
-        return 1;
-    return 0;
+    return ret == CDS_DISC_OK;
 }

 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
@@ -2684,7 +2467,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_lock_medium   = cdrom_lock_medium,

    /* generic scsi device */
-    .bdrv_ioctl         = hdev_ioctl,
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
 };
 #endif /* __linux__ */
@@ -2743,7 +2525,7 @@ static int cdrom_reopen(BlockDriverState *bs)
    return 0;
 }

-static int cdrom_is_inserted(BlockDriverState *bs)
+static bool cdrom_is_inserted(BlockDriverState *bs)
 {
    return raw_getlength(bs) > 0;
 }
@@ -2831,7 +2613,6 @@ static void bdrv_file_init(void)
    bdrv_register(&bdrv_file);
    bdrv_register(&bdrv_host_device);
 #ifdef __linux__
-    bdrv_register(&bdrv_host_floppy);
    bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -154,11 +154,6 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
    return bdrv_truncate(bs->file->bs, offset);
 }

-static int raw_is_inserted(BlockDriverState *bs)
-{
-    return bdrv_is_inserted(bs->file->bs);
-}
-
 static int raw_media_changed(BlockDriverState *bs)
 {
    return bdrv_media_changed(bs->file->bs);
@@ -174,11 +169,6 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
    bdrv_lock_medium(bs->file->bs, locked);
 }

-static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    return bdrv_ioctl(bs->file->bs, req, buf);
-}
-
 static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
                                 unsigned long int req, void *buf,
                                 BlockCompletionFunc *cb,
@@ -264,11 +254,9 @@ BlockDriver bdrv_raw = {
    .bdrv_refresh_limits  = &raw_refresh_limits,
    .bdrv_probe_blocksizes = &raw_probe_blocksizes,
    .bdrv_probe_geometry  = &raw_probe_geometry,
-    .bdrv_is_inserted     = &raw_is_inserted,
    .bdrv_media_changed   = &raw_media_changed,
    .bdrv_eject           = &raw_eject,
    .bdrv_lock_medium     = &raw_lock_medium,
-    .bdrv_ioctl           = &raw_ioctl,
    .bdrv_aio_ioctl       = &raw_aio_ioctl,
    .create_opts          = &raw_create_opts,
    .bdrv_has_zero_init   = &raw_has_zero_init
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -651,14 +651,16 @@ static coroutine_fn void do_co_req(void *opaque)
    unsigned int *rlen = srco->rlen;

    co = qemu_coroutine_self();
-    aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       NULL, restart_co_req, co);

    ret = send_co_req(sockfd, hdr, data, wlen);
    if (ret < 0) {
        goto out;
    }

-    aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       restart_co_req, NULL, co);

    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
    if (ret != sizeof(*hdr)) {
@@ -683,7 +685,8 @@ static coroutine_fn void do_co_req(void *opaque)
 out:
    /* there is at most one request for this sockfd, so it is safe to
     * set each handler to NULL. */
-    aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL);
+    aio_set_fd_handler(srco->aio_context, sockfd, false,
+                       NULL, NULL, NULL);

    srco->ret = ret;
    srco->finished = true;
@@ -735,7 +738,8 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
    BDRVSheepdogState *s = opaque;
    AIOReq *aio_req, *next;

-    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
+                       NULL, NULL);
    close(s->fd);
    s->fd = -1;

@@ -938,7 +942,8 @@ static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
        return fd;
    }

-    aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s);
+    aio_set_fd_handler(s->aio_context, fd, false,
+                       co_read_response, NULL, s);
    return fd;
 }

@@ -1199,7 +1204,7 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,

    qemu_co_mutex_lock(&s->lock);
    s->co_send = qemu_coroutine_self();
-    aio_set_fd_handler(s->aio_context, s->fd,
+    aio_set_fd_handler(s->aio_context, s->fd, false,
                       co_read_response, co_write_request, s);
    socket_set_cork(s->fd, 1);

@@ -1218,7 +1223,8 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
    }
 out:
    socket_set_cork(s->fd, 0);
-    aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s);
+    aio_set_fd_handler(s->aio_context, s->fd, false,
+                       co_read_response, NULL, s);
    s->co_send = NULL;
    qemu_co_mutex_unlock(&s->lock);
 }
@@ -1368,7 +1374,8 @@ static void sd_detach_aio_context(BlockDriverState *bs)
 {
    BDRVSheepdogState *s = bs->opaque;

-    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
+                       NULL, NULL);
 }

 static void sd_attach_aio_context(BlockDriverState *bs,
@@ -1377,7 +1384,8 @@ static void sd_attach_aio_context(BlockDriverState *bs,
    BDRVSheepdogState *s = bs->opaque;

    s->aio_context = new_context;
-    aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s);
+    aio_set_fd_handler(new_context, s->fd, false,
+                       co_read_response, NULL, s);
 }

 /* TODO Convert to fine grained options */
@@ -1490,7 +1498,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    g_free(buf);
    return 0;
 out:
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
+                       false, NULL, NULL, NULL);
    if (s->fd >= 0) {
        closesocket(s->fd);
    }
@@ -1528,7 +1537,8 @@ static void sd_reopen_commit(BDRVReopenState *state)
    BDRVSheepdogState *s = state->bs->opaque;

    if (s->fd) {
-        aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+        aio_set_fd_handler(s->aio_context, s->fd, false,
+                           NULL, NULL, NULL);
        closesocket(s->fd);
    }

@@ -1551,7 +1561,8 @@ static void sd_reopen_abort(BDRVReopenState *state)
    }

    if (re_s->fd) {
-        aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
+        aio_set_fd_handler(s->aio_context, re_s->fd, false,
+                           NULL, NULL, NULL);
        closesocket(re_s->fd);
    }

@@ -1935,7 +1946,8 @@ static void sd_close(BlockDriverState *bs)
        error_report("%s, %s", sd_strerror(rsp->result), s->name);
    }

-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
+                       false, NULL, NULL, NULL);
    closesocket(s->fd);
    g_free(s->host_spec);
 }
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -253,9 +253,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
    return -ENOTSUP;
 }

-void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
-                                        const char *id_or_name,
-                                        Error **errp)
+int bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
+                                       const char *id_or_name,
+                                       Error **errp)
 {
    int ret;
    Error *local_err = NULL;
@@ -270,6 +270,7 @@ void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs,
    if (ret < 0) {
        error_propagate(errp, local_err);
    }
+    return ret;
 }

 int bdrv_snapshot_list(BlockDriverState *bs,
@@ -356,3 +357,130 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,

    return ret;
 }
+
+
+/* Group operations. All block drivers are involved.
+ * These functions will properly handle dataplane (take aio_context_acquire
+ * when appropriate for appropriate block drivers) */
+
+bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
+{
+    bool ok = true;
+    BlockDriverState *bs = NULL;
+
+    while (ok && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_is_inserted(bs) && !bdrv_is_read_only(bs)) {
+            ok = bdrv_can_snapshot(bs);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return ok;
+}
+
+int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
+                             Error **err)
+{
+    int ret = 0;
+    BlockDriverState *bs = NULL;
+    QEMUSnapshotInfo sn1, *snapshot = &sn1;
+
+    while (ret == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs) &&
+                bdrv_snapshot_find(bs, snapshot, name) >= 0) {
+            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return ret;
+}
+
+
+int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
+{
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs)) {
+            err = bdrv_snapshot_goto(bs, name);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
+{
+    QEMUSnapshotInfo sn;
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs)) {
+            err = bdrv_snapshot_find(bs, &sn, name);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
+                             BlockDriverState *vm_state_bs,
+                             uint64_t vm_state_size,
+                             BlockDriverState **first_bad_bs)
+{
+    int err = 0;
+    BlockDriverState *bs = NULL;
+
+    while (err == 0 && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        if (bs == vm_state_bs) {
+            sn->vm_state_size = vm_state_size;
+            err = bdrv_snapshot_create(bs, sn);
+        } else if (bdrv_can_snapshot(bs)) {
+            sn->vm_state_size = 0;
+            err = bdrv_snapshot_create(bs, sn);
+        }
+        aio_context_release(ctx);
+    }
+
+    *first_bad_bs = bs;
+    return err;
+}
+
+BlockDriverState *bdrv_all_find_vmstate_bs(void)
+{
+    bool not_found = true;
+    BlockDriverState *bs = NULL;
+
+    while (not_found && (bs = bdrv_next(bs))) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        not_found = !bdrv_can_snapshot(bs);
+        aio_context_release(ctx);
+    }
+    return bs;
+}
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -800,14 +800,15 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
            rd_handler, wr_handler);

    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       rd_handler, wr_handler, co);
+                       false, rd_handler, wr_handler, co);
 }

 static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
                                          BlockDriverState *bs)
 {
    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
+                       false, NULL, NULL, NULL);
 }

 /* A non-blocking call returned EAGAIN, so yield, ensuring the
--- a/block/stream.c
+++ b/block/stream.c
@@ -16,6 +16,7 @@
 #include "block/blockjob.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
+#include "sysemu/block-backend.h"

 enum {
    /*
@@ -222,7 +223,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,

    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        !bdrv_iostatus_is_enabled(bs)) {
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
        return;
    }
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -33,8 +33,7 @@
 * its own locking.
 *
 * This locking is however handled internally in this file, so it's
- * mostly transparent to outside users (but see the documentation in
- * throttle_groups_lock()).
+ * transparent to outside users.
 *
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
@@ -76,9 +75,9 @@ static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
 * created.
 *
 * @name: the name of the ThrottleGroup
- * @ret:  the ThrottleGroup
+ * @ret:  the ThrottleState member of the ThrottleGroup
 */
-static ThrottleGroup *throttle_group_incref(const char *name)
+ThrottleState *throttle_group_incref(const char *name)
 {
    ThrottleGroup *tg = NULL;
    ThrottleGroup *iter;
@@ -108,7 +107,7 @@ static ThrottleGroup *throttle_group_incref(const char *name)

    qemu_mutex_unlock(&throttle_groups_lock);

-    return tg;
+    return &tg->ts;
 }

 /* Decrease the reference count of a ThrottleGroup.
@@ -116,10 +115,12 @@ static ThrottleGroup *throttle_group_incref(const char *name)
 * When the reference count reaches zero the ThrottleGroup is
 * destroyed.
 *
- * @tg:  The ThrottleGroup to unref
+ * @ts:  The ThrottleGroup to unref, given by its ThrottleState member
 */
-static void throttle_group_unref(ThrottleGroup *tg)
+void throttle_group_unref(ThrottleState *ts)
 {
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+
    qemu_mutex_lock(&throttle_groups_lock);
    if (--tg->refcount == 0) {
        QTAILQ_REMOVE(&throttle_groups, tg, list);
@@ -401,7 +402,8 @@ static void write_timer_cb(void *opaque)
 void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
 {
    int i;
-    ThrottleGroup *tg = throttle_group_incref(groupname);
+    ThrottleState *ts = throttle_group_incref(groupname);
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;

    if (qtest_enabled()) {
@@ -409,7 +411,7 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    bs->throttle_state = &tg->ts;
+    bs->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
    /* If the ThrottleGroup is new set this BlockDriverState as the token */
@@ -435,6 +437,9 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
 * list, destroying the timers and setting the throttle_state pointer
 * to NULL.
 *
+ * The BlockDriverState must not have pending throttled requests, so
+ * the caller has to drain them first.
+ *
 * The group will be destroyed if it's empty after this operation.
 *
 * @bs: the BlockDriverState to remove
@@ -444,6 +449,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs)
    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    int i;

+    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+
    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
        if (tg->tokens[i] == bs) {
@@ -461,38 +470,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs)
    throttle_timers_destroy(&bs->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

-    throttle_group_unref(tg);
+    throttle_group_unref(&tg->ts);
    bs->throttle_state = NULL;
 }

-/* Acquire the lock of this throttling group.
- *
- * You won't normally need to use this. None of the functions from the
- * ThrottleGroup API require you to acquire the lock since all of them
- * deal with it internally.
- *
- * This should only be used in exceptional cases when you want to
- * access the protected fields of a BlockDriverState directly
- * (e.g. bdrv_swap()).
- *
- * @bs: a BlockDriverState that is member of the group
- */
-void throttle_group_lock(BlockDriverState *bs)
-{
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    qemu_mutex_lock(&tg->lock);
-}
-
-/* Release the lock of this throttling group.
- *
- * See the comments in throttle_group_lock().
- */
-void throttle_group_unlock(BlockDriverState *bs)
-{
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    qemu_mutex_unlock(&tg->lock);
-}
-
 static void throttle_groups_init(void)
 {
    qemu_mutex_init(&throttle_groups_lock);
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,7 +53,7 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"

 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2161,19 +2161,19 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
-        .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK,
+        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
        {
            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
        },
    };

-    *spec_info->vmdk = (ImageInfoSpecificVmdk) {
+    *spec_info->u.vmdk = (ImageInfoSpecificVmdk) {
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

-    next = &spec_info->vmdk->extents;
+    next = &spec_info->u.vmdk->extents;
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -174,7 +174,7 @@ int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context)
 {
-    aio_set_event_notifier(old_context, &aio->e, NULL);
+    aio_set_event_notifier(old_context, &aio->e, false, NULL);
    aio->is_aio_context_attached = false;
 }

@@ -182,7 +182,8 @@ void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *new_context)
 {
    aio->is_aio_context_attached = true;
-    aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb);
+    aio_set_event_notifier(new_context, &aio->e, false,
+                           win32_aio_completion_cb);
 }

 QEMUWin32AIOState *win32_aio_init(void)
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -11,7 +11,7 @@
 */

 #include "block/block_int.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "block/write-threshold.h"
 #include "qemu/notify.h"
 #include "qapi-event.h"
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -29,13 +29,27 @@
 #include "block/block.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qjson.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"

+/* Transactional group of block jobs */
+struct BlockJobTxn {
+
+    /* Is this txn being cancelled? */
+    bool aborting;
+
+    /* List of jobs */
+    QLIST_HEAD(, BlockJob) jobs;
+
+    /* Reference count */
+    int refcnt;
+};
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
@@ -59,6 +73,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
+    job->refcnt        = 1;
    bs->job = job;

    /* Only set speed when necessary to avoid NotSupported error */
@@ -67,7 +82,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,

        block_job_set_speed(job, speed, &local_err);
        if (local_err) {
-            block_job_release(bs);
+            block_job_unref(job);
            error_propagate(errp, local_err);
            return NULL;
        }
@@ -75,15 +90,101 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    return job;
 }

-void block_job_release(BlockDriverState *bs)
+void block_job_ref(BlockJob *job)
 {
-    BlockJob *job = bs->job;
+    ++job->refcnt;
+}

-    bs->job = NULL;
-    bdrv_op_unblock_all(bs, job->blocker);
-    error_free(job->blocker);
-    g_free(job->id);
-    g_free(job);
+void block_job_unref(BlockJob *job)
+{
+    if (--job->refcnt == 0) {
+        job->bs->job = NULL;
+        bdrv_op_unblock_all(job->bs, job->blocker);
+        bdrv_unref(job->bs);
+        error_free(job->blocker);
+        g_free(job->id);
+        g_free(job);
+    }
+}
+
+static void block_job_completed_single(BlockJob *job)
+{
+    if (!job->ret) {
+        if (job->driver->commit) {
+            job->driver->commit(job);
+        }
+    } else {
+        if (job->driver->abort) {
+            job->driver->abort(job);
+        }
+    }
+    job->cb(job->opaque, job->ret);
+    if (job->txn) {
+        block_job_txn_unref(job->txn);
+    }
+    block_job_unref(job);
+}
+
+static void block_job_completed_txn_abort(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+
+    if (txn->aborting) {
+        /*
+         * We are cancelled by another job, which will handle everything.
+         */
+        return;
+    }
+    txn->aborting = true;
+    /* We are the first failed job. Cancel other jobs. */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+    }
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (other_job == job || other_job->completed) {
+            /* Other jobs are "effectively" cancelled by us, set the status for
+             * them; this job, however, may or may not be cancelled, depending
+             * on the caller, so leave it. */
+            if (other_job != job) {
+                other_job->cancelled = true;
+            }
+            continue;
+        }
+        block_job_cancel_sync(other_job);
+        assert(other_job->completed);
+    }
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
+}
+
+static void block_job_completed_txn_success(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+    /*
+     * Successful completion, see if there are other running jobs in this
+     * txn.
+     */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (!other_job->completed) {
+            return;
+        }
+    }
+    /* We are the last completed job, commit the transaction. */
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+        assert(other_job->ret == 0);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
 }

 void block_job_completed(BlockJob *job, int ret)
@@ -91,8 +192,16 @@ void block_job_completed(BlockJob *job, int ret)
    BlockDriverState *bs = job->bs;

    assert(bs->job == job);
-    job->cb(job->opaque, ret);
-    block_job_release(bs);
+    assert(!job->completed);
+    job->completed = true;
+    job->ret = ret;
+    if (!job->txn) {
+        block_job_completed_single(job);
+    } else if (ret < 0 || block_job_is_cancelled(job)) {
+        block_job_completed_txn_abort(job);
+    } else {
+        block_job_completed_txn_success(job);
+    }
 }

 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -177,43 +286,29 @@ struct BlockFinishData {
    int ret;
 };

-static void block_job_finish_cb(void *opaque, int ret)
-{
-    struct BlockFinishData *data = opaque;
-
-    data->cancelled = block_job_is_cancelled(data->job);
-    data->ret = ret;
-    data->cb(data->opaque, ret);
-}
-
 static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
-    struct BlockFinishData data;
    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
+    int ret;

    assert(bs->job == job);

-    /* Set up our own callback to store the result and chain to
-     * the original callback.
-     */
-    data.job = job;
-    data.cb = job->cb;
-    data.opaque = job->opaque;
-    data.ret = -EINPROGRESS;
-    job->cb = block_job_finish_cb;
-    job->opaque = &data;
+    block_job_ref(job);
    finish(job, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
+        block_job_unref(job);
        return -EBUSY;
    }
-    while (data.ret == -EINPROGRESS) {
+    while (!job->completed) {
        aio_poll(bdrv_get_aio_context(bs), true);
    }
-    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
+    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
+    block_job_unref(job);
+    return ret;
 }

 /* A wrapper around block_job_cancel() taking an Error ** parameter so it may be
@@ -354,8 +449,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
-        if (bs != job->bs) {
-            bdrv_iostatus_set_err(bs, error);
+        if (bs->blk && bs != job->bs) {
+            blk_iostatus_set_err(bs->blk, error);
        }
    }
    return action;
@@ -405,3 +500,36 @@ void block_job_defer_to_main_loop(BlockJob *job,

    qemu_bh_schedule(data->bh);
 }
+
+BlockJobTxn *block_job_txn_new(void)
+{
+    BlockJobTxn *txn = g_new0(BlockJobTxn, 1);
+    QLIST_INIT(&txn->jobs);
+    txn->refcnt = 1;
+    return txn;
+}
+
+static void block_job_txn_ref(BlockJobTxn *txn)
+{
+    txn->refcnt++;
+}
+
+void block_job_txn_unref(BlockJobTxn *txn)
+{
+    if (txn && --txn->refcnt == 0) {
+        g_free(txn);
+    }
+}
+
+void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job)
+{
+    if (!txn) {
+        return;
+    }
+
+    assert(!job->txn);
+    job->txn = txn;
+
+    QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
+    block_job_txn_ref(txn);
+}
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -740,8 +740,7 @@ static void padzero(abi_ulong elf_bss, abi_ulong last_bss)
           size must be known */
        if (qemu_real_host_page_size < qemu_host_page_size) {
            abi_ulong end_addr, end_addr1;
-            end_addr1 = (elf_bss + qemu_real_host_page_size - 1) &
-                ~(qemu_real_host_page_size - 1);
+            end_addr1 = REAL_HOST_PAGE_ALIGN(elf_bss);
            end_addr = HOST_PAGE_ALIGN(elf_bss);
            if (end_addr1 < end_addr) {
                mmap((void *)g2h(end_addr1), end_addr - end_addr1,
--- a/246
+++ b/246
@@ -8,6 +8,9 @@
 CLICOLOR_FORCE= GREP_OPTIONS=
 unset CLICOLOR_FORCE GREP_OPTIONS

+# Don't allow CCACHE, if present, to use cached results of compile tests!
+export CCACHE_RECACHE=yes
+
 # Temporary directory used for files created while
 # configure runs. Since it is in the build directory
 # we can safely blow away any previous version of it
@@ -261,6 +264,7 @@ rdma=""
 gprof="no"
 debug_tcg="no"
 debug="no"
+fortify_source=""
 strip_opt="yes"
 tcg_interpreter="no"
 bigendian="no"
@@ -331,6 +335,8 @@ gtkabi=""
 gtk_gl="no"
 gnutls=""
 gnutls_hash=""
+nettle=""
+gcrypt=""
 vte=""
 virglrenderer=""
 tpm="yes"
@@ -417,9 +423,6 @@ if test "$debug_info" = "yes"; then
    LDFLAGS="-g $LDFLAGS"
 fi

-test_cflags=""
-test_libs=""
-
 # make source path absolute
 source_path=`cd "$source_path"; pwd`

@@ -724,6 +727,8 @@ if test "$mingw32" = "yes" ; then
  QEMU_CFLAGS="-DWIN32_LEAN_AND_MEAN -DWINVER=0x501 $QEMU_CFLAGS"
  # enable C99/POSIX format strings (needs mingw32-runtime 3.15 or later)
  QEMU_CFLAGS="-D__USE_MINGW_ANSI_STDIO=1 $QEMU_CFLAGS"
+  # MinGW needs -mthreads for TLS and macro _MT.
+  QEMU_CFLAGS="-mthreads $QEMU_CFLAGS"
  LIBS="-lwinmm -lws2_32 -liphlpapi $LIBS"
  write_c_skeleton;
  if compile_prog "" "-liberty" ; then
@@ -788,6 +793,9 @@ for opt do
  --enable-modules)
      modules="yes"
  ;;
+  --disable-modules)
+      modules="no"
+  ;;
  --cpu=*)
  ;;
  --target-list=*) target_list="$optarg"
@@ -877,6 +885,7 @@ for opt do
      debug_tcg="yes"
      debug="yes"
      strip_opt="no"
+      fortify_source="no"
  ;;
  --enable-sparse) sparse="yes"
  ;;
@@ -1114,6 +1123,14 @@ for opt do
  ;;
  --enable-gnutls) gnutls="yes"
  ;;
+  --disable-nettle) nettle="no"
+  ;;
+  --enable-nettle) nettle="yes"
+  ;;
+  --disable-gcrypt) gcrypt="no"
+  ;;
+  --enable-gcrypt) gcrypt="yes"
+  ;;
  --enable-rdma) rdma="yes"
  ;;
  --disable-rdma) rdma="no"
@@ -1324,6 +1341,8 @@ disabled with --disable-FEATURE, default is enabled if available:
  sparse          sparse checker

  gnutls          GNUTLS cryptography support
+  nettle          nettle cryptography support
+  gcrypt          libgcrypt cryptography support
  sdl             SDL UI
  --with-sdlabi     select preferred SDL ABI 1.2 or 2.0
  gtk             gtk UI
@@ -1331,7 +1350,6 @@ disabled with --disable-FEATURE, default is enabled if available:
  vte             vte support for the gtk UI
  curses          curses UI
  vnc             VNC UI support
-  vnc-tls         TLS encryption for VNC server
  vnc-sasl        SASL encryption for VNC server
  vnc-jpeg        JPEG lossy compression for VNC server
  vnc-png         PNG compression for VNC server
@@ -1410,6 +1428,9 @@ if compile_object ; then
 else
    error_exit "\"$cc\" either does not exist or does not work"
 fi
+if ! compile_prog ; then
+    error_exit "\"$cc\" cannot build an executable (is your linker broken?)"
+fi

 # Check that the C++ compiler exists and works with the C compiler
 if has $cxx; then
@@ -1470,6 +1491,16 @@ for flag in $gcc_flags; do
 done

 if test "$stack_protector" != "no"; then
+  cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+    char arr[64], *p = arr, *c = argv[0];
+    while (*c) {
+        *p++ = *c++;
+    }
+    return 0;
+}
+EOF
  gcc_flags="-fstack-protector-strong -fstack-protector-all"
  sp_on=0
  for flag in $gcc_flags; do
@@ -1872,16 +1903,34 @@ fi
 # libseccomp check

 if test "$seccomp" != "no" ; then
-    if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
-        $pkg_config --atleast-version=2.1.1 libseccomp; then
+    case "$cpu" in
+    i386|x86_64)
+        libseccomp_minver="2.1.0"
+        ;;
+    arm|aarch64)
+        libseccomp_minver="2.2.3"
+        ;;
+    *)
+        libseccomp_minver=""
+        ;;
+    esac
+
+    if test "$libseccomp_minver" != "" &&
+       $pkg_config --atleast-version=$libseccomp_minver libseccomp ; then
        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
-	seccomp="yes"
+        seccomp="yes"
    else
-	if test "$seccomp" = "yes"; then
-            feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.1"
-	fi
-	seccomp="no"
+        if test "$seccomp" = "yes" ; then
+            if test "$libseccomp_minver" != "" ; then
+                feature_not_found "libseccomp" \
+                    "Install libseccomp devel >= $libseccomp_minver"
+            else
+                feature_not_found "libseccomp" \
+                    "libseccomp is not supported for host cpu $cpu"
+            fi
+        fi
+        seccomp="no"
    fi
 fi
 ##########################################
@@ -1912,6 +1961,23 @@ EOF
  elif
      cat > $TMPC <<EOF &&
 #include <xenctrl.h>
+#include <stdint.h>
+int main(void) {
+  xc_interface *xc = NULL;
+  xen_domain_handle_t handle;
+  xc_domain_create(xc, 0, handle, 0, NULL, NULL);
+  return 0;
+}
+EOF
+      compile_prog "" "$xen_libs"
+    then
+    xen_ctrl_version=470
+    xen=yes
+
+  # Xen 4.6
+  elif
+      cat > $TMPC <<EOF &&
+#include <xenctrl.h>
 #include <xenstore.h>
 #include <stdint.h>
 #include <xen/hvm/hvm_info_table.h>
@@ -2254,20 +2320,76 @@ else
    gnutls_hash="no"
 fi

-if test "$gnutls_gcrypt" != "no"; then
-    if has "libgcrypt-config"; then
+
+# If user didn't give a --disable/enable-gcrypt flag,
+# then mark as disabled if user requested nettle
+# explicitly, or if gnutls links to nettle
+if test -z "$gcrypt"
+then
+    if test "$nettle" = "yes" || test "$gnutls_nettle" = "yes"
+    then
+        gcrypt="no"
+    fi
+fi
+
+# If user didn't give a --disable/enable-nettle flag,
+# then mark as disabled if user requested gcrypt
+# explicitly, or if gnutls links to gcrypt
+if test -z "$nettle"
+then
+    if test "$gcrypt" = "yes" || test "$gnutls_gcrypt" = "yes"
+    then
+        nettle="no"
+    fi
+fi
+
+has_libgcrypt_config() {
+    if ! has "libgcrypt-config"
+    then
+	return 1
+    fi
+
+    if test -n "$cross_prefix"
+    then
+	host=`libgcrypt-config --host`
+	if test "$host-" != $cross_prefix
+	then
+	    return 1
+	fi
+    fi
+
+    return 0
+}
+
+if test "$gcrypt" != "no"; then
+    if has_libgcrypt_config; then
        gcrypt_cflags=`libgcrypt-config --cflags`
        gcrypt_libs=`libgcrypt-config --libs`
+        # Debian has remove -lgpg-error from libgcrypt-config
+        # as it "spreads unnecessary dependencies" which in
+        # turn breaks static builds...
+        if test "$static" = "yes"
+        then
+            gcrypt_libs="$gcrypt_libs -lgpg-error"
+        fi
        libs_softmmu="$gcrypt_libs $libs_softmmu"
        libs_tools="$gcrypt_libs $libs_tools"
        QEMU_CFLAGS="$QEMU_CFLAGS $gcrypt_cflags"
+        gcrypt="yes"
+        if test -z "$nettle"; then
+           nettle="no"
+        fi
    else
-        feature_not_found "gcrypt" "Install gcrypt devel"
+        if test "$gcrypt" = "yes"; then
+            feature_not_found "gcrypt" "Install gcrypt devel"
+        else
+            gcrypt="no"
+        fi
    fi
 fi


-if test "$gnutls_nettle" != "no"; then
+if test "$nettle" != "no"; then
    if $pkg_config --exists "nettle"; then
        nettle_cflags=`$pkg_config --cflags nettle`
        nettle_libs=`$pkg_config --libs nettle`
@@ -2275,20 +2397,30 @@ if test "$gnutls_nettle" != "no"; then
        libs_softmmu="$nettle_libs $libs_softmmu"
        libs_tools="$nettle_libs $libs_tools"
        QEMU_CFLAGS="$QEMU_CFLAGS $nettle_cflags"
+        nettle="yes"
    else
-        feature_not_found "nettle" "Install nettle devel"
+        if test "$nettle" = "yes"; then
+            feature_not_found "nettle" "Install nettle devel"
+        else
+            nettle="no"
+        fi
    fi
 fi

+if test "$gcrypt" = "yes" && test "$nettle" = "yes"
+then
+    error_exit "Only one of gcrypt & nettle can be enabled"
+fi
+
 ##########################################
 # libtasn1 - only for the TLS creds/session test suite

 tasn1=yes
+tasn1_cflags=""
+tasn1_libs=""
 if $pkg_config --exists "libtasn1"; then
    tasn1_cflags=`$pkg_config --cflags libtasn1`
    tasn1_libs=`$pkg_config --libs libtasn1`
-    test_cflags="$test_cflags $tasn1_cflags"
-    test_libs="$test_libs $tasn1_libs"
 else
    tasn1=no
 fi
@@ -3211,25 +3343,11 @@ fi
 libs_softmmu="$libs_softmmu $fdt_libs"

 ##########################################
-# opengl probe (for sdl2, milkymist-tmu2)
-
-# GLX probe, used by milkymist-tmu2
-# this is temporary, code will be switched to egl mid-term.
-cat > $TMPC << EOF
-#include <X11/Xlib.h>
-#include <GL/gl.h>
-#include <GL/glx.h>
-int main(void) { glBegin(0); glXQueryVersion(0,0,0); return 0; }
-EOF
-if compile_prog "" "-lGL -lX11" ; then
-  have_glx=yes
-else
-  have_glx=no
-fi
+# opengl probe (for sdl2, gtk, milkymist-tmu2)

 if test "$opengl" != "no" ; then
-  opengl_pkgs="gl glesv2 epoxy egl"
-  if $pkg_config $opengl_pkgs x11 && test "$have_glx" = "yes"; then
+  opengl_pkgs="epoxy"
+  if $pkg_config $opengl_pkgs x11; then
    opengl_cflags="$($pkg_config --cflags $opengl_pkgs) $x11_cflags"
    opengl_libs="$($pkg_config --libs $opengl_pkgs) $x11_libs"
    opengl=yes
@@ -3491,6 +3609,22 @@ if compile_prog "" "" ; then
  eventfd=yes
 fi

+# check if memfd is supported
+memfd=no
+cat > $TMPC << EOF
+#include <sys/memfd.h>
+
+int main(void)
+{
+    return memfd_create("foo", MFD_ALLOW_SEALING);
+}
+EOF
+if compile_prog "" "" ; then
+  memfd=yes
+fi
+
+
+
 # check for fallocate
 fallocate=no
 cat > $TMPC << EOF
@@ -4321,6 +4455,7 @@ fi
 # check if ccache is interfering with
 # semantic analysis of macros

+unset CCACHE_CPP2
 ccache_cpp2=no
 cat > $TMPC << EOF
 static const int Z = 1;
@@ -4344,6 +4479,20 @@ if ! compile_object "-Werror"; then
    ccache_cpp2=yes
 fi

+#################################################
+# clang does not support glibc + FORTIFY_SOURCE.
+
+if test "$fortify_source" != "no"; then
+  if echo | $cc -dM -E - | grep __clang__ > /dev/null 2>&1 ; then
+    fortify_source="no";
+  elif test -n "$cxx" &&
+       echo | $cxx -dM -E - | grep __clang__ >/dev/null 2>&1 ; then
+    fortify_source="no";
+  else
+    fortify_source="yes"
+  fi
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -4351,8 +4500,10 @@ fi
 if test "$gcov" = "yes" ; then
  CFLAGS="-fprofile-arcs -ftest-coverage -g $CFLAGS"
  LDFLAGS="-fprofile-arcs -ftest-coverage $LDFLAGS"
-elif test "$debug" = "no" ; then
+elif test "$fortify_source" = "yes" ; then
  CFLAGS="-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $CFLAGS"
+elif test "$debug" = "no"; then
+  CFLAGS="-O2 $CFLAGS"
 fi

 ##########################################
@@ -4417,6 +4568,7 @@ if test "$want_tools" = "yes" ; then
  tools="qemu-img\$(EXESUF) qemu-io\$(EXESUF) $tools"
  if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then
    tools="qemu-nbd\$(EXESUF) $tools"
+    tools="ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools"
  fi
 fi
 if test "$softmmu" = yes ; then
@@ -4437,7 +4589,7 @@ fi

 if [ "$guest_agent" != "no" ]; then
  if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" -o "$mingw32" = "yes" ] ; then
-      tools="qemu-ga\$(EXESUF) $tools"
+      tools="qemu-ga $tools"
      guest_agent=yes
  elif [ "$guest_agent" != yes ]; then
      guest_agent=no
@@ -4605,8 +4757,8 @@ echo "GTK support       $gtk"
 echo "GTK GL support    $gtk_gl"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
-echo "GNUTLS gcrypt     $gnutls_gcrypt"
-echo "GNUTLS nettle     $gnutls_nettle ${gnutls_nettle+($nettle_version)}"
+echo "libgcrypt         $gcrypt"
+echo "nettle            $nettle ${nettle+($nettle_version)}"
 echo "libtasn1          $tasn1"
 echo "VTE support       $vte"
 echo "curses support    $curses"
@@ -4885,6 +5037,9 @@ fi
 if test "$eventfd" = "yes" ; then
  echo "CONFIG_EVENTFD=y" >> $config_host_mak
 fi
+if test "$memfd" = "yes" ; then
+  echo "CONFIG_MEMFD=y" >> $config_host_mak
+fi
 if test "$fallocate" = "yes" ; then
  echo "CONFIG_FALLOCATE=y" >> $config_host_mak
 fi
@@ -4972,11 +5127,11 @@ fi
 if test "$gnutls_hash" = "yes" ; then
  echo "CONFIG_GNUTLS_HASH=y" >> $config_host_mak
 fi
-if test "$gnutls_gcrypt" = "yes" ; then
-  echo "CONFIG_GNUTLS_GCRYPT=y" >> $config_host_mak
+if test "$gcrypt" = "yes" ; then
+  echo "CONFIG_GCRYPT=y" >> $config_host_mak
 fi
-if test "$gnutls_nettle" = "yes" ; then
-  echo "CONFIG_GNUTLS_NETTLE=y" >> $config_host_mak
+if test "$nettle" = "yes" ; then
+  echo "CONFIG_NETTLE=y" >> $config_host_mak
  echo "CONFIG_NETTLE_VERSION_MAJOR=${nettle_version%%.*}" >> $config_host_mak
 fi
 if test "$tasn1" = "yes" ; then
@@ -5311,8 +5466,8 @@ echo "EXESUF=$EXESUF" >> $config_host_mak
 echo "DSOSUF=$DSOSUF" >> $config_host_mak
 echo "LDFLAGS_SHARED=$LDFLAGS_SHARED" >> $config_host_mak
 echo "LIBS_QGA+=$libs_qga" >> $config_host_mak
-echo "TEST_LIBS=$test_libs" >> $config_host_mak
-echo "TEST_CFLAGS=$test_cflags" >> $config_host_mak
+echo "TASN1_LIBS=$tasn1_libs" >> $config_host_mak
+echo "TASN1_CFLAGS=$tasn1_cflags" >> $config_host_mak
 echo "POD2MAN=$POD2MAN" >> $config_host_mak
 echo "TRANSLATE_OPT_CFLAGS=$TRANSLATE_OPT_CFLAGS" >> $config_host_mak
 if test "$gcov" = "yes" ; then
@@ -5558,6 +5713,7 @@ case "$target_name" in
      echo "CONFIG_KVM=y" >> $config_target_mak
      if test "$vhost_net" = "yes" ; then
        echo "CONFIG_VHOST_NET=y" >> $config_target_mak
+        echo "CONFIG_VHOST_NET_TEST_$target_name=y" >> $config_host_mak
      fi
    fi
 esac
--- a/contrib/ivshmem-client/Makefile.objs
+++ b/contrib/ivshmem-client/Makefile.objs
@@ -0,0 +1 @@
+ivshmem-client-obj-y = ivshmem-client.o main.o
--- a/contrib/ivshmem-client/ivshmem-client.c
+++ b/contrib/ivshmem-client/ivshmem-client.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "qemu-common.h"
+#include "qemu/queue.h"
+
+#include "ivshmem-client.h"
+
+/* log a message on stdout if verbose=1 */
+#define IVSHMEM_CLIENT_DEBUG(client, fmt, ...) do { \
+        if ((client)->verbose) {         \
+            printf(fmt, ## __VA_ARGS__); \
+        }                                \
+    } while (0)
+
+/* read message from the unix socket */
+static int
+ivshmem_client_read_one_msg(IvshmemClient *client, int64_t *index, int *fd)
+{
+    int ret;
+    struct msghdr msg;
+    struct iovec iov[1];
+    union {
+        struct cmsghdr cmsg;
+        char control[CMSG_SPACE(sizeof(int))];
+    } msg_control;
+    struct cmsghdr *cmsg;
+
+    iov[0].iov_base = index;
+    iov[0].iov_len = sizeof(*index);
+
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 1;
+    msg.msg_control = &msg_control;
+    msg.msg_controllen = sizeof(msg_control);
+
+    ret = recvmsg(client->sock_fd, &msg, 0);
+    if (ret < sizeof(*index)) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read message: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+    if (ret == 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "lost connection to server\n");
+        return -1;
+    }
+
+    *index = GINT64_FROM_LE(*index);
+    *fd = -1;
+
+    for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+
+        if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)) ||
+            cmsg->cmsg_level != SOL_SOCKET ||
+            cmsg->cmsg_type != SCM_RIGHTS) {
+            continue;
+        }
+
+        memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
+    }
+
+    return 0;
+}
+
+/* free a peer when the server advertises a disconnection or when the
+ * client is freed */
+static void
+ivshmem_client_free_peer(IvshmemClient *client, IvshmemClientPeer *peer)
+{
+    unsigned vector;
+
+    QTAILQ_REMOVE(&client->peer_list, peer, next);
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        close(peer->vectors[vector]);
+    }
+
+    g_free(peer);
+}
+
+/* handle message coming from server (new peer, new vectors) */
+static int
+ivshmem_client_handle_server_msg(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    int64_t peer_id;
+    int ret, fd;
+
+    ret = ivshmem_client_read_one_msg(client, &peer_id, &fd);
+    if (ret < 0) {
+        return -1;
+    }
+
+    /* can return a peer or the local client */
+    peer = ivshmem_client_search_peer(client, peer_id);
+
+    /* delete peer */
+    if (fd == -1) {
+
+        if (peer == NULL || peer == &client->local) {
+            IVSHMEM_CLIENT_DEBUG(client, "receive delete for invalid "
+                                 "peer %" PRId64 "\n", peer_id);
+            return -1;
+        }
+
+        IVSHMEM_CLIENT_DEBUG(client, "delete peer id = %" PRId64 "\n", peer_id);
+        ivshmem_client_free_peer(client, peer);
+        return 0;
+    }
+
+    /* new peer */
+    if (peer == NULL) {
+        peer = g_malloc0(sizeof(*peer));
+        peer->id = peer_id;
+        peer->vectors_count = 0;
+        QTAILQ_INSERT_TAIL(&client->peer_list, peer, next);
+        IVSHMEM_CLIENT_DEBUG(client, "new peer id = %" PRId64 "\n", peer_id);
+    }
+
+    /* new vector */
+    IVSHMEM_CLIENT_DEBUG(client, "  new vector %d (fd=%d) for peer id %"
+                         PRId64 "\n", peer->vectors_count, fd, peer->id);
+    if (peer->vectors_count >= G_N_ELEMENTS(peer->vectors)) {
+        IVSHMEM_CLIENT_DEBUG(client, "Too many vectors received, failing");
+        return -1;
+    }
+
+    peer->vectors[peer->vectors_count] = fd;
+    peer->vectors_count++;
+
+    return 0;
+}
+
+/* init a new ivshmem client */
+int
+ivshmem_client_init(IvshmemClient *client, const char *unix_sock_path,
+                    IvshmemClientNotifCb notif_cb, void *notif_arg,
+                    bool verbose)
+{
+    int ret;
+    unsigned i;
+
+    memset(client, 0, sizeof(*client));
+
+    ret = snprintf(client->unix_sock_path, sizeof(client->unix_sock_path),
+                   "%s", unix_sock_path);
+
+    if (ret < 0 || ret >= sizeof(client->unix_sock_path)) {
+        IVSHMEM_CLIENT_DEBUG(client, "could not copy unix socket path\n");
+        return -1;
+    }
+
+    for (i = 0; i < IVSHMEM_CLIENT_MAX_VECTORS; i++) {
+        client->local.vectors[i] = -1;
+    }
+
+    QTAILQ_INIT(&client->peer_list);
+    client->local.id = -1;
+
+    client->notif_cb = notif_cb;
+    client->notif_arg = notif_arg;
+    client->verbose = verbose;
+    client->shm_fd = -1;
+    client->sock_fd = -1;
+
+    return 0;
+}
+
+/* create and connect to the unix socket */
+int
+ivshmem_client_connect(IvshmemClient *client)
+{
+    struct sockaddr_un sun;
+    int fd, ret;
+    int64_t tmp;
+
+    IVSHMEM_CLIENT_DEBUG(client, "connect to client %s\n",
+                         client->unix_sock_path);
+
+    client->sock_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (client->sock_fd < 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot create socket: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    sun.sun_family = AF_UNIX;
+    ret = snprintf(sun.sun_path, sizeof(sun.sun_path), "%s",
+                   client->unix_sock_path);
+    if (ret < 0 || ret >= sizeof(sun.sun_path)) {
+        IVSHMEM_CLIENT_DEBUG(client, "could not copy unix socket path\n");
+        goto err_close;
+    }
+
+    if (connect(client->sock_fd, (struct sockaddr *)&sun, sizeof(sun)) < 0) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot connect to %s: %s\n", sun.sun_path,
+                             strerror(errno));
+        goto err_close;
+    }
+
+    /* first, we expect a protocol version */
+    if (ivshmem_client_read_one_msg(client, &tmp, &fd) < 0 ||
+        (tmp != IVSHMEM_PROTOCOL_VERSION) || fd != -1) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server\n");
+        goto err_close;
+    }
+
+    /* then, we expect our index + a fd == -1 */
+    if (ivshmem_client_read_one_msg(client, &client->local.id, &fd) < 0 ||
+        client->local.id < 0 || fd != -1) {
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server (2)\n");
+        goto err_close;
+    }
+    IVSHMEM_CLIENT_DEBUG(client, "our_id=%" PRId64 "\n", client->local.id);
+
+    /* now, we expect shared mem fd + a -1 index, note that shm fd
+     * is not used */
+    if (ivshmem_client_read_one_msg(client, &tmp, &fd) < 0 ||
+        tmp != -1 || fd < 0) {
+        if (fd >= 0) {
+            close(fd);
+        }
+        IVSHMEM_CLIENT_DEBUG(client, "cannot read from server (3)\n");
+        goto err_close;
+    }
+    client->shm_fd = fd;
+    IVSHMEM_CLIENT_DEBUG(client, "shm_fd=%d\n", fd);
+
+    return 0;
+
+err_close:
+    close(client->sock_fd);
+    client->sock_fd = -1;
+    return -1;
+}
+
+/* close connection to the server, and free all peer structures */
+void
+ivshmem_client_close(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    unsigned i;
+
+    IVSHMEM_CLIENT_DEBUG(client, "close client\n");
+
+    while ((peer = QTAILQ_FIRST(&client->peer_list)) != NULL) {
+        ivshmem_client_free_peer(client, peer);
+    }
+
+    close(client->shm_fd);
+    client->shm_fd = -1;
+    close(client->sock_fd);
+    client->sock_fd = -1;
+    client->local.id = -1;
+    for (i = 0; i < IVSHMEM_CLIENT_MAX_VECTORS; i++) {
+        close(client->local.vectors[i]);
+        client->local.vectors[i] = -1;
+    }
+    client->local.vectors_count = 0;
+}
+
+/* get the fd_set according to the unix socket and peer list */
+void
+ivshmem_client_get_fds(const IvshmemClient *client, fd_set *fds, int *maxfd)
+{
+    int fd;
+    unsigned vector;
+
+    FD_SET(client->sock_fd, fds);
+    if (client->sock_fd >= *maxfd) {
+        *maxfd = client->sock_fd + 1;
+    }
+
+    for (vector = 0; vector < client->local.vectors_count; vector++) {
+        fd = client->local.vectors[vector];
+        FD_SET(fd, fds);
+        if (fd >= *maxfd) {
+            *maxfd = fd + 1;
+        }
+    }
+}
+
+/* handle events from eventfd: just print a message on notification */
+static int
+ivshmem_client_handle_event(IvshmemClient *client, const fd_set *cur, int maxfd)
+{
+    IvshmemClientPeer *peer;
+    uint64_t kick;
+    unsigned i;
+    int ret;
+
+    peer = &client->local;
+
+    for (i = 0; i < peer->vectors_count; i++) {
+        if (peer->vectors[i] >= maxfd || !FD_ISSET(peer->vectors[i], cur)) {
+            continue;
+        }
+
+        ret = read(peer->vectors[i], &kick, sizeof(kick));
+        if (ret < 0) {
+            return ret;
+        }
+        if (ret != sizeof(kick)) {
+            IVSHMEM_CLIENT_DEBUG(client, "invalid read size = %d\n", ret);
+            errno = EINVAL;
+            return -1;
+        }
+        IVSHMEM_CLIENT_DEBUG(client, "received event on fd %d vector %d: %"
+                             PRIu64 "\n", peer->vectors[i], i, kick);
+        if (client->notif_cb != NULL) {
+            client->notif_cb(client, peer, i, client->notif_arg);
+        }
+    }
+
+    return 0;
+}
+
+/* read and handle new messages on the given fd_set */
+int
+ivshmem_client_handle_fds(IvshmemClient *client, fd_set *fds, int maxfd)
+{
+    if (client->sock_fd < maxfd && FD_ISSET(client->sock_fd, fds) &&
+        ivshmem_client_handle_server_msg(client) < 0 && errno != EINTR) {
+        IVSHMEM_CLIENT_DEBUG(client, "ivshmem_client_handle_server_msg() "
+                             "failed\n");
+        return -1;
+    } else if (ivshmem_client_handle_event(client, fds, maxfd) < 0 &&
+               errno != EINTR) {
+        IVSHMEM_CLIENT_DEBUG(client, "ivshmem_client_handle_event() failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+/* send a notification on a vector of a peer */
+int
+ivshmem_client_notify(const IvshmemClient *client,
+                      const IvshmemClientPeer *peer, unsigned vector)
+{
+    uint64_t kick;
+    int fd;
+
+    if (vector >= peer->vectors_count) {
+        IVSHMEM_CLIENT_DEBUG(client, "invalid vector %u on peer %" PRId64 "\n",
+                             vector, peer->id);
+        return -1;
+    }
+    fd = peer->vectors[vector];
+    IVSHMEM_CLIENT_DEBUG(client, "notify peer %" PRId64
+                         " on vector %d, fd %d\n", peer->id, vector, fd);
+
+    kick = 1;
+    if (write(fd, &kick, sizeof(kick)) != sizeof(kick)) {
+        fprintf(stderr, "could not write to %d: %s\n", peer->vectors[vector],
+                strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+/* send a notification to all vectors of a peer */
+int
+ivshmem_client_notify_all_vects(const IvshmemClient *client,
+                                const IvshmemClientPeer *peer)
+{
+    unsigned vector;
+    int ret = 0;
+
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        if (ivshmem_client_notify(client, peer, vector) < 0) {
+            ret = -1;
+        }
+    }
+
+    return ret;
+}
+
+/* send a notification to all peers */
+int
+ivshmem_client_notify_broadcast(const IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    int ret = 0;
+
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        if (ivshmem_client_notify_all_vects(client, peer) < 0) {
+            ret = -1;
+        }
+    }
+
+    return ret;
+}
+
+/* lookup peer from its id */
+IvshmemClientPeer *
+ivshmem_client_search_peer(IvshmemClient *client, int64_t peer_id)
+{
+    IvshmemClientPeer *peer;
+
+    if (peer_id == client->local.id) {
+        return &client->local;
+    }
+
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        if (peer->id == peer_id) {
+            return peer;
+        }
+    }
+    return NULL;
+}
+
+/* dump our info, the list of peers their vectors on stdout */
+void
+ivshmem_client_dump(const IvshmemClient *client)
+{
+    const IvshmemClientPeer *peer;
+    unsigned vector;
+
+    /* dump local infos */
+    peer = &client->local;
+    printf("our_id = %" PRId64 "\n", peer->id);
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        printf("  vector %d is enabled (fd=%d)\n", vector,
+               peer->vectors[vector]);
+    }
+
+    /* dump peers */
+    QTAILQ_FOREACH(peer, &client->peer_list, next) {
+        printf("peer_id = %" PRId64 "\n", peer->id);
+
+        for (vector = 0; vector < peer->vectors_count; vector++) {
+            printf("  vector %d is enabled (fd=%d)\n", vector,
+                   peer->vectors[vector]);
+        }
+    }
+}
--- a/contrib/ivshmem-client/ivshmem-client.h
+++ b/contrib/ivshmem-client/ivshmem-client.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _IVSHMEM_CLIENT_H_
+#define _IVSHMEM_CLIENT_H_
+
+/**
+ * This file provides helper to implement an ivshmem client. It is used
+ * on the host to ask QEMU to send an interrupt to an ivshmem PCI device in a
+ * guest. QEMU also implements an ivshmem client similar to this one, they both
+ * connect to an ivshmem server.
+ *
+ * A standalone ivshmem client based on this file is provided for debug/test
+ * purposes.
+ */
+
+#include <limits.h>
+#include <sys/select.h>
+
+#include "qemu/queue.h"
+#include "hw/misc/ivshmem.h"
+
+/**
+ * Maximum number of notification vectors supported by the client
+ */
+#define IVSHMEM_CLIENT_MAX_VECTORS 64
+
+/**
+ * Structure storing a peer
+ *
+ * Each time a client connects to an ivshmem server, it is advertised to
+ * all connected clients through the unix socket. When our ivshmem
+ * client receives a notification, it creates a IvshmemClientPeer
+ * structure to store the infos of this peer.
+ *
+ * This structure is also used to store the information of our own
+ * client in (IvshmemClient)->local.
+ */
+typedef struct IvshmemClientPeer {
+    QTAILQ_ENTRY(IvshmemClientPeer) next;    /**< next in list*/
+    int64_t id;                              /**< the id of the peer */
+    int vectors[IVSHMEM_CLIENT_MAX_VECTORS]; /**< one fd per vector */
+    unsigned vectors_count;                  /**< number of vectors */
+} IvshmemClientPeer;
+QTAILQ_HEAD(IvshmemClientPeerList, IvshmemClientPeer);
+
+typedef struct IvshmemClientPeerList IvshmemClientPeerList;
+typedef struct IvshmemClient IvshmemClient;
+
+/**
+ * Typedef of callback function used when our IvshmemClient receives a
+ * notification from a peer.
+ */
+typedef void (*IvshmemClientNotifCb)(
+    const IvshmemClient *client,
+    const IvshmemClientPeer *peer,
+    unsigned vect, void *arg);
+
+/**
+ * Structure describing an ivshmem client
+ *
+ * This structure stores all information related to our client: the name
+ * of the server unix socket, the list of peers advertised by the
+ * server, our own client information, and a pointer the notification
+ * callback function used when we receive a notification from a peer.
+ */
+struct IvshmemClient {
+    char unix_sock_path[PATH_MAX];      /**< path to unix sock */
+    int sock_fd;                        /**< unix sock filedesc */
+    int shm_fd;                         /**< shm file descriptor */
+
+    IvshmemClientPeerList peer_list;    /**< list of peers */
+    IvshmemClientPeer local;            /**< our own infos */
+
+    IvshmemClientNotifCb notif_cb;      /**< notification callback */
+    void *notif_arg;                    /**< notification argument */
+
+    bool verbose;                       /**< true to enable debug */
+};
+
+/**
+ * Initialize an ivshmem client
+ *
+ * @client:         A pointer to an uninitialized IvshmemClient structure
+ * @unix_sock_path: The pointer to the unix socket file name
+ * @notif_cb:       If not NULL, the pointer to the function to be called when
+ *                  our IvshmemClient receives a notification from a peer
+ * @notif_arg:      Opaque pointer given as-is to the notification callback
+ *                  function
+ * @verbose:        True to enable debug
+ *
+ * Returns:         0 on success, or a negative value on error
+ */
+int ivshmem_client_init(IvshmemClient *client, const char *unix_sock_path,
+                        IvshmemClientNotifCb notif_cb, void *notif_arg,
+                        bool verbose);
+
+/**
+ * Connect to the server
+ *
+ * Connect to the server unix socket, and read the first initial
+ * messages sent by the server, giving the ID of the client and the file
+ * descriptor of the shared memory.
+ *
+ * @client: The ivshmem client
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_connect(IvshmemClient *client);
+
+/**
+ * Close connection to the server and free all peer structures
+ *
+ * @client: The ivshmem client
+ */
+void ivshmem_client_close(IvshmemClient *client);
+
+/**
+ * Fill a fd_set with file descriptors to be monitored
+ *
+ * This function will fill a fd_set with all file descriptors
+ * that must be polled (unix server socket and peers eventfd). The
+ * function will not initialize the fd_set, it is up to the caller
+ * to do this.
+ *
+ * @client: The ivshmem client
+ * @fds:    The fd_set to be updated
+ * @maxfd:  Must be set to the max file descriptor + 1 in fd_set. This value is
+ *          updated if this function adds a greater fd in fd_set.
+ */
+void ivshmem_client_get_fds(const IvshmemClient *client, fd_set *fds,
+                            int *maxfd);
+
+/**
+ * Read and handle new messages
+ *
+ * Given a fd_set filled by select(), handle incoming messages from
+ * server or peers.
+ *
+ * @client: The ivshmem client
+ * @fds:    The fd_set containing the file descriptors to be checked. Note
+ *          that file descriptors that are not related to our client are
+ *          ignored.
+ * @maxfd:  The maximum fd in fd_set, plus one.
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_handle_fds(IvshmemClient *client, fd_set *fds, int maxfd);
+
+/**
+ * Send a notification to a vector of a peer
+ *
+ * @client: The ivshmem client
+ * @peer:   The peer to be notified
+ * @vector: The number of the vector
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_client_notify(const IvshmemClient *client,
+                          const IvshmemClientPeer *peer, unsigned vector);
+
+/**
+ * Send a notification to all vectors of a peer
+ *
+ * @client: The ivshmem client
+ * @peer:   The peer to be notified
+ *
+ * Returns: 0 on success, or a negative value on error (at least one
+ *          notification failed)
+ */
+int ivshmem_client_notify_all_vects(const IvshmemClient *client,
+                                    const IvshmemClientPeer *peer);
+
+/**
+ * Broadcat a notification to all vectors of all peers
+ *
+ * @client: The ivshmem client
+ *
+ * Returns: 0 on success, or a negative value on error (at least one
+ *          notification failed)
+ */
+int ivshmem_client_notify_broadcast(const IvshmemClient *client);
+
+/**
+ * Search a peer from its identifier
+ *
+ * Return the peer structure from its peer_id. If the given peer_id is
+ * the local id, the function returns the local peer structure.
+ *
+ * @client:  The ivshmem client
+ * @peer_id: The identifier of the peer structure
+ *
+ * Returns:  The peer structure, or NULL if not found
+ */
+IvshmemClientPeer *
+ivshmem_client_search_peer(IvshmemClient *client, int64_t peer_id);
+
+/**
+ * Dump information of this ivshmem client on stdout
+ *
+ * Dump the id and the vectors of the given ivshmem client and the list
+ * of its peers and their vectors on stdout.
+ *
+ * @client: The ivshmem client
+ */
+void ivshmem_client_dump(const IvshmemClient *client);
+
+#endif /* _IVSHMEM_CLIENT_H_ */
--- a/contrib/ivshmem-client/main.c
+++ b/contrib/ivshmem-client/main.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu-common.h"
+
+#include "ivshmem-client.h"
+
+#define IVSHMEM_CLIENT_DEFAULT_VERBOSE        0
+#define IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH "/tmp/ivshmem_socket"
+
+typedef struct IvshmemClientArgs {
+    bool verbose;
+    const char *unix_sock_path;
+} IvshmemClientArgs;
+
+/* show ivshmem_client_usage and exit with given error code */
+static void
+ivshmem_client_usage(const char *name, int code)
+{
+    fprintf(stderr, "%s [opts]\n", name);
+    fprintf(stderr, "  -h: show this help\n");
+    fprintf(stderr, "  -v: verbose mode\n");
+    fprintf(stderr, "  -S <unix_sock_path>: path to the unix socket\n"
+                    "     to connect to.\n"
+                    "     default=%s\n", IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH);
+    exit(code);
+}
+
+/* parse the program arguments, exit on error */
+static void
+ivshmem_client_parse_args(IvshmemClientArgs *args, int argc, char *argv[])
+{
+    int c;
+
+    while ((c = getopt(argc, argv,
+                       "h"  /* help */
+                       "v"  /* verbose */
+                       "S:" /* unix_sock_path */
+                      )) != -1) {
+
+        switch (c) {
+        case 'h': /* help */
+            ivshmem_client_usage(argv[0], 0);
+            break;
+
+        case 'v': /* verbose */
+            args->verbose = 1;
+            break;
+
+        case 'S': /* unix_sock_path */
+            args->unix_sock_path = optarg;
+            break;
+
+        default:
+            ivshmem_client_usage(argv[0], 1);
+            break;
+        }
+    }
+}
+
+/* show command line help */
+static void
+ivshmem_client_cmdline_help(void)
+{
+    printf("dump: dump peers (including us)\n"
+           "int <peer> <vector>: notify one vector on a peer\n"
+           "int <peer> all: notify all vectors of a peer\n"
+           "int all: notify all vectors of all peers (excepting us)\n");
+}
+
+/* read stdin and handle commands */
+static int
+ivshmem_client_handle_stdin_command(IvshmemClient *client)
+{
+    IvshmemClientPeer *peer;
+    char buf[128];
+    char *s, *token;
+    int ret;
+    int peer_id, vector;
+
+    memset(buf, 0, sizeof(buf));
+    ret = read(0, buf, sizeof(buf) - 1);
+    if (ret < 0) {
+        return -1;
+    }
+
+    s = buf;
+    while ((token = strsep(&s, "\n\r;")) != NULL) {
+        if (!strcmp(token, "")) {
+            continue;
+        }
+        if (!strcmp(token, "?")) {
+            ivshmem_client_cmdline_help();
+        }
+        if (!strcmp(token, "help")) {
+            ivshmem_client_cmdline_help();
+        } else if (!strcmp(token, "dump")) {
+            ivshmem_client_dump(client);
+        } else if (!strcmp(token, "int all")) {
+            ivshmem_client_notify_broadcast(client);
+        } else if (sscanf(token, "int %d %d", &peer_id, &vector) == 2) {
+            peer = ivshmem_client_search_peer(client, peer_id);
+            if (peer == NULL) {
+                printf("cannot find peer_id = %d\n", peer_id);
+                continue;
+            }
+            ivshmem_client_notify(client, peer, vector);
+        } else if (sscanf(token, "int %d all", &peer_id) == 1) {
+            peer = ivshmem_client_search_peer(client, peer_id);
+            if (peer == NULL) {
+                printf("cannot find peer_id = %d\n", peer_id);
+                continue;
+            }
+            ivshmem_client_notify_all_vects(client, peer);
+        } else {
+            printf("invalid command, type help\n");
+        }
+    }
+
+    printf("cmd> ");
+    fflush(stdout);
+    return 0;
+}
+
+/* listen on stdin (command line), on unix socket (notifications of new
+ * and dead peers), and on eventfd (IRQ request) */
+static int
+ivshmem_client_poll_events(IvshmemClient *client)
+{
+    fd_set fds;
+    int ret, maxfd;
+
+    while (1) {
+
+        FD_ZERO(&fds);
+        FD_SET(0, &fds); /* add stdin in fd_set */
+        maxfd = 1;
+
+        ivshmem_client_get_fds(client, &fds, &maxfd);
+
+        ret = select(maxfd, &fds, NULL, NULL, NULL);
+        if (ret < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+
+            fprintf(stderr, "select error: %s\n", strerror(errno));
+            break;
+        }
+        if (ret == 0) {
+            continue;
+        }
+
+        if (FD_ISSET(0, &fds) &&
+            ivshmem_client_handle_stdin_command(client) < 0 && errno != EINTR) {
+            fprintf(stderr, "ivshmem_client_handle_stdin_command() failed\n");
+            break;
+        }
+
+        if (ivshmem_client_handle_fds(client, &fds, maxfd) < 0) {
+            fprintf(stderr, "ivshmem_client_handle_fds() failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+/* callback when we receive a notification (just display it) */
+static void
+ivshmem_client_notification_cb(const IvshmemClient *client,
+                               const IvshmemClientPeer *peer,
+                               unsigned vect, void *arg)
+{
+    (void)client;
+    (void)arg;
+    printf("receive notification from peer_id=%" PRId64 " vector=%u\n",
+           peer->id, vect);
+}
+
+int
+main(int argc, char *argv[])
+{
+    struct sigaction sa;
+    IvshmemClient client;
+    IvshmemClientArgs args = {
+        .verbose = IVSHMEM_CLIENT_DEFAULT_VERBOSE,
+        .unix_sock_path = IVSHMEM_CLIENT_DEFAULT_UNIX_SOCK_PATH,
+    };
+
+    /* parse arguments, will exit on error */
+    ivshmem_client_parse_args(&args, argc, argv);
+
+    /* Ignore SIGPIPE, see this link for more info:
+     * http://www.mail-archive.com/libevent-users@monkey.org/msg01606.html */
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPIPE, &sa, 0) == -1) {
+        perror("failed to ignore SIGPIPE; sigaction");
+        return 1;
+    }
+
+    ivshmem_client_cmdline_help();
+    printf("cmd> ");
+    fflush(stdout);
+
+    if (ivshmem_client_init(&client, args.unix_sock_path,
+                            ivshmem_client_notification_cb, NULL,
+                            args.verbose) < 0) {
+        fprintf(stderr, "cannot init client\n");
+        return 1;
+    }
+
+    while (1) {
+        if (ivshmem_client_connect(&client) < 0) {
+            fprintf(stderr, "cannot connect to server, retry in 1 second\n");
+            sleep(1);
+            continue;
+        }
+
+        fprintf(stdout, "listen on server socket %d\n", client.sock_fd);
+
+        if (ivshmem_client_poll_events(&client) == 0) {
+            continue;
+        }
+
+        /* disconnected from server, reset all peers */
+        fprintf(stdout, "disconnected from server\n");
+
+        ivshmem_client_close(&client);
+    }
+
+    return 0;
+}
--- a/contrib/ivshmem-server/Makefile.objs
+++ b/contrib/ivshmem-server/Makefile.objs
@@ -0,0 +1 @@
+ivshmem-server-obj-y = ivshmem-server.o main.o
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -0,0 +1,493 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#ifdef CONFIG_LINUX
+#include <sys/vfs.h>
+#endif
+
+#include "ivshmem-server.h"
+
+/* log a message on stdout if verbose=1 */
+#define IVSHMEM_SERVER_DEBUG(server, fmt, ...) do { \
+        if ((server)->verbose) {         \
+            printf(fmt, ## __VA_ARGS__); \
+        }                                \
+    } while (0)
+
+/** maximum size of a huge page, used by ivshmem_server_ftruncate() */
+#define IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE (1024 * 1024 * 1024)
+
+/** default listen backlog (number of sockets not accepted) */
+#define IVSHMEM_SERVER_LISTEN_BACKLOG 10
+
+/* send message to a client unix socket */
+static int
+ivshmem_server_send_one_msg(int sock_fd, int64_t peer_id, int fd)
+{
+    int ret;
+    struct msghdr msg;
+    struct iovec iov[1];
+    union {
+        struct cmsghdr cmsg;
+        char control[CMSG_SPACE(sizeof(int))];
+    } msg_control;
+    struct cmsghdr *cmsg;
+
+    peer_id = GINT64_TO_LE(peer_id);
+    iov[0].iov_base = &peer_id;
+    iov[0].iov_len = sizeof(peer_id);
+
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 1;
+
+    /* if fd is specified, add it in a cmsg */
+    if (fd >= 0) {
+        memset(&msg_control, 0, sizeof(msg_control));
+        msg.msg_control = &msg_control;
+        msg.msg_controllen = sizeof(msg_control);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+        memcpy(CMSG_DATA(cmsg), &fd, sizeof(fd));
+    }
+
+    ret = sendmsg(sock_fd, &msg, 0);
+    if (ret <= 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* free a peer when the server advertises a disconnection or when the
+ * server is freed */
+static void
+ivshmem_server_free_peer(IvshmemServer *server, IvshmemServerPeer *peer)
+{
+    unsigned vector;
+    IvshmemServerPeer *other_peer;
+
+    IVSHMEM_SERVER_DEBUG(server, "free peer %" PRId64 "\n", peer->id);
+    close(peer->sock_fd);
+    QTAILQ_REMOVE(&server->peer_list, peer, next);
+
+    /* advertise the deletion to other peers */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        ivshmem_server_send_one_msg(other_peer->sock_fd, peer->id, -1);
+    }
+
+    for (vector = 0; vector < peer->vectors_count; vector++) {
+        event_notifier_cleanup(&peer->vectors[vector]);
+    }
+
+    g_free(peer);
+}
+
+/* send the peer id and the shm_fd just after a new client connection */
+static int
+ivshmem_server_send_initial_info(IvshmemServer *server, IvshmemServerPeer *peer)
+{
+    int ret;
+
+    /* send our protocol version first */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, IVSHMEM_PROTOCOL_VERSION,
+                                      -1);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send version: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    /* send the peer id to the client */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, peer->id, -1);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send peer id: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    /* send the shm_fd */
+    ret = ivshmem_server_send_one_msg(peer->sock_fd, -1, server->shm_fd);
+    if (ret < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send shm fd: %s\n",
+                             strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/* handle message on listening unix socket (new client connection) */
+static int
+ivshmem_server_handle_new_conn(IvshmemServer *server)
+{
+    IvshmemServerPeer *peer, *other_peer;
+    struct sockaddr_un unaddr;
+    socklen_t unaddr_len;
+    int newfd;
+    unsigned i;
+
+    /* accept the incoming connection */
+    unaddr_len = sizeof(unaddr);
+    newfd = qemu_accept(server->sock_fd,
+                        (struct sockaddr *)&unaddr, &unaddr_len);
+
+    if (newfd < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot accept() %s\n", strerror(errno));
+        return -1;
+    }
+
+    qemu_set_nonblock(newfd);
+    IVSHMEM_SERVER_DEBUG(server, "accept()=%d\n", newfd);
+
+    /* allocate new structure for this peer */
+    peer = g_malloc0(sizeof(*peer));
+    peer->sock_fd = newfd;
+
+    /* get an unused peer id */
+    /* XXX: this could use id allocation such as Linux IDA, or simply
+     * a free-list */
+    for (i = 0; i < G_MAXUINT16; i++) {
+        if (ivshmem_server_search_peer(server, server->cur_id) == NULL) {
+            break;
+        }
+        server->cur_id++;
+    }
+    if (i == G_MAXUINT16) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot allocate new client id\n");
+        close(newfd);
+        g_free(peer);
+        return -1;
+    }
+    peer->id = server->cur_id++;
+
+    /* create eventfd, one per vector */
+    peer->vectors_count = server->n_vectors;
+    for (i = 0; i < peer->vectors_count; i++) {
+        if (event_notifier_init(&peer->vectors[i], FALSE) < 0) {
+            IVSHMEM_SERVER_DEBUG(server, "cannot create eventfd\n");
+            goto fail;
+        }
+    }
+
+    /* send peer id and shm fd */
+    if (ivshmem_server_send_initial_info(server, peer) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot send initial info\n");
+        goto fail;
+    }
+
+    /* advertise the new peer to others */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        for (i = 0; i < peer->vectors_count; i++) {
+            ivshmem_server_send_one_msg(other_peer->sock_fd, peer->id,
+                                        peer->vectors[i].wfd);
+        }
+    }
+
+    /* advertise the other peers to the new one */
+    QTAILQ_FOREACH(other_peer, &server->peer_list, next) {
+        for (i = 0; i < peer->vectors_count; i++) {
+            ivshmem_server_send_one_msg(peer->sock_fd, other_peer->id,
+                                        other_peer->vectors[i].wfd);
+        }
+    }
+
+    /* advertise the new peer to itself */
+    for (i = 0; i < peer->vectors_count; i++) {
+        ivshmem_server_send_one_msg(peer->sock_fd, peer->id,
+                                    event_notifier_get_fd(&peer->vectors[i]));
+    }
+
+    QTAILQ_INSERT_TAIL(&server->peer_list, peer, next);
+    IVSHMEM_SERVER_DEBUG(server, "new peer id = %" PRId64 "\n",
+                         peer->id);
+    return 0;
+
+fail:
+    while (i--) {
+        event_notifier_cleanup(&peer->vectors[i]);
+    }
+    close(newfd);
+    g_free(peer);
+    return -1;
+}
+
+/* Try to ftruncate a file to next power of 2 of shmsize.
+ * If it fails; all power of 2 above shmsize are tested until
+ * we reach the maximum huge page size. This is useful
+ * if the shm file is in a hugetlbfs that cannot be truncated to the
+ * shm_size value. */
+static int
+ivshmem_server_ftruncate(int fd, unsigned shmsize)
+{
+    int ret;
+    struct stat mapstat;
+
+    /* align shmsize to next power of 2 */
+    shmsize = pow2ceil(shmsize);
+
+    if (fstat(fd, &mapstat) != -1 && mapstat.st_size == shmsize) {
+        return 0;
+    }
+
+    while (shmsize <= IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE) {
+        ret = ftruncate(fd, shmsize);
+        if (ret == 0) {
+            return ret;
+        }
+        shmsize *= 2;
+    }
+
+    return -1;
+}
+
+/* Init a new ivshmem server */
+int
+ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
+                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    bool verbose)
+{
+    int ret;
+
+    memset(server, 0, sizeof(*server));
+    server->verbose = verbose;
+
+    ret = snprintf(server->unix_sock_path, sizeof(server->unix_sock_path),
+                   "%s", unix_sock_path);
+    if (ret < 0 || ret >= sizeof(server->unix_sock_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy unix socket path\n");
+        return -1;
+    }
+    ret = snprintf(server->shm_path, sizeof(server->shm_path),
+                   "%s", shm_path);
+    if (ret < 0 || ret >= sizeof(server->shm_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy shm path\n");
+        return -1;
+    }
+
+    server->shm_size = shm_size;
+    server->n_vectors = n_vectors;
+
+    QTAILQ_INIT(&server->peer_list);
+
+    return 0;
+}
+
+#ifdef CONFIG_LINUX
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static long gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+        ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+        return -1;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC) {
+        return -1;
+    }
+
+    return fs.f_bsize;
+}
+#endif
+
+/* open shm, create and bind to the unix socket */
+int
+ivshmem_server_start(IvshmemServer *server)
+{
+    struct sockaddr_un sun;
+    int shm_fd, sock_fd, ret;
+
+    /* open shm file */
+#ifdef CONFIG_LINUX
+    long hpagesize;
+
+    hpagesize = gethugepagesize(server->shm_path);
+    if (hpagesize < 0 && errno != ENOENT) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n",
+                             server->shm_path, strerror(errno));
+    }
+
+    if (hpagesize > 0) {
+        gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path);
+        IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path);
+        shm_fd = mkstemp(filename);
+        unlink(filename);
+        g_free(filename);
+    } else
+#endif
+    {
+        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
+                             server->shm_path);
+        shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU);
+    }
+
+    if (shm_fd < 0) {
+        fprintf(stderr, "cannot open shm file %s: %s\n", server->shm_path,
+                strerror(errno));
+        return -1;
+    }
+    if (ivshmem_server_ftruncate(shm_fd, server->shm_size) < 0) {
+        fprintf(stderr, "ftruncate(%s) failed: %s\n", server->shm_path,
+                strerror(errno));
+        goto err_close_shm;
+    }
+
+    IVSHMEM_SERVER_DEBUG(server, "create & bind socket %s\n",
+                         server->unix_sock_path);
+
+    /* create the unix listening socket */
+    sock_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock_fd < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot create socket: %s\n",
+                             strerror(errno));
+        goto err_close_shm;
+    }
+
+    sun.sun_family = AF_UNIX;
+    ret = snprintf(sun.sun_path, sizeof(sun.sun_path), "%s",
+                   server->unix_sock_path);
+    if (ret < 0 || ret >= sizeof(sun.sun_path)) {
+        IVSHMEM_SERVER_DEBUG(server, "could not copy unix socket path\n");
+        goto err_close_sock;
+    }
+    if (bind(sock_fd, (struct sockaddr *)&sun, sizeof(sun)) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "cannot connect to %s: %s\n", sun.sun_path,
+                             strerror(errno));
+        goto err_close_sock;
+    }
+
+    if (listen(sock_fd, IVSHMEM_SERVER_LISTEN_BACKLOG) < 0) {
+        IVSHMEM_SERVER_DEBUG(server, "listen() failed: %s\n", strerror(errno));
+        goto err_close_sock;
+    }
+
+    server->sock_fd = sock_fd;
+    server->shm_fd = shm_fd;
+
+    return 0;
+
+err_close_sock:
+    close(sock_fd);
+err_close_shm:
+    close(shm_fd);
+    return -1;
+}
+
+/* close connections to clients, the unix socket and the shm fd */
+void
+ivshmem_server_close(IvshmemServer *server)
+{
+    IvshmemServerPeer *peer, *npeer;
+
+    IVSHMEM_SERVER_DEBUG(server, "close server\n");
+
+    QTAILQ_FOREACH_SAFE(peer, &server->peer_list, next, npeer) {
+        ivshmem_server_free_peer(server, peer);
+    }
+
+    unlink(server->unix_sock_path);
+    close(server->sock_fd);
+    close(server->shm_fd);
+    server->sock_fd = -1;
+    server->shm_fd = -1;
+}
+
+/* get the fd_set according to the unix socket and the peer list */
+void
+ivshmem_server_get_fds(const IvshmemServer *server, fd_set *fds, int *maxfd)
+{
+    IvshmemServerPeer *peer;
+
+    if (server->sock_fd == -1) {
+        return;
+    }
+
+    FD_SET(server->sock_fd, fds);
+    if (server->sock_fd >= *maxfd) {
+        *maxfd = server->sock_fd + 1;
+    }
+
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        FD_SET(peer->sock_fd, fds);
+        if (peer->sock_fd >= *maxfd) {
+            *maxfd = peer->sock_fd + 1;
+        }
+    }
+}
+
+/* process incoming messages on the sockets in fd_set */
+int
+ivshmem_server_handle_fds(IvshmemServer *server, fd_set *fds, int maxfd)
+{
+    IvshmemServerPeer *peer, *peer_next;
+
+    if (server->sock_fd < maxfd && FD_ISSET(server->sock_fd, fds) &&
+        ivshmem_server_handle_new_conn(server) < 0 && errno != EINTR) {
+        IVSHMEM_SERVER_DEBUG(server, "ivshmem_server_handle_new_conn() "
+                             "failed\n");
+        return -1;
+    }
+
+    QTAILQ_FOREACH_SAFE(peer, &server->peer_list, next, peer_next) {
+        /* any message from a peer socket result in a close() */
+        IVSHMEM_SERVER_DEBUG(server, "peer->sock_fd=%d\n", peer->sock_fd);
+        if (peer->sock_fd < maxfd && FD_ISSET(peer->sock_fd, fds)) {
+            ivshmem_server_free_peer(server, peer);
+        }
+    }
+
+    return 0;
+}
+
+/* lookup peer from its id */
+IvshmemServerPeer *
+ivshmem_server_search_peer(IvshmemServer *server, int64_t peer_id)
+{
+    IvshmemServerPeer *peer;
+
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        if (peer->id == peer_id) {
+            return peer;
+        }
+    }
+    return NULL;
+}
+
+/* dump our info, the list of peers their vectors on stdout */
+void
+ivshmem_server_dump(const IvshmemServer *server)
+{
+    const IvshmemServerPeer *peer;
+    unsigned vector;
+
+    /* dump peers */
+    QTAILQ_FOREACH(peer, &server->peer_list, next) {
+        printf("peer_id = %" PRId64 "\n", peer->id);
+
+        for (vector = 0; vector < peer->vectors_count; vector++) {
+            printf("  vector %d is enabled (fd=%d)\n", vector,
+                   event_notifier_get_fd(&peer->vectors[vector]));
+        }
+    }
+}
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _IVSHMEM_SERVER_H_
+#define _IVSHMEM_SERVER_H_
+
+/**
+ * The ivshmem server is a daemon that creates a unix socket in listen
+ * mode. The ivshmem clients (qemu or ivshmem-client) connect to this
+ * unix socket. For each client, the server will create some eventfd
+ * (see EVENTFD(2)), one per vector. These fd are transmitted to all
+ * clients using the SCM_RIGHTS cmsg message. Therefore, each client is
+ * able to send a notification to another client without beeing
+ * "profixied" by the server.
+ *
+ * We use this mechanism to send interruptions between guests.
+ * qemu is able to transform an event on a eventfd into a PCI MSI-x
+ * interruption in the guest.
+ *
+ * The ivshmem server is also able to share the file descriptor
+ * associated to the ivshmem shared memory.
+ */
+
+#include <limits.h>
+#include <sys/select.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "qemu/event_notifier.h"
+#include "qemu/queue.h"
+#include "hw/misc/ivshmem.h"
+
+/**
+ * Maximum number of notification vectors supported by the server
+ */
+#define IVSHMEM_SERVER_MAX_VECTORS 64
+
+/**
+ * Structure storing a peer
+ *
+ * Each time a client connects to an ivshmem server, a new
+ * IvshmemServerPeer structure is created. This peer and all its
+ * vectors are advertised to all connected clients through the connected
+ * unix sockets.
+ */
+typedef struct IvshmemServerPeer {
+    QTAILQ_ENTRY(IvshmemServerPeer) next;    /**< next in list*/
+    int sock_fd;                             /**< connected unix sock */
+    int64_t id;                              /**< the id of the peer */
+    EventNotifier vectors[IVSHMEM_SERVER_MAX_VECTORS]; /**< one per vector */
+    unsigned vectors_count;                  /**< number of vectors */
+} IvshmemServerPeer;
+QTAILQ_HEAD(IvshmemServerPeerList, IvshmemServerPeer);
+
+typedef struct IvshmemServerPeerList IvshmemServerPeerList;
+
+/**
+ * Structure describing an ivshmem server
+ *
+ * This structure stores all information related to our server: the name
+ * of the server unix socket and the list of connected peers.
+ */
+typedef struct IvshmemServer {
+    char unix_sock_path[PATH_MAX];   /**< path to unix socket */
+    int sock_fd;                     /**< unix sock file descriptor */
+    char shm_path[PATH_MAX];         /**< path to shm */
+    size_t shm_size;                 /**< size of shm */
+    int shm_fd;                      /**< shm file descriptor */
+    unsigned n_vectors;              /**< number of vectors */
+    uint16_t cur_id;                 /**< id to be given to next client */
+    bool verbose;                    /**< true in verbose mode */
+    IvshmemServerPeerList peer_list; /**< list of peers */
+} IvshmemServer;
+
+/**
+ * Initialize an ivshmem server
+ *
+ * @server:         A pointer to an uninitialized IvshmemServer structure
+ * @unix_sock_path: The pointer to the unix socket file name
+ * @shm_path:       Path to the shared memory. The path corresponds to a POSIX
+ *                  shm name or a hugetlbfs mount point.
+ * @shm_size:       Size of shared memory
+ * @n_vectors:      Number of interrupt vectors per client
+ * @verbose:        True to enable verbose mode
+ *
+ * Returns:         0 on success, or a negative value on error
+ */
+int
+ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
+                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    bool verbose);
+
+/**
+ * Open the shm, then create and bind to the unix socket
+ *
+ * @server: The pointer to the initialized IvshmemServer structure
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_server_start(IvshmemServer *server);
+
+/**
+ * Close the server
+ *
+ * Close connections to all clients, close the unix socket and the
+ * shared memory file descriptor. The structure remains initialized, so
+ * it is possible to call ivshmem_server_start() again after a call to
+ * ivshmem_server_close().
+ *
+ * @server: The ivshmem server
+ */
+void ivshmem_server_close(IvshmemServer *server);
+
+/**
+ * Fill a fd_set with file descriptors to be monitored
+ *
+ * This function will fill a fd_set with all file descriptors that must
+ * be polled (unix server socket and peers unix socket). The function
+ * will not initialize the fd_set, it is up to the caller to do it.
+ *
+ * @server: The ivshmem server
+ * @fds:    The fd_set to be updated
+ * @maxfd:  Must be set to the max file descriptor + 1 in fd_set. This value is
+ *          updated if this function adds a greater fd in fd_set.
+ */
+void
+ivshmem_server_get_fds(const IvshmemServer *server, fd_set *fds, int *maxfd);
+
+/**
+ * Read and handle new messages
+ *
+ * Given a fd_set (for instance filled by a call to select()), handle
+ * incoming messages from peers.
+ *
+ * @server: The ivshmem server
+ * @fds:    The fd_set containing the file descriptors to be checked. Note that
+ *          file descriptors that are not related to our server are ignored.
+ * @maxfd:  The maximum fd in fd_set, plus one.
+ *
+ * Returns: 0 on success, or a negative value on error
+ */
+int ivshmem_server_handle_fds(IvshmemServer *server, fd_set *fds, int maxfd);
+
+/**
+ * Search a peer from its identifier
+ *
+ * @server:  The ivshmem server
+ * @peer_id: The identifier of the peer structure
+ *
+ * Returns:  The peer structure, or NULL if not found
+ */
+IvshmemServerPeer *
+ivshmem_server_search_peer(IvshmemServer *server, int64_t peer_id);
+
+/**
+ * Dump information of this ivshmem server and its peers on stdout
+ *
+ * @server: The ivshmem server
+ */
+void ivshmem_server_dump(const IvshmemServer *server);
+
+#endif /* _IVSHMEM_SERVER_H_ */
--- a/contrib/ivshmem-server/main.c
+++ b/contrib/ivshmem-server/main.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu-common.h"
+
+#include "ivshmem-server.h"
+
+#define IVSHMEM_SERVER_DEFAULT_VERBOSE        0
+#define IVSHMEM_SERVER_DEFAULT_FOREGROUND     0
+#define IVSHMEM_SERVER_DEFAULT_PID_FILE       "/var/run/ivshmem-server.pid"
+#define IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "/tmp/ivshmem_socket"
+#define IVSHMEM_SERVER_DEFAULT_SHM_PATH       "ivshmem"
+#define IVSHMEM_SERVER_DEFAULT_SHM_SIZE       (4*1024*1024)
+#define IVSHMEM_SERVER_DEFAULT_N_VECTORS      1
+
+/* used to quit on signal SIGTERM */
+static int ivshmem_server_quit;
+
+/* arguments given by the user */
+typedef struct IvshmemServerArgs {
+    bool verbose;
+    bool foreground;
+    const char *pid_file;
+    const char *unix_socket_path;
+    const char *shm_path;
+    uint64_t shm_size;
+    unsigned n_vectors;
+} IvshmemServerArgs;
+
+/* show ivshmem_server_usage and exit with given error code */
+static void
+ivshmem_server_usage(const char *name, int code)
+{
+    fprintf(stderr, "%s [opts]\n", name);
+    fprintf(stderr, "  -h: show this help\n");
+    fprintf(stderr, "  -v: verbose mode\n");
+    fprintf(stderr, "  -F: foreground mode (default is to daemonize)\n");
+    fprintf(stderr, "  -p <pid_file>: path to the PID file (used in daemon\n"
+                    "     mode only).\n"
+                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+    fprintf(stderr, "  -S <unix_socket_path>: path to the unix socket\n"
+                    "     to listen to.\n"
+                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH);
+    fprintf(stderr, "  -m <shm_path>: path to the shared memory.\n"
+                    "     The path corresponds to a POSIX shm name or a\n"
+                    "     hugetlbfs mount point.\n"
+                    "     default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+    fprintf(stderr, "  -l <size>: size of shared memory in bytes. The suffix\n"
+                    "     K, M and G can be used (ex: 1K means 1024).\n"
+                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE);
+    fprintf(stderr, "  -n <n_vects>: number of vectors.\n"
+                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+
+    exit(code);
+}
+
+/* parse the program arguments, exit on error */
+static void
+ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
+{
+    int c;
+    unsigned long long v;
+    Error *errp = NULL;
+
+    while ((c = getopt(argc, argv,
+                       "h"  /* help */
+                       "v"  /* verbose */
+                       "F"  /* foreground */
+                       "p:" /* pid_file */
+                       "S:" /* unix_socket_path */
+                       "m:" /* shm_path */
+                       "l:" /* shm_size */
+                       "n:" /* n_vectors */
+                      )) != -1) {
+
+        switch (c) {
+        case 'h': /* help */
+            ivshmem_server_usage(argv[0], 0);
+            break;
+
+        case 'v': /* verbose */
+            args->verbose = 1;
+            break;
+
+        case 'F': /* foreground */
+            args->foreground = 1;
+            break;
+
+        case 'p': /* pid_file */
+            args->pid_file = optarg;
+            break;
+
+        case 'S': /* unix_socket_path */
+            args->unix_socket_path = optarg;
+            break;
+
+        case 'm': /* shm_path */
+            args->shm_path = optarg;
+            break;
+
+        case 'l': /* shm_size */
+            parse_option_size("shm_size", optarg, &args->shm_size, &errp);
+            if (errp) {
+                fprintf(stderr, "cannot parse shm size: %s\n",
+                        error_get_pretty(errp));
+                error_free(errp);
+                ivshmem_server_usage(argv[0], 1);
+            }
+            break;
+
+        case 'n': /* n_vectors */
+            if (parse_uint_full(optarg, &v, 0) < 0) {
+                fprintf(stderr, "cannot parse n_vectors\n");
+                ivshmem_server_usage(argv[0], 1);
+            }
+            args->n_vectors = v;
+            break;
+
+        default:
+            ivshmem_server_usage(argv[0], 1);
+            break;
+        }
+    }
+
+    if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) {
+        fprintf(stderr, "too many requested vectors (max is %d)\n",
+                IVSHMEM_SERVER_MAX_VECTORS);
+        ivshmem_server_usage(argv[0], 1);
+    }
+
+    if (args->verbose == 1 && args->foreground == 0) {
+        fprintf(stderr, "cannot use verbose in daemon mode\n");
+        ivshmem_server_usage(argv[0], 1);
+    }
+}
+
+/* wait for events on listening server unix socket and connected client
+ * sockets */
+static int
+ivshmem_server_poll_events(IvshmemServer *server)
+{
+    fd_set fds;
+    int ret = 0, maxfd;
+
+    while (!ivshmem_server_quit) {
+
+        FD_ZERO(&fds);
+        maxfd = 0;
+        ivshmem_server_get_fds(server, &fds, &maxfd);
+
+        ret = select(maxfd, &fds, NULL, NULL, NULL);
+
+        if (ret < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+
+            fprintf(stderr, "select error: %s\n", strerror(errno));
+            break;
+        }
+        if (ret == 0) {
+            continue;
+        }
+
+        if (ivshmem_server_handle_fds(server, &fds, maxfd) < 0) {
+            fprintf(stderr, "ivshmem_server_handle_fds() failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static void
+ivshmem_server_quit_cb(int signum)
+{
+    ivshmem_server_quit = 1;
+}
+
+int
+main(int argc, char *argv[])
+{
+    IvshmemServer server;
+    struct sigaction sa, sa_quit;
+    IvshmemServerArgs args = {
+        .verbose = IVSHMEM_SERVER_DEFAULT_VERBOSE,
+        .foreground = IVSHMEM_SERVER_DEFAULT_FOREGROUND,
+        .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE,
+        .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH,
+        .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH,
+        .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
+        .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS,
+    };
+    int ret = 1;
+
+    /* parse arguments, will exit on error */
+    ivshmem_server_parse_args(&args, argc, argv);
+
+    /* Ignore SIGPIPE, see this link for more info:
+     * http://www.mail-archive.com/libevent-users@monkey.org/msg01606.html */
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPIPE, &sa, 0) == -1) {
+        perror("failed to ignore SIGPIPE; sigaction");
+        goto err;
+    }
+
+    sa_quit.sa_handler = ivshmem_server_quit_cb;
+    sa_quit.sa_flags = 0;
+    if (sigemptyset(&sa_quit.sa_mask) == -1 ||
+        sigaction(SIGTERM, &sa_quit, 0) == -1) {
+        perror("failed to add SIGTERM handler; sigaction");
+        goto err;
+    }
+
+    /* init the ivshms structure */
+    if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path,
+                            args.shm_size, args.n_vectors, args.verbose) < 0) {
+        fprintf(stderr, "cannot init server\n");
+        goto err;
+    }
+
+    /* start the ivshmem server (open shm & unix socket) */
+    if (ivshmem_server_start(&server) < 0) {
+        fprintf(stderr, "cannot bind\n");
+        goto err;
+    }
+
+    /* daemonize if asked to */
+    if (!args.foreground) {
+        FILE *fp;
+
+        if (qemu_daemon(1, 1) < 0) {
+            fprintf(stderr, "cannot daemonize: %s\n", strerror(errno));
+            goto err_close;
+        }
+
+        /* write pid file */
+        fp = fopen(args.pid_file, "w");
+        if (fp == NULL) {
+            fprintf(stderr, "cannot write pid file: %s\n", strerror(errno));
+            goto err_close;
+        }
+
+        fprintf(fp, "%d\n", (int) getpid());
+        fclose(fp);
+    }
+
+    ivshmem_server_poll_events(&server);
+    fprintf(stdout, "server disconnected\n");
+    ret = 0;
+
+err_close:
+    ivshmem_server_close(&server);
+err:
+    return ret;
+}
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -30,6 +30,7 @@
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
+#include "sysemu/replay.h"

 /* -icount align implementation. */

@@ -184,7 +185,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
-                             TranslationBlock *orig_tb)
+                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;

@@ -194,7 +195,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
        max_cycles = CF_COUNT_MASK;

    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
-                     max_cycles | CF_NOCACHE);
+                     max_cycles | CF_NOCACHE
+                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
    cpu->current_tb = tb;
    /* execute the generated code */
@@ -345,21 +347,25 @@ int cpu_exec(CPUState *cpu)
    uintptr_t next_tb;
    SyncClocks sc;

+    /* replay_interrupt may need current_cpu */
+    current_cpu = cpu;
+
    if (cpu->halted) {
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
            apic_poll_irq(x86_cpu->apic_state);
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
        }
 #endif
        if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
            return EXCP_HALTED;
        }

        cpu->halted = 0;
    }

-    current_cpu = cpu;
    atomic_mb_set(&tcg_current_cpu, cpu);
    rcu_read_lock();

@@ -401,10 +407,22 @@ int cpu_exec(CPUState *cpu)
                    cpu->exception_index = -1;
                    break;
 #else
-                    cc->do_interrupt(cpu);
-                    cpu->exception_index = -1;
+                    if (replay_exception()) {
+                        cc->do_interrupt(cpu);
+                        cpu->exception_index = -1;
+                    } else if (!replay_has_interrupt()) {
+                        /* give a chance to iothread in replay mode */
+                        ret = EXCP_INTERRUPT;
+                        break;
+                    }
 #endif
                }
+            } else if (replay_has_exception()
+                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+                /* try to cause an exception pending in the log */
+                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
+                ret = -1;
+                break;
            }

            next_tb = 0; /* force lookup of first TB */
@@ -420,30 +438,40 @@ int cpu_exec(CPUState *cpu)
                        cpu->exception_index = EXCP_DEBUG;
                        cpu_loop_exit(cpu);
                    }
-                    if (interrupt_request & CPU_INTERRUPT_HALT) {
+                    if (replay_mode == REPLAY_MODE_PLAY
+                        && !replay_has_interrupt()) {
+                        /* Do nothing */
+                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+                        replay_interrupt();
                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                        cpu->halted = 1;
                        cpu->exception_index = EXCP_HLT;
                        cpu_loop_exit(cpu);
                    }
 #if defined(TARGET_I386)
-                    if (interrupt_request & CPU_INTERRUPT_INIT) {
+                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
+                        replay_interrupt();
                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
                        do_cpu_init(x86_cpu);
                        cpu->exception_index = EXCP_HALTED;
                        cpu_loop_exit(cpu);
                    }
 #else
-                    if (interrupt_request & CPU_INTERRUPT_RESET) {
+                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
+                        replay_interrupt();
                        cpu_reset(cpu);
+                        cpu_loop_exit(cpu);
                    }
 #endif
                    /* The target hook has 3 exit conditions:
                       False when the interrupt isn't processed,
                       True when it is, and we should restart on a new TB,
                       and via longjmp via cpu_loop_exit.  */
-                    if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                        next_tb = 0;
+                    else {
+                        replay_interrupt();
+                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                            next_tb = 0;
+                        }
                    }
                    /* Don't use the cached interrupt_request value,
                       do_interrupt may have updated the EXITTB flag. */
@@ -454,7 +482,8 @@ int cpu_exec(CPUState *cpu)
                        next_tb = 0;
                    }
                }
-                if (unlikely(cpu->exit_request)) {
+                if (unlikely(cpu->exit_request
+                             || replay_has_interrupt())) {
                    cpu->exit_request = 0;
                    cpu->exception_index = EXCP_INTERRUPT;
                    cpu_loop_exit(cpu);
@@ -477,7 +506,8 @@ int cpu_exec(CPUState *cpu)
                /* see if we can patch the calling TB. When the TB
                   spans two pages, we cannot safely do a direct
                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1) {
+                if (next_tb != 0 && tb->page_addr[1] == -1
+                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
                                next_tb & TB_EXIT_MASK, tb);
                }
@@ -518,7 +548,7 @@ int cpu_exec(CPUState *cpu)
                            if (insns_left > 0) {
                                /* Execute remaining instructions.  */
                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb);
+                                cpu_exec_nocache(cpu, insns_left, tb, false);
                                align_clocks(&sc, cpu);
                            }
                            cpu->exception_index = EXCP_INTERRUPT;
@@ -538,15 +568,27 @@ int cpu_exec(CPUState *cpu)
                   only be set by a memory fault) */
            } /* for(;;) */
        } else {
-            /* Reload env after longjmp - the compiler may have smashed all
-             * local variables as longjmp is marked 'noreturn'. */
+#if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
+            /* Some compilers wrongly smash all local variables after
+             * siglongjmp. There were bug reports for gcc 4.5.0 and clang.
+             * Reload essential local variables here for those compilers.
+             * Newer versions of gcc would complain about this code (-Wclobbered). */
            cpu = current_cpu;
            cc = CPU_GET_CLASS(cpu);
-            cpu->can_do_io = 1;
 #ifdef TARGET_I386
            x86_cpu = X86_CPU(cpu);
            env = &x86_cpu->env;
 #endif
+#else /* buggy compiler */
+            /* Assert that the compiler does not smash local variables. */
+            g_assert(cpu == current_cpu);
+            g_assert(cc == CPU_GET_CLASS(cpu));
+#ifdef TARGET_I386
+            g_assert(x86_cpu == X86_CPU(cpu));
+            g_assert(env == &x86_cpu->env);
+#endif
+#endif /* buggy compiler */
+            cpu->can_do_io = 1;
            tb_lock_reset();
        }
    } /* for(;;) */
--- a/cpus.c
+++ b/cpus.c
@@ -42,6 +42,7 @@
 #include "qemu/seqlock.h"
 #include "qapi-event.h"
 #include "hw/nmi.h"
+#include "sysemu/replay.h"

 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -334,7 +335,7 @@ static int64_t qemu_icount_round(int64_t count)
    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 }

-static void icount_warp_rt(void *opaque)
+static void icount_warp_rt(void)
 {
    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
     * changes from -1 to another value, so the race here is okay.
@@ -345,7 +346,8 @@ static void icount_warp_rt(void *opaque)

    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
-        int64_t clock = cpu_get_clock_locked();
+        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
+                                     cpu_get_clock_locked());
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
@@ -368,6 +370,11 @@ static void icount_warp_rt(void *opaque)
    }
 }

+static void icount_dummy_timer(void *opaque)
+{
+    (void)opaque;
+}
+
 void qtest_clock_warp(int64_t dest)
 {
    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
@@ -403,6 +410,18 @@ void qemu_clock_warp(QEMUClockType type)
        return;
    }

+    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
+     * do not fire, so computing the deadline does not make sense.
+     */
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    /* warp clock deterministically in record/replay mode */
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
+        return;
+    }
+
    if (icount_sleep) {
        /*
         * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
@@ -412,7 +431,7 @@ void qemu_clock_warp(QEMUClockType type)
         * the CPU starts running, in case the CPU is woken by an event other
         * than the earliest QEMU_CLOCK_VIRTUAL timer.
         */
-        icount_warp_rt(NULL);
+        icount_warp_rt();
        timer_del(icount_warp_timer);
    }
    if (!all_cpu_threads_idle()) {
@@ -605,7 +624,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
-                                         icount_warp_rt, NULL);
+                                         icount_dummy_timer, NULL);
    }

    icount_align_option = qemu_opt_get_bool(opts, "align", false);
@@ -694,15 +713,6 @@ void cpu_synchronize_all_post_init(void)
    }
 }

-void cpu_clean_all_dirty(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        cpu_clean_state(cpu);
-    }
-}
-
 static int do_vm_stop(RunState state)
 {
    int ret = 0;
@@ -1405,12 +1415,36 @@ int vm_stop_force_state(RunState state)
        return vm_stop(state);
    } else {
        runstate_set(state);
+
+        bdrv_drain_all();
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
        return bdrv_flush_all();
    }
 }

+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        /* Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return qemu_icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
 static int tcg_cpu_exec(CPUState *cpu)
 {
    int ret;
@@ -1423,24 +1457,12 @@ static int tcg_cpu_exec(CPUState *cpu)
 #endif
    if (use_icount) {
        int64_t count;
-        int64_t deadline;
        int decr;
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-        /* Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        count = qemu_icount_round(deadline);
+        count = tcg_get_icount_limit();
        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
@@ -1458,6 +1480,7 @@ static int tcg_cpu_exec(CPUState *cpu)
                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
+        replay_account_executed_instructions();
    }
    return ret;
 }
--- a/crypto/cipher-builtin.c
+++ b/crypto/cipher-builtin.c
@@ -25,8 +25,7 @@ typedef struct QCryptoCipherBuiltinAES QCryptoCipherBuiltinAES;
 struct QCryptoCipherBuiltinAES {
    AES_KEY encrypt_key;
    AES_KEY decrypt_key;
-    uint8_t *iv;
-    size_t niv;
+    uint8_t iv[AES_BLOCK_SIZE];
 };
 typedef struct QCryptoCipherBuiltinDESRFB QCryptoCipherBuiltinDESRFB;
 struct QCryptoCipherBuiltinDESRFB {
@@ -40,6 +39,7 @@ struct QCryptoCipherBuiltin {
        QCryptoCipherBuiltinAES aes;
        QCryptoCipherBuiltinDESRFB desrfb;
    } state;
+    size_t blocksize;
    void (*free)(QCryptoCipher *cipher);
    int (*setiv)(QCryptoCipher *cipher,
                 const uint8_t *iv, size_t niv,
@@ -61,7 +61,6 @@ static void qcrypto_cipher_free_aes(QCryptoCipher *cipher)
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    g_free(ctxt->state.aes.iv);
    g_free(ctxt);
    cipher->opaque = NULL;
 }
@@ -145,15 +144,13 @@ static int qcrypto_cipher_setiv_aes(QCryptoCipher *cipher,
                                     Error **errp)
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;
-    if (niv != 16) {
-        error_setg(errp, "IV must be 16 bytes not %zu", niv);
+    if (niv != AES_BLOCK_SIZE) {
+        error_setg(errp, "IV must be %d bytes not %zu",
+                   AES_BLOCK_SIZE, niv);
        return -1;
    }

-    g_free(ctxt->state.aes.iv);
-    ctxt->state.aes.iv = g_new0(uint8_t, niv);
-    memcpy(ctxt->state.aes.iv, iv, niv);
-    ctxt->state.aes.niv = niv;
+    memcpy(ctxt->state.aes.iv, iv, AES_BLOCK_SIZE);

    return 0;
 }
@@ -185,6 +182,7 @@ static int qcrypto_cipher_init_aes(QCryptoCipher *cipher,
        goto error;
    }

+    ctxt->blocksize = AES_BLOCK_SIZE;
    ctxt->free = qcrypto_cipher_free_aes;
    ctxt->setiv = qcrypto_cipher_setiv_aes;
    ctxt->encrypt = qcrypto_cipher_encrypt_aes;
@@ -286,6 +284,7 @@ static int qcrypto_cipher_init_des_rfb(QCryptoCipher *cipher,
    memcpy(ctxt->state.desrfb.key, key, nkey);
    ctxt->state.desrfb.nkey = nkey;

+    ctxt->blocksize = 8;
    ctxt->free = qcrypto_cipher_free_des_rfb;
    ctxt->setiv = qcrypto_cipher_setiv_des_rfb;
    ctxt->encrypt = qcrypto_cipher_encrypt_des_rfb;
@@ -374,6 +373,12 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

+    if (len % ctxt->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctxt->blocksize);
+        return -1;
+    }
+
    return ctxt->encrypt(cipher, in, out, len, errp);
 }

@@ -386,6 +391,12 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

+    if (len % ctxt->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctxt->blocksize);
+        return -1;
+    }
+
    return ctxt->decrypt(cipher, in, out, len, errp);
 }

--- a/crypto/cipher-gcrypt.c
+++ b/crypto/cipher-gcrypt.c
@@ -34,6 +34,11 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    }
 }

+typedef struct QCryptoCipherGcrypt QCryptoCipherGcrypt;
+struct QCryptoCipherGcrypt {
+    gcry_cipher_hd_t handle;
+    size_t blocksize;
+};

 QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                                  QCryptoCipherMode mode,
@@ -41,7 +46,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                                  Error **errp)
 {
    QCryptoCipher *cipher;
-    gcry_cipher_hd_t handle;
+    QCryptoCipherGcrypt *ctx;
    gcry_error_t err;
    int gcryalg, gcrymode;

@@ -87,7 +92,9 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    cipher->alg = alg;
    cipher->mode = mode;

-    err = gcry_cipher_open(&handle, gcryalg, gcrymode, 0);
+    ctx = g_new0(QCryptoCipherGcrypt, 1);
+
+    err = gcry_cipher_open(&ctx->handle, gcryalg, gcrymode, 0);
    if (err != 0) {
        error_setg(errp, "Cannot initialize cipher: %s",
                   gcry_strerror(err));
@@ -100,10 +107,12 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
         * bizarre RFB variant of DES :-)
         */
        uint8_t *rfbkey = qcrypto_cipher_munge_des_rfb_key(key, nkey);
-        err = gcry_cipher_setkey(handle, rfbkey, nkey);
+        err = gcry_cipher_setkey(ctx->handle, rfbkey, nkey);
        g_free(rfbkey);
+        ctx->blocksize = 8;
    } else {
-        err = gcry_cipher_setkey(handle, key, nkey);
+        err = gcry_cipher_setkey(ctx->handle, key, nkey);
+        ctx->blocksize = 16;
    }
    if (err != 0) {
        error_setg(errp, "Cannot set key: %s",
@@ -111,11 +120,12 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        goto error;
    }

-    cipher->opaque = handle;
+    cipher->opaque = ctx;
    return cipher;

 error:
-    gcry_cipher_close(handle);
+    gcry_cipher_close(ctx->handle);
+    g_free(ctx);
    g_free(cipher);
    return NULL;
 }
@@ -123,12 +133,13 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

 void qcrypto_cipher_free(QCryptoCipher *cipher)
 {
-    gcry_cipher_hd_t handle;
+    QCryptoCipherGcrypt *ctx;
    if (!cipher) {
        return;
    }
-    handle = cipher->opaque;
-    gcry_cipher_close(handle);
+    ctx = cipher->opaque;
+    gcry_cipher_close(ctx->handle);
+    g_free(ctx);
    g_free(cipher);
 }

@@ -139,10 +150,16 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
                           size_t len,
                           Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    err = gcry_cipher_encrypt(handle,
+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
+    err = gcry_cipher_encrypt(ctx->handle,
                              out, len,
                              in, len);
    if (err != 0) {
@@ -161,10 +178,16 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
                           size_t len,
                           Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    err = gcry_cipher_decrypt(handle,
+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
+    err = gcry_cipher_decrypt(ctx->handle,
                              out, len,
                              in, len);
    if (err != 0) {
@@ -180,11 +203,17 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
                         const uint8_t *iv, size_t niv,
                         Error **errp)
 {
-    gcry_cipher_hd_t handle = cipher->opaque;
+    QCryptoCipherGcrypt *ctx = cipher->opaque;
    gcry_error_t err;

-    gcry_cipher_reset(handle);
-    err = gcry_cipher_setiv(handle, iv, niv);
+    if (niv != ctx->blocksize) {
+        error_setg(errp, "Expected IV size %zu not %zu",
+                   ctx->blocksize, niv);
+        return -1;
+    }
+
+    gcry_cipher_reset(ctx->handle);
+    err = gcry_cipher_setiv(ctx->handle, iv, niv);
    if (err != 0) {
        error_setg(errp, "Cannot set IV: %s",
                   gcry_strerror(err));
--- a/crypto/cipher-nettle.c
+++ b/crypto/cipher-nettle.c
@@ -69,7 +69,7 @@ struct QCryptoCipherNettle {
    nettle_cipher_func *alg_encrypt;
    nettle_cipher_func *alg_decrypt;
    uint8_t *iv;
-    size_t niv;
+    size_t blocksize;
 };

 bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
@@ -125,7 +125,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        ctx->alg_encrypt = des_encrypt_wrapper;
        ctx->alg_decrypt = des_decrypt_wrapper;

-        ctx->niv = DES_BLOCK_SIZE;
+        ctx->blocksize = DES_BLOCK_SIZE;
        break;

    case QCRYPTO_CIPHER_ALG_AES_128:
@@ -140,14 +140,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        ctx->alg_encrypt = aes_encrypt_wrapper;
        ctx->alg_decrypt = aes_decrypt_wrapper;

-        ctx->niv = AES_BLOCK_SIZE;
+        ctx->blocksize = AES_BLOCK_SIZE;
        break;
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        goto error;
    }

-    ctx->iv = g_new0(uint8_t, ctx->niv);
+    ctx->iv = g_new0(uint8_t, ctx->blocksize);
    cipher->opaque = ctx;

    return cipher;
@@ -184,6 +184,12 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherNettle *ctx = cipher->opaque;

+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
        ctx->alg_encrypt(ctx->ctx_encrypt, len, out, in);
@@ -191,7 +197,7 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,

    case QCRYPTO_CIPHER_MODE_CBC:
        cbc_encrypt(ctx->ctx_encrypt, ctx->alg_encrypt,
-                    ctx->niv, ctx->iv,
+                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
    default:
@@ -211,6 +217,12 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
 {
    QCryptoCipherNettle *ctx = cipher->opaque;

+    if (len % ctx->blocksize) {
+        error_setg(errp, "Length %zu must be a multiple of block size %zu",
+                   len, ctx->blocksize);
+        return -1;
+    }
+
    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
        ctx->alg_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
@@ -219,7 +231,7 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,

    case QCRYPTO_CIPHER_MODE_CBC:
        cbc_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                    ctx->alg_decrypt, ctx->niv, ctx->iv,
+                    ctx->alg_decrypt, ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
    default:
@@ -235,9 +247,9 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
                         Error **errp)
 {
    QCryptoCipherNettle *ctx = cipher->opaque;
-    if (niv != ctx->niv) {
+    if (niv != ctx->blocksize) {
        error_setg(errp, "Expected IV size %zu not %zu",
-                   ctx->niv, niv);
+                   ctx->blocksize, niv);
        return -1;
    }
    memcpy(ctx->iv, iv, niv);
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -47,7 +47,7 @@ qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
    return true;
 }

-#if defined(CONFIG_GNUTLS_GCRYPT) || defined(CONFIG_GNUTLS_NETTLE)
+#if defined(CONFIG_GCRYPT) || defined(CONFIG_NETTLE)
 static uint8_t *
 qcrypto_cipher_munge_des_rfb_key(const uint8_t *key,
                                 size_t nkey)
@@ -63,11 +63,11 @@ qcrypto_cipher_munge_des_rfb_key(const uint8_t *key,
    }
    return ret;
 }
-#endif /* CONFIG_GNUTLS_GCRYPT || CONFIG_GNUTLS_NETTLE */
+#endif /* CONFIG_GCRYPT || CONFIG_NETTLE */

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
 #include "crypto/cipher-gcrypt.c"
-#elif defined CONFIG_GNUTLS_NETTLE
+#elif defined CONFIG_NETTLE
 #include "crypto/cipher-nettle.c"
 #else
 #include "crypto/cipher-builtin.c"
--- a/crypto/init.c
+++ b/crypto/init.c
@@ -24,8 +24,9 @@
 #ifdef CONFIG_GNUTLS
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>
+#endif

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
 #include <gcrypt.h>
 #endif

@@ -37,6 +38,7 @@
 *  - When GNUTLS >= 2.12, we must not initialize gcrypt threading
 *    because GNUTLS will do that itself
 *  - When GNUTLS < 2.12 we must always initialize gcrypt threading
+ *  - When GNUTLS is disabled we must always initialize gcrypt threading
 *
 * But....
 *
@@ -47,12 +49,15 @@
 *
 *   - gcrypt < 1.6.0
 * AND
- *   - gnutls < 2.12
+ *      - gnutls < 2.12
+ *   OR
+ *      - gnutls is disabled
 *
 */

-#if (defined(CONFIG_GNUTLS_GCRYPT) &&           \
-     (!defined(GNUTLS_VERSION_NUMBER) ||        \
+#if (defined(CONFIG_GCRYPT) &&                  \
+     (!defined(CONFIG_GNUTLS) ||                \
+      !defined(GNUTLS_VERSION_NUMBER) ||       \
      (GNUTLS_VERSION_NUMBER < 0x020c00)) &&    \
     (!defined(GCRYPT_VERSION_NUMBER) ||        \
      (GCRYPT_VERSION_NUMBER < 0x010600)))
@@ -113,6 +118,7 @@ static struct gcry_thread_cbs qcrypto_gcrypt_thread_impl = {

 int qcrypto_init(Error **errp)
 {
+#ifdef CONFIG_GNUTLS
    int ret;
    ret = gnutls_global_init();
    if (ret < 0) {
@@ -125,8 +131,9 @@ int qcrypto_init(Error **errp)
    gnutls_global_set_log_level(10);
    gnutls_global_set_log_function(qcrypto_gnutls_log);
 #endif
+#endif

-#ifdef CONFIG_GNUTLS_GCRYPT
+#ifdef CONFIG_GCRYPT
    if (!gcry_check_version(GCRYPT_VERSION)) {
        error_setg(errp, "Unable to initialize gcrypt");
        return -1;
@@ -139,12 +146,3 @@ int qcrypto_init(Error **errp)

    return 0;
 }
-
-#else /* ! CONFIG_GNUTLS */
-
-int qcrypto_init(Error **errp G_GNUC_UNUSED)
-{
-    return 0;
-}
-
-#endif /* ! CONFIG_GNUTLS */
--- a/crypto/tlscreds.c
+++ b/crypto/tlscreds.c
@@ -123,10 +123,10 @@ qcrypto_tls_creds_get_path(QCryptoTLSCreds *creds,
        goto cleanup;
    }

-    trace_qcrypto_tls_creds_get_path(creds, filename,
-                                     *cred ? *cred : "<none>");
    ret = 0;
 cleanup:
+    trace_qcrypto_tls_creds_get_path(creds, filename,
+                                     *cred ? *cred : "<none>");
    return ret;
 }

--- a/crypto/tlscredsx509.c
+++ b/crypto/tlscredsx509.c
@@ -255,6 +255,7 @@ qcrypto_tls_creds_check_cert_key_purpose(QCryptoTLSCredsX509 *creds,
        }

        g_free(buffer);
+        buffer = NULL;
    }

    if (isServer) {
@@ -485,7 +486,8 @@ qcrypto_tls_creds_x509_sanity_check(QCryptoTLSCredsX509 *creds,
    int ret = -1;

    memset(cacerts, 0, sizeof(cacerts));
-    if (access(certFile, R_OK) == 0) {
+    if (certFile &&
+        access(certFile, R_OK) == 0) {
        cert = qcrypto_tls_creds_load_cert(creds,
                                           certFile, isServer,
                                           errp);
@@ -654,6 +656,10 @@ qcrypto_tls_creds_x509_unload(QCryptoTLSCredsX509 *creds)
        gnutls_certificate_free_credentials(creds->data);
        creds->data = NULL;
    }
+    if (creds->parent_obj.dh_params) {
+        gnutls_dh_params_deinit(creds->parent_obj.dh_params);
+        creds->parent_obj.dh_params = NULL;
+    }
 }


--- a/crypto/tlssession.c
+++ b/crypto/tlssession.c
@@ -304,9 +304,9 @@ qcrypto_tls_session_check_certificate(QCryptoTLSSession *session,

                allow = qemu_acl_party_is_allowed(acl, session->peername);

-                error_setg(errp, "TLS x509 ACL check for %s is %s",
-                           session->peername, allow ? "allowed" : "denied");
                if (!allow) {
+                    error_setg(errp, "TLS x509 ACL check for %s is denied",
+                               session->peername);
                    goto error;
                }
            }
--- a/default-configs/aarch64-linux-user.mak
+++ b/default-configs/aarch64-linux-user.mak
@@ -1,3 +1 @@
 # Default configuration for aarch64-linux-user
-
-CONFIG_GDBSTUB_XML=y
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -35,5 +35,5 @@ CONFIG_SDHCI=y
 CONFIG_EDU=y
 CONFIG_VGA=y
 CONFIG_VGA_PCI=y
-CONFIG_IVSHMEM=$(CONFIG_KVM)
+CONFIG_IVSHMEM=$(CONFIG_POSIX)
 CONFIG_ROCKER=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -3,6 +3,7 @@
 include pci.mak
 include sound.mak
 include usb.mak
+CONFIG_VIRTIO_VGA=y
 CONFIG_ISA_MMIO=y
 CONFIG_ESCC=y
 CONFIG_M48T59=y
--- a/disas.c
+++ b/disas.c
@@ -214,11 +214,6 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
        s.info.mach = bfd_mach_i386_i386;
    }
    s.info.print_insn = print_insn_i386;
-#elif defined(TARGET_SPARC)
-    s.info.print_insn = print_insn_sparc;
-#ifdef TARGET_SPARC64
-    s.info.mach = bfd_mach_sparc_v9b;
-#endif
 #elif defined(TARGET_PPC)
    if ((flags >> 16) & 1) {
        s.info.endian = BFD_ENDIAN_LITTLE;
@@ -235,29 +230,6 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
    }
    s.info.disassembler_options = (char *)"any";
    s.info.print_insn = print_insn_ppc;
-#elif defined(TARGET_M68K)
-    s.info.print_insn = print_insn_m68k;
-#elif defined(TARGET_MIPS)
-#ifdef TARGET_WORDS_BIGENDIAN
-    s.info.print_insn = print_insn_big_mips;
-#else
-    s.info.print_insn = print_insn_little_mips;
-#endif
-#elif defined(TARGET_SH4)
-    s.info.mach = bfd_mach_sh4;
-    s.info.print_insn = print_insn_sh;
-#elif defined(TARGET_ALPHA)
-    s.info.mach = bfd_mach_alpha_ev6;
-    s.info.print_insn = print_insn_alpha;
-#elif defined(TARGET_S390X)
-    s.info.mach = bfd_mach_s390_64;
-    s.info.print_insn = print_insn_s390;
-#elif defined(TARGET_MOXIE)
-    s.info.mach = bfd_arch_moxie;
-    s.info.print_insn = print_insn_moxie;
-#elif defined(TARGET_LM32)
-    s.info.mach = bfd_mach_lm32;
-    s.info.print_insn = print_insn_lm32;
 #endif
    if (s.info.print_insn == NULL) {
        s.info.print_insn = print_insn_od_target;
@@ -429,13 +401,6 @@ void monitor_disas(Monitor *mon, CPUState *cpu,
        s.info.mach = bfd_mach_i386_i386;
    }
    s.info.print_insn = print_insn_i386;
-#elif defined(TARGET_ALPHA)
-    s.info.print_insn = print_insn_alpha;
-#elif defined(TARGET_SPARC)
-    s.info.print_insn = print_insn_sparc;
-#ifdef TARGET_SPARC64
-    s.info.mach = bfd_mach_sparc_v9b;
-#endif
 #elif defined(TARGET_PPC)
    if (flags & 0xFFFF) {
        /* If we have a precise definition of the instruction set, use it. */
@@ -451,26 +416,6 @@ void monitor_disas(Monitor *mon, CPUState *cpu,
        s.info.endian = BFD_ENDIAN_LITTLE;
    }
    s.info.print_insn = print_insn_ppc;
-#elif defined(TARGET_M68K)
-    s.info.print_insn = print_insn_m68k;
-#elif defined(TARGET_MIPS)
-#ifdef TARGET_WORDS_BIGENDIAN
-    s.info.print_insn = print_insn_big_mips;
-#else
-    s.info.print_insn = print_insn_little_mips;
-#endif
-#elif defined(TARGET_SH4)
-    s.info.mach = bfd_mach_sh4;
-    s.info.print_insn = print_insn_sh;
-#elif defined(TARGET_S390X)
-    s.info.mach = bfd_mach_s390_64;
-    s.info.print_insn = print_insn_s390;
-#elif defined(TARGET_MOXIE)
-    s.info.mach = bfd_arch_moxie;
-    s.info.print_insn = print_insn_moxie;
-#elif defined(TARGET_LM32)
-    s.info.mach = bfd_mach_lm32;
-    s.info.print_insn = print_insn_lm32;
 #endif
    if (!s.info.print_insn) {
        monitor_printf(mon, "0x" TARGET_FMT_lx
--- a/disas/arm.c
+++ b/disas/arm.c
@@ -1779,7 +1779,7 @@ print_insn_coprocessor (bfd_vma pc, struct disassemble_info *info, long given,

 			/* Is ``imm'' a negative number?  */
 			if (imm & 0x40)
-			  imm |= (-1 << 7);
+			  imm |= (~0u << 7);

 			func (stream, "%d", imm);
 		      }
--- a/disas/mips.c
+++ b/disas/mips.c
@@ -2420,9 +2420,11 @@ const struct mips_opcode mips_builtin_opcodes[] =
 {"hibernate","",        0x42000023, 0xffffffff,	0, 			0,		V1	},
 {"ins",     "t,r,+A,+B", 0x7c000004, 0xfc00003f, WR_t|RD_s,    		0,		I33	},
 {"jr",      "s",	0x00000008, 0xfc1fffff,	UBD|RD_s,		0,		I1	},
+{"jr",      "s",	0x00000009, 0xfc1fffff,	UBD|RD_s,		0,		I32R6	}, /* jalr */
 /* jr.hb is officially MIPS{32,64}R2, but it works on R1 as jr with
   the same hazard barrier effect.  */
 {"jr.hb",   "s",	0x00000408, 0xfc1fffff,	UBD|RD_s,		0,		I32	},
+{"jr.hb",   "s",	0x00000409, 0xfc1fffff,	UBD|RD_s,		0,		I32R6	}, /* jalr.hb */
 {"j",       "s",	0x00000008, 0xfc1fffff,	UBD|RD_s,		0,		I1	}, /* jr */
 /* SVR4 PIC code requires special handling for j, so it must be a
   macro.  */
--- a/docs/bitmaps.md
+++ b/docs/bitmaps.md
@@ -19,12 +19,20 @@ which is included at the end of this document.
 * A dirty bitmap's name is unique to the node, but bitmaps attached to different
  nodes can share the same name.

+* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
+  name, but any user-created bitmaps may not be. There can be any number of
+  anonymous bitmaps per node.
+
+* The name of a user-created bitmap must not be empty ("").
+
 ## Bitmap Modes

 * A Bitmap can be "frozen," which means that it is currently in-use by a backup
  operation and cannot be deleted, renamed, written to, reset,
  etc.

+* The normal operating mode for a bitmap is "active."
+
 ## Basic QMP Usage

 ### Supported Commands ###
@@ -97,11 +105,7 @@ which is included at the end of this document.
 }
 ```

-## Transactions (Not yet implemented)
-
-* Transactional commands are forthcoming in a future version,
-  and are not yet available for use. This section serves as
-  documentation of intent for their design and usage.
+## Transactions

 ### Justification

@@ -323,6 +327,155 @@ full backup as a backing image.
      "event": "BLOCK_JOB_COMPLETED" }
    ```

+### Partial Transactional Failures
+
+* Sometimes, a transaction will succeed in launching and return success,
+  but then later the backup jobs themselves may fail. It is possible that
+  a management application may have to deal with a partial backup failure
+  after a successful transaction.
+
+* If multiple backup jobs are specified in a single transaction, when one of
+  them fails, it will not interact with the other backup jobs in any way.
+
+* The job(s) that succeeded will clear the dirty bitmap associated with the
+  operation, but the job(s) that failed will not. It is not "safe" to delete
+  any incremental backups that were created successfully in this scenario,
+  even though others failed.
+
+#### Example
+
+* QMP example highlighting two backup jobs:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ]
+      }
+    }
+    ```
+
+* QMP example response, highlighting one success and one failure:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the first job was completed:
+        ```json
+        { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
+          "data": { "device": "drive0", "type": "backup",
+                     "speed": 0, "len": 67108864, "offset": 67108864 },
+          "event": "BLOCK_JOB_COMPLETED"
+        }
+        ```
+
+    * Later yet, QEMU sends notice that the second job has failed:
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+
+* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
+  but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
+  incremental backup of all drives at a point-in-time is to be made,
+  new backups for both drives will need to be made, taking into account
+  that a new incremental backup for drive0 needs to be based on top of
+  "d0-incr-1.qcow2."
+
+### Grouped Completion Mode
+
+* While jobs launched by transactions normally complete or fail on their own,
+  it is possible to instruct them to complete or fail together as a group.
+
+* QMP transactions take an optional properties structure that can affect
+  the semantics of the transaction.
+
+* The "completion-mode" transaction property can be either "individual"
+  which is the default, legacy behavior described above, or "grouped,"
+  a new behavior detailed below.
+
+* Delayed Completion: In grouped completion mode, no jobs will report
+  success until all jobs are ready to report success.
+
+* Grouped failure: If any job fails in grouped completion mode, all remaining
+  jobs will be cancelled. Any incremental backups will restore their dirty
+  bitmap objects as if no backup command was ever issued.
+
+    * Regardless of if QEMU reports a particular incremental backup job as
+      CANCELLED or as an ERROR, the in-memory bitmap will be restored.
+
+#### Example
+
+* Here's the same example scenario from above with the new property:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ],
+        "properties": {
+          "completion-mode": "grouped"
+        }
+      }
+    }
+    ```
+
+* QMP example response, highlighting a failure for drive2:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the second job has errored out,
+      but that the first job was also cancelled:
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
+          "data": { "device": "drive0", "type": "backup", "speed": 0,
+                    "len": 67108864, "offset": 16777216 },
+          "event": "BLOCK_JOB_CANCELLED" }
+        ```
+
 <!--
 The FreeBSD Documentation License

--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -291,3 +291,194 @@ save/send this state when we are in the middle of a pio operation
 (that is what ide_drive_pio_state_needed() checks).  If DRQ_STAT is
 not enabled, the values on that fields are garbage and don't need to
 be sent.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+  Source side
+     Forward path - written by migration thread
+     Return path  - opened by main thread, read by return-path thread
+
+  Destination side
+     Forward path - read by main thread
+     Return path  - opened by main thread, written by main thread AND postcopy
+                    thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability x-postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode.  Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on.  Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up.  This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+   Command: 'postcopy listen'
+   The device state
+      A series of sections, identical to the precopy streams device state stream
+      containing everything except postcopiable devices (i.e. RAM)
+   Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location.  This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+                        1      2   3     4 5                      6   7
+main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
+thread                             |       |
+                                   |     (page request)
+                                   |        \___
+                                   v            \
+listen thread:                     --- page -- page -- page -- page -- page --
+
+                                   a   b        c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+   All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
+recurses into qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package.   It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+  Advise:  Set at the start of migration if postcopy is enabled, even
+           if it hasn't had the start command; here the destination
+           checks that its OS has the support needed for postcopy, and performs
+           setup to ensure the RAM mappings are suitable for later postcopy.
+           The destination will fail early in migration at this point if the
+           required OS support is not present.
+           (Triggered by reception of POSTCOPY_ADVISE command)
+
+  Discard: Entered on receipt of the first 'discard' command; prior to
+           the first Discard being performed, hugepages are switched off
+           (using madvise) to ensure that no new huge pages are created
+           during the postcopy phase, and to cause any huge pages that
+           have discards on them to be broken.
+
+  Listen:  The first command in the package, POSTCOPY_LISTEN, switches
+           the destination state to Listen, and starts a new thread
+           (the 'listen thread') which takes over the job of receiving
+           pages off the migration stream, while the main thread carries
+           on processing the blob.  With this thread able to process page
+           reception, the destination now 'sensitises' the RAM to detect
+           any access to missing pages (on Linux using the 'userfault'
+           system).
+
+  Running: POSTCOPY_RUN causes the destination to synchronise all
+           state and start the CPUs and IO devices running.  The main
+           thread now finishes processing the migration package and
+           now carries on as it would for normal precopy migration
+           (although it can't do the cleanup it would do as it
+           finishes a normal migration).
+
+  End:     The listen thread can now quit, and perform the cleanup of migration
+           state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'.  The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending.  During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+   a) Have been sent but then redirtied (which must be discarded)
+   b) Have not yet been sent - which also must be discarded to cause any
+      transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy.  The dirtymap
+is still valid and is used to ensure that no page is sent more than once.  Any
+request for a page that has already been sent is ignored.  Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -106,12 +106,15 @@ Types, commands, and events share a common namespace.  Therefore,
 generally speaking, type definitions should always use CamelCase for
 user-defined type names, while built-in types are lowercase. Type
 definitions should not end in 'Kind', as this namespace is used for
-creating implicit C enums for visiting union types.  Command names,
+creating implicit C enums for visiting union types, or in 'List', as
+this namespace is used for creating array types.  Command names,
 and field names within a type, should be all lower case with words
 separated by a hyphen.  However, some existing older commands and
 complex types use underscore; when extending such expressions,
 consistency is preferred over blindly avoiding underscore.  Event
-names should be ALL_CAPS with words separated by underscore.
+names should be ALL_CAPS with words separated by underscore.  Field
+names cannot start with 'has-' or 'has_', as this is reserved for
+tracking optional fields.

 Any name (command, event, type, field, or enum value) beginning with
 "x-" is marked experimental, and may be withdrawn or changed
@@ -122,9 +125,10 @@ vendor), even if the rest of the name uses dash (example:
 __com.redhat_drive-mirror).  Other than downstream extensions (with
 leading underscore and the use of dots), all names should begin with a
 letter, and contain only ASCII letters, digits, dash, and underscore.
-It is okay to reuse names that match C keywords; the generator will
-rename a field named "default" in the QAPI to "q_default" in the
-generated C code.
+Names beginning with 'q_' are reserved for the generator: QMP names
+that resemble C keywords or other problematic strings will be munged
+in C to use this prefix.  For example, a field named "default" in
+qapi becomes "q_default" in the generated C code.

 In the rest of this document, usage lines are given for each
 expression type, with literal strings written in lower case and
@@ -510,8 +514,23 @@ exactly the server (QEMU) supports.
 For this purpose, QMP provides introspection via command
 query-qmp-schema.  QGA currently doesn't support introspection.

+While Client JSON Protocol wire compatibility should be maintained
+between qemu versions, we cannot make the same guarantees for
+introspection stability.  For example, one version of qemu may provide
+a non-variant optional member of a struct, and a later version rework
+the member to instead be non-optional and associated with a variant.
+Likewise, one version of qemu may list a member with open-ended type
+'str', and a later version could convert it to a finite set of strings
+via an enum type; or a member may be converted from a specific type to
+an alternate that represents a choice between the original type and
+something else.
+
 query-qmp-schema returns a JSON array of SchemaInfo objects.  These
 objects together describe the wire ABI, as defined in the QAPI schema.
+There is no specified order to the SchemaInfo objects returned; a
+client must search for a particular name throughout the entire array
+to learn more about that name, but is at least guaranteed that there
+will be no collisions between type, command, and event names.

 However, the SchemaInfo can't reflect all the rules and restrictions
 that apply to QMP.  It's interface introspection (figuring out what's
@@ -592,7 +611,9 @@ any.  Each element is a JSON object with members "name" (the member's
 name), "type" (the name of its type), and optionally "default".  The
 member is optional if "default" is present.  Currently, "default" can
 only have value null.  Other values are reserved for future
-extensions.
+extensions.  The "members" array is in no particular order; clients
+must search the entire object when learning whether a particular
+member is supported.

 Example: the SchemaInfo for MyType from section Struct types

@@ -606,7 +627,9 @@ Example: the SchemaInfo for MyType from section Struct types
 "variants" is a JSON array describing the object's variant members.
 Each element is a JSON object with members "case" (the value of type
 tag this element applies to) and "type" (the name of an object type
-that provides the variant members for this type tag value).
+that provides the variant members for this type tag value).  The
+"variants" array is in no particular order, and is not guaranteed to
+list cases in the same order as the corresponding "tag" enum type.

 Example: the SchemaInfo for flat union BlockdevOptions from section
 Union types
@@ -647,7 +670,8 @@ Union types
 The SchemaInfo for an alternate type has meta-type "alternate", and
 variant member "members".  "members" is a JSON array.  Each element is
 a JSON object with member "type", which names a type.  Values of the
-alternate type conform to exactly one of its member types.
+alternate type conform to exactly one of its member types.  There is
+no guarantee on the order in which "members" will be listed.

 Example: the SchemaInfo for BlockRef from section Alternate types

@@ -658,15 +682,20 @@ Example: the SchemaInfo for BlockRef from section Alternate types

 The SchemaInfo for an array type has meta-type "array", and variant
 member "element-type", which names the array's element type.  Array
-types are implicitly defined.
+types are implicitly defined.  For convenience, the array's name may
+resemble the element type; however, clients should examine member
+"element-type" instead of making assumptions based on parsing member
+"name".

 Example: the SchemaInfo for ['str']

-    { "name": "strList", "meta-type": "array",
+    { "name": "[str]", "meta-type": "array",
      "element-type": "str" }

 The SchemaInfo for an enumeration type has meta-type "enum" and
-variant member "values".
+variant member "values".  The values are listed in no particular
+order; clients must search the entire enum when learning whether a
+particular value is supported.

 Example: the SchemaInfo for MyEnum from section Enumeration types

--- a/docs/qmp-events.txt
+++ b/docs/qmp-events.txt
@@ -28,6 +28,8 @@ Example:
    "data": { "actual": 944766976 },
    "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }

+Note: this event is rate-limited.
+
 BLOCK_IMAGE_CORRUPTED
 ---------------------

@@ -296,6 +298,8 @@ Example:
     "data": { "reference": "usr1", "sector-num": 345435, "sectors-count": 5 },
     "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }

+Note: this event is rate-limited.
+
 QUORUM_REPORT_BAD
 -----------------

@@ -318,6 +322,8 @@ Example:
     "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 },
     "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }

+Note: this event is rate-limited.
+
 RESET
 -----

@@ -358,6 +364,8 @@ Example:
    "data": { "offset": 78 },
    "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }

+Note: this event is rate-limited.
+
 SHUTDOWN
 --------

@@ -632,6 +640,8 @@ Example:
    "data": { "id": "channel0", "open": true },
    "timestamp": { "seconds": 1401385907, "microseconds": 422329 } }

+Note: this event is rate-limited separately for each "id".
+
 WAKEUP
 ------

@@ -662,3 +672,5 @@ Example:

 Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is
 followed respectively by the RESET, SHUTDOWN, or STOP events.
+
+Note: this event is rate-limited.
--- a/docs/qmp-spec.txt
+++ b/docs/qmp-spec.txt
@@ -175,6 +175,11 @@ The format of asynchronous events is:
 For a listing of supported asynchronous events, please, refer to the
 qmp-events.txt file.

+Some events are rate-limited to at most one per second.  If additional
+"similar" events arrive within one second, all but the last one are
+dropped, and the last one is delayed.  "Similar" normally means same
+event type.  See qmp-events.txt for details.
+
 2.5 QGA Synchronization
 -----------------------

--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -0,0 +1,168 @@
+Copyright (c) 2010-2015 Institute for System Programming
+                        of the Russian Academy of Sciences.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Record/replay
+-------------
+
+Record/replay functions are used for the reverse execution and deterministic
+replay of qemu execution. This implementation of deterministic replay can
+be used for deterministic debugging of guest code through a gdb remote
+interface.
+
+Execution recording writes a non-deterministic events log, which can be later
+used for replaying the execution anywhere and for unlimited number of times.
+It also supports checkpointing for faster rewinding during reverse debugging.
+Execution replaying reads the log and replays all non-deterministic events
+including external input, hardware clocks, and interrupts.
+
+Deterministic replay has the following features:
+ * Deterministically replays whole system execution and all contents of
+   the memory, state of the hardware devices, clocks, and screen of the VM.
+ * Writes execution log into the file for later replaying for multiple times
+   on different machines.
+ * Supports i386, x86_64, and ARM hardware platforms.
+ * Performs deterministic replay of all operations with keyboard and mouse
+   input devices.
+
+Usage of the record/replay:
+ * First, record the execution, by adding the following arguments to the command line:
+   '-icount shift=7,rr=record,rrfile=replay.bin -net none'.
+   Block devices' images are not actually changed in the recording mode,
+   because all of the changes are written to the temporary overlay file.
+ * Then you can replay it by using another command
+   line option: '-icount shift=7,rr=replay,rrfile=replay.bin -net none'
+ * '-net none' option should also be specified if network replay patches
+   are not applied.
+
+Papers with description of deterministic replay implementation:
+http://www.computer.org/csdl/proceedings/csmr/2012/4666/00/4666a553-abs.html
+http://dl.acm.org/citation.cfm?id=2786805.2803179
+
+Modifications of qemu include:
+ * wrappers for clock and time functions to save their return values in the log
+ * saving different asynchronous events (e.g. system shutdown) into the log
+ * synchronization of the bottom halves execution
+ * synchronization of the threads from thread pool
+ * recording/replaying user input (mouse and keyboard)
+ * adding internal checkpoints for cpu and io synchronization
+
+Non-deterministic events
+------------------------
+
+Our record/replay system is based on saving and replaying non-deterministic
+events (e.g. keyboard input) and simulating deterministic ones (e.g. reading
+from HDD or memory of the VM). Saving only non-deterministic events makes
+log file smaller, simulation faster, and allows using reverse debugging even
+for realtime applications.
+
+The following non-deterministic data from peripheral devices is saved into
+the log: mouse and keyboard input, network packets, audio controller input,
+USB packets, serial port input, and hardware clocks (they are non-deterministic
+too, because their values are taken from the host machine). Inputs from
+simulated hardware, memory of VM, software interrupts, and execution of
+instructions are not saved into the log, because they are deterministic and
+can be replayed by simulating the behavior of virtual machine starting from
+initial state.
+
+We had to solve three tasks to implement deterministic replay: recording
+non-deterministic events, replaying non-deterministic events, and checking
+that there is no divergence between record and replay modes.
+
+We changed several parts of QEMU to make event log recording and replaying.
+Devices' models that have non-deterministic input from external devices were
+changed to write every external event into the execution log immediately.
+E.g. network packets are written into the log when they arrive into the virtual
+network adapter.
+
+All non-deterministic events are coming from these devices. But to
+replay them we need to know at which moments they occur. We specify
+these moments by counting the number of instructions executed between
+every pair of consecutive events.
+
+Instruction counting
+--------------------
+
+QEMU should work in icount mode to use record/replay feature. icount was
+designed to allow deterministic execution in absence of external inputs
+of the virtual machine. We also use icount to control the occurrence of the
+non-deterministic events. The number of instructions elapsed from the last event
+is written to the log while recording the execution. In replay mode we
+can predict when to inject that event using the instruction counter.
+
+Timers
+------
+
+Timers are used to execute callbacks from different subsystems of QEMU
+at the specified moments of time. There are several kinds of timers:
+ * Real time clock. Based on host time and used only for callbacks that
+   do not change the virtual machine state. For this reason real time
+   clock and timers does not affect deterministic replay at all.
+ * Virtual clock. These timers run only during the emulation. In icount
+   mode virtual clock value is calculated using executed instructions counter.
+   That is why it is completely deterministic and does not have to be recorded.
+ * Host clock. This clock is used by device models that simulate real time
+   sources (e.g. real time clock chip). Host clock is the one of the sources
+   of non-determinism. Host clock read operations should be logged to
+   make the execution deterministic.
+ * Real time clock for icount. This clock is similar to real time clock but
+   it is used only for increasing virtual clock while virtual machine is
+   sleeping. Due to its nature it is also non-deterministic as the host clock
+   and has to be logged too.
+
+Checkpoints
+-----------
+
+Replaying of the execution of virtual machine is bound by sources of
+non-determinism. These are inputs from clock and peripheral devices,
+and QEMU thread scheduling. Thread scheduling affect on processing events
+from timers, asynchronous input-output, and bottom halves.
+
+Invocations of timers are coupled with clock reads and changing the state
+of the virtual machine. Reads produce non-deterministic data taken from
+host clock. And VM state changes should preserve their order. Their relative
+order in replay mode must replicate the order of callbacks in record mode.
+To preserve this order we use checkpoints. When a specific clock is processed
+in record mode we save to the log special "checkpoint" event.
+Checkpoints here do not refer to virtual machine snapshots. They are just
+record/replay events used for synchronization.
+
+QEMU in replay mode will try to invoke timers processing in random moment
+of time. That's why we do not process a group of timers until the checkpoint
+event will be read from the log. Such an event allows synchronizing CPU
+execution and timer events.
+
+Another checkpoints application in record/replay is instruction counting
+while the virtual machine is idle. This function (qemu_clock_warp) is called
+from the wait loop. It changes virtual machine state and must be deterministic
+then. That is why we added checkpoint to this function to prevent its
+operation in replay mode when it does not correspond to record mode.
+
+Bottom halves
+-------------
+
+Disk I/O events are completely deterministic in our model, because
+in both record and replay modes we start virtual machine from the same
+disk state. But callbacks that virtual disk controller uses for reading and
+writing the disk may occur at different moments of time in record and replay
+modes.
+
+Reading and writing requests are created by CPU thread of QEMU. Later these
+requests proceed to block layer which creates "bottom halves". Bottom
+halves consist of callback and its parameters. They are processed when
+main loop locks the global mutex. These locks are not synchronized with
+replaying process because main loop also processes the events that do not
+affect the virtual machine state (like user interaction with monitor).
+
+That is why we had to implement saving and replaying bottom halves callbacks
+synchronously to the CPU execution. When the callback is about to execute
+it is added to the queue in the replay module. This queue is written to the
+log when its callbacks are executed. In replay mode callbacks are not processed
+until the corresponding event is read from the events log file.
+
+Sometimes the block layer uses asynchronous callbacks for its internal purposes
+(like reading or writing VM snapshots or disk image cluster tables). In this
+case bottom halves are not marked as "replayable" and do not saved
+into the log.
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -76,6 +76,13 @@ increasing address order, similar to memcpy().

 Selector Register IOport: 0x510
 Data Register IOport:     0x511
+DMA Address IOport:       0x514
+
+=== ARM Register Locations ===
+
+Selector Register address: Base + 8 (2 bytes)
+Data Register address:     Base + 0 (8 bytes)
+DMA Address address:       Base + 16 (8 bytes)

 == Firmware Configuration Items ==

@@ -86,11 +93,15 @@ by selecting the "signature" item using key 0x0000 (FW_CFG_SIGNATURE),
 and reading four bytes from the data register. If the fw_cfg device is
 present, the four bytes read will contain the characters "QEMU".

-=== Revision (Key 0x0001, FW_CFG_ID) ===
+If the DMA interface is available, then reading the DMA Address
+Register returns 0x51454d5520434647 ("QEMU CFG" in big-endian format).

-A 32-bit little-endian unsigned int, this item is used as an interface
-revision number, and is currently set to 1 by QEMU when fw_cfg is
-initialized.
+=== Revision / feature bitmap (Key 0x0001, FW_CFG_ID) ===
+
+A 32-bit little-endian unsigned int, this item is used to check for enabled
+features.
+ - Bit 0: traditional interface. Always set.
+ - Bit 1: DMA interface.

 === File Directory (Key 0x0019, FW_CFG_FILE_DIR) ===

@@ -132,6 +143,55 @@ Selector Reg.    Range Usage
 In practice, the number of allowed firmware configuration items is given
 by the value of FW_CFG_MAX_ENTRY (see fw_cfg.h).

+= Guest-side DMA Interface =
+
+If bit 1 of the feature bitmap is set, the DMA interface is present. This does
+not replace the existing fw_cfg interface, it is an add-on. This interface
+can be used through the 64-bit wide address register.
+
+The address register is in big-endian format. The value for the register is 0
+at startup and after an operation. A write to the least significant half (at
+offset 4) triggers an operation. This means that operations with 32-bit
+addresses can be triggered with just one write, whereas operations with
+64-bit addresses can be triggered with one 64-bit write or two 32-bit writes,
+starting with the most significant half (at offset 0).
+
+In this register, the physical address of a FWCfgDmaAccess structure in RAM
+should be written. This is the format of the FWCfgDmaAccess structure:
+
+typedef struct FWCfgDmaAccess {
+    uint32_t control;
+    uint32_t length;
+    uint64_t address;
+} FWCfgDmaAccess;
+
+The fields of the structure are in big endian mode, and the field at the lowest
+address is the "control" field.
+
+The "control" field has the following bits:
+ - Bit 0: Error
+ - Bit 1: Read
+ - Bit 2: Skip
+ - Bit 3: Select. The upper 16 bits are the selected index.
+
+When an operation is triggered, if the "control" field has bit 3 set, the
+upper 16 bits are interpreted as an index of a firmware configuration item.
+This has the same effect as writing the selector register.
+
+If the "control" field has bit 1 set, a read operation will be performed.
+"length" bytes for the current selector and offset will be copied into the
+physical RAM address specified by the "address" field.
+
+If the "control" field has bit 2 set (and not bit 1), a skip operation will be
+performed. The offset for the current selector will be advanced "length" bytes.
+
+To check the result, read the "control" field:
+   error bit set        ->  something went wrong.
+   all bits cleared     ->  transfer finished successfully.
+   otherwise            ->  transfer still in progress (doesn't happen
+                            today due to implementation not being async,
+                            but may in the future).
+
 = Host-side API =

 The following functions are available to the QEMU programmer for adding
@@ -159,6 +219,17 @@ will convert a 16-, 32-, or 64-bit integer to little-endian, then add
 a dynamically allocated copy of the appropriately sized item to fw_cfg
 under the given selector key value.

+== fw_cfg_modify_iXX() ==
+
+Modify the value of an XX-bit item (where XX may be 16, 32, or 64).
+Similarly to the corresponding fw_cfg_add_iXX() function set, convert
+a 16-, 32-, or 64-bit integer to little endian, create a dynamically
+allocated copy of the required size, and replace the existing item at
+the given selector key value with the newly allocated one. The previous
+item, assumed to have been allocated during an earlier call to
+fw_cfg_add_iXX() or fw_cfg_modify_iXX() (of the same width XX), is freed
+before the function returns.
+
 == fw_cfg_add_file() ==

 Given a filename (i.e., fw_cfg item name), starting pointer, and size,
@@ -216,6 +287,21 @@ the following syntax:
 where <item_name> is the fw_cfg item name, and <path> is the location
 on the host file system of a file containing the data to be inserted.

+Small enough items may be provided directly as strings on the command
+line, using the syntax:
+
+    -fw_cfg [name=]<item_name>,string=<string>
+
+The terminating NUL character of the content <string> will NOT be
+included as part of the fw_cfg item data, which is consistent with
+the absence of a NUL terminator for items inserted via the file option.
+
+Both <item_name> and, if applicable, the content <string> are passed
+through by QEMU without any interpretation, expansion, or further
+processing. Any such processing (potentially performed e.g., by the shell)
+is outside of QEMU's responsibility; as such, using plain ASCII characters
+is recommended.
+
 NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/"
 when using the "-fw_cfg" command line option, to avoid conflicting with
 item names used internally by QEMU. For instance:
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -2,30 +2,106 @@
 Device Specification for Inter-VM shared memory device
 ------------------------------------------------------

-The Inter-VM shared memory device is designed to share a region of memory to
-userspace in multiple virtual guests.  The memory region does not belong to any
-guest, but is a POSIX memory object on the host.  Optionally, the device may
-support sending interrupts to other guests sharing the same memory region.
+The Inter-VM shared memory device is designed to share a memory region (created
+on the host via the POSIX shared memory API) between multiple QEMU processes
+running different guests. In order for all guests to be able to pick up the
+shared memory area, it is modeled by QEMU as a PCI device exposing said memory
+to the guest as a PCI BAR.
+The memory region does not belong to any guest, but is a POSIX memory object on
+the host. The host can access this shared memory if needed.
+
+The device also provides an optional communication mechanism between guests
+sharing the same memory object. More details about that in the section 'Guest to
+guest communication' section.


 The Inter-VM PCI device
 -----------------------

-*BARs*
+From the VM point of view, the ivshmem PCI device supports three BARs.

-The device supports three BARs.  BAR0 is a 1 Kbyte MMIO region to support
-registers.  BAR1 is used for MSI-X when it is enabled in the device.  BAR2 is
-used to map the shared memory object from the host.  The size of BAR2 is
-specified when the guest is started and must be a power of 2 in size.
+- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is
+  not used.
+- BAR1 is used for MSI-X when it is enabled in the device.
+- BAR2 is used to access the shared memory object.

-*Registers*
+It is your choice how to use the device but you must choose between two
+behaviors :

-The device currently supports 4 registers of 32-bits each.  Registers
-are used for synchronization between guests sharing the same memory object when
-interrupts are supported (this requires using the shared memory server).
+- basically, if you only need the shared memory part, you will map BAR2.
+  This way, you have access to the shared memory in guest and can use it as you
+  see fit (memnic, for example, uses it in userland
+  http://dpdk.org/browse/memnic).

-The server assigns each VM an ID number and sends this ID number to the QEMU
-process when the guest starts.
+- BAR0 and BAR1 are used to implement an optional communication mechanism
+  through interrupts in the guests. If you need an event mechanism between the
+  guests accessing the shared memory, you will most likely want to write a
+  kernel driver that will handle interrupts. See details in the section 'Guest
+  to guest communication' section.
+
+The behavior is chosen when starting your QEMU processes:
+- no communication mechanism needed, the first QEMU to start creates the shared
+  memory on the host, subsequent QEMU processes will use it.
+
+- communication mechanism needed, an ivshmem server must be started before any
+  QEMU processes, then each QEMU process connects to the server unix socket.
+
+For more details on the QEMU ivshmem parameters, see qemu-doc documentation.
+
+
+Guest to guest communication
+----------------------------
+
+This section details the communication mechanism between the guests accessing
+the ivhsmem shared memory.
+
+*ivshmem server*
+
+This server code is available in qemu.git/contrib/ivshmem-server.
+
+The server must be started on the host before any guest.
+It creates a shared memory object then waits for clients to connect on a unix
+socket. All the messages are little-endian int64_t integer.
+
+For each client (QEMU process) that connects to the server:
+- the server sends a protocol version, if client does not support it, the client
+  closes the communication,
+- the server assigns an ID for this client and sends this ID to him as the first
+  message,
+- the server sends a fd to the shared memory object to this client,
+- the server creates a new set of host eventfds associated to the new client and
+  sends this set to all already connected clients,
+- finally, the server sends all the eventfds sets for all clients to the new
+  client.
+
+The server signals all clients when one of them disconnects.
+
+The client IDs are limited to 16 bits because of the current implementation (see
+Doorbell register in 'PCI device registers' subsection). Hence only 65536
+clients are supported.
+
+All the file descriptors (fd to the shared memory, eventfds for each client)
+are passed to clients using SCM_RIGHTS over the server unix socket.
+
+Apart from the current ivshmem implementation in QEMU, an ivshmem client has
+been provided in qemu.git/contrib/ivshmem-client for debug.
+
+*QEMU as an ivshmem client*
+
+At initialisation, when creating the ivshmem device, QEMU first receives a
+protocol version and closes communication with server if it does not match.
+Then, QEMU gets its ID from the server then makes it available through BAR0
+IVPosition register for the VM to use (see 'PCI device registers' subsection).
+QEMU then uses the fd to the shared memory to map it to BAR2.
+eventfds for all other clients received from the server are stored to implement
+BAR0 Doorbell register (see 'PCI device registers' subsection).
+Finally, eventfds assigned to this QEMU process are used to send interrupts in
+this VM.
+
+*PCI device registers*
+
+From the VM point of view, the ivshmem PCI device supports 4 registers of
+32-bits each.

 enum ivshmem_registers {
    IntrMask = 0,
@@ -49,8 +125,8 @@ bit to 0 and unmasked by setting the first bit to 1.
 IVPosition Register: The IVPosition register is read-only and reports the
 guest's ID number.  The guest IDs are non-negative integers.  When using the
 server, since the server is a separate process, the VM ID will only be set when
-the device is ready (shared memory is received from the server and accessible via
-the device).  If the device is not ready, the IVPosition will return -1.
+the device is ready (shared memory is received from the server and accessible
+via the device).  If the device is not ready, the IVPosition will return -1.
 Applications should ensure that they have a valid VM ID before accessing the
 shared memory.

@@ -59,8 +135,8 @@ Doorbell register.  The doorbell register is 32-bits, logically divided into
 two 16-bit fields.  The high 16-bits are the guest ID to interrupt and the low
 16-bits are the interrupt vector to trigger.  The semantics of the value
 written to the doorbell depends on whether the device is using MSI or a regular
-pin-based interrupt.  In short, MSI uses vectors while regular interrupts set the
-status register.
+pin-based interrupt.  In short, MSI uses vectors while regular interrupts set
+the status register.

 Regular Interrupts

@@ -71,7 +147,7 @@ interrupt in the destination guest.

 Message Signalled Interrupts

-A ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
+An ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
 written to the Doorbell register must be between 0 and the maximum number of
 vectors the guest supports.  The lower 16 bits written to the doorbell is the
 MSI vector that will be raised in the destination guest.  The number of MSI
@@ -83,14 +159,3 @@ interrupt itself should be communicated via the shared memory region.  Devices
 supporting multiple MSI vectors can use different vectors to indicate different
 events have occurred.  The semantics of interrupt vectors are left to the
 user's discretion.
-
-
-Usage in the Guest
------------------
-
-The shared memory device is intended to be used with the provided UIO driver.
-Very little configuration is needed.  The guest should map BAR0 to access the
-registers (an array of 32-bit ints allows simple writing) and map BAR2 to
-access the shared memory region itself.  The size of the shared memory region
-is specified when the guest (or shared memory server) is started.  A guest may
-map the whole shared memory region or only part of it.
--- a/docs/specs/vhost-user.txt
+++ b/docs/specs/vhost-user.txt
@@ -87,6 +87,14 @@ Depending on the request type, payload can be:
   User address: a 64-bit user address
   mmap offset: 64-bit offset where region starts in the mapped memory

+* Log description
+   ---------------------------
+   | log size | log offset |
+   ---------------------------
+   log size: size of area used for logging
+   log offset: offset from start of supplied file descriptor
+       where logging starts (i.e. where guest address 0 would be logged)
+
 In QEMU the vhost-user message is implemented with the following struct:

 typedef struct VhostUserMsg {
@@ -98,6 +106,7 @@ typedef struct VhostUserMsg {
        struct vhost_vring_state state;
        struct vhost_vring_addr addr;
        VhostUserMemory memory;
+        VhostUserLog log;
    };
 } QEMU_PACKED VhostUserMsg;

@@ -115,11 +124,13 @@ the ones that do:
 * VHOST_GET_FEATURES
 * VHOST_GET_PROTOCOL_FEATURES
 * VHOST_GET_VRING_BASE
+ * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)

 There are several messages that the master sends with file descriptors passed
 in the ancillary data:

 * VHOST_SET_MEM_TABLE
+ * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)
 * VHOST_SET_LOG_FD
 * VHOST_SET_VRING_KICK
 * VHOST_SET_VRING_CALL
@@ -135,13 +146,45 @@ As older slaves don't support negotiating protocol features,
 a feature bit was dedicated for this purpose:
 #define VHOST_USER_F_PROTOCOL_FEATURES 30

+Starting and stopping rings
+----------------------
+Client must only process each ring when it is started.
+
+Client must only pass data between the ring and the
+backend, when the ring is enabled.
+
+If ring is started but disabled, client must process the
+ring without talking to the backend.
+
+For example, for a networking device, in the disabled state
+client must not supply any new RX packets, but must process
+and discard any TX packets.
+
+If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized
+in an enabled state.
+
+If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized
+in a disabled state. Client must not pass data to/from the backend until ring is enabled by
+VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by
+VHOST_USER_SET_VRING_ENABLE with parameter 0.
+
+Each ring is initialized in a stopped state, client must not process it until
+ring is started, or after it has been stopped.
+
+Client must start ring upon receiving a kick (that is, detecting that file
+descriptor is readable) on the descriptor specified by
+VHOST_USER_SET_VRING_KICK, and stop ring upon receiving
+VHOST_USER_GET_VRING_BASE.
+
+While processing the rings (whether they are enabled or not), client must
+support changing some configuration aspects on the fly.
+
 Multiple queue support
 ----------------------

 Multiple queue is treated as a protocol extension, hence the slave has to
 implement protocol features first. The multiple queues feature is supported
-only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set:
-#define VHOST_USER_PROTOCOL_F_MQ    0
+only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set.

 The max number of queues the slave supports can be queried with message
 VHOST_USER_GET_PROTOCOL_FEATURES. Master should stop when the number of
@@ -152,6 +195,66 @@ queue in the sent message to identify a specified queue. One queue pair
 is enabled initially. More queues are enabled dynamically, by sending
 message VHOST_USER_SET_VRING_ENABLE.

+Migration
+---------
+
+During live migration, the master may need to track the modifications
+the slave makes to the memory mapped regions. The client should mark
+the dirty pages in a log. Once it complies to this logging, it may
+declare the VHOST_F_LOG_ALL vhost feature.
+
+To start/stop logging of data/used ring writes, server may send messages
+VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with
+VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively.
+
+All the modifications to memory pointed by vring "descriptor" should
+be marked. Modifications to "used" vring should be marked if
+VHOST_VRING_F_LOG is part of ring's flags.
+
+Dirty pages are of size:
+#define VHOST_LOG_PAGE 0x1000
+
+The log memory fd is provided in the ancillary data of
+VHOST_USER_SET_LOG_BASE message when the slave has
+VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature.
+
+The size of the log is supplied as part of VhostUserMsg
+which should be large enough to cover all known guest
+addresses. Log starts at the supplied offset in the
+supplied file descriptor.
+The log covers from address 0 to the maximum of guest
+regions. In pseudo-code, to mark page at "addr" as dirty:
+
+page = addr / VHOST_LOG_PAGE
+log[page / 8] |= 1 << page % 8
+
+Where addr is the guest physical address.
+
+Use atomic operations, as the log may be concurrently manipulated.
+
+Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG
+is set for this ring), log_guest_addr should be used to calculate the log
+offset: the write to first byte of the used ring is logged at this offset from
+log start. Also note that this value might be outside the legal guest physical
+address range (i.e. does not have to be covered by the VhostUserMemory table),
+but the bit offset of the last byte of the ring must fall within
+the size supplied by VhostUserLog.
+
+VHOST_USER_SET_LOG_FD is an optional message with an eventfd in
+ancillary data, it may be used to inform the master that the log has
+been modified.
+
+Once the source has finished migration, rings will be stopped by
+the source. No further update must be done before rings are
+restarted.
+
+Protocol features
+-----------------
+
+#define VHOST_USER_PROTOCOL_F_MQ             0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD      1
+#define VHOST_USER_PROTOCOL_F_RARP           2
+
 Message types
 -------------

@@ -211,14 +314,16 @@ Message types
      as an owner of the session. This can be used on the Slave as a
      "session start" flag.

- * VHOST_USER_RESET_DEVICE
+ * VHOST_USER_RESET_OWNER

      Id: 4
-      Equivalent ioctl: VHOST_RESET_DEVICE
      Master payload: N/A

-      Issued when a new connection is about to be closed. The Master will no
-      longer own this connection (and will usually close it).
+      This is no longer used. Used to be sent to request disabling
+      all rings, but some clients interpreted it to also discard
+      connection state (this interpretation would lead to bugs).
+      It is recommended that clients either ignore this message,
+      or use it to disable all rings.

 * VHOST_USER_SET_MEM_TABLE

@@ -236,8 +341,14 @@ Message types
      Id: 6
      Equivalent ioctl: VHOST_SET_LOG_BASE
      Master payload: u64
+      Slave payload: N/A
+
+      Sets logging shared memory space.
+      When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol
+      feature, the log memory fd is provided in the ancillary data of
+      VHOST_USER_SET_LOG_BASE message, the size and offset of shared
+      memory area provided in the message.

-      Sets the logging base address.

 * VHOST_USER_SET_LOG_FD

@@ -337,3 +448,19 @@ Message types
      Master payload: vring state description

      Signal slave to enable or disable corresponding vring.
+      This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES
+      has been negotiated.
+
+ * VHOST_USER_SEND_RARP
+
+      Id: 19
+      Equivalent ioctl: N/A
+      Master payload: u64
+
+      Ask vhost user backend to broadcast a fake RARP to notify the migration
+      is terminated for guest that does not support GUEST_ANNOUNCE.
+      Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in
+      VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP
+      is present in VHOST_USER_GET_PROTOCOL_FEATURES.
+      The first 6 bytes of the payload contain the mac address of the guest to
+      allow the vhost user backend to construct and broadcast the fake RARP.
--- a/docs/virtio-migration.txt
+++ b/docs/virtio-migration.txt
@@ -0,0 +1,106 @@
+Virtio devices and migration
+============================
+
+Copyright 2015 IBM Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.  See
+the COPYING file in the top-level directory.
+
+Saving and restoring the state of virtio devices is a bit of a twisty maze,
+for several reasons:
+- state is distributed between several parts:
+  - virtio core, for common fields like features, number of queues, ...
+  - virtio transport (pci, ccw, ...), for the different proxy devices and
+    transport specific state (msix vectors, indicators, ...)
+  - virtio device (net, blk, ...), for the different device types and their
+    state (mac address, request queue, ...)
+- most fields are saved via the stream interface; subsequently, subsections
+  have been added to make cross-version migration possible
+
+This file attempts to document the current procedure and point out some
+caveats.
+
+
+Save state procedure
+====================
+
+virtio core               virtio transport          virtio device
+-----------               ----------------          -------------
+
+                                                    save() function registered
+                                                    via register_savevm()
+virtio_save()                                       <----------
+             ------>      save_config()
+                          - save proxy device
+                          - save transport-specific
+                            device fields
+- save common device
+  fields
+- save common virtqueue
+  fields
+             ------>      save_queue()
+                          - save transport-specific
+                            virtqueue fields
+             ------>                               save_device()
+                                                   - save device-specific
+                                                     fields
+- save subsections
+  - device endianness,
+    if changed from
+    default endianness
+  - 64 bit features, if
+    any high feature bit
+    is set
+  - virtio-1 virtqueue
+    fields, if VERSION_1
+    is set
+
+
+Load state procedure
+====================
+
+virtio core               virtio transport          virtio device
+-----------               ----------------          -------------
+
+                                                    load() function registered
+                                                    via register_savevm()
+virtio_load()                                       <----------
+             ------>      load_config()
+                          - load proxy device
+                          - load transport-specific
+                            device fields
+- load common device
+  fields
+- load common virtqueue
+  fields
+             ------>      load_queue()
+                          - load transport-specific
+                            virtqueue fields
+- notify guest
+             ------>                               load_device()
+                                                   - load device-specific
+                                                     fields
+- load subsections
+  - device endianness
+  - 64 bit features
+  - virtio-1 virtqueue
+    fields
+- sanitize endianness
+- sanitize features
+- virtqueue index sanity
+  check
+                                                   - feature-dependent setup
+
+
+Implications of this setup
+==========================
+
+Devices need to be careful in their state processing during load: The
+load_device() procedure is invoked by the core before subsections have
+been loaded. Any code that depends on information transmitted in subsections
+therefore has to be invoked in the device's load() function _after_
+virtio_load() returned (like e.g. code depending on features).
+
+Any extension of the state being migrated should be done in subsections
+added to the core for compatibility reasons. If transport or device specific
+state is added, core needs to invoke a callback from the new subsection.
--- a/docs/writing-qmp-commands.txt
+++ b/docs/writing-qmp-commands.txt
@@ -210,7 +210,7 @@ if you don't see these strings, then something went wrong.
 === Errors ===

 QMP commands should use the error interface exported by the error.h header
-file. Basically, errors are set by calling the error_set() function.
+file. Basically, most errors are set by calling the error_setg() function.

 Let's say we don't accept the string "message" to contain the word "love". If
 it does contain it, we want the "hello-world" command to return an error:
@@ -219,8 +219,7 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
 {
    if (has_message) {
        if (strstr(message, "love")) {
-            error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                      "the word 'love' is not allowed");
+            error_setg(errp, "the word 'love' is not allowed");
            return;
        }
        printf("%s\n", message);
@@ -229,10 +228,8 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
    }
 }

-The first argument to the error_set() function is the Error pointer to pointer,
-which is passed to all QMP functions. The second argument is a ErrorClass
-value, which should be ERROR_CLASS_GENERIC_ERROR most of the time (more
-details about error classes are given below). The third argument is a human
+The first argument to the error_setg() function is the Error pointer
+to pointer, which is passed to all QMP functions. The next argument is a human
 description of the error, this is a free-form printf-like string.

 Let's test the example above. Build qemu, run it as defined in the "Testing"
@@ -249,8 +246,9 @@ The QMP server's response should be:
    }
 }

-As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR. There
-are two exceptions to this rule:
+As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR
+(done by default when using error_setg()). There are two exceptions to
+this rule:

 1. A non-generic ErrorClass value exists* for the failure you want to report
    (eg. DeviceNotFound)
@@ -259,8 +257,8 @@ are two exceptions to this rule:
    want to report, hence you have to add a new ErrorClass value so that they
    can check for it

-If the failure you want to report doesn't fall in one of the two cases above,
-just report ERROR_CLASS_GENERIC_ERROR.
+If the failure you want to report falls into one of the two cases above,
+use error_set() with a second argument of an ErrorClass value.

 * All existing ErrorClass values are defined in the qapi-schema.json file

--- a/exec.c
+++ b/exec.c
@@ -50,11 +50,15 @@
 #include "qemu/rcu_queue.h"
 #include "qemu/main-loop.h"
 #include "translate-all.h"
+#include "sysemu/replay.h"

 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"

 #include "qemu/range.h"
+#ifndef _WIN32
+#include "qemu/mmap-alloc.h"
+#endif

 //#define DEBUG_SUBPAGE

@@ -84,9 +88,9 @@ static MemoryRegion io_mem_unassigned;
 */
 #define RAM_RESIZEABLE (1 << 2)

-/* An extra page is mapped on top of this RAM.
+/* RAM is backed by an mmapped file.
 */
-#define RAM_EXTRA (1 << 3)
+#define RAM_FILE (1 << 3)
 #endif

 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
@@ -879,6 +883,7 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
    }
    va_end(ap2);
    va_end(ap);
+    replay_finish();
 #if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
@@ -898,7 +903,7 @@ static RAMBlock *qemu_get_ram_block(ram_addr_t addr)

    block = atomic_rcu_read(&ram_list.mru_block);
    if (block && addr - block->offset < block->max_length) {
-        goto found;
+        return block;
    }
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        if (addr - block->offset < block->max_length) {
@@ -1059,9 +1064,11 @@ static uint16_t phys_section_add(PhysPageMap *map,

 static void phys_section_destroy(MemoryRegion *mr)
 {
+    bool have_sub_page = mr->subpage;
+
    memory_region_unref(mr);

-    if (mr->subpage) {
+    if (have_sub_page) {
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
        object_unref(OBJECT(&subpage->iomem));
        g_free(subpage);
@@ -1191,9 +1198,6 @@ static long gethugepagesize(const char *path, Error **errp)
        return 0;
    }

-    if (fs.f_type != HUGETLBFS_MAGIC)
-        fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
-
    return fs.f_bsize;
 }

@@ -1202,16 +1206,14 @@ static void *file_ram_alloc(RAMBlock *block,
                            const char *path,
                            Error **errp)
 {
+    struct stat st;
    char *filename;
    char *sanitized_name;
    char *c;
-    void *ptr;
-    void *area = NULL;
+    void *area;
    int fd;
    uint64_t hpagesize;
-    uint64_t total;
    Error *local_err = NULL;
-    size_t offset;

    hpagesize = gethugepagesize(path, &local_err);
    if (local_err) {
@@ -1233,29 +1235,35 @@ static void *file_ram_alloc(RAMBlock *block,
        goto error;
    }

-    /* Make name safe to use with mkstemp by replacing '/' with '_'. */
-    sanitized_name = g_strdup(memory_region_name(block->mr));
-    for (c = sanitized_name; *c != '\0'; c++) {
-        if (*c == '/')
-            *c = '_';
+    if (!stat(path, &st) && S_ISDIR(st.st_mode)) {
+        /* Make name safe to use with mkstemp by replacing '/' with '_'. */
+        sanitized_name = g_strdup(memory_region_name(block->mr));
+        for (c = sanitized_name; *c != '\0'; c++) {
+            if (*c == '/') {
+                *c = '_';
+            }
+        }
+
+        filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
+                                   sanitized_name);
+        g_free(sanitized_name);
+
+        fd = mkstemp(filename);
+        if (fd >= 0) {
+            unlink(filename);
+        }
+        g_free(filename);
+    } else {
+        fd = open(path, O_RDWR | O_CREAT, 0644);
    }

-    filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
-                               sanitized_name);
-    g_free(sanitized_name);
-
-    fd = mkstemp(filename);
    if (fd < 0) {
        error_setg_errno(errp, errno,
                         "unable to create backing store for hugepages");
-        g_free(filename);
        goto error;
    }
-    unlink(filename);
-    g_free(filename);

    memory = ROUND_UP(memory, hpagesize);
-    total = memory + hpagesize;

    /*
     * ftruncate is not supported by hugetlbfs in older
@@ -1267,40 +1275,14 @@ static void *file_ram_alloc(RAMBlock *block,
        perror("ftruncate");
    }

-    ptr = mmap(0, total, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
-                -1, 0);
-    if (ptr == MAP_FAILED) {
-        error_setg_errno(errp, errno,
-                         "unable to allocate memory range for hugepages");
-        close(fd);
-        goto error;
-    }
-
-    offset = QEMU_ALIGN_UP((uintptr_t)ptr, hpagesize) - (uintptr_t)ptr;
-
-    area = mmap(ptr + offset, memory, PROT_READ | PROT_WRITE,
-                (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE) |
-                MAP_FIXED,
-                fd, 0);
+    area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED);
    if (area == MAP_FAILED) {
        error_setg_errno(errp, errno,
                         "unable to map backing store for hugepages");
-        munmap(ptr, total);
        close(fd);
        goto error;
    }

-    if (offset > 0) {
-        munmap(ptr, offset);
-    }
-    ptr += offset;
-    total -= offset;
-
-    if (total > memory + getpagesize()) {
-        munmap(ptr + memory + getpagesize(),
-               total - memory - getpagesize());
-    }
-
    if (mem_prealloc) {
        os_mem_prealloc(fd, area, memory);
    }
@@ -1309,10 +1291,6 @@ static void *file_ram_alloc(RAMBlock *block,
    return area;

 error:
-    if (mem_prealloc) {
-        error_report("%s", error_get_pretty(*errp));
-        exit(1);
-    }
    return NULL;
 }
 #endif
@@ -1398,6 +1376,11 @@ static RAMBlock *find_ram_block(ram_addr_t addr)
    return NULL;
 }

+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+    return rb->idstr;
+}
+
 /* Called with iothread lock held.  */
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
 {
@@ -1468,7 +1451,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)

    assert(block);

-    newsize = TARGET_PAGE_ALIGN(newsize);
+    newsize = HOST_PAGE_ALIGN(newsize);

    if (block->used_length == newsize) {
        return 0;
@@ -1612,13 +1595,13 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
        return -1;
    }

-    size = TARGET_PAGE_ALIGN(size);
+    size = HOST_PAGE_ALIGN(size);
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->used_length = size;
    new_block->max_length = size;
    new_block->flags = share ? RAM_SHARED : 0;
-    new_block->flags |= RAM_EXTRA;
+    new_block->flags |= RAM_FILE;
    new_block->host = file_ram_alloc(new_block, size,
                                     mem_path, errp);
    if (!new_block->host) {
@@ -1648,8 +1631,8 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
    ram_addr_t addr;
    Error *local_err = NULL;

-    size = TARGET_PAGE_ALIGN(size);
-    max_size = TARGET_PAGE_ALIGN(max_size);
+    size = HOST_PAGE_ALIGN(size);
+    max_size = HOST_PAGE_ALIGN(max_size);
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->resized = resized;
@@ -1720,8 +1703,8 @@ static void reclaim_ramblock(RAMBlock *block)
        xen_invalidate_map_cache_entry(block->host);
 #ifndef _WIN32
    } else if (block->fd >= 0) {
-        if (block->flags & RAM_EXTRA) {
-            munmap(block->host, block->max_length + getpagesize());
+        if (block->flags & RAM_FILE) {
+            qemu_ram_munmap(block->host, block->max_length);
        } else {
            munmap(block->host, block->max_length);
        }
@@ -1898,8 +1881,16 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
    }
 }

-/* Some of the softmmu routines need to translate from a host pointer
- * (typically a TLB entry) back to a ram offset.
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
@@ -1907,18 +1898,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr,
+                                   ram_addr_t *offset)
 {
    RAMBlock *block;
    uint8_t *host = ptr;
-    MemoryRegion *mr;

    if (xen_enabled()) {
        rcu_read_lock();
        *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        mr = qemu_get_ram_block(*ram_addr)->mr;
+        block = qemu_get_ram_block(*ram_addr);
+        if (block) {
+            *offset = (host - block->host);
+        }
        rcu_read_unlock();
-        return mr;
+        return block;
    }

    rcu_read_lock();
@@ -1941,10 +1936,49 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
    return NULL;

 found:
-    *ram_addr = block->offset + (host - block->host);
-    mr = block->mr;
+    *offset = (host - block->host);
+    if (round_offset) {
+        *offset &= TARGET_PAGE_MASK;
+    }
+    *ram_addr = block->offset + *offset;
    rcu_read_unlock();
-    return mr;
+    return block;
+}
+
+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (!strcmp(name, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+{
+    RAMBlock *block;
+    ram_addr_t offset; /* Not used */
+
+    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+
+    if (!block) {
+        return NULL;
+    }
+
+    return block->mr;
 }

 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
@@ -2725,8 +2759,8 @@ void cpu_register_map_client(QEMUBH *bh)
 void cpu_exec_init_all(void)
 {
    qemu_mutex_init(&ram_list.mutex);
-    memory_map_init();
    io_mem_init();
+    memory_map_init();
    qemu_mutex_init(&map_client_list_lock);
 }

@@ -3523,6 +3557,16 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
    }
    return 0;
 }
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_bits(void)
+{
+    return TARGET_PAGE_BITS;
+}
+
 #endif

 /*
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -1128,10 +1128,19 @@ int main(int argc, char **argv)
        }
    }

+    if (chdir("/") < 0) {
+        do_perror("chdir");
+        goto error;
+    }
+    if (chroot(rpath) < 0) {
+        do_perror("chroot");
+        goto error;
+    }
+
    get_version = false;
 #ifdef FS_IOC_GETVERSION
    /* check whether underlying FS support IOC_GETVERSION */
-    retval = statfs(rpath, &st_fs);
+    retval = statfs("/", &st_fs);
    if (!retval) {
        switch (st_fs.f_type) {
        case EXT2_SUPER_MAGIC:
@@ -1144,16 +1153,7 @@ int main(int argc, char **argv)
    }
 #endif

-    if (chdir("/") < 0) {
-        do_perror("chdir");
-        goto error;
-    }
-    if (chroot(rpath) < 0) {
-        do_perror("chroot");
-        goto error;
-    }
    umask(0);
-
    if (init_capabilities() < 0) {
        goto error;
    }
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -956,6 +956,13 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
        if (*p == ',')
            p++;
        len = strtoull(p, NULL, 16);
+
+        /* memtohex() doubles the required space */
+        if (len > MAX_PACKET_LENGTH / 2) {
+            put_packet (s, "E22");
+            break;
+        }
+
        if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len, false) != 0) {
            put_packet (s, "E14");
        } else {
@@ -970,6 +977,12 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
        len = strtoull(p, (char **)&p, 16);
        if (*p == ':')
            p++;
+
+        /* hextomem() reads 2*len bytes */
+        if (len > strlen(p) / 2) {
+            put_packet (s, "E22");
+            break;
+        }
        hextomem(mem_buf, p, len);
        if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len,
                                   true) != 0) {
@@ -1107,7 +1120,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
            cpu = find_cpu(thread);
            if (cpu != NULL) {
                cpu_synchronize_state(cpu);
-                len = snprintf((char *)mem_buf, sizeof(mem_buf),
+                /* memtohex() doubles the required space */
+                len = snprintf((char *)mem_buf, sizeof(buf) / 2,
                               "CPU#%d [%s]", cpu->cpu_index,
                               cpu->halted ? "halted " : "running");
                memtohex(buf, mem_buf, len);
@@ -1136,8 +1150,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
                put_packet(s, "E01");
                break;
            }
-            hextomem(mem_buf, p + 5, len);
            len = len / 2;
+            hextomem(mem_buf, p + 5, len);
            mem_buf[len++] = 0;
            qemu_chr_be_write(s->mon_chr, mem_buf, len);
            put_packet(s, "OK");
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -194,8 +194,8 @@ ETEXI

    {
        .name       = "change",
-        .args_type  = "device:B,target:F,arg:s?",
-        .params     = "device filename [format]",
+        .args_type  = "device:B,target:F,arg:s?,read-only-mode:s?",
+        .params     = "device filename [format [read-only-mode]]",
        .help       = "change a removable medium, optional format",
        .mhandler.cmd = hmp_change,
    },
@@ -206,7 +206,7 @@ STEXI
 Change the configuration of a device.

@table @option
-@item change @var{diskdevice} @var{filename} [@var{format}]
+@item change @var{diskdevice} @var{filename} [@var{format} [@var{read-only-mode}]]
 Change the medium for a removable disk device to point to @var{filename}. eg

@example
@@ -215,6 +215,20 @@ Change the medium for a removable disk device to point to @var{filename}. eg

@var{format} is optional.

+@var{read-only-mode} may be used to change the read-only status of the device.
+It accepts the following values:
+
+@table @var
+@item retain
+Retains the current status; this is the default.
+
+@item read-only
+Makes the device read-only.
+
+@item read-write
+Makes the device writable.
+@end table
+
@item change vnc @var{display},@var{options}
 Change the configuration of the VNC server. The valid syntax for @var{display}
 and @var{options} are described at @ref{sec_invocation}. eg
@@ -1005,6 +1019,23 @@ STEXI
@item migrate_set_parameter @var{parameter} @var{value}
@findex migrate_set_parameter
 Set the parameter @var{parameter} for migration.
+ETEXI
+
+    {
+        .name       = "migrate_start_postcopy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Followup to a migration command to switch the migration"
+                      " to postcopy mode. The x-postcopy-ram capability must "
+                      "be set before the original migration command.",
+        .mhandler.cmd = hmp_migrate_start_postcopy,
+    },
+
+STEXI
+@item migrate_start_postcopy
+@findex migrate_start_postcopy
+Switch in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
 ETEXI

    {
--- a/hmp.c
+++ b/hmp.c
@@ -27,6 +27,7 @@
 #include "qapi/opts-visitor.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/string-output-visitor.h"
+#include "qapi/util.h"
 #include "qapi-visit.h"
 #include "ui/console.h"
 #include "block/qapi.h"
@@ -521,6 +522,7 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                       " flush_total_time_ns=%" PRId64
                       " rd_merged=%" PRId64
                       " wr_merged=%" PRId64
+                       " idle_time_ns=%" PRId64
                       "\n",
                       stats->value->stats->rd_bytes,
                       stats->value->stats->wr_bytes,
@@ -531,7 +533,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                       stats->value->stats->rd_total_time_ns,
                       stats->value->stats->flush_total_time_ns,
                       stats->value->stats->rd_merged,
-                       stats->value->stats->wr_merged);
+                       stats->value->stats->wr_merged,
+                       stats->value->stats->idle_time_ns);
    }

    qapi_free_BlockStatsList(stats_list);
@@ -569,8 +572,8 @@ void hmp_info_vnc(Monitor *mon, const QDict *qdict)
        for (client = info->clients; client; client = client->next) {
            monitor_printf(mon, "Client:\n");
            monitor_printf(mon, "     address: %s:%s\n",
-                           client->value->base->host,
-                           client->value->base->service);
+                           client->value->host,
+                           client->value->service);
            monitor_printf(mon, "  x509_dname: %s\n",
                           client->value->x509_dname ?
                           client->value->x509_dname : "none");
@@ -638,7 +641,7 @@ void hmp_info_spice(Monitor *mon, const QDict *qdict)
        for (chan = info->channels; chan; chan = chan->next) {
            monitor_printf(mon, "Channel:\n");
            monitor_printf(mon, "     address: %s:%s%s\n",
-                           chan->value->base->host, chan->value->base->port,
+                           chan->value->host, chan->value->port,
                           chan->value->tls ? " [tls]" : "");
            monitor_printf(mon, "     session: %" PRId64 "\n",
                           chan->value->connection_id);
@@ -841,11 +844,11 @@ void hmp_info_tpm(Monitor *mon, const QDict *qdict)
                       c, TpmModel_lookup[ti->model]);

        monitor_printf(mon, "  \\ %s: type=%s",
-                       ti->id, TpmTypeOptionsKind_lookup[ti->options->kind]);
+                       ti->id, TpmTypeOptionsKind_lookup[ti->options->type]);

-        switch (ti->options->kind) {
+        switch (ti->options->type) {
        case TPM_TYPE_OPTIONS_KIND_PASSTHROUGH:
-            tpo = ti->options->passthrough;
+            tpo = ti->options->u.passthrough;
            monitor_printf(mon, "%s%s%s%s",
                           tpo->has_path ? ",path=" : "",
                           tpo->has_path ? tpo->path : "",
@@ -1293,6 +1296,13 @@ void hmp_client_migrate_info(Monitor *mon, const QDict *qdict)
    hmp_handle_error(mon, &err);
 }

+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+    qmp_migrate_start_postcopy(&err);
+    hmp_handle_error(mon, &err);
+}
+
 void hmp_set_password(Monitor *mon, const QDict *qdict)
 {
    const char *protocol  = qdict_get_str(qdict, "protocol");
@@ -1336,24 +1346,46 @@ void hmp_change(Monitor *mon, const QDict *qdict)
    const char *device = qdict_get_str(qdict, "device");
    const char *target = qdict_get_str(qdict, "target");
    const char *arg = qdict_get_try_str(qdict, "arg");
+    const char *read_only = qdict_get_try_str(qdict, "read-only-mode");
+    BlockdevChangeReadOnlyMode read_only_mode = 0;
    Error *err = NULL;

-    if (strcmp(device, "vnc") == 0 &&
-            (strcmp(target, "passwd") == 0 ||
-             strcmp(target, "password") == 0)) {
-        if (!arg) {
-            monitor_read_password(mon, hmp_change_read_arg, NULL);
+    if (strcmp(device, "vnc") == 0) {
+        if (read_only) {
+            monitor_printf(mon,
+                           "Parameter 'read-only-mode' is invalid for VNC\n");
+            return;
+        }
+        if (strcmp(target, "passwd") == 0 ||
+            strcmp(target, "password") == 0) {
+            if (!arg) {
+                monitor_read_password(mon, hmp_change_read_arg, NULL);
+                return;
+            }
+        }
+        qmp_change("vnc", target, !!arg, arg, &err);
+    } else {
+        if (read_only) {
+            read_only_mode =
+                qapi_enum_parse(BlockdevChangeReadOnlyMode_lookup,
+                                read_only, BLOCKDEV_CHANGE_READ_ONLY_MODE_MAX,
+                                BLOCKDEV_CHANGE_READ_ONLY_MODE_RETAIN, &err);
+            if (err) {
+                hmp_handle_error(mon, &err);
+                return;
+            }
+        }
+
+        qmp_blockdev_change_medium(device, target, !!arg, arg,
+                                   !!read_only, read_only_mode, &err);
+        if (err &&
+            error_get_class(err) == ERROR_CLASS_DEVICE_ENCRYPTED) {
+            error_free(err);
+            monitor_read_block_device_key(mon, device, NULL, NULL);
            return;
        }
    }

-    qmp_change(device, target, !!arg, arg, &err);
-    if (err &&
-        error_get_class(err) == ERROR_CLASS_DEVICE_ENCRYPTED) {
-        error_free(err);
-        monitor_read_block_device_key(mon, device, NULL, NULL);
-        return;
-    }
    hmp_handle_error(mon, &err);
 }

@@ -1735,15 +1767,15 @@ void hmp_sendkey(Monitor *mon, const QDict *qdict)
            if (*endp != '\0') {
                goto err_out;
            }
-            keylist->value->kind = KEY_VALUE_KIND_NUMBER;
-            keylist->value->number = value;
+            keylist->value->type = KEY_VALUE_KIND_NUMBER;
+            keylist->value->u.number = value;
        } else {
            int idx = index_from_key(keyname_buf);
            if (idx == Q_KEY_CODE_MAX) {
                goto err_out;
            }
-            keylist->value->kind = KEY_VALUE_KIND_QCODE;
-            keylist->value->qcode = idx;
+            keylist->value->type = KEY_VALUE_KIND_QCODE;
+            keylist->value->u.qcode = idx;
        }

        if (!separator) {
@@ -1958,12 +1990,12 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
        value = info->value;

        if (value) {
-            switch (value->kind) {
+            switch (value->type) {
            case MEMORY_DEVICE_INFO_KIND_DIMM:
-                di = value->dimm;
+                di = value->u.dimm;

                monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
-                               MemoryDeviceInfoKind_lookup[value->kind],
+                               MemoryDeviceInfoKind_lookup[value->type],
                               di->id ? di->id : "");
                monitor_printf(mon, "  addr: 0x%" PRIx64 "\n", di->addr);
                monitor_printf(mon, "  slot: %" PRId64 "\n", di->slot);
--- a/hmp.h
+++ b/hmp.h
@@ -69,6 +69,7 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
 void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
--- a/hw/9pfs/codir.c
+++ b/hw/9pfs/codir.c
@@ -14,7 +14,7 @@

 #include "fsdev/qemu-fsdev.h"
 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "virtio-9p-coth.h"

 int v9fs_co_readdir_r(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent *dent,
--- a/hw/9pfs/cofile.c
+++ b/hw/9pfs/cofile.c
@@ -14,7 +14,7 @@

 #include "fsdev/qemu-fsdev.h"
 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "virtio-9p-coth.h"

 int v9fs_co_st_gen(V9fsPDU *pdu, V9fsPath *path, mode_t st_mode,
--- a/hw/9pfs/cofs.c
+++ b/hw/9pfs/cofs.c
@@ -14,7 +14,7 @@

 #include "fsdev/qemu-fsdev.h"
 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "virtio-9p-coth.h"

 static ssize_t __readlink(V9fsState *s, V9fsPath *path, V9fsString *buf)
--- a/hw/9pfs/coxattr.c
+++ b/hw/9pfs/coxattr.c
@@ -14,7 +14,7 @@

 #include "fsdev/qemu-fsdev.h"
 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "virtio-9p-coth.h"

 int v9fs_co_llistxattr(V9fsPDU *pdu, V9fsPath *path, void *value, size_t size)
--- a/hw/9pfs/virtio-9p-coth.c
+++ b/hw/9pfs/virtio-9p-coth.c
@@ -12,71 +12,30 @@
 *
 */

-#include "fsdev/qemu-fsdev.h"
-#include "qemu/thread.h"
-#include "qemu/event_notifier.h"
-#include "block/coroutine.h"
+#include "qemu-common.h"
+#include "block/thread-pool.h"
+#include "qemu/coroutine.h"
+#include "qemu/main-loop.h"
 #include "virtio-9p-coth.h"

-/* v9fs glib thread pool */
-static V9fsThPool v9fs_pool;
+/* Called from QEMU I/O thread.  */
+static void coroutine_enter_cb(void *opaque, int ret)
+{
+    Coroutine *co = opaque;
+    qemu_coroutine_enter(co, NULL);
+}
+
+/* Called from worker thread.  */
+static int coroutine_enter_func(void *arg)
+{
+    Coroutine *co = arg;
+    qemu_coroutine_enter(co, NULL);
+    return 0;
+}

 void co_run_in_worker_bh(void *opaque)
 {
    Coroutine *co = opaque;
-    g_thread_pool_push(v9fs_pool.pool, co, NULL);
-}
-
-static void v9fs_qemu_process_req_done(EventNotifier *e)
-{
-    Coroutine *co;
-
-    event_notifier_test_and_clear(e);
-
-    while ((co = g_async_queue_try_pop(v9fs_pool.completed)) != NULL) {
-        qemu_coroutine_enter(co, NULL);
-    }
-}
-
-static void v9fs_thread_routine(gpointer data, gpointer user_data)
-{
-    Coroutine *co = data;
-
-    qemu_coroutine_enter(co, NULL);
-
-    g_async_queue_push(v9fs_pool.completed, co);
-
-    event_notifier_set(&v9fs_pool.e);
-}
-
-int v9fs_init_worker_threads(void)
-{
-    int ret = 0;
-    V9fsThPool *p = &v9fs_pool;
-    sigset_t set, oldset;
-
-    sigfillset(&set);
-    /* Leave signal handling to the iothread.  */
-    pthread_sigmask(SIG_SETMASK, &set, &oldset);
-
-    p->pool = g_thread_pool_new(v9fs_thread_routine, p, -1, FALSE, NULL);
-    if (!p->pool) {
-        ret = -1;
-        goto err_out;
-    }
-    p->completed = g_async_queue_new();
-    if (!p->completed) {
-        /*
-         * We are going to terminate.
-         * So don't worry about cleanup
-         */
-        ret = -1;
-        goto err_out;
-    }
-    event_notifier_init(&p->e, 0);
-
-    event_notifier_set_handler(&p->e, v9fs_qemu_process_req_done);
-err_out:
-    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
-    return ret;
+    thread_pool_submit_aio(qemu_get_aio_context()->thread_pool,
+                           coroutine_enter_func, co, coroutine_enter_cb, co);
 }
--- a/hw/9pfs/virtio-9p-coth.h
+++ b/hw/9pfs/virtio-9p-coth.h
@@ -16,16 +16,8 @@
 #define _QEMU_VIRTIO_9P_COTH_H

 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
 #include "virtio-9p.h"
-#include <glib.h>
-
-typedef struct V9fsThPool {
-    EventNotifier e;
-
-    GThreadPool *pool;
-    GAsyncQueue *completed;
-} V9fsThPool;

 /*
 * we want to use bottom half because we want to make sure the below
@@ -45,7 +37,7 @@ typedef struct V9fsThPool {
        qemu_bh_schedule(co_bh);                                        \
        /*                                                              \
         * yield in qemu thread and re-enter back                       \
-         * in glib worker thread                                        \
+         * in worker thread                                             \
         */                                                             \
        qemu_coroutine_yield();                                         \
        qemu_bh_delete(co_bh);                                          \
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -43,6 +43,16 @@ static void virtio_9p_get_config(VirtIODevice *vdev, uint8_t *config)
    g_free(cfg);
 }

+static void virtio_9p_save(QEMUFile *f, void *opaque)
+{
+    virtio_save(VIRTIO_DEVICE(opaque), f);
+}
+
+static int virtio_9p_load(QEMUFile *f, void *opaque, int version_id)
+{
+    return virtio_load(VIRTIO_DEVICE(opaque), f, version_id);
+}
+
 static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
 {
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -106,10 +116,6 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
                   " and export path:%s", s->fsconf.fsdev_id, s->ctx.fs_root);
        goto out;
    }
-    if (v9fs_init_worker_threads() < 0) {
-        error_setg(errp, "worker thread initialization failed");
-        goto out;
-    }

    /*
     * Check details of export path, We need to use fs driver
@@ -130,6 +136,7 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
    }
    v9fs_path_free(&path);

+    register_savevm(dev, "virtio-9p", -1, 1, virtio_9p_save, virtio_9p_load, s);
    return;
 out:
    g_free(s->ctx.fs_root);
@@ -138,6 +145,17 @@ out:
    v9fs_path_free(&path);
 }

+static void virtio_9p_device_unrealize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    V9fsState *s = VIRTIO_9P(dev);
+
+    virtio_cleanup(vdev);
+    unregister_savevm(dev, "virtio-9p", s);
+    g_free(s->ctx.fs_root);
+    g_free(s->tag);
+}
+
 /* virtio-9p device */

 static Property virtio_9p_properties[] = {
@@ -154,6 +172,7 @@ static void virtio_9p_class_init(ObjectClass *klass, void *data)
    dc->props = virtio_9p_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
    vdc->realize = virtio_9p_device_realize;
+    vdc->unrealize = virtio_9p_device_unrealize;
    vdc->get_features = virtio_9p_get_features;
    vdc->get_config = virtio_9p_get_config;
 }
--- a/hw/9pfs/virtio-9p.h
+++ b/hw/9pfs/virtio-9p.h
@@ -13,7 +13,7 @@
 #include "fsdev/file-op-9p.h"
 #include "fsdev/virtio-9p-marshal.h"
 #include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"

 enum {
    P9_TLERROR = 6,
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1163,9 +1163,7 @@ void *acpi_data_push(GArray *table_data, unsigned size)

 unsigned acpi_data_len(GArray *table)
 {
-#if GLIB_CHECK_VERSION(2, 22, 0)
    assert(g_array_get_element_size(table) == 1);
-#endif
    return table->len;
 }

--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -625,8 +625,12 @@ void acpi_pm1_cnt_reset(ACPIREGS *ar)
 void acpi_gpe_init(ACPIREGS *ar, uint8_t len)
 {
    ar->gpe.len = len;
-    ar->gpe.sts = g_malloc0(len / 2);
-    ar->gpe.en = g_malloc0(len / 2);
+    /* Only first len / 2 bytes are ever used,
+     * but the caller in ich9.c migrates full len bytes.
+     * TODO: fix ich9.c and drop the extra allocation.
+     */
+    ar->gpe.sts = g_malloc0(len);
+    ar->gpe.en = g_malloc0(len);
 }

 void acpi_gpe_reset(ACPIREGS *ar)
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -155,6 +155,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data,
                qapi_event_send_mem_unplug_error(dev->id,
                                                 error_get_pretty(local_err),
                                                 &error_abort);
+                error_free(local_err);
                break;
            }
            trace_mhp_acpi_pc_dimm_deleted(mem_st->selector);
@@ -238,10 +239,12 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st,

    mdev->dimm = dev;
    mdev->is_enabled = true;
-    mdev->is_inserting = true;
+    if (dev->hotplugged) {
+        mdev->is_inserting = true;

-    /* do ACPI magic */
-    acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS);
+        /* do ACPI magic */
+        acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS);
+    }
    return;
 }

--- a/hw/arm/allwinner-a10.c
+++ b/hw/arm/allwinner-a10.c
@@ -39,6 +39,9 @@ static void aw_a10_init(Object *obj)
        qemu_check_nic_model(&nd_table[0], TYPE_AW_EMAC);
        qdev_set_nic_properties(DEVICE(&s->emac), &nd_table[0]);
    }
+
+    object_initialize(&s->sata, sizeof(s->sata), TYPE_ALLWINNER_AHCI);
+    qdev_set_parent_bus(DEVICE(&s->sata), sysbus_get_default());
 }

 static void aw_a10_realize(DeviceState *dev, Error **errp)
@@ -93,6 +96,14 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
    sysbus_mmio_map(sysbusdev, 0, AW_A10_EMAC_BASE);
    sysbus_connect_irq(sysbusdev, 0, s->irq[55]);

+    object_property_set_bool(OBJECT(&s->sata), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->sata), 0, AW_A10_SATA_BASE);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, s->irq[56]);
+
    /* FIXME use a qdev chardev prop instead of serial_hds[] */
    serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, s->irq[1],
                   115200, serial_hds[0], DEVICE_NATIVE_ENDIAN);
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .4.50
 .4.94
				`@@ -0,0 +1 @@`
				`ivshmem-client-obj-y = ivshmem-client.o main.o`
				`@@ -0,0 +1 @@`
				`ivshmem-server-obj-y = ivshmem-server.o main.o`