tests/qtest: migration-test: Add tests for file-based migration

Add basic tests for file-based migration. Signed-off-by: Fabiano Rosas <farosas@suse.de>
tests/qtest: migration: Add support for negative testing of qmp_migrate
2023-06-30 16:57:29 -03:00 · 2023-06-30 16:57:29 -03:00 · 2023-06-30 16:57:25 -03:00 · 2023-06-30 16:40:16 -03:00 · 2023-06-30 16:40:15 -03:00 · 2023-06-30 16:40:15 -03:00
196 changed files with 5054 additions and 1851 deletions
--- a/.gitlab-ci.d/crossbuilds.yml
+++ b/.gitlab-ci.d/crossbuilds.yml
@@ -57,7 +57,7 @@ cross-i386-tci:
  variables:
    IMAGE: fedora-i386-cross
    ACCEL: tcg-interpreter
-    EXTRA_CONFIGURE_OPTS: --target-list=i386-softmmu,i386-linux-user,aarch64-softmmu,aarch64-linux-user,ppc-softmmu,ppc-linux-user
+    EXTRA_CONFIGURE_OPTS: --target-list=i386-softmmu,i386-linux-user,aarch64-softmmu,aarch64-linux-user,ppc-softmmu,ppc-linux-user --disable-plugins
    MAKE_CHECK_ARGS: check check-tcg

 cross-mipsel-system:
--- a/.mailmap
+++ b/.mailmap
@@ -76,9 +76,10 @@ Paul Burton <paulburton@kernel.org> <pburton@wavecomp.com>
 Philippe Mathieu-Daudé <philmd@linaro.org> <f4bug@amsat.org>
 Philippe Mathieu-Daudé <philmd@linaro.org> <philmd@redhat.com>
 Philippe Mathieu-Daudé <philmd@linaro.org> <philmd@fungible.com>
+Roman Bolshakov <rbolshakov@ddn.com> <r.bolshakov@yadro.com>
 Stefan Brankovic <stefan.brankovic@syrmia.com> <stefan.brankovic@rt-rk.com.com>
-Yongbok Kim <yongbok.kim@mips.com> <yongbok.kim@imgtec.com>
 Taylor Simpson <ltaylorsimpson@gmail.com> <tsimpson@quicinc.com>
+Yongbok Kim <yongbok.kim@mips.com> <yongbok.kim@imgtec.com>

 # Also list preferred name forms where people have changed their
 # git author config, or had utf8/latin1 encoding issues.
--- a/7
+++ b/7
@@ -498,14 +498,14 @@ F: target/arm/hvf/

 X86 HVF CPUs
 M: Cameron Esfahani <dirty@apple.com>
-M: Roman Bolshakov <r.bolshakov@yadro.com>
+M: Roman Bolshakov <rbolshakov@ddn.com>
 W: https://wiki.qemu.org/Features/HVF
 S: Maintained
 F: target/i386/hvf/

 HVF
 M: Cameron Esfahani <dirty@apple.com>
-M: Roman Bolshakov <r.bolshakov@yadro.com>
+M: Roman Bolshakov <rbolshakov@ddn.com>
 W: https://wiki.qemu.org/Features/HVF
 S: Maintained
 F: accel/hvf/
@@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c

 VFIO
 M: Alex Williamson <alex.williamson@redhat.com>
-R: Cédric Le Goater <clg@redhat.com>
+M: Cédric Le Goater <clg@redhat.com>
 S: Supported
 F: hw/vfio/*
 F: include/hw/vfio/
@@ -3202,6 +3202,7 @@ F: docs/interop/dbus*
 F: docs/sphinx/dbus*
 F: docs/sphinx/fakedbusdoc.py
 F: tests/qtest/dbus*
+F: scripts/xml-preprocess*

 Seccomp
 M: Daniel P. Berrange <berrange@redhat.com>
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -304,7 +304,7 @@ static void hvf_region_del(MemoryListener *listener,

 static MemoryListener hvf_memory_listener = {
    .name = "hvf",
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
    .region_add = hvf_region_add,
    .region_del = hvf_region_del,
    .log_start = hvf_log_start,
@@ -372,19 +372,19 @@ type_init(hvf_type_init);

 static void hvf_vcpu_destroy(CPUState *cpu)
 {
-    hv_return_t ret = hv_vcpu_destroy(cpu->hvf->fd);
+    hv_return_t ret = hv_vcpu_destroy(cpu->accel->fd);
    assert_hvf_ok(ret);

    hvf_arch_vcpu_destroy(cpu);
-    g_free(cpu->hvf);
-    cpu->hvf = NULL;
+    g_free(cpu->accel);
+    cpu->accel = NULL;
 }

 static int hvf_init_vcpu(CPUState *cpu)
 {
    int r;

-    cpu->hvf = g_malloc0(sizeof(*cpu->hvf));
+    cpu->accel = g_new0(AccelCPUState, 1);

    /* init cpu signals */
    struct sigaction sigact;
@@ -393,18 +393,19 @@ static int hvf_init_vcpu(CPUState *cpu)
    sigact.sa_handler = dummy_signal;
    sigaction(SIG_IPI, &sigact, NULL);

-    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
-    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->accel->unblock_ipi_mask);
+    sigdelset(&cpu->accel->unblock_ipi_mask, SIG_IPI);

 #ifdef __aarch64__
-    r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
+    r = hv_vcpu_create(&cpu->accel->fd,
+                       (hv_vcpu_exit_t **)&cpu->accel->exit, NULL);
 #else
-    r = hv_vcpu_create((hv_vcpuid_t *)&cpu->hvf->fd, HV_VCPU_DEFAULT);
+    r = hv_vcpu_create((hv_vcpuid_t *)&cpu->accel->fd, HV_VCPU_DEFAULT);
 #endif
    cpu->vcpu_dirty = 1;
    assert_hvf_ok(r);

-    cpu->hvf->guest_debug_enabled = false;
+    cpu->accel->guest_debug_enabled = false;

    return hvf_arch_init_vcpu(cpu);
 }
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1105,6 +1105,7 @@ static MemoryListener kvm_coalesced_pio_listener = {
    .name = "kvm-coalesced-pio",
    .coalesced_io_add = kvm_coalesce_pio_add,
    .coalesced_io_del = kvm_coalesce_pio_del,
+    .priority = MEMORY_LISTENER_PRIORITY_MIN,
 };

 int kvm_check_extension(KVMState *s, unsigned int extension)
@@ -1777,7 +1778,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
    kml->listener.commit = kvm_region_commit;
    kml->listener.log_start = kvm_log_start;
    kml->listener.log_stop = kvm_log_stop;
-    kml->listener.priority = 10;
+    kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
    kml->listener.name = name;

    if (s->kvm_dirty_ring_size) {
@@ -1802,7 +1803,7 @@ static MemoryListener kvm_io_listener = {
    .name = "kvm-io",
    .eventfd_add = kvm_io_ioeventfd_add,
    .eventfd_del = kvm_io_ioeventfd_del,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
 };

 int kvm_set_irq(KVMState *s, int irq, int level)
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -27,6 +27,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_ioeventfd_any_length_allowed;
 bool kvm_msi_use_devid;
+bool kvm_direct_msi_allowed;

 void kvm_flush_coalesced_mmio_buffer(void)
 {
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -152,8 +152,4 @@ void mttcg_start_vcpu_thread(CPUState *cpu)

    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                       cpu, QEMU_THREAD_JOINABLE);
-
-#ifdef _WIN32
-    cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
 }
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -329,9 +329,6 @@ void rr_start_vcpu_thread(CPUState *cpu)

        single_tcg_halt_cond = cpu->halt_cond;
        single_tcg_cpu_thread = cpu->thread;
-#ifdef _WIN32
-        cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
    } else {
        /* we share the thread */
        cpu->thread = single_tcg_cpu_thread;
--- a/audio/dbusaudio.c
+++ b/audio/dbusaudio.c
@@ -29,7 +29,11 @@
 #include "qemu/timer.h"
 #include "qemu/dbus.h"

+#ifdef G_OS_UNIX
 #include <gio/gunixfdlist.h>
+#endif
+
+#include "ui/dbus.h"
 #include "ui/dbus-display1.h"

 #define AUDIO_CAP "dbus"
@@ -444,7 +448,9 @@ listener_in_vanished_cb(GDBusConnection *connection,
 static gboolean
 dbus_audio_register_listener(AudioState *s,
                             GDBusMethodInvocation *invocation,
+#ifdef G_OS_UNIX
                             GUnixFDList *fd_list,
+#endif
                             GVariant *arg_listener,
                             bool out)
 {
@@ -471,6 +477,11 @@ dbus_audio_register_listener(AudioState *s,
        return DBUS_METHOD_INVOCATION_HANDLED;
    }

+#ifdef G_OS_WIN32
+    if (!dbus_win32_import_socket(invocation, arg_listener, &fd)) {
+        return DBUS_METHOD_INVOCATION_HANDLED;
+    }
+#else
    fd = g_unix_fd_list_get(fd_list, g_variant_get_handle(arg_listener), &err);
    if (err) {
        g_dbus_method_invocation_return_error(invocation,
@@ -480,6 +491,7 @@ dbus_audio_register_listener(AudioState *s,
                                              err->message);
        return DBUS_METHOD_INVOCATION_HANDLED;
    }
+#endif

    socket = g_socket_new_from_fd(fd, &err);
    if (err) {
@@ -488,15 +500,28 @@ dbus_audio_register_listener(AudioState *s,
                                              DBUS_DISPLAY_ERROR_FAILED,
                                              "Couldn't make a socket: %s",
                                              err->message);
+#ifdef G_OS_WIN32
+        closesocket(fd);
+#else
+        close(fd);
+#endif
        return DBUS_METHOD_INVOCATION_HANDLED;
    }
    socket_conn = g_socket_connection_factory_create_connection(socket);
    if (out) {
        qemu_dbus_display1_audio_complete_register_out_listener(
-            da->iface, invocation, NULL);
+            da->iface, invocation
+#ifdef G_OS_UNIX
+            , NULL
+#endif
+            );
    } else {
        qemu_dbus_display1_audio_complete_register_in_listener(
-            da->iface, invocation, NULL);
+            da->iface, invocation
+#ifdef G_OS_UNIX
+            , NULL
+#endif
+            );
    }

    listener_conn =
@@ -574,22 +599,32 @@ dbus_audio_register_listener(AudioState *s,
 static gboolean
 dbus_audio_register_out_listener(AudioState *s,
                                 GDBusMethodInvocation *invocation,
+#ifdef G_OS_UNIX
                                 GUnixFDList *fd_list,
+#endif
                                 GVariant *arg_listener)
 {
    return dbus_audio_register_listener(s, invocation,
-                                        fd_list, arg_listener, true);
+#ifdef G_OS_UNIX
+                                        fd_list,
+#endif
+                                        arg_listener, true);

 }

 static gboolean
 dbus_audio_register_in_listener(AudioState *s,
                                GDBusMethodInvocation *invocation,
+#ifdef G_OS_UNIX
                                GUnixFDList *fd_list,
+#endif
                                GVariant *arg_listener)
 {
    return dbus_audio_register_listener(s, invocation,
-                                        fd_list, arg_listener, false);
+#ifdef G_OS_UNIX
+                                        fd_list,
+#endif
+                                        arg_listener, false);
 }

 static void
--- a/block.c
+++ b/block.c
@@ -555,8 +555,9 @@ int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
 * On success, return @blk's actual length.
 * Otherwise, return -errno.
 */
-static int64_t create_file_fallback_truncate(BlockBackend *blk,
-                                             int64_t minimum_size, Error **errp)
+static int64_t coroutine_fn GRAPH_UNLOCKED
+create_file_fallback_truncate(BlockBackend *blk, int64_t minimum_size,
+                              Error **errp)
 {
    Error *local_err = NULL;
    int64_t size;
@@ -564,14 +565,14 @@ static int64_t create_file_fallback_truncate(BlockBackend *blk,

    GLOBAL_STATE_CODE();

-    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
-                       &local_err);
+    ret = blk_co_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
+                          &local_err);
    if (ret < 0 && ret != -ENOTSUP) {
        error_propagate(errp, local_err);
        return ret;
    }

-    size = blk_getlength(blk);
+    size = blk_co_getlength(blk);
    if (size < 0) {
        error_free(local_err);
        error_setg_errno(errp, -size,
@@ -2854,7 +2855,7 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
 * Replaces the node that a BdrvChild points to without updating permissions.
 *
 * If @new_bs is non-NULL, the parent of @child must already be drained through
- * @child.
+ * @child and the caller must hold the AioContext lock for @new_bs.
 */
 static void bdrv_replace_child_noperm(BdrvChild *child,
                                      BlockDriverState *new_bs)
@@ -2893,7 +2894,7 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
    }

    /* TODO Pull this up into the callers to avoid polling here */
-    bdrv_graph_wrlock();
+    bdrv_graph_wrlock(new_bs);
    if (old_bs) {
        if (child->klass->detach) {
            child->klass->detach(child);
@@ -2989,6 +2990,10 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
 * Function doesn't update permissions, caller is responsible for this.
 *
 * Returns new created child.
+ *
+ * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
+ * @child_bs can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
 */
 static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
                                           const char *child_name,
@@ -2999,7 +3004,7 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
                                           Transaction *tran, Error **errp)
 {
    BdrvChild *new_child;
-    AioContext *parent_ctx;
+    AioContext *parent_ctx, *new_child_ctx;
    AioContext *child_ctx = bdrv_get_aio_context(child_bs);

    assert(child_class->get_parent_desc);
@@ -3050,6 +3055,12 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
        }
    }

+    new_child_ctx = bdrv_get_aio_context(child_bs);
+    if (new_child_ctx != child_ctx) {
+        aio_context_release(child_ctx);
+        aio_context_acquire(new_child_ctx);
+    }
+
    bdrv_ref(child_bs);
    /*
     * Let every new BdrvChild start with a drained parent. Inserting the child
@@ -3079,11 +3090,20 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
    };
    tran_add(tran, &bdrv_attach_child_common_drv, s);

+    if (new_child_ctx != child_ctx) {
+        aio_context_release(new_child_ctx);
+        aio_context_acquire(child_ctx);
+    }
+
    return new_child;
 }

 /*
 * Function doesn't update permissions, caller is responsible for this.
+ *
+ * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
+ * @child_bs can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
 */
 static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
                                           BlockDriverState *child_bs,
@@ -3347,6 +3367,10 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
 * callers which don't need their own reference any more must call bdrv_unref().
 *
 * Function doesn't update permissions, caller is responsible for this.
+ *
+ * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
+ * @child_bs can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
 */
 static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
                                           BlockDriverState *child_bs,
@@ -3435,6 +3459,11 @@ out:
    return 0;
 }

+/*
+ * The caller must hold the AioContext lock for @backing_hd. Both @bs and
+ * @backing_hd can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
+ */
 static int bdrv_set_backing_noperm(BlockDriverState *bs,
                                   BlockDriverState *backing_hd,
                                   Transaction *tran, Error **errp)
@@ -3498,6 +3527,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
    int ret = 0;
    bool implicit_backing = false;
    BlockDriverState *backing_hd;
+    AioContext *backing_hd_ctx;
    QDict *options;
    QDict *tmp_parent_options = NULL;
    Error *local_err = NULL;
@@ -3582,8 +3612,12 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,

    /* Hook up the backing file link; drop our reference, bs owns the
     * backing_hd reference now */
+    backing_hd_ctx = bdrv_get_aio_context(backing_hd);
+    aio_context_acquire(backing_hd_ctx);
    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
    bdrv_unref(backing_hd);
+    aio_context_release(backing_hd_ctx);
+
    if (ret < 0) {
        goto free_exit;
    }
@@ -3654,6 +3688,7 @@ done:
 *
 * The BlockdevRef will be removed from the options QDict.
 *
+ * The caller must hold the lock of the main AioContext and no other AioContext.
 * @parent can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
 */
@@ -3665,6 +3700,8 @@ BdrvChild *bdrv_open_child(const char *filename,
                           bool allow_none, Error **errp)
 {
    BlockDriverState *bs;
+    BdrvChild *child;
+    AioContext *ctx;

    GLOBAL_STATE_CODE();

@@ -3674,13 +3711,19 @@ BdrvChild *bdrv_open_child(const char *filename,
        return NULL;
    }

-    return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
-                             errp);
+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
+                              errp);
+    aio_context_release(ctx);
+
+    return child;
 }

 /*
 * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
 *
+ * The caller must hold the lock of the main AioContext and no other AioContext.
 * @parent can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
 */
@@ -3757,6 +3800,7 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
    int64_t total_size;
    QemuOpts *opts = NULL;
    BlockDriverState *bs_snapshot = NULL;
+    AioContext *ctx = bdrv_get_aio_context(bs);
    int ret;

    GLOBAL_STATE_CODE();
@@ -3765,7 +3809,10 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
       instead of opening 'filename' directly */

    /* Get the required size from the image */
+    aio_context_acquire(ctx);
    total_size = bdrv_getlength(bs);
+    aio_context_release(ctx);
+
    if (total_size < 0) {
        error_setg_errno(errp, -total_size, "Could not get image size");
        goto out;
@@ -3799,7 +3846,10 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
        goto out;
    }

+    aio_context_acquire(ctx);
    ret = bdrv_append(bs_snapshot, bs, errp);
+    aio_context_release(ctx);
+
    if (ret < 0) {
        bs_snapshot = NULL;
        goto out;
@@ -3843,6 +3893,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
    Error *local_err = NULL;
    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;
+    AioContext *ctx = qemu_get_aio_context();

    assert(!child_class || !flags);
    assert(!child_class == !parent);
@@ -3980,9 +4031,13 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
             * looking at the header to guess the image format. This works even
             * in cases where a guest would not see a consistent state. */
-            file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
+            ctx = bdrv_get_aio_context(file_bs);
+            aio_context_acquire(ctx);
+            file = blk_new(ctx, 0, BLK_PERM_ALL);
            blk_insert_bs(file, file_bs, &local_err);
            bdrv_unref(file_bs);
+            aio_context_release(ctx);
+
            if (local_err) {
                goto fail;
            }
@@ -4028,8 +4083,13 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
        goto fail;
    }

+    /* The AioContext could have changed during bdrv_open_common() */
+    ctx = bdrv_get_aio_context(bs);
+
    if (file) {
+        aio_context_acquire(ctx);
        blk_unref(file);
+        aio_context_release(ctx);
        file = NULL;
    }

@@ -4087,13 +4147,16 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
         * (snapshot_bs); thus, we have to drop the strong reference to bs
         * (which we obtained by calling bdrv_new()). bs will not be deleted,
         * though, because the overlay still has a reference to it. */
+        aio_context_acquire(ctx);
        bdrv_unref(bs);
+        aio_context_release(ctx);
        bs = snapshot_bs;
    }

    return bs;

 fail:
+    aio_context_acquire(ctx);
    blk_unref(file);
    qobject_unref(snapshot_options);
    qobject_unref(bs->explicit_options);
@@ -4102,11 +4165,14 @@ fail:
    bs->options = NULL;
    bs->explicit_options = NULL;
    bdrv_unref(bs);
+    aio_context_release(ctx);
    error_propagate(errp, local_err);
    return NULL;

 close_and_fail:
+    aio_context_acquire(ctx);
    bdrv_unref(bs);
+    aio_context_release(ctx);
    qobject_unref(snapshot_options);
    qobject_unref(options);
    error_propagate(errp, local_err);
@@ -4578,6 +4644,11 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
 * backing BlockDriverState (or NULL).
 *
 * Return 0 on success, otherwise return < 0 and set @errp.
+ *
+ * The caller must hold the AioContext lock of @reopen_state->bs.
+ * @reopen_state->bs can move to a different AioContext in this function.
+ * Callers must make sure that their AioContext locking is still correct after
+ * this.
 */
 static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
                                             bool is_backing, Transaction *tran,
@@ -4590,6 +4661,8 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
    const char *child_name = is_backing ? "backing" : "file";
    QObject *value;
    const char *str;
+    AioContext *ctx, *old_ctx;
+    int ret;

    GLOBAL_STATE_CODE();

@@ -4654,8 +4727,22 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        reopen_state->old_file_bs = old_child_bs;
    }

-    return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
-                                           tran, errp);
+    old_ctx = bdrv_get_aio_context(bs);
+    ctx = bdrv_get_aio_context(new_child_bs);
+    if (old_ctx != ctx) {
+        aio_context_release(old_ctx);
+        aio_context_acquire(ctx);
+    }
+
+    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
+                                          tran, errp);
+
+    if (old_ctx != ctx) {
+        aio_context_release(ctx);
+        aio_context_acquire(old_ctx);
+    }
+
+    return ret;
 }

 /*
@@ -4674,6 +4761,7 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
 * It is the responsibility of the caller to then call the abort() or
 * commit() for any other BDS that have been left in a prepare() state
 *
+ * The caller must hold the AioContext lock of @reopen_state->bs.
 */
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                               BlockReopenQueue *queue,
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -203,7 +203,8 @@ static void bochs_refresh_limits(BlockDriverState *bs, Error **errp)
    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
 }

-static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+static int64_t coroutine_fn GRAPH_RDLOCK
+seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
    BDRVBochsState *s = bs->opaque;
    uint64_t offset = sector_num * 512;
@@ -224,8 +225,8 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
        (s->extent_blocks + s->bitmap_blocks));

    /* read in bitmap for current extent */
-    ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), 1,
-                     &bitmap_entry, 0);
+    ret = bdrv_co_pread(bs->file, bitmap_offset + (extent_offset / 8), 1,
+                        &bitmap_entry, 0);
    if (ret < 0) {
        return ret;
    }
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -212,7 +212,8 @@ static void cloop_refresh_limits(BlockDriverState *bs, Error **errp)
    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
 }

-static inline int cloop_read_block(BlockDriverState *bs, int block_num)
+static int coroutine_fn GRAPH_RDLOCK
+cloop_read_block(BlockDriverState *bs, int block_num)
 {
    BDRVCloopState *s = bs->opaque;

@@ -220,8 +221,8 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
        int ret;
        uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num];

-        ret = bdrv_pread(bs->file, s->offsets[block_num], bytes,
-                         s->compressed_block, 0);
+        ret = bdrv_co_pread(bs->file, s->offsets[block_num], bytes,
+                            s->compressed_block, 0);
        if (ret < 0) {
            return -1;
        }
@@ -244,7 +245,7 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
    return 0;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 cloop_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -616,7 +616,8 @@ err:
    return s->n_chunks; /* error */
 }

-static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
+static int coroutine_fn GRAPH_RDLOCK
+dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
 {
    BDRVDMGState *s = bs->opaque;

@@ -633,8 +634,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
        case UDZO: { /* zlib compressed */
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
-                             s->compressed_chunk, 0);
+            ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
+                                s->compressed_chunk, 0);
            if (ret < 0) {
                return -1;
            }
@@ -659,8 +660,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
            }
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
-                             s->compressed_chunk, 0);
+            ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
+                                s->compressed_chunk, 0);
            if (ret < 0) {
                return -1;
            }
@@ -680,8 +681,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
            }
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
-                             s->compressed_chunk, 0);
+            ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
+                                s->compressed_chunk, 0);
            if (ret < 0) {
                return -1;
            }
@@ -696,8 +697,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
            }
            break;
        case UDRW: /* copy */
-            ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
-                             s->uncompressed_chunk, 0);
+            ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk],
+                                s->uncompressed_chunk, 0);
            if (ret < 0) {
                return -1;
            }
@@ -713,7 +714,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    return 0;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 dmg_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
              QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -193,7 +193,7 @@ static int fd_open(BlockDriverState *bs)
    return -EIO;
 }

-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs);
+static int64_t raw_getlength(BlockDriverState *bs);

 typedef struct RawPosixAIOData {
    BlockDriverState *bs;
@@ -1974,7 +1974,7 @@ static int handle_aiocb_write_zeroes(void *opaque)
 #ifdef CONFIG_FALLOCATE
    /* Last resort: we are trying to extend the file with zeroed data. This
     * can be done via fallocate(fd, 0) */
-    len = raw_co_getlength(aiocb->bs);
+    len = raw_getlength(aiocb->bs);
    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
        if (ret == 0 || ret != -ENOTSUP) {
@@ -2666,7 +2666,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
    }

    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
-        int64_t cur_length = raw_co_getlength(bs);
+        int64_t cur_length = raw_getlength(bs);

        if (offset != cur_length && exact) {
            error_setg(errp, "Cannot resize device files");
@@ -2684,7 +2684,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 }

 #ifdef __OpenBSD__
-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+static int64_t raw_getlength(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
@@ -2703,7 +2703,7 @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
        return st.st_size;
 }
 #elif defined(__NetBSD__)
-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+static int64_t raw_getlength(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
@@ -2728,7 +2728,7 @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
        return st.st_size;
 }
 #elif defined(__sun__)
-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+static int64_t raw_getlength(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    struct dk_minfo minfo;
@@ -2759,7 +2759,7 @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
    return size;
 }
 #elif defined(CONFIG_BSD)
-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+static int64_t raw_getlength(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
@@ -2831,7 +2831,7 @@ again:
    return size;
 }
 #else
-static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+static int64_t raw_getlength(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    int ret;
@@ -2850,6 +2850,11 @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
 }
 #endif

+static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+{
+    return raw_getlength(bs);
+}
+
 static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs)
 {
    struct stat st;
@@ -3215,7 +3220,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
         * round up if necessary.
         */
        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
-            int64_t file_length = raw_co_getlength(bs);
+            int64_t file_length = raw_getlength(bs);
            if (file_length > 0) {
                /* Ignore errors, this is just a safeguard */
                assert(hole == file_length);
@@ -3237,7 +3242,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,

 #if defined(__linux__)
 /* Verify that the file is not in the page cache */
-static void coroutine_fn check_cache_dropped(BlockDriverState *bs, Error **errp)
+static void check_cache_dropped(BlockDriverState *bs, Error **errp)
 {
    const size_t window_size = 128 * 1024 * 1024;
    BDRVRawState *s = bs->opaque;
@@ -3252,7 +3257,7 @@ static void coroutine_fn check_cache_dropped(BlockDriverState *bs, Error **errp)
    page_size = sysconf(_SC_PAGESIZE);
    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));

-    end = raw_co_getlength(bs);
+    end = raw_getlength(bs);

    for (offset = 0; offset < end; offset += window_size) {
        void *new_window;
@@ -4468,7 +4473,7 @@ static int cdrom_reopen(BlockDriverState *bs)

 static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
 {
-    return raw_co_getlength(bs) > 0;
+    return raw_getlength(bs) > 0;
 }

 static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -30,10 +30,8 @@ BdrvGraphLock graph_lock;
 /* Protects the list of aiocontext and orphaned_reader_count */
 static QemuMutex aio_context_list_lock;

-#if 0
 /* Written and read with atomic operations. */
 static int has_writer;
-#endif

 /*
 * A reader coroutine could move from an AioContext to another.
@@ -90,7 +88,6 @@ void unregister_aiocontext(AioContext *ctx)
    g_free(ctx->bdrv_graph);
 }

-#if 0
 static uint32_t reader_count(void)
 {
    BdrvGraphRWlock *brdv_graph;
@@ -108,19 +105,27 @@ static uint32_t reader_count(void)
    assert((int32_t)rd >= 0);
    return rd;
 }
-#endif

-void bdrv_graph_wrlock(void)
+void bdrv_graph_wrlock(BlockDriverState *bs)
 {
+    AioContext *ctx = NULL;
+
    GLOBAL_STATE_CODE();
-    /*
-     * TODO Some callers hold an AioContext lock when this is called, which
-     * causes deadlocks. Reenable once the AioContext locking is cleaned up (or
-     * AioContext locks are gone).
-     */
-#if 0
    assert(!qatomic_read(&has_writer));

+    /*
+     * Release only non-mainloop AioContext. The mainloop often relies on the
+     * BQL and doesn't lock the main AioContext before doing things.
+     */
+    if (bs) {
+        ctx = bdrv_get_aio_context(bs);
+        if (ctx != qemu_get_aio_context()) {
+            aio_context_release(ctx);
+        } else {
+            ctx = NULL;
+        }
+    }
+
    /* Make sure that constantly arriving new I/O doesn't cause starvation */
    bdrv_drain_all_begin_nopoll();

@@ -149,13 +154,15 @@ void bdrv_graph_wrlock(void)
    } while (reader_count() >= 1);

    bdrv_drain_all_end();
-#endif
+
+    if (ctx) {
+        aio_context_acquire(bdrv_get_aio_context(bs));
+    }
 }

 void bdrv_graph_wrunlock(void)
 {
    GLOBAL_STATE_CODE();
-#if 0
    QEMU_LOCK_GUARD(&aio_context_list_lock);
    assert(qatomic_read(&has_writer));

@@ -167,13 +174,10 @@ void bdrv_graph_wrunlock(void)

    /* Wake up all coroutine that are waiting to read the graph */
    qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
-#endif
 }

 void coroutine_fn bdrv_graph_co_rdlock(void)
 {
-    /* TODO Reenable when wrlock is reenabled */
-#if 0
    BdrvGraphRWlock *bdrv_graph;
    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;

@@ -233,12 +237,10 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
            qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
        }
    }
-#endif
 }

 void coroutine_fn bdrv_graph_co_rdunlock(void)
 {
-#if 0
    BdrvGraphRWlock *bdrv_graph;
    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;

@@ -256,7 +258,6 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
    if (qatomic_read(&has_writer)) {
        aio_wait_kick();
    }
-#endif
 }

 void bdrv_graph_rdlock_main_loop(void)
@@ -274,19 +275,13 @@ void bdrv_graph_rdunlock_main_loop(void)
 void assert_bdrv_graph_readable(void)
 {
    /* reader_count() is slow due to aio_context_list_lock lock contention */
-    /* TODO Reenable when wrlock is reenabled */
-#if 0
 #ifdef CONFIG_DEBUG_GRAPH_LOCK
    assert(qemu_in_main_thread() || reader_count());
 #endif
-#endif
 }

 void assert_bdrv_graph_writable(void)
 {
    assert(qemu_in_main_thread());
-    /* TODO Reenable when wrlock is reenabled */
-#if 0
    assert(qatomic_read(&has_writer));
-#endif
 }
--- a/block/io.c
+++ b/block/io.c
@@ -1379,7 +1379,7 @@ bdrv_aligned_preadv(BdrvChild *child, BdrvTrackedRequest *req,
    }

    /* Forward the request to the BlockDriver, possibly fragmenting it */
-    total_bytes = bdrv_getlength(bs);
+    total_bytes = bdrv_co_getlength(bs);
    if (total_bytes < 0) {
        ret = total_bytes;
        goto out;
@@ -2388,7 +2388,7 @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero,
    assert(pnum);
    assert_bdrv_graph_readable();
    *pnum = 0;
-    total_size = bdrv_getlength(bs);
+    total_size = bdrv_co_getlength(bs);
    if (total_size < 0) {
        ret = total_size;
        goto early_out;
@@ -2408,7 +2408,7 @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero,
        bytes = n;
    }

-    /* Must be non-NULL or bdrv_getlength() would have failed */
+    /* Must be non-NULL or bdrv_co_getlength() would have failed */
    assert(bs->drv);
    has_filtered_child = bdrv_filter_child(bs);
    if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
@@ -2546,7 +2546,7 @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero,
        if (!cow_bs) {
            ret |= BDRV_BLOCK_ZERO;
        } else if (want_zero) {
-            int64_t size2 = bdrv_getlength(cow_bs);
+            int64_t size2 = bdrv_co_getlength(cow_bs);

            if (size2 >= 0 && offset >= size2) {
                ret |= BDRV_BLOCK_ZERO;
@@ -3011,7 +3011,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
    }

    /* Write back cached data to the OS even with cache=unsafe */
-    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
+    BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
    if (bs->drv->bdrv_co_flush_to_os) {
        ret = bs->drv->bdrv_co_flush_to_os(bs);
        if (ret < 0) {
@@ -3029,7 +3029,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
        goto flush_children;
    }

-    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
+    BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
    if (!bs->drv) {
        /* bs->drv->bdrv_co_flush() might have ejected the BDS
         * (even in case of apparent success) */
@@ -3592,7 +3592,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
        return ret;
    }

-    old_size = bdrv_getlength(bs);
+    old_size = bdrv_co_getlength(bs);
    if (old_size < 0) {
        error_setg_errno(errp, -old_size, "Failed to get old image size");
        return old_size;
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -200,7 +200,7 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
    assert(idx < s->bat_size && idx + to_allocate <= s->bat_size);

    space = to_allocate * s->tracks;
-    len = bdrv_getlength(bs->file->bs);
+    len = bdrv_co_getlength(bs->file->bs);
    if (len < 0) {
        return len;
    }
@@ -448,7 +448,7 @@ parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
    uint32_t i;
    int64_t off, high_off, size;

-    size = bdrv_getlength(bs->file->bs);
+    size = bdrv_co_getlength(bs->file->bs);
    if (size < 0) {
        res->check_errors++;
        return size;
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -370,7 +370,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
        if (!allocate)
            return 0;
        /* allocate a new l2 entry */
-        l2_offset = bdrv_getlength(bs->file->bs);
+        l2_offset = bdrv_co_getlength(bs->file->bs);
        if (l2_offset < 0) {
            return l2_offset;
        }
@@ -379,7 +379,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
        /* update the L1 entry */
        s->l1_table[l1_index] = l2_offset;
        tmp = cpu_to_be64(l2_offset);
-        BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_UPDATE);
        ret = bdrv_co_pwrite_sync(bs->file,
                                  s->l1_table_offset + l1_index * sizeof(tmp),
                                  sizeof(tmp), &tmp, 0);
@@ -410,7 +410,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
        }
    }
    l2_table = s->l2_cache + (min_index << s->l2_bits);
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_LOAD);
    if (new_l2_table) {
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
        ret = bdrv_co_pwrite_sync(bs->file, l2_offset,
@@ -434,7 +434,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
        if (!allocate)
            return 0;
-        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
        assert(QEMU_IS_ALIGNED(n_start | n_end, BDRV_SECTOR_SIZE));
        /* allocate a new cluster */
        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
@@ -445,20 +445,20 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
            if (decompress_cluster(bs, cluster_offset) < 0) {
                return -EIO;
            }
-            cluster_offset = bdrv_getlength(bs->file->bs);
+            cluster_offset = bdrv_co_getlength(bs->file->bs);
            if ((int64_t) cluster_offset < 0) {
                return cluster_offset;
            }
            cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
            /* write the cluster content */
-            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+            BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
            ret = bdrv_co_pwrite(bs->file, cluster_offset, s->cluster_size,
                                 s->cluster_cache, 0);
            if (ret < 0) {
                return ret;
            }
        } else {
-            cluster_offset = bdrv_getlength(bs->file->bs);
+            cluster_offset = bdrv_co_getlength(bs->file->bs);
            if ((int64_t) cluster_offset < 0) {
                return cluster_offset;
            }
@@ -491,7 +491,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
                                                      NULL) < 0) {
                                return -EIO;
                            }
-                            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+                            BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
                            ret = bdrv_co_pwrite(bs->file, cluster_offset + i,
                                                 BDRV_SECTOR_SIZE,
                                                 s->cluster_data, 0);
@@ -510,9 +510,9 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
        tmp = cpu_to_be64(cluster_offset);
        l2_table[l2_index] = tmp;
        if (allocate == 2) {
-            BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+            BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
        } else {
-            BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+            BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE);
        }
        ret = bdrv_co_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
                                  sizeof(tmp), &tmp, 0);
@@ -595,7 +595,7 @@ decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
    if (s->cluster_cache_offset != coffset) {
        csize = cluster_offset >> (63 - s->cluster_bits);
        csize &= (s->cluster_size - 1);
-        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
        ret = bdrv_co_pread(bs->file, coffset, csize, s->cluster_data, 0);
        if (ret < 0)
            return -1;
@@ -657,7 +657,7 @@ qcow_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                /* read from the base image */
                qemu_co_mutex_unlock(&s->lock);
                /* qcow2 emits this on bs->file instead of bs->backing */
-                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                ret = bdrv_co_pread(bs->backing, offset, n, buf, 0);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
@@ -680,7 +680,7 @@ qcow_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                break;
            }
            qemu_co_mutex_unlock(&s->lock);
-            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+            BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
            ret = bdrv_co_pread(bs->file, cluster_offset + offset_in_cluster,
                                n, buf, 0);
            qemu_co_mutex_lock(&s->lock);
@@ -765,7 +765,7 @@ qcow_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
        }

        qemu_co_mutex_unlock(&s->lock);
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
        ret = bdrv_co_pwrite(bs->file, cluster_offset + offset_in_cluster,
                             n, buf, 0);
        qemu_co_mutex_lock(&s->lock);
@@ -1114,7 +1114,7 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes,
    }
    cluster_offset &= s->cluster_offset_mask;

-    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
    ret = bdrv_co_pwrite(bs->file, cluster_offset, out_len, out_buf, 0);
    if (ret < 0) {
        goto fail;
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -283,10 +283,9 @@ static int free_bitmap_clusters(BlockDriverState *bs, Qcow2BitmapTable *tb)
 /* load_bitmap_data
 * @bitmap_table entries must satisfy specification constraints.
 * @bitmap must be cleared */
-static int load_bitmap_data(BlockDriverState *bs,
-                            const uint64_t *bitmap_table,
-                            uint32_t bitmap_table_size,
-                            BdrvDirtyBitmap *bitmap)
+static int coroutine_fn GRAPH_RDLOCK
+load_bitmap_data(BlockDriverState *bs, const uint64_t *bitmap_table,
+                 uint32_t bitmap_table_size, BdrvDirtyBitmap *bitmap)
 {
    int ret = 0;
    BDRVQcow2State *s = bs->opaque;
@@ -319,7 +318,7 @@ static int load_bitmap_data(BlockDriverState *bs,
                 * already cleared */
            }
        } else {
-            ret = bdrv_pread(bs->file, data_offset, s->cluster_size, buf, 0);
+            ret = bdrv_co_pread(bs->file, data_offset, s->cluster_size, buf, 0);
            if (ret < 0) {
                goto finish;
            }
@@ -337,8 +336,9 @@ finish:
    return ret;
 }

-static BdrvDirtyBitmap *load_bitmap(BlockDriverState *bs,
-                                    Qcow2Bitmap *bm, Error **errp)
+static coroutine_fn GRAPH_RDLOCK
+BdrvDirtyBitmap *load_bitmap(BlockDriverState *bs,
+                             Qcow2Bitmap *bm, Error **errp)
 {
    int ret;
    uint64_t *bitmap_table = NULL;
@@ -649,9 +649,10 @@ fail:
    return NULL;
 }

-int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                                  void **refcount_table,
-                                  int64_t *refcount_table_size)
+int coroutine_fn
+qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                              void **refcount_table,
+                              int64_t *refcount_table_size)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;
@@ -957,8 +958,9 @@ static void set_readonly_helper(gpointer bitmap, gpointer value)
 * If header_updated is not NULL then it is set appropriately regardless of
 * the return value.
 */
-bool coroutine_fn qcow2_load_dirty_bitmaps(BlockDriverState *bs,
-                                           bool *header_updated, Error **errp)
+bool coroutine_fn GRAPH_RDLOCK
+qcow2_load_dirty_bitmaps(BlockDriverState *bs,
+                         bool *header_updated, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
    Qcow2BitmapList *bm_list;
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -48,7 +48,7 @@ int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
 #endif

-    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
    ret = bdrv_co_pwrite_zeroes(bs->file,
                                s->l1_table_offset + new_l1_size * L1E_SIZE,
                                (s->l1_size - new_l1_size) * L1E_SIZE, 0);
@@ -61,7 +61,7 @@ int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
        goto fail;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
            continue;
@@ -501,7 +501,7 @@ do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset,
        return 0;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_READ);

    if (!bs->drv) {
        return -ENOMEDIUM;
@@ -551,7 +551,7 @@ do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset,
        return ret;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_WRITE);
    ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
                          qiov->size, qiov, 0);
    if (ret < 0) {
@@ -823,10 +823,9 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 *
 * Return 0 on success and -errno in error cases
 */
-int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
-                                                       uint64_t offset,
-                                                       int compressed_size,
-                                                       uint64_t *host_offset)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
+                                      int compressed_size, uint64_t *host_offset)
 {
    BDRVQcow2State *s = bs->opaque;
    int l2_index, ret;
@@ -872,7 +871,7 @@ int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,

    /* compressed clusters never have the copied flag */

-    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
    set_l2_entry(s, l2_slice, l2_index, cluster_offset);
    if (has_subclusters(s)) {
@@ -992,7 +991,7 @@ perform_cow(BlockDriverState *bs, QCowL2Meta *m)
        /* NOTE: we have a write_aio blkdebug event here followed by
         * a cow_write one in do_perform_cow_write(), but there's only
         * one single I/O operation */
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
    } else {
        /* If there's no guest data then write both COW regions separately */
@@ -2038,8 +2037,9 @@ fail:
 * all clusters in the same L2 slice) and returns the number of zeroed
 * clusters.
 */
-static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
-                            uint64_t nb_clusters, int flags)
+static int coroutine_fn
+zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
+                 uint64_t nb_clusters, int flags)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l2_slice;
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -118,7 +118,7 @@ int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
            ret = -ENOMEM;
            goto fail;
        }
-        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
        ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
                            refcount_table_size2, s->refcount_table, 0);
        if (ret < 0) {
@@ -1069,14 +1069,14 @@ int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offs

 /* only used to allocate compressed sectors. We try to allocate
   contiguous sectors. size must be <= cluster_size */
-int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size)
+int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t offset;
    size_t free_in_cluster;
    int ret;

-    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
    assert(size > 0 && size <= s->cluster_size);
    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));

@@ -1524,10 +1524,11 @@ static int realloc_refcount_array(BDRVQcow2State *s, void **array,
 *
 * Modifies the number of errors in res.
 */
-int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
-                             void **refcount_table,
-                             int64_t *refcount_table_size,
-                             int64_t offset, int64_t size)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
+                         void **refcount_table,
+                         int64_t *refcount_table_size,
+                         int64_t offset, int64_t size)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t start, last, cluster_offset, k, refcount;
@@ -1538,7 +1539,7 @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
        return 0;
    }

-    file_len = bdrv_getlength(bs->file->bs);
+    file_len = bdrv_co_getlength(bs->file->bs);
    if (file_len < 0) {
        return file_len;
    }
@@ -1600,10 +1601,11 @@ enum {
 *
 * On failure in-memory @l2_table may be modified.
 */
-static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
-                                uint64_t l2_offset,
-                                uint64_t *l2_table, int l2_index, bool active,
-                                bool *metadata_overlap)
+static int coroutine_fn GRAPH_RDLOCK
+fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
+                     uint64_t l2_offset, uint64_t *l2_table,
+                     int l2_index, bool active,
+                     bool *metadata_overlap)
 {
    BDRVQcow2State *s = bs->opaque;
    int ret;
@@ -1634,8 +1636,8 @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
        goto fail;
    }

-    ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
-                           &l2_table[idx], 0);
+    ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
+                              &l2_table[idx], 0);
    if (ret < 0) {
        fprintf(stderr, "ERROR: Failed to overwrite L2 "
                "table entry: %s\n", strerror(-ret));
@@ -1659,10 +1661,11 @@ fail:
 * Returns the number of errors found by the checks or -errno if an internal
 * error occurred.
 */
-static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
-                              void **refcount_table,
-                              int64_t *refcount_table_size, int64_t l2_offset,
-                              int flags, BdrvCheckMode fix, bool active)
+static int coroutine_fn GRAPH_RDLOCK
+check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+                   void **refcount_table,
+                   int64_t *refcount_table_size, int64_t l2_offset,
+                   int flags, BdrvCheckMode fix, bool active)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t l2_entry, l2_bitmap;
@@ -1673,7 +1676,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
    bool metadata_overlap;

    /* Read L2 table from disk */
-    ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
+    ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
    if (ret < 0) {
        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
        res->check_errors++;
@@ -1858,12 +1861,11 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
 * Returns the number of errors found by the checks or -errno if an internal
 * error occurred.
 */
-static int check_refcounts_l1(BlockDriverState *bs,
-                              BdrvCheckResult *res,
-                              void **refcount_table,
-                              int64_t *refcount_table_size,
-                              int64_t l1_table_offset, int l1_size,
-                              int flags, BdrvCheckMode fix, bool active)
+static int coroutine_fn GRAPH_RDLOCK
+check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res,
+                   void **refcount_table, int64_t *refcount_table_size,
+                   int64_t l1_table_offset, int l1_size,
+                   int flags, BdrvCheckMode fix, bool active)
 {
    BDRVQcow2State *s = bs->opaque;
    size_t l1_size_bytes = l1_size * L1E_SIZE;
@@ -1889,7 +1891,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
    }

    /* Read L1 table entries from disk */
-    ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
+    ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
    if (ret < 0) {
        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
        res->check_errors++;
@@ -1949,8 +1951,8 @@ static int check_refcounts_l1(BlockDriverState *bs,
 * have been already detected and sufficiently signaled by the calling function
 * (qcow2_check_refcounts) by the time this function is called).
 */
-static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
-                              BdrvCheckMode fix)
+static int coroutine_fn GRAPH_RDLOCK
+check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
@@ -2005,8 +2007,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
            }
        }

-        ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
-                         l2_table, 0);
+        ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
+                            l2_table, 0);
        if (ret < 0) {
            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
                    strerror(-ret));
@@ -2059,8 +2061,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
                goto fail;
            }

-            ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table,
-                              0);
+            ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0);
            if (ret < 0) {
                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
                        strerror(-ret));
@@ -2083,9 +2084,10 @@ fail:
 * Checks consistency of refblocks and accounts for each refblock in
 * *refcount_table.
 */
-static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
-                           BdrvCheckMode fix, bool *rebuild,
-                           void **refcount_table, int64_t *nb_clusters)
+static int coroutine_fn GRAPH_RDLOCK
+check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
+                BdrvCheckMode fix, bool *rebuild,
+                void **refcount_table, int64_t *nb_clusters)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t i, size;
@@ -2127,13 +2129,13 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                    goto resize_fail;
                }

-                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
-                                    PREALLOC_MODE_OFF, 0, &local_err);
+                ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false,
+                                       PREALLOC_MODE_OFF, 0, &local_err);
                if (ret < 0) {
                    error_report_err(local_err);
                    goto resize_fail;
                }
-                size = bdrv_getlength(bs->file->bs);
+                size = bdrv_co_getlength(bs->file->bs);
                if (size < 0) {
                    ret = size;
                    goto resize_fail;
@@ -2197,9 +2199,10 @@ resize_fail:
 /*
 * Calculates an in-memory refcount table.
 */
-static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                               BdrvCheckMode fix, bool *rebuild,
-                               void **refcount_table, int64_t *nb_clusters)
+static int coroutine_fn GRAPH_RDLOCK
+calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                    BdrvCheckMode fix, bool *rebuild,
+                    void **refcount_table, int64_t *nb_clusters)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t i;
@@ -2299,10 +2302,11 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 * Compares the actual reference count for each cluster in the image against the
 * refcount as reported by the refcount structures on-disk.
 */
-static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                              BdrvCheckMode fix, bool *rebuild,
-                              int64_t *highest_cluster,
-                              void *refcount_table, int64_t nb_clusters)
+static void coroutine_fn
+compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                  BdrvCheckMode fix, bool *rebuild,
+                  int64_t *highest_cluster,
+                  void *refcount_table, int64_t nb_clusters)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t i;
@@ -2463,7 +2467,8 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
 * Return whether the on-disk reftable array was resized (true/false),
 * or -errno on error.
 */
-static int rebuild_refcounts_write_refblocks(
+static int coroutine_fn GRAPH_RDLOCK
+rebuild_refcounts_write_refblocks(
        BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
        int64_t first_cluster, int64_t end_cluster,
        uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
@@ -2578,8 +2583,8 @@ static int rebuild_refcounts_write_refblocks(
        on_disk_refblock = (void *)((char *) *refcount_table +
                                    refblock_index * s->cluster_size);

-        ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size,
-                          on_disk_refblock, 0);
+        ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size,
+                             on_disk_refblock, 0);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "ERROR writing refblock");
            return ret;
@@ -2601,11 +2606,10 @@ static int rebuild_refcounts_write_refblocks(
 * On success, the old refcount structure is leaked (it will be covered by the
 * new refcount structure).
 */
-static int rebuild_refcount_structure(BlockDriverState *bs,
-                                      BdrvCheckResult *res,
-                                      void **refcount_table,
-                                      int64_t *nb_clusters,
-                                      Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
+                           void **refcount_table, int64_t *nb_clusters,
+                           Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t reftable_offset = -1;
@@ -2734,8 +2738,8 @@ static int rebuild_refcount_structure(BlockDriverState *bs,
    }

    assert(reftable_length < INT_MAX);
-    ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length,
-                      on_disk_reftable, 0);
+    ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length,
+                         on_disk_reftable, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "ERROR writing reftable");
        goto fail;
@@ -2745,10 +2749,10 @@ static int rebuild_refcount_structure(BlockDriverState *bs,
    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
    reftable_offset_and_clusters.reftable_clusters =
        cpu_to_be32(reftable_clusters);
-    ret = bdrv_pwrite_sync(bs->file,
-                           offsetof(QCowHeader, refcount_table_offset),
-                           sizeof(reftable_offset_and_clusters),
-                           &reftable_offset_and_clusters, 0);
+    ret = bdrv_co_pwrite_sync(bs->file,
+                              offsetof(QCowHeader, refcount_table_offset),
+                              sizeof(reftable_offset_and_clusters),
+                              &reftable_offset_and_clusters, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "ERROR setting reftable");
        goto fail;
@@ -2777,8 +2781,8 @@ fail:
 * Returns 0 if no errors are found, the number of errors in case the image is
 * detected as corrupted, and -errno when an internal error occurred.
 */
-int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                          BdrvCheckMode fix)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
    BDRVQcow2State *s = bs->opaque;
    BdrvCheckResult pre_compare_res;
@@ -2787,7 +2791,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
    bool rebuild = false;
    int ret;

-    size = bdrv_getlength(bs->file->bs);
+    size = bdrv_co_getlength(bs->file->bs);
    if (size < 0) {
        res->check_errors++;
        return size;
@@ -3541,7 +3545,8 @@ done:
    return ret;
 }

-static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
+static int64_t coroutine_fn get_refblock_offset(BlockDriverState *bs,
+                                                uint64_t offset)
 {
    BDRVQcow2State *s = bs->opaque;
    uint32_t index = offset_to_reftable_index(s, offset);
@@ -3707,7 +3712,8 @@ int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
    return -EIO;
 }

-int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_detect_metadata_preallocation(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;
    int64_t i, end_cluster, cluster_count = 0, threshold;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -570,7 +570,7 @@ int qcow2_mark_corrupt(BlockDriverState *bs)
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 * before if necessary.
 */
-int qcow2_mark_consistent(BlockDriverState *bs)
+static int coroutine_fn qcow2_mark_consistent(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;

@@ -2225,7 +2225,7 @@ qcow2_co_preadv_encrypted(BlockDriverState *bs,
        return -ENOMEM;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
    ret = bdrv_co_pread(s->data_file, host_offset, bytes, buf, 0);
    if (ret < 0) {
        goto fail;
@@ -2315,7 +2315,7 @@ qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
        assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */

-        BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
        return bdrv_co_preadv_part(bs->backing, offset, bytes,
                                   qiov, qiov_offset, 0);

@@ -2329,7 +2329,7 @@ qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
                                             offset, bytes, qiov, qiov_offset);
        }

-        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
        return bdrv_co_preadv_part(s->data_file, host_offset,
                                   bytes, qiov, qiov_offset, 0);

@@ -2539,7 +2539,7 @@ handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
            return ret;
        }

-        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
        ret = bdrv_co_pwrite_zeroes(s->data_file, start_offset, nb_bytes,
                                    BDRV_REQ_NO_FALLBACK);
        if (ret < 0) {
@@ -2604,7 +2604,7 @@ int qcow2_co_pwritev_task(BlockDriverState *bs, uint64_t host_offset,
     * guest data now.
     */
    if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) {
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
        trace_qcow2_writev_data(qemu_coroutine_self(), host_offset);
        ret = bdrv_co_pwritev_part(s->data_file, host_offset,
                                   bytes, qiov, qiov_offset, 0);
@@ -4678,7 +4678,7 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
        goto fail;
    }

-    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+    BLKDBG_CO_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
    if (ret < 0) {
        goto fail;
@@ -4797,7 +4797,7 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,

    out_buf = qemu_blockalign(bs, s->cluster_size);

-    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
    if (ret < 0) {
        goto fail;
@@ -5344,7 +5344,7 @@ qcow2_co_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
        return offset;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
    return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
 }

@@ -5356,7 +5356,7 @@ qcow2_co_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
        return offset;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
    return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0);
 }

--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -836,7 +836,6 @@ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,

 int qcow2_mark_dirty(BlockDriverState *bs);
 int qcow2_mark_corrupt(BlockDriverState *bs);
-int qcow2_mark_consistent(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);

 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
@@ -867,7 +866,7 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset,
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
 int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
                                             int64_t nb_clusters);
-int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size);
+int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size);
 void qcow2_free_clusters(BlockDriverState *bs,
                          int64_t offset, int64_t size,
                          enum qcow2_discard_type type);
@@ -879,8 +878,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,

 int qcow2_flush_caches(BlockDriverState *bs);
 int qcow2_write_caches(BlockDriverState *bs);
-int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                          BdrvCheckMode fix);
+int coroutine_fn qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                                       BdrvCheckMode fix);

 void qcow2_process_discards(BlockDriverState *bs, int ret);

@@ -888,10 +887,10 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
                                 int64_t size);
 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
                                  int64_t size, bool data_file);
-int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
-                             void **refcount_table,
-                             int64_t *refcount_table_size,
-                             int64_t offset, int64_t size);
+int coroutine_fn qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
+                                          void **refcount_table,
+                                          int64_t *refcount_table_size,
+                                          int64_t offset, int64_t size);

 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                BlockDriverAmendStatusCB *status_cb,
@@ -919,10 +918,9 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
 int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
                                         unsigned int *bytes,
                                         uint64_t *host_offset, QCowL2Meta **m);
-int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
-                                                       uint64_t offset,
-                                                       int compressed_size,
-                                                       uint64_t *host_offset);
+int coroutine_fn GRAPH_RDLOCK
+qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
+                                      int compressed_size, uint64_t *host_offset);
 void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
                                     uint64_t *coffset, int *csize);

@@ -992,11 +990,12 @@ void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset);
 void qcow2_cache_discard(Qcow2Cache *c, void *table);

 /* qcow2-bitmap.c functions */
-int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                                  void **refcount_table,
-                                  int64_t *refcount_table_size);
-bool coroutine_fn qcow2_load_dirty_bitmaps(BlockDriverState *bs,
-                                           bool *header_updated, Error **errp);
+int coroutine_fn
+qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                              void **refcount_table,
+                              int64_t *refcount_table_size);
+bool coroutine_fn GRAPH_RDLOCK
+qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated, Error **errp);
 bool qcow2_get_bitmap_info_list(BlockDriverState *bs,
                                Qcow2BitmapInfoList **info_list, Error **errp);
 int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -200,7 +200,8 @@ static void qed_check_for_leaks(QEDCheck *check)
 /**
 * Mark an image clean once it passes check or has been repaired
 */
-static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
+static void coroutine_fn GRAPH_RDLOCK
+qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
 {
    /* Skip if there were unfixable corruptions or I/O errors */
    if (result->corruptions > 0 || result->check_errors > 0) {
@@ -213,7 +214,7 @@ static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
    }

    /* Ensure fixes reach storage before clearing check bit */
-    bdrv_flush(s->bs);
+    bdrv_co_flush(s->bs);

    s->header.features &= ~QED_F_NEED_CHECK;
    qed_write_header_sync(s);
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -122,7 +122,7 @@ int coroutine_fn qed_read_l1_table_sync(BDRVQEDState *s)
 int coroutine_fn qed_write_l1_table(BDRVQEDState *s, unsigned int index,
                                    unsigned int n)
 {
-    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
+    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
    return qed_write_table(s, s->header.l1_table_offset,
                           s->l1_table, index, n, false);
 }
@@ -150,7 +150,7 @@ int coroutine_fn qed_read_l2_table(BDRVQEDState *s, QEDRequest *request,
    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
    request->l2_table->table = qed_alloc_table(s);

-    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
+    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L2_LOAD);
    ret = qed_read_table(s, offset, request->l2_table->table);

    if (ret) {
@@ -183,7 +183,7 @@ int coroutine_fn qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                                    unsigned int index, unsigned int n,
                                    bool flush)
 {
-    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
    return qed_write_table(s, request->l2_table->offset,
                           request->l2_table->table, index, n, flush);
 }
--- a/block/qed.c
+++ b/block/qed.c
@@ -195,14 +195,15 @@ static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
 *
 * The string is NUL-terminated.
 */
-static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
-                           char *buf, size_t buflen)
+static int coroutine_fn GRAPH_RDLOCK
+qed_read_string(BdrvChild *file, uint64_t offset,
+                size_t n, char *buf, size_t buflen)
 {
    int ret;
    if (n >= buflen) {
        return -EINVAL;
    }
-    ret = bdrv_pread(file, offset, n, buf, 0);
+    ret = bdrv_co_pread(file, offset, n, buf, 0);
    if (ret < 0) {
        return ret;
    }
@@ -882,7 +883,7 @@ static int coroutine_fn GRAPH_RDLOCK
 qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov)
 {
    if (s->bs->backing) {
-        BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+        BLKDBG_CO_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
        return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
    }
    qemu_iovec_memset(qiov, 0, 0, qiov->size);
@@ -917,7 +918,7 @@ qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len,
        goto out;
    }

-    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_COW_WRITE);
    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
    if (ret < 0) {
        goto out;
@@ -1069,7 +1070,7 @@ static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb)

    trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);

-    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
    return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
 }
@@ -1323,7 +1324,7 @@ qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
    } else if (ret != QED_CLUSTER_FOUND) {
        r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
    } else {
-        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
        r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
    }
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -214,7 +214,7 @@ raw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
        return ret;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }

@@ -268,7 +268,7 @@ raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
        goto fail;
    }

-    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);

 fail:
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -169,9 +169,10 @@ exit:
 * It is assumed that 'buffer' is at least 4096*num_sectors large.
 *
 * 0 is returned on success, -errno otherwise */
-static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
-                                  uint32_t *sectors_written, void *buffer,
-                                  uint32_t num_sectors)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
+                       uint32_t *sectors_written, void *buffer,
+                       uint32_t num_sectors)
 {
    int ret = 0;
    uint64_t offset;
@@ -195,8 +196,7 @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
            /* full */
            break;
        }
-        ret = bdrv_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp,
-                          0);
+        ret = bdrv_co_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp, 0);
        if (ret < 0) {
            goto exit;
        }
@@ -853,8 +853,9 @@ static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
 }


-static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
-                          void *data, uint32_t length, uint64_t offset)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
+               void *data, uint32_t length, uint64_t offset)
 {
    int ret = 0;
    void *buffer = NULL;
@@ -924,7 +925,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,

    sectors += partial_sectors;

-    file_length = bdrv_getlength(bs->file->bs);
+    file_length = bdrv_co_getlength(bs->file->bs);
    if (file_length < 0) {
        ret = file_length;
        goto exit;
@@ -971,8 +972,8 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,

        if (i == 0 && leading_length) {
            /* partial sector at the front of the buffer */
-            ret = bdrv_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
-                             merged_sector, 0);
+            ret = bdrv_co_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
+                                merged_sector, 0);
            if (ret < 0) {
                goto exit;
            }
@@ -981,9 +982,9 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
            sector_write = merged_sector;
        } else if (i == sectors - 1 && trailing_length) {
            /* partial sector at the end of the buffer */
-            ret = bdrv_pread(bs->file, file_offset + trailing_length,
-                             VHDX_LOG_SECTOR_SIZE - trailing_length,
-                             merged_sector + trailing_length, 0);
+            ret = bdrv_co_pread(bs->file, file_offset + trailing_length,
+                                VHDX_LOG_SECTOR_SIZE - trailing_length,
+                                merged_sector + trailing_length, 0);
            if (ret < 0) {
                goto exit;
            }
@@ -1036,8 +1037,9 @@ exit:
 }

 /* Perform a log write, and then immediately flush the entire log */
-int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
-                             void *data, uint32_t length, uint64_t offset)
+int coroutine_fn
+vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
+                         void *data, uint32_t length, uint64_t offset)
 {
    int ret = 0;
    VHDXLogSequence logs = { .valid = true,
@@ -1047,7 +1049,7 @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,

    /* Make sure data written (new and/or changed blocks) is stable
     * on disk, before creating log entry */
-    ret = bdrv_flush(bs);
+    ret = bdrv_co_flush(bs);
    if (ret < 0) {
        goto exit;
    }
@@ -1059,7 +1061,7 @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
    logs.log = s->log;

    /* Make sure log is stable on disk */
-    ret = bdrv_flush(bs);
+    ret = bdrv_co_flush(bs);
    if (ret < 0) {
        goto exit;
    }
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1250,12 +1250,13 @@ exit:
 *
 * Returns the file offset start of the new payload block
 */
-static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
-                               uint64_t *new_offset, bool *need_zero)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
+                    uint64_t *new_offset, bool *need_zero)
 {
    int64_t current_len;

-    current_len = bdrv_getlength(bs->file->bs);
+    current_len = bdrv_co_getlength(bs->file->bs);
    if (current_len < 0) {
        return current_len;
    }
@@ -1271,16 +1272,16 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
    if (*need_zero) {
        int ret;

-        ret = bdrv_truncate(bs->file, *new_offset + s->block_size, false,
-                            PREALLOC_MODE_OFF, BDRV_REQ_ZERO_WRITE, NULL);
+        ret = bdrv_co_truncate(bs->file, *new_offset + s->block_size, false,
+                               PREALLOC_MODE_OFF, BDRV_REQ_ZERO_WRITE, NULL);
        if (ret != -ENOTSUP) {
            *need_zero = false;
            return ret;
        }
    }

-    return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
-                         PREALLOC_MODE_OFF, 0, NULL);
+    return bdrv_co_truncate(bs->file, *new_offset + s->block_size, false,
+                            PREALLOC_MODE_OFF, 0, NULL);
 }

 /*
@@ -1572,12 +1573,10 @@ exit:
 * The first 64KB of the Metadata section is reserved for the metadata
 * header and entries; beyond that, the metadata items themselves reside.
 */
-static int vhdx_create_new_metadata(BlockBackend *blk,
-                                    uint64_t image_size,
-                                    uint32_t block_size,
-                                    uint32_t sector_size,
-                                    uint64_t metadata_offset,
-                                    VHDXImageType type)
+static int coroutine_fn
+vhdx_create_new_metadata(BlockBackend *blk, uint64_t image_size,
+                         uint32_t block_size, uint32_t sector_size,
+                         uint64_t metadata_offset, VHDXImageType type)
 {
    int ret = 0;
    uint32_t offset = 0;
@@ -1668,13 +1667,13 @@ static int vhdx_create_new_metadata(BlockBackend *blk,
                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
    vhdx_metadata_entry_le_export(&md_table_entry[4]);

-    ret = blk_pwrite(blk, metadata_offset, VHDX_HEADER_BLOCK_SIZE, buffer, 0);
+    ret = blk_co_pwrite(blk, metadata_offset, VHDX_HEADER_BLOCK_SIZE, buffer, 0);
    if (ret < 0) {
        goto exit;
    }

-    ret = blk_pwrite(blk, metadata_offset + (64 * KiB),
-                     VHDX_METADATA_ENTRY_BUFFER_SIZE, entry_buffer, 0);
+    ret = blk_co_pwrite(blk, metadata_offset + (64 * KiB),
+                        VHDX_METADATA_ENTRY_BUFFER_SIZE, entry_buffer, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -1694,10 +1693,11 @@ exit:
 *  Fixed images: default state of the BAT is fully populated, with
 *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
 */
-static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
-                           uint64_t image_size, VHDXImageType type,
-                           bool use_zero_blocks, uint64_t file_offset,
-                           uint32_t length, Error **errp)
+static int coroutine_fn
+vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
+                uint64_t image_size, VHDXImageType type,
+                bool use_zero_blocks, uint64_t file_offset,
+                uint32_t length, Error **errp)
 {
    int ret = 0;
    uint64_t data_file_offset;
@@ -1718,14 +1718,14 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
    if (type == VHDX_TYPE_DYNAMIC) {
        /* All zeroes, so we can just extend the file - the end of the BAT
         * is the furthest thing we have written yet */
-        ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
-                           0, errp);
+        ret = blk_co_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
+                              0, errp);
        if (ret < 0) {
            goto exit;
        }
    } else if (type == VHDX_TYPE_FIXED) {
-        ret = blk_truncate(blk, data_file_offset + image_size, false,
-                           PREALLOC_MODE_OFF, 0, errp);
+        ret = blk_co_truncate(blk, data_file_offset + image_size, false,
+                              PREALLOC_MODE_OFF, 0, errp);
        if (ret < 0) {
            goto exit;
        }
@@ -1759,7 +1759,7 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
            s->bat[sinfo.bat_idx] = cpu_to_le64(s->bat[sinfo.bat_idx]);
            sector_num += s->sectors_per_block;
        }
-        ret = blk_pwrite(blk, file_offset, length, s->bat, 0);
+        ret = blk_co_pwrite(blk, file_offset, length, s->bat, 0);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to write the BAT");
            goto exit;
@@ -1780,15 +1780,12 @@ exit:
 * to create the BAT itself, we will also cause the BAT to be
 * created.
 */
-static int vhdx_create_new_region_table(BlockBackend *blk,
-                                        uint64_t image_size,
-                                        uint32_t block_size,
-                                        uint32_t sector_size,
-                                        uint32_t log_size,
-                                        bool use_zero_blocks,
-                                        VHDXImageType type,
-                                        uint64_t *metadata_offset,
-                                        Error **errp)
+static int coroutine_fn
+vhdx_create_new_region_table(BlockBackend *blk, uint64_t image_size,
+                             uint32_t block_size, uint32_t sector_size,
+                             uint32_t log_size, bool use_zero_blocks,
+                             VHDXImageType type, uint64_t *metadata_offset,
+                             Error **errp)
 {
    int ret = 0;
    uint32_t offset = 0;
@@ -1863,15 +1860,15 @@ static int vhdx_create_new_region_table(BlockBackend *blk,
    }

    /* Now write out the region headers to disk */
-    ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, VHDX_HEADER_BLOCK_SIZE,
-                     buffer, 0);
+    ret = blk_co_pwrite(blk, VHDX_REGION_TABLE_OFFSET, VHDX_HEADER_BLOCK_SIZE,
+                        buffer, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to write first region table");
        goto exit;
    }

-    ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, VHDX_HEADER_BLOCK_SIZE,
-                     buffer, 0);
+    ret = blk_co_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, VHDX_HEADER_BLOCK_SIZE,
+                        buffer, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to write second region table");
        goto exit;
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -413,8 +413,9 @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
 int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
                   Error **errp);

-int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
-                             void *data, uint32_t length, uint64_t offset);
+int coroutine_fn GRAPH_RDLOCK
+vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
+                         void *data, uint32_t length, uint64_t offset);

 static inline void leguid_to_cpus(MSGUID *guid)
 {
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -339,7 +339,8 @@ out:
    return ret;
 }

-static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
+static int coroutine_fn GRAPH_RDLOCK
+vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 {
    char *desc, *tmp_desc;
    char *p_name, *tmp_str;
@@ -348,7 +349,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)

    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
-    ret = bdrv_pread(bs->file, s->desc_offset, DESC_SIZE, desc, 0);
+    ret = bdrv_co_pread(bs->file, s->desc_offset, DESC_SIZE, desc, 0);
    if (ret < 0) {
        goto out;
    }
@@ -368,7 +369,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
        pstrcat(desc, DESC_SIZE, tmp_desc);
    }

-    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, DESC_SIZE, desc, 0);
+    ret = bdrv_co_pwrite_sync(bs->file, s->desc_offset, DESC_SIZE, desc, 0);

 out:
    g_free(desc);
@@ -1437,7 +1438,7 @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent,
    if (skip_start_bytes > 0) {
        if (copy_from_backing) {
            /* qcow2 emits this on bs->file instead of bs->backing */
-            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
+            BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_READ);
            ret = bdrv_co_pread(bs->backing, offset, skip_start_bytes,
                                whole_grain, 0);
            if (ret < 0) {
@@ -1445,7 +1446,7 @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent,
                goto exit;
            }
        }
-        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
+        BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_WRITE);
        ret = bdrv_co_pwrite(extent->file, cluster_offset, skip_start_bytes,
                             whole_grain, 0);
        if (ret < 0) {
@@ -1457,7 +1458,7 @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent,
    if (skip_end_bytes < cluster_bytes) {
        if (copy_from_backing) {
            /* qcow2 emits this on bs->file instead of bs->backing */
-            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
+            BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_READ);
            ret = bdrv_co_pread(bs->backing, offset + skip_end_bytes,
                                cluster_bytes - skip_end_bytes,
                                whole_grain + skip_end_bytes, 0);
@@ -1466,7 +1467,7 @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent,
                goto exit;
            }
        }
-        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
+        BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_WRITE);
        ret = bdrv_co_pwrite(extent->file, cluster_offset + skip_end_bytes,
                             cluster_bytes - skip_end_bytes,
                             whole_grain + skip_end_bytes, 0);
@@ -1487,7 +1488,7 @@ vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, uint32_t offset)
 {
    offset = cpu_to_le32(offset);
    /* update L2 table */
-    BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE);
+    BLKDBG_CO_EVENT(extent->file, BLKDBG_L2_UPDATE);
    if (bdrv_co_pwrite(extent->file,
                       ((int64_t)m_data->l2_offset * 512)
                           + (m_data->l2_index * sizeof(offset)),
@@ -1617,7 +1618,7 @@ get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent,
        }
    }
    l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes);
-    BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
+    BLKDBG_CO_EVENT(extent->file, BLKDBG_L2_LOAD);
    if (bdrv_co_pread(extent->file,
                (int64_t)l2_offset * 512,
                l2_size_bytes,
@@ -1828,12 +1829,12 @@ vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
        n_bytes = buf_len + sizeof(VmdkGrainMarker);
        qemu_iovec_init_buf(&local_qiov, data, n_bytes);

-        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
+        BLKDBG_CO_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
    } else {
        qemu_iovec_init(&local_qiov, qiov->niov);
        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);

-        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO);
+        BLKDBG_CO_EVENT(extent->file, BLKDBG_WRITE_AIO);
    }

    write_offset = cluster_offset + offset_in_cluster;
@@ -1875,7 +1876,7 @@ vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,


    if (!extent->compressed) {
-        BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO);
+        BLKDBG_CO_EVENT(extent->file, BLKDBG_READ_AIO);
        ret = bdrv_co_preadv(extent->file,
                             cluster_offset + offset_in_cluster, bytes,
                             qiov, 0);
@@ -1889,7 +1890,7 @@ vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
-    BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
+    BLKDBG_CO_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
    ret = bdrv_co_pread(extent->file, cluster_offset, buf_bytes, cluster_buf,
                        0);
    if (ret < 0) {
@@ -1967,7 +1968,7 @@ vmdk_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

                /* qcow2 emits this on bs->file instead of bs->backing */
-                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
                                     &local_qiov, 0);
                if (ret < 0) {
@@ -2131,7 +2132,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes,
        int64_t length;

        for (i = 0; i < s->num_extents; i++) {
-            length = bdrv_getlength(s->extents[i].file->bs);
+            length = bdrv_co_getlength(s->extents[i].file->bs);
            if (length < 0) {
                return length;
            }
@@ -2165,7 +2166,7 @@ vmdk_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
    return ret;
 }

-static int GRAPH_UNLOCKED
+static int coroutine_fn GRAPH_UNLOCKED
 vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
                 bool zeroed_grain, Error **errp)
 {
@@ -2176,7 +2177,7 @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
    int gd_buf_size;

    if (flat) {
-        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
+        ret = blk_co_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
        goto exit;
    }
    magic = cpu_to_be32(VMDK4_MAGIC);
@@ -2228,19 +2229,19 @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = blk_pwrite(blk, 0, sizeof(magic), &magic, 0);
+    ret = blk_co_pwrite(blk, 0, sizeof(magic), &magic, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = blk_pwrite(blk, sizeof(magic), sizeof(header), &header, 0);
+    ret = blk_co_pwrite(blk, sizeof(magic), sizeof(header), &header, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }

-    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
-                       PREALLOC_MODE_OFF, 0, errp);
+    ret = blk_co_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
+                          PREALLOC_MODE_OFF, 0, errp);
    if (ret < 0) {
        goto exit;
    }
@@ -2252,8 +2253,8 @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf_size, gd_buf, 0);
+    ret = blk_co_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                        gd_buf_size, gd_buf, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -2264,8 +2265,8 @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf_size, gd_buf, 0);
+    ret = blk_co_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                        gd_buf_size, gd_buf, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
    }
@@ -2908,7 +2909,7 @@ vmdk_co_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix)
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
-    int64_t total_sectors = bdrv_nb_sectors(bs);
+    int64_t total_sectors = bdrv_co_nb_sectors(bs);
    int ret;
    uint64_t cluster_offset;

@@ -2938,7 +2939,7 @@ vmdk_co_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix)
            break;
        }
        if (ret == VMDK_OK) {
-            int64_t extent_len = bdrv_getlength(extent->file->bs);
+            int64_t extent_len = bdrv_co_getlength(extent->file->bs);
            if (extent_len < 0) {
                fprintf(stderr,
                        "ERROR: could not get extent file length for sector %"
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -486,8 +486,8 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 * operation (the block bitmaps is updated then), 0 otherwise.
 * If write is true then err must not be NULL.
 */
-static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
-                                       bool write, int *err)
+static int64_t coroutine_fn GRAPH_RDLOCK
+get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err)
 {
    BDRVVPCState *s = bs->opaque;
    uint64_t bitmap_offset, block_offset;
@@ -515,8 +515,7 @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,

        s->last_bitmap_offset = bitmap_offset;
        memset(bitmap, 0xff, s->bitmap_size);
-        r = bdrv_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap,
-                             0);
+        r = bdrv_co_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, 0);
        if (r < 0) {
            *err = r;
            return -2;
@@ -532,13 +531,13 @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
 *
 * Returns 0 on success and < 0 on error
 */
-static int rewrite_footer(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK rewrite_footer(BlockDriverState *bs)
 {
    int ret;
    BDRVVPCState *s = bs->opaque;
    int64_t offset = s->free_data_block_offset;

-    ret = bdrv_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
+    ret = bdrv_co_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
    if (ret < 0)
        return ret;

@@ -552,7 +551,8 @@ static int rewrite_footer(BlockDriverState *bs)
 *
 * Returns the sectors' offset in the image file on success and < 0 on error
 */
-static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
+static int64_t coroutine_fn GRAPH_RDLOCK
+alloc_block(BlockDriverState *bs, int64_t offset)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t bat_offset;
@@ -572,8 +572,8 @@ static int64_t alloc_block(BlockDriverState *bs, int64_t offset)

    /* Initialize the block's bitmap */
    memset(bitmap, 0xff, s->bitmap_size);
-    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset,
-                           s->bitmap_size, bitmap, 0);
+    ret = bdrv_co_pwrite_sync(bs->file, s->free_data_block_offset,
+                              s->bitmap_size, bitmap, 0);
    if (ret < 0) {
        return ret;
    }
@@ -587,7 +587,7 @@ static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
    /* Write BAT entry to disk */
    bat_offset = s->bat_offset + (4 * index);
    bat_value = cpu_to_be32(s->pagetable[index]);
-    ret = bdrv_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
+    ret = bdrv_co_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
    if (ret < 0)
        goto fail;

@@ -718,11 +718,11 @@ fail:
    return ret;
 }

-static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
-                                            bool want_zero,
-                                            int64_t offset, int64_t bytes,
-                                            int64_t *pnum, int64_t *map,
-                                            BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+vpc_co_block_status(BlockDriverState *bs, bool want_zero,
+                    int64_t offset, int64_t bytes,
+                    int64_t *pnum, int64_t *map,
+                    BlockDriverState **file)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t image_offset;
@@ -820,8 +820,8 @@ static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
    return 0;
 }

-static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
-                               int64_t total_sectors)
+static int coroutine_fn create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
+                                            int64_t total_sectors)
 {
    VHDDynDiskHeader dyndisk_header;
    uint8_t bat_sector[512];
@@ -834,13 +834,13 @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
    block_size = 0x200000;
    num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);

-    ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
+    ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
    if (ret < 0) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
+    ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -850,7 +850,7 @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,

    memset(bat_sector, 0xFF, 512);
    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
-        ret = blk_pwrite(blk, offset, 512, bat_sector, 0);
+        ret = blk_co_pwrite(blk, offset, 512, bat_sector, 0);
        if (ret < 0) {
            goto fail;
        }
@@ -878,7 +878,7 @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
    /* Write the header */
    offset = 512;

-    ret = blk_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
+    ret = blk_co_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -888,21 +888,21 @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
    return ret;
 }

-static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
-                             int64_t total_size, Error **errp)
+static int coroutine_fn create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
+                                          int64_t total_size, Error **errp)
 {
    int ret;

    /* Add footer to total size */
    total_size += sizeof(*footer);

-    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
+    ret = blk_co_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
    if (ret < 0) {
        return ret;
    }

-    ret = blk_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
-                     footer, 0);
+    ret = blk_co_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
+                        footer, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Unable to write VHD header");
        return ret;
--- a/blockjob.c
+++ b/blockjob.c
@@ -230,20 +230,27 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
                       uint64_t perm, uint64_t shared_perm, Error **errp)
 {
    BdrvChild *c;
+    AioContext *ctx = bdrv_get_aio_context(bs);
    bool need_context_ops;
    GLOBAL_STATE_CODE();

    bdrv_ref(bs);

-    need_context_ops = bdrv_get_aio_context(bs) != job->job.aio_context;
+    need_context_ops = ctx != job->job.aio_context;

-    if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
-        aio_context_release(job->job.aio_context);
+    if (need_context_ops) {
+        if (job->job.aio_context != qemu_get_aio_context()) {
+            aio_context_release(job->job.aio_context);
+        }
+        aio_context_acquire(ctx);
    }
    c = bdrv_root_attach_child(bs, name, &child_job, 0, perm, shared_perm, job,
                               errp);
-    if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
-        aio_context_acquire(job->job.aio_context);
+    if (need_context_ops) {
+        aio_context_release(ctx);
+        if (job->job.aio_context != qemu_get_aio_context()) {
+            aio_context_acquire(job->job.aio_context);
+        }
    }
    if (c == NULL) {
        return -EPERM;
--- a/chardev/char-win-stdio.c
+++ b/chardev/char-win-stdio.c
@@ -190,7 +190,7 @@ static void qemu_chr_open_stdio(Chardev *chr,
        }
    }

-    dwMode |= ENABLE_LINE_INPUT;
+    dwMode |= ENABLE_LINE_INPUT | ENABLE_VIRTUAL_TERMINAL_INPUT;

    if (is_console) {
        /* set the terminal in raw mode */
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -558,7 +558,7 @@ When CI tasks, maintainers or yourself report a Docker test failure, follow the
 below steps to debug it:

 1. Locally reproduce the failure with the reported command line. E.g. run
-   ``make docker-test-mingw@fedora J=8``.
+   ``make docker-test-mingw@fedora-win64-cross J=8``.
 2. Add "V=1" to the command line, try again, to see the verbose output.
 3. Further add "DEBUG=1" to the command line. This will pause in a shell prompt
   in the container right before testing starts. You could either manually
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -7,12 +7,21 @@ the guest is running on source host and restoring this saved state on the
 destination host. This document details how saving and restoring of VFIO
 devices is done in QEMU.

-Migration of VFIO devices currently consists of a single stop-and-copy phase.
-During the stop-and-copy phase the guest is stopped and the entire VFIO device
-data is transferred to the destination.
+Migration of VFIO devices consists of two phases: the optional pre-copy phase,
+and the stop-and-copy phase. The pre-copy phase is iterative and allows to
+accommodate VFIO devices that have a large amount of data that needs to be
+transferred. The iterative pre-copy phase of migration allows for the guest to
+continue whilst the VFIO device state is transferred to the destination, this
+helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy
+support by reporting the VFIO_MIGRATION_PRE_COPY flag in the
+VFIO_DEVICE_FEATURE_MIGRATION ioctl.

-The pre-copy phase of migration is currently not supported for VFIO devices.
-Support for VFIO pre-copy will be added later on.
+When pre-copy is supported, it's possible to further reduce downtime by
+enabling "switchover-ack" migration capability.
+VFIO migration uAPI defines "initial bytes" as part of its pre-copy data stream
+and recommends that the initial bytes are sent and loaded in the destination
+before stopping the source VM. Enabling this migration capability will
+guarantee that and thus, can potentially reduce downtime even further.

 Note that currently VFIO migration is supported only for a single device. This
 is due to VFIO migration's lack of P2P support. However, P2P support is planned
@@ -29,10 +38,23 @@ VFIO implements the device hooks for the iterative approach as follows:
 * A ``load_setup`` function that sets the VFIO device on the destination in
  _RESUMING state.

+* A ``state_pending_estimate`` function that reports an estimate of the
+  remaining pre-copy data that the vendor driver has yet to save for the VFIO
+  device.
+
 * A ``state_pending_exact`` function that reads pending_bytes from the vendor
  driver, which indicates the amount of data that the vendor driver has yet to
  save for the VFIO device.

+* An ``is_active_iterate`` function that indicates ``save_live_iterate`` is
+  active only when the VFIO device is in pre-copy states.
+
+* A ``save_live_iterate`` function that reads the VFIO device's data from the
+  vendor driver during iterative pre-copy phase.
+
+* A ``switchover_ack_needed`` function that checks if the VFIO device uses
+  "switchover-ack" migration capability when this capability is enabled.
+
 * A ``save_state`` function to save the device config space if it is present.

 * A ``save_live_complete_precopy`` function that sets the VFIO device in
@@ -111,8 +133,10 @@ Flow of state changes during Live migration
 ===========================================

 Below is the flow of state change during live migration.
-The values in the brackets represent the VM state, the migration state, and
+The values in the parentheses represent the VM state, the migration state, and
 the VFIO device state, respectively.
+The text in the square brackets represents the flow if the VFIO device supports
+pre-copy.

 Live migration save path
 ------------------------
@@ -124,11 +148,12 @@ Live migration save path
                                  |
                     migrate_init spawns migration_thread
                Migration thread then calls each device's .save_setup()
-                       (RUNNING, _SETUP, _RUNNING)
+                  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
                                  |
-                      (RUNNING, _ACTIVE, _RUNNING)
-             If device is active, get pending_bytes by .state_pending_exact()
+                  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
+      If device is active, get pending_bytes by .state_pending_{estimate,exact}()
          If total pending_bytes >= threshold_size, call .save_live_iterate()
+                  [Data of VFIO device for pre-copy phase is copied]
        Iterate till total pending bytes converge and are less than threshold
                                  |
  On migration completion, vCPU stops and calls .save_live_complete_precopy for
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -86,6 +86,7 @@ Emulated Devices
   devices/ccid.rst
   devices/cxl.rst
   devices/ivshmem.rst
+   devices/keyboard.rst
   devices/net.rst
   devices/nvme.rst
   devices/usb.rst
--- a/docs/system/devices/keyboard.rst
+++ b/docs/system/devices/keyboard.rst
@@ -0,0 +1,129 @@
+.. _keyboard:
+
+Sparc32 keyboard
+----------------
+SUN Type 4, 5 and 5c keyboards have dip switches to choose the language layout
+of the keyboard. Solaris makes an ioctl to query the value of the dipswitches
+and uses that value to select keyboard layout. Also the SUN bios like the one
+in the file ss5.bin uses this value to support at least some keyboard layouts.
+However, the OpenBIOS provided with qemu is hardcoded to always use an
+US keyboard layout.
+
+With the escc.chnA-sunkbd-layout driver property it is possible to select
+keyboard layout. Example:
+
+-global escc.chnA-sunkbd-layout=de
+
+Depending on type of keyboard, the keyboard can have 6 or 5 dip-switches to
+select keyboard layout, giving up to 64 different layouts. Not all
+combinations are supported by Solaris and even less by Sun OpenBoot BIOS.
+
+The dip switch settings can be given as hexadecimal number, decimal number
+or in some cases as a language string. Examples:
+
+-global escc.chnA-sunkbd-layout=0x2b
+
+-global escc.chnA-sunkbd-layout=43
+
+-global escc.chnA-sunkbd-layout=sv
+
+The above 3 examples all select a swedish keyboard layout. Table 3-15 at
+https://docs.oracle.com/cd/E19683-01/806-6642/new-43/index.html explains which
+keytable file is used for different dip switch settings. The information
+in that table can be summarized in this table:
+
+.. list-table:: Language selection values for escc.chnA-sunkbd-layout
+   :widths: 10 10 10
+   :header-rows: 1
+
+   * - Hexadecimal value
+     - Decimal value
+     - Language code
+   * - 0x21
+     - 33
+     - en-us
+   * - 0x23
+     - 35
+     - fr
+   * - 0x24
+     - 36
+     - da
+   * - 0x25
+     - 37
+     - de
+   * - 0x26
+     - 38
+     - it
+   * - 0x27
+     - 39
+     - nl
+   * - 0x28
+     - 40
+     - no
+   * - 0x29
+     - 41
+     - pt
+   * - 0x2a
+     - 42
+     - es
+   * - 0x2b
+     - 43
+     - sv
+   * - 0x2c
+     - 44
+     - fr-ch
+   * - 0x2d
+     - 45
+     - de-ch
+   * - 0x2e
+     - 46
+     - en-gb
+   * - 0x2f
+     - 47
+     - ko
+   * - 0x30
+     - 48
+     - tw
+   * - 0x31
+     - 49
+     - ja
+   * - 0x32
+     - 50
+     - fr-ca
+   * - 0x33
+     - 51
+     - hu
+   * - 0x34
+     - 52
+     - pl
+   * - 0x35
+     - 53
+     - cz
+   * - 0x36
+     - 54
+     - ru
+   * - 0x37
+     - 55
+     - lv
+   * - 0x38
+     - 56
+     - tr
+   * - 0x39
+     - 57
+     - gr
+   * - 0x3a
+     - 58
+     - ar
+   * - 0x3b
+     - 59
+     - lt
+   * - 0x3c
+     - 60
+     - nl-be
+   * - 0x3c
+     - 60
+     - be
+
+Not all dip switch values have a corresponding language code and both "be" and
+"nl-be" correspond to the same dip switch value. By default, if no value is
+given to escc.chnA-sunkbd-layout 0x21 (en-us) will be used.
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -212,6 +212,41 @@ The namespace may be configured with additional parameters
  the minimum memory page size (CAP.MPSMIN). The default value (``0``)
  has this property inherit the ``mdts`` value.

+Flexible Data Placement
+-----------------------
+
+The device may be configured to support TP4146 ("Flexible Data Placement") by
+configuring it (``fdp=on``) on the subsystem::
+
+    -device nvme-subsys,id=nvme-subsys-0,nqn=subsys0,fdp=on,fdp.nruh=16
+
+The subsystem emulates a single Endurance Group, on which Flexible Data
+Placement will be supported. Also note that the device emulation deviates
+slightly from the specification, by always enabling the "FDP Mode" feature on
+the controller if the subsystems is configured for Flexible Data Placement.
+
+Enabling Flexible Data Placement on the subsyste enables the following
+parameters:
+
+``fdp.nrg`` (default: ``1``)
+  Set the number of Reclaim Groups.
+
+``fdp.nruh`` (default: ``0``)
+  Set the number of Reclaim Unit Handles. This is a mandatory paramater and
+  must be non-zero.
+
+``fdp.runs`` (default: ``96M``)
+  Set the Reclaim Unit Nominal Size. Defaults to 96 MiB.
+
+Namespaces within this subsystem may requests Reclaim Unit Handles::
+
+    -device nvme-ns,drive=nvm-1,fdp.ruhs=RUHLIST
+
+The ``RUHLIST`` is a semicolon separated list (i.e. ``0;1;2;3``) and may
+include ranges (i.e. ``0;8-15``). If no reclaim unit handle list is specified,
+the controller will assign the controller-specified reclaim unit handle to
+placement handle identifier 0.
+
 Metadata
 --------

@@ -320,4 +355,4 @@ controller are:

 .. code-block:: console

-   echo 0000:01:00.1 > /sys/bus/pci/drivers/nvme/bind
+   echo 0000:01:00.1 > /sys/bus/pci/drivers/nvme/bind
--- a/docs/system/target-sparc.rst
+++ b/docs/system/target-sparc.rst
@@ -38,7 +38,7 @@ QEMU emulates the following sun4m peripherals:
 -  Non Volatile RAM M48T02/M48T08

 -  Slave I/O: timers, interrupt controllers, Zilog serial ports,
-   keyboard and power/reset logic
+   :ref:`keyboard` and power/reset logic

 -  ESP SCSI controller with hard disk and CD-ROM support

--- a/hw/arm/sbsa-ref.c
+++ b/hw/arm/sbsa-ref.c
@@ -23,6 +23,7 @@
 #include "qemu/error-report.h"
 #include "qemu/units.h"
 #include "sysemu/device_tree.h"
+#include "sysemu/kvm.h"
 #include "sysemu/numa.h"
 #include "sysemu/runstate.h"
 #include "sysemu/sysemu.h"
@@ -36,6 +37,7 @@
 #include "hw/ide/internal.h"
 #include "hw/ide/ahci_internal.h"
 #include "hw/intc/arm_gicv3_common.h"
+#include "hw/intc/arm_gicv3_its_common.h"
 #include "hw/loader.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/qdev-properties.h"
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -48,12 +48,12 @@
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/arm/virt.h"
+#include "hw/intc/arm_gicv3_its_common.h"
 #include "hw/mem/nvdimm.h"
 #include "hw/platform-bus.h"
 #include "sysemu/numa.h"
 #include "sysemu/reset.h"
 #include "sysemu/tpm.h"
-#include "kvm_arm.h"
 #include "migration/vmstate.h"
 #include "hw/acpi/ghes.h"
 #include "hw/acpi/viot.h"
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -63,6 +63,7 @@
 #include "hw/arm/fdt.h"
 #include "hw/intc/arm_gic.h"
 #include "hw/intc/arm_gicv3_common.h"
+#include "hw/intc/arm_gicv3_its_common.h"
 #include "hw/irq.h"
 #include "kvm_arm.h"
 #include "hw/firmware/smbios.h"
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -45,7 +45,7 @@ static MemoryListener xen_memory_listener = {
    .log_sync = NULL,
    .log_global_start = NULL,
    .log_global_stop = NULL,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 struct XenArmState {
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -31,6 +31,8 @@
 #include "qemu/module.h"
 #include "hw/char/escc.h"
 #include "ui/console.h"
+
+#include "qemu/cutils.h"
 #include "trace.h"

 /*
@@ -190,6 +192,7 @@
 #define R_MISC1I 14
 #define R_EXTINT 15

+static uint8_t sunkbd_layout_dip_switch(const char *sunkbd_layout);
 static void handle_kbd_command(ESCCChannelState *s, int val);
 static int serial_can_receive(void *opaque);
 static void serial_receive_byte(ESCCChannelState *s, int ch);
@@ -846,6 +849,79 @@ static QemuInputHandler sunkbd_handler = {
    .event = sunkbd_handle_event,
 };

+static uint8_t sunkbd_layout_dip_switch(const char *kbd_layout)
+{
+    /* Return the value of the dip-switches in a SUN Type 5 keyboard */
+    static uint8_t ret = 0xff;
+
+    if ((ret == 0xff) && kbd_layout) {
+        int i;
+        struct layout_values {
+            const char *lang;
+            uint8_t dip;
+        } languages[] =
+            /*
+             * Dip values from table 3-16 Layouts for Type 4, 5 and 5c Keyboards
+             */
+            {
+                {"en-us", 0x21}, /* U.S.A. (US5.kt) */
+                                 /* 0x22 is some other US (US_UNIX5.kt) */
+                {"fr",    0x23}, /* France (France5.kt) */
+                {"da",    0x24}, /* Denmark (Denmark5.kt) */
+                {"de",    0x25}, /* Germany (Germany5.kt) */
+                {"it",    0x26}, /* Italy (Italy5.kt) */
+                {"nl",    0x27}, /* The Netherlands (Netherland5.kt) */
+                {"no",    0x28}, /* Norway (Norway.kt) */
+                {"pt",    0x29}, /* Portugal (Portugal5.kt) */
+                {"es",    0x2a}, /* Spain (Spain5.kt) */
+                {"sv",    0x2b}, /* Sweden (Sweden5.kt) */
+                {"fr-ch", 0x2c}, /* Switzerland/French (Switzer_Fr5.kt) */
+                {"de-ch", 0x2d}, /* Switzerland/German (Switzer_Ge5.kt) */
+                {"en-gb", 0x2e}, /* Great Britain (UK5.kt) */
+                {"ko",    0x2f}, /* Korea (Korea5.kt) */
+                {"tw",    0x30}, /* Taiwan (Taiwan5.kt) */
+                {"ja",    0x31}, /* Japan (Japan5.kt) */
+                {"fr-ca", 0x32}, /* Canada/French (Canada_Fr5.kt) */
+                {"hu",    0x33}, /* Hungary (Hungary5.kt) */
+                {"pl",    0x34}, /* Poland (Poland5.kt) */
+                {"cz",    0x35}, /* Czech (Czech5.kt) */
+                {"ru",    0x36}, /* Russia (Russia5.kt) */
+                {"lv",    0x37}, /* Latvia (Latvia5.kt) */
+                {"tr",    0x38}, /* Turkey-Q5 (TurkeyQ5.kt) */
+                {"gr",    0x39}, /* Greece (Greece5.kt) */
+                {"ar",    0x3a}, /* Arabic (Arabic5.kt) */
+                {"lt",    0x3b}, /* Lithuania (Lithuania5.kt) */
+                {"nl-be", 0x3c}, /* Belgium (Belgian5.kt) */
+                {"be",    0x3c}, /* Belgium (Belgian5.kt) */
+            };
+
+        for (i = 0;
+             i < sizeof(languages) / sizeof(struct layout_values);
+             i++) {
+            if (!strcmp(kbd_layout, languages[i].lang)) {
+                ret = languages[i].dip;
+                return ret;
+            }
+        }
+
+        /* Found no known language code */
+        if ((kbd_layout[0] >= '0') && (kbd_layout[0] <= '9')) {
+            unsigned int tmp;
+
+            /* As a fallback we also accept numeric dip switch value */
+            if (!qemu_strtoui(kbd_layout, NULL, 0, &tmp)) {
+                ret = tmp;
+            }
+        }
+    }
+
+    if (ret == 0xff) {
+        /* Final fallback if keyboard_layout was not set or recognized */
+        ret = 0x21; /* en-us layout */
+    }
+    return ret;
+}
+
 static void handle_kbd_command(ESCCChannelState *s, int val)
 {
    trace_escc_kbd_command(val);
@@ -867,7 +943,7 @@ static void handle_kbd_command(ESCCChannelState *s, int val)
    case 0xf:
        clear_queue(s);
        put_queue(s, 0xfe);
-        put_queue(s, 0x21); /*  en-us layout */
+        put_queue(s, sunkbd_layout_dip_switch(s->sunkbd_layout));
        break;
    default:
        break;
@@ -976,6 +1052,7 @@ static Property escc_properties[] = {
    DEFINE_PROP_UINT32("chnAtype",  ESCCState, chn[1].type, 0),
    DEFINE_PROP_CHR("chrB", ESCCState, chn[0].chr),
    DEFINE_PROP_CHR("chrA", ESCCState, chn[1].chr),
+    DEFINE_PROP_STRING("chnA-sunkbd-layout", ESCCState, chn[1].sunkbd_layout),
    DEFINE_PROP_END_OF_LIST(),
 };

--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -143,11 +143,15 @@ static void set_drive_helper(Object *obj, Visitor *v, const char *name,
             * aware of iothreads require their BlockBackends to be in the main
             * AioContext.
             */
-            ctx = iothread ? bdrv_get_aio_context(bs) : qemu_get_aio_context();
-            blk = blk_new(ctx, 0, BLK_PERM_ALL);
+            ctx = bdrv_get_aio_context(bs);
+            blk = blk_new(iothread ? ctx : qemu_get_aio_context(),
+                          0, BLK_PERM_ALL);
            blk_created = true;

+            aio_context_acquire(ctx);
            ret = blk_insert_bs(blk, bs, errp);
+            aio_context_release(ctx);
+
            if (ret < 0) {
                goto fail;
            }
--- a/hw/display/virtio-gpu-udmabuf.c
+++ b/hw/display/virtio-gpu-udmabuf.c
@@ -132,7 +132,8 @@ void virtio_gpu_init_udmabuf(struct virtio_gpu_simple_resource *res)
    void *pdata = NULL;

    res->dmabuf_fd = -1;
-    if (res->iov_cnt == 1) {
+    if (res->iov_cnt == 1 &&
+        res->iov[0].iov_len < 4096) {
        pdata = res->iov[0].iov_base;
    } else {
        virtio_gpu_create_udmabuf(res);
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -18,9 +18,17 @@
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-gpu.h"

+#include "ui/egl-helpers.h"
+
 #include <virglrenderer.h>

-static struct virgl_renderer_callbacks virtio_gpu_3d_cbs;
+#if VIRGL_RENDERER_CALLBACKS_VERSION >= 4
+static void *
+virgl_get_egl_display(G_GNUC_UNUSED void *cookie)
+{
+    return qemu_egl_display;
+}
+#endif

 static void virgl_cmd_create_resource_2d(VirtIOGPU *g,
                                         struct virtio_gpu_ctrl_command *cmd)
@@ -145,7 +153,6 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g,
                                  struct virtio_gpu_ctrl_command *cmd)
 {
    struct virtio_gpu_set_scanout ss;
-    struct virgl_renderer_resource_info info;
    int ret;

    VIRTIO_GPU_FILL_CMD(ss);
@@ -160,10 +167,20 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g,
    }
    g->parent_obj.enable = 1;

-    memset(&info, 0, sizeof(info));
-
    if (ss.resource_id && ss.r.width && ss.r.height) {
+        struct virgl_renderer_resource_info info;
+        void *d3d_tex2d = NULL;
+
+#ifdef HAVE_VIRGL_D3D_INFO_EXT
+        struct virgl_renderer_resource_info_ext ext;
+        memset(&ext, 0, sizeof(ext));
+        ret = virgl_renderer_resource_get_info_ext(ss.resource_id, &ext);
+        info = ext.base;
+        d3d_tex2d = ext.d3d_tex2d;
+#else
+        memset(&info, 0, sizeof(info));
        ret = virgl_renderer_resource_get_info(ss.resource_id, &info);
+#endif
        if (ret == -1) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: illegal resource specified %d\n",
@@ -178,7 +195,8 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g,
            g->parent_obj.scanout[ss.scanout_id].con, info.tex_id,
            info.flags & VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP,
            info.width, info.height,
-            ss.r.x, ss.r.y, ss.r.width, ss.r.height);
+            ss.r.x, ss.r.y, ss.r.width, ss.r.height,
+            d3d_tex2d);
    } else {
        dpy_gfx_replace_surface(
            g->parent_obj.scanout[ss.scanout_id].con, NULL);
@@ -607,8 +625,21 @@ void virtio_gpu_virgl_reset(VirtIOGPU *g)
 int virtio_gpu_virgl_init(VirtIOGPU *g)
 {
    int ret;
+    uint32_t flags = 0;

-    ret = virgl_renderer_init(g, 0, &virtio_gpu_3d_cbs);
+#if VIRGL_RENDERER_CALLBACKS_VERSION >= 4
+    if (qemu_egl_display) {
+        virtio_gpu_3d_cbs.version = 4;
+        virtio_gpu_3d_cbs.get_egl_display = virgl_get_egl_display;
+    }
+#endif
+#ifdef VIRGL_RENDERER_D3D11_SHARE_TEXTURE
+    if (qemu_egl_angle_d3d) {
+        flags |= VIRGL_RENDERER_D3D11_SHARE_TEXTURE;
+    }
+#endif
+
+    ret = virgl_renderer_init(g, flags, &virtio_gpu_3d_cbs);
    if (ret != 0) {
        error_report("virgl could not be initialized: %d", ret);
        return ret;
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -258,6 +258,16 @@ static uint32_t calc_image_hostmem(pixman_format_code_t pformat,
    return height * stride;
 }

+#ifdef WIN32
+static void
+win32_pixman_image_destroy(pixman_image_t *image, void *data)
+{
+    HANDLE handle = data;
+
+    qemu_win32_map_free(pixman_image_get_data(image), handle, &error_warn);
+}
+#endif
+
 static void virtio_gpu_resource_create_2d(VirtIOGPU *g,
                                          struct virtio_gpu_ctrl_command *cmd)
 {
@@ -304,12 +314,27 @@ static void virtio_gpu_resource_create_2d(VirtIOGPU *g,

    res->hostmem = calc_image_hostmem(pformat, c2d.width, c2d.height);
    if (res->hostmem + g->hostmem < g->conf_max_hostmem) {
+        void *bits = NULL;
+#ifdef WIN32
+        bits = qemu_win32_map_alloc(res->hostmem, &res->handle, &error_warn);
+        if (!bits) {
+            goto end;
+        }
+#endif
        res->image = pixman_image_create_bits(pformat,
                                              c2d.width,
                                              c2d.height,
-                                              NULL, 0);
+                                              bits, res->hostmem / c2d.height);
+#ifdef WIN32
+        if (res->image) {
+            pixman_image_set_destroy_function(res->image, win32_pixman_image_destroy, res->handle);
+        }
+#endif
    }

+#ifdef WIN32
+end:
+#endif
    if (!res->image) {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "%s: resource creation failed %d %d %d\n",
@@ -438,11 +463,11 @@ static void virtio_gpu_transfer_to_host_2d(VirtIOGPU *g,
                                           struct virtio_gpu_ctrl_command *cmd)
 {
    struct virtio_gpu_simple_resource *res;
-    int h;
+    int h, bpp;
    uint32_t src_offset, dst_offset, stride;
-    int bpp;
    pixman_format_code_t format;
    struct virtio_gpu_transfer_to_host_2d t2d;
+    void *img_data;

    VIRTIO_GPU_FILL_CMD(t2d);
    virtio_gpu_t2d_bswap(&t2d);
@@ -471,23 +496,23 @@ static void virtio_gpu_transfer_to_host_2d(VirtIOGPU *g,
    format = pixman_image_get_format(res->image);
    bpp = DIV_ROUND_UP(PIXMAN_FORMAT_BPP(format), 8);
    stride = pixman_image_get_stride(res->image);
+    img_data = pixman_image_get_data(res->image);

-    if (t2d.offset || t2d.r.x || t2d.r.y ||
-        t2d.r.width != pixman_image_get_width(res->image)) {
-        void *img_data = pixman_image_get_data(res->image);
+    if (t2d.r.x || t2d.r.width != pixman_image_get_width(res->image)) {
        for (h = 0; h < t2d.r.height; h++) {
            src_offset = t2d.offset + stride * h;
            dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp);

            iov_to_buf(res->iov, res->iov_cnt, src_offset,
-                       (uint8_t *)img_data
-                       + dst_offset, t2d.r.width * bpp);
+                       (uint8_t *)img_data + dst_offset,
+                       t2d.r.width * bpp);
        }
    } else {
-        iov_to_buf(res->iov, res->iov_cnt, 0,
-                   pixman_image_get_data(res->image),
-                   pixman_image_get_stride(res->image)
-                   * pixman_image_get_height(res->image));
+        src_offset = t2d.offset;
+        dst_offset = t2d.r.y * stride + t2d.r.x * bpp;
+        iov_to_buf(res->iov, res->iov_cnt, src_offset,
+                   (uint8_t *)img_data + dst_offset,
+                   stride * t2d.r.height);
    }
 }

@@ -498,6 +523,8 @@ static void virtio_gpu_resource_flush(VirtIOGPU *g,
    struct virtio_gpu_resource_flush rf;
    struct virtio_gpu_scanout *scanout;
    pixman_region16_t flush_region;
+    bool within_bounds = false;
+    bool update_submitted = false;
    int i;

    VIRTIO_GPU_FILL_CMD(rf);
@@ -518,13 +545,28 @@ static void virtio_gpu_resource_flush(VirtIOGPU *g,
                rf.r.x < scanout->x + scanout->width &&
                rf.r.x + rf.r.width >= scanout->x &&
                rf.r.y < scanout->y + scanout->height &&
-                rf.r.y + rf.r.height >= scanout->y &&
-                console_has_gl(scanout->con)) {
-                dpy_gl_update(scanout->con, 0, 0, scanout->width,
-                              scanout->height);
+                rf.r.y + rf.r.height >= scanout->y) {
+                within_bounds = true;
+
+                if (console_has_gl(scanout->con)) {
+                    dpy_gl_update(scanout->con, 0, 0, scanout->width,
+                                  scanout->height);
+                    update_submitted = true;
+                }
            }
        }
-        return;
+
+        if (update_submitted) {
+            return;
+        }
+        if (!within_bounds) {
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: flush bounds outside scanouts"
+                          " bounds for flush %d: %d %d %d %d\n",
+                          __func__, rf.resource_id, rf.r.x, rf.r.y,
+                          rf.r.width, rf.r.height);
+            cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER;
+            return;
+        }
    }

    if (!res->blob &&
@@ -634,8 +676,10 @@ static void virtio_gpu_do_set_scanout(VirtIOGPU *g,
        if (console_has_gl(scanout->con)) {
            if (!virtio_gpu_update_dmabuf(g, scanout_id, res, fb, r)) {
                virtio_gpu_update_scanout(g, scanout_id, res, r);
-                return;
+            } else {
+                *error = VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY;
            }
+            return;
        }

        data = res->blob;
@@ -666,6 +710,9 @@ static void virtio_gpu_do_set_scanout(VirtIOGPU *g,
            *error = VIRTIO_GPU_RESP_ERR_UNSPEC;
            return;
        }
+#ifdef WIN32
+        qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, fb->offset);
+#endif

        pixman_image_unref(rect);
        dpy_gfx_replace_surface(g->parent_obj.scanout[scanout_id].con,
@@ -1209,6 +1256,7 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size,
    struct virtio_gpu_simple_resource *res;
    struct virtio_gpu_scanout *scanout;
    uint32_t resource_id, pformat;
+    void *bits = NULL;
    int i;

    g->hostmem = 0;
@@ -1233,15 +1281,23 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size,
            g_free(res);
            return -EINVAL;
        }
+
+        res->hostmem = calc_image_hostmem(pformat, res->width, res->height);
+#ifdef WIN32
+        bits = qemu_win32_map_alloc(res->hostmem, &res->handle, &error_warn);
+        if (!bits) {
+            g_free(res);
+            return -EINVAL;
+        }
+#endif
        res->image = pixman_image_create_bits(pformat,
                                              res->width, res->height,
-                                              NULL, 0);
+                                              bits, res->hostmem / res->height);
        if (!res->image) {
            g_free(res);
            return -EINVAL;
        }

-        res->hostmem = calc_image_hostmem(pformat, res->width, res->height);

        res->addrs = g_new(uint64_t, res->iov_cnt);
        res->iov = g_new(struct iovec, res->iov_cnt);
@@ -1302,6 +1358,9 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size,
        if (!scanout->ds) {
            return -EINVAL;
        }
+#ifdef WIN32
+        qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, 0);
+#endif

        dpy_gfx_replace_surface(scanout->con, scanout->ds);
        dpy_gfx_update_full(scanout->con);
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -467,7 +467,7 @@ static MemoryListener xen_memory_listener = {
    .log_sync = xen_log_sync,
    .log_global_start = xen_log_global_start,
    .log_global_stop = xen_log_global_stop,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -21,10 +21,12 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "qemu/error-report.h"
 #include "gic_internal.h"
 #include "hw/arm/linux-boot-if.h"
 #include "hw/qdev-properties.h"
 #include "migration/vmstate.h"
+#include "sysemu/kvm.h"

 static int gic_pre_save(void *opaque)
 {
@@ -233,12 +235,12 @@ static void arm_gic_common_realize(DeviceState *dev, Error **errp)
    }
 }

-static inline void arm_gic_common_reset_irq_state(GICState *s, int first_cpu,
+static inline void arm_gic_common_reset_irq_state(GICState *s, int cidx,
                                                  int resetprio)
 {
    int i, j;

-    for (i = first_cpu; i < first_cpu + s->num_cpu; i++) {
+    for (i = cidx; i < cidx + s->num_cpu; i++) {
        if (s->revision == REV_11MPCORE) {
            s->priority_mask[i] = 0xf0;
        } else {
@@ -393,3 +395,8 @@ static void register_types(void)
 }

 type_init(register_types)
+
+const char *gic_class_name(void)
+{
+    return kvm_irqchip_in_kernel() ? "kvm-arm-gic" : "arm_gic";
+}
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "qemu/error-report.h"
 #include "hw/core/cpu.h"
 #include "hw/intc/arm_gicv3_common.h"
 #include "hw/qdev-properties.h"
@@ -608,3 +609,16 @@ static void register_types(void)
 }

 type_init(register_types)
+
+const char *gicv3_class_name(void)
+{
+    if (kvm_irqchip_in_kernel()) {
+        return "kvm-arm-gicv3";
+    } else {
+        if (kvm_enabled()) {
+            error_report("Userspace GICv3 is not supported with KVM");
+            exit(1);
+        }
+        return "arm-gicv3";
+    }
+}
--- a/hw/intc/arm_gicv3_its_common.c
+++ b/hw/intc/arm_gicv3_its_common.c
@@ -24,6 +24,7 @@
 #include "hw/intc/arm_gicv3_its_common.h"
 #include "qemu/log.h"
 #include "qemu/module.h"
+#include "sysemu/kvm.h"

 static int gicv3_its_pre_save(void *opaque)
 {
@@ -158,3 +159,14 @@ static void gicv3_its_common_register_types(void)
 }

 type_init(gicv3_its_common_register_types)
+
+const char *its_class_name(void)
+{
+    if (kvm_irqchip_in_kernel()) {
+        /* KVM implementation requires this capability */
+        return kvm_direct_msi_enabled() ? "arm-its-kvm" : NULL;
+    } else {
+        /* Software emulation based model */
+        return "arm-gicv3-its";
+    }
+}
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -507,7 +507,12 @@ VHostNetState *get_vhost_net(NetClientState *nc)
    switch (nc->info->type) {
    case NET_CLIENT_DRIVER_TAP:
        vhost_net = tap_get_vhost_net(nc);
-        assert(vhost_net);
+        /*
+         * tap_get_vhost_net() can return NULL if a tap net-device backend is
+         * created with 'vhost=off' option, 'vhostforce=off' or no vhost or
+         * vhostforce or vhostfd options at all. Please see net_init_tap_one().
+         * Hence, we omit the assertion here.
+         */
        break;
 #ifdef CONFIG_VHOST_NET_USER
    case NET_CLIENT_DRIVER_VHOST_USER:
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -43,7 +43,14 @@
 *              subsys=<subsys_id>
 *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
 *              zoned=<true|false[optional]>, \
- *              subsys=<subsys_id>,detached=<true|false[optional]>
+ *              subsys=<subsys_id>,shared=<true|false[optional]>, \
+ *              detached=<true|false[optional]>, \
+ *              zoned.zone_size=<N[optional]>, \
+ *              zoned.zone_capacity=<N[optional]>, \
+ *              zoned.descr_ext_size=<N[optional]>, \
+ *              zoned.max_active=<N[optional]>, \
+ *              zoned.max_open=<N[optional]>, \
+ *              zoned.cross_read=<true|false[optional]>
 *
 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
@@ -1748,6 +1755,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
    case NVME_CMD_WRITE:
    case NVME_CMD_WRITE_ZEROES:
    case NVME_CMD_ZONE_APPEND:
+    case NVME_CMD_COPY:
        status = NVME_WRITE_FAULT;
        break;
    default:
@@ -2847,6 +2855,25 @@ static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
    }
 }

+static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
+                                           NvmeCopyAIOCB *iocb, uint16_t nr)
+{
+    uint32_t copy_len = 0;
+
+    for (int idx = 0; idx < nr; idx++) {
+        uint32_t nlb;
+        nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
+                                     &nlb, NULL, NULL, NULL);
+        copy_len += nlb + 1;
+    }
+
+    if (copy_len > ns->id_ns.mcl) {
+        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+    }
+
+    return NVME_SUCCESS;
+}
+
 static void nvme_copy_out_completed_cb(void *opaque, int ret)
 {
    NvmeCopyAIOCB *iocb = opaque;
@@ -3159,6 +3186,11 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
        }
    }

+    status = nvme_check_copy_mcl(ns, iocb, nr);
+    if (status) {
+        goto invalid;
+    }
+
    iocb->req = req;
    iocb->ret = 0;
    iocb->nr = nr;
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -400,8 +400,9 @@ static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp)
    NvmeRuHandle *ruh;
    uint8_t lbafi = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
    g_autofree unsigned int *ruhids = NULL;
-    unsigned int *ruhid;
-    char *r, *p, *token;
+    unsigned int n, m, *ruhid;
+    const char *endptr, *token;
+    char *r, *p;
    uint16_t *ph;

    if (!ns->params.fdp.ruhs) {
@@ -438,23 +439,55 @@ static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp)

    /* parse the placement handle identifiers */
    while ((token = qemu_strsep(&p, ";")) != NULL) {
-        ns->fdp.nphs += 1;
-        if (ns->fdp.nphs > NVME_FDP_MAXPIDS ||
-            ns->fdp.nphs == endgrp->fdp.nruh) {
-            error_setg(errp, "too many placement handles");
-            free(r);
-            return false;
-        }
-
-        if (qemu_strtoui(token, NULL, 0, ruhid++) < 0) {
+        if (qemu_strtoui(token, &endptr, 0, &n) < 0) {
            error_setg(errp, "cannot parse reclaim unit handle identifier");
            free(r);
            return false;
        }
+
+        m = n;
+
+        /* parse range */
+        if (*endptr == '-') {
+            token = endptr + 1;
+
+            if (qemu_strtoui(token, NULL, 0, &m) < 0) {
+                error_setg(errp, "cannot parse reclaim unit handle identifier");
+                free(r);
+                return false;
+            }
+
+            if (m < n) {
+                error_setg(errp, "invalid reclaim unit handle identifier range");
+                free(r);
+                return false;
+            }
+        }
+
+        for (; n <= m; n++) {
+            if (ns->fdp.nphs++ == endgrp->fdp.nruh) {
+                error_setg(errp, "too many placement handles");
+                free(r);
+                return false;
+            }
+
+            *ruhid++ = n;
+        }
    }

    free(r);

+    /* verify that the ruhids are unique */
+    for (unsigned int i = 0; i < ns->fdp.nphs; i++) {
+        for (unsigned int j = i + 1; j < ns->fdp.nphs; j++) {
+            if (ruhids[i] == ruhids[j]) {
+                error_setg(errp, "duplicate reclaim unit handle identifier: %u",
+                           ruhids[i]);
+                return false;
+            }
+        }
+    }
+
    ph = ns->fdp.phs = g_new(uint16_t, ns->fdp.nphs);

    ruhid = ruhids;
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -158,8 +158,10 @@ static bool nvme_subsys_setup_fdp(NvmeSubsystem *subsys, Error **errp)

    endgrp->fdp.nrg = subsys->params.fdp.nrg;

-    if (!subsys->params.fdp.nruh) {
-        error_setg(errp, "fdp.nruh must be non-zero");
+    if (!subsys->params.fdp.nruh ||
+        subsys->params.fdp.nruh > NVME_FDP_MAXPIDS) {
+        error_setg(errp, "fdp.nruh must be non-zero and less than %u",
+                   NVME_FDP_MAXPIDS);
        return false;
    }

--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -765,7 +765,9 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env)
    tlb->mas7_3 = 0;
    tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;

+#ifdef CONFIG_KVM
    env->tlb_dirty = true;
+#endif
 }

 static void ppce500_cpu_reset_sec(void *opaque)
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -83,7 +83,9 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env,
    tlb->mas2 = (va & TARGET_PAGE_MASK) | MAS2_M;
    tlb->mas7_3 = pa & TARGET_PAGE_MASK;
    tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;
+#ifdef CONFIG_KVM
    env->tlb_dirty = true;
+#endif
 }

 static void spin_kick(CPUState *cs, run_on_cpu_data data)
--- a/hw/remote/proxy-memory-listener.c
+++ b/hw/remote/proxy-memory-listener.c
@@ -217,7 +217,7 @@ void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
    proxy_listener->listener.commit = proxy_memory_listener_commit;
    proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
    proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
-    proxy_listener->listener.priority = 10;
+    proxy_listener->listener.priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND;
    proxy_listener->listener.name = "proxy";

    memory_listener_register(&proxy_listener->listener,
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -289,38 +289,11 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev,
    memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS);
 }

-static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev,
-                                                uint32_t argsz)
+static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev)
 {
-    struct vfio_device_info *info = g_malloc0(argsz);
-    VFIOPCIDevice *vfio_pci;
-    int fd;
+    VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);

-    vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
-    fd = vfio_pci->vbasedev.fd;
-
-    /*
-     * If the specified argsz is not large enough to contain all capabilities
-     * it will be updated upon return from the ioctl.  Retry until we have
-     * a big enough buffer to hold the entire capability chain.  On error,
-     * just exit and rely on CLP defaults.
-     */
-retry:
-    info->argsz = argsz;
-
-    if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
-        trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name);
-        g_free(info);
-        return NULL;
-    }
-
-    if (info->argsz > argsz) {
-        argsz = info->argsz;
-        info = g_realloc(info, argsz);
-        goto retry;
-    }
-
-    return info;
+    return vfio_get_device_info(vfio_pci->vbasedev.fd);
 }

 /*
@@ -335,7 +308,7 @@ bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh)

    assert(fh);

-    info = get_device_info(pbdev, sizeof(*info));
+    info = get_device_info(pbdev);
    if (!info) {
        return false;
    }
@@ -356,7 +329,7 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev)
 {
    g_autofree struct vfio_device_info *info = NULL;

-    info = get_device_info(pbdev, sizeof(*info));
+    info = get_device_info(pbdev);
    if (!info) {
        return;
    }
--- a/hw/sparc64/niagara.c
+++ b/hw/sparc64/niagara.c
@@ -23,6 +23,7 @@
 */

 #include "qemu/osdep.h"
+#include "block/block_int-common.h"
 #include "qemu/units.h"
 #include "cpu.h"
 #include "hw/boards.h"
@@ -143,9 +144,10 @@ static void niagara_init(MachineState *machine)
            memory_region_add_subregion(get_system_memory(),
                                        NIAGARA_VDISK_BASE, &s->vdisk_ram);
            dinfo->is_default = 1;
-            rom_add_file_fixed(blk_name(blk), NIAGARA_VDISK_BASE, -1);
+            rom_add_file_fixed(blk_bs(blk)->filename, NIAGARA_VDISK_BASE, -1);
        } else {
-            error_report("could not load ram disk '%s'", blk_name(blk));
+            error_report("could not load ram disk '%s'",
+                         blk_bs(blk)->filename);
            exit(1);
        }
    }
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -381,7 +381,7 @@ static unsigned int vfio_migratable_device_num(void)
    return device_num;
 }

-int vfio_block_multiple_devices_migration(Error **errp)
+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
 {
    int ret;

@@ -390,6 +390,12 @@ int vfio_block_multiple_devices_migration(Error **errp)
        return 0;
    }

+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+        error_setg(errp, "Migration is currently not supported with multiple "
+                         "VFIO devices");
+        return -EINVAL;
+    }
+
    error_setg(&multiple_devices_migration_blocker,
               "Migration is currently not supported with multiple "
               "VFIO devices");
@@ -427,7 +433,7 @@ static bool vfio_viommu_preset(void)
    return false;
 }

-int vfio_block_giommu_migration(Error **errp)
+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
 {
    int ret;

@@ -436,6 +442,12 @@ int vfio_block_giommu_migration(Error **errp)
        return 0;
    }

+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+        error_setg(errp,
+                   "Migration is currently not supported with vIOMMU enabled");
+        return -EINVAL;
+    }
+
    error_setg(&giommu_migration_blocker,
               "Migration is currently not supported with vIOMMU enabled");
    ret = migrate_add_blocker(giommu_migration_blocker, errp);
@@ -492,7 +504,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
            }

            if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
-                migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
+                (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+                 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
                return false;
            }
        }
@@ -537,7 +550,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
                return false;
            }

-            if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
+            if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+                migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
                continue;
            } else {
                return false;
@@ -2844,11 +2858,35 @@ void vfio_put_group(VFIOGroup *group)
    }
 }

+struct vfio_device_info *vfio_get_device_info(int fd)
+{
+    struct vfio_device_info *info;
+    uint32_t argsz = sizeof(*info);
+
+    info = g_malloc0(argsz);
+
+retry:
+    info->argsz = argsz;
+
+    if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
+        g_free(info);
+        return NULL;
+    }
+
+    if (info->argsz > argsz) {
+        argsz = info->argsz;
+        info = g_realloc(info, argsz);
+        goto retry;
+    }
+
+    return info;
+}
+
 int vfio_get_device(VFIOGroup *group, const char *name,
                    VFIODevice *vbasedev, Error **errp)
 {
-    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
-    int ret, fd;
+    g_autofree struct vfio_device_info *info = NULL;
+    int fd;

    fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
    if (fd < 0) {
@@ -2860,11 +2898,11 @@ int vfio_get_device(VFIOGroup *group, const char *name,
        return fd;
    }

-    ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
-    if (ret) {
+    info = vfio_get_device_info(fd);
+    if (!info) {
        error_setg_errno(errp, errno, "error getting device info");
        close(fd);
-        return ret;
+        return -1;
    }

    /*
@@ -2892,14 +2930,14 @@ int vfio_get_device(VFIOGroup *group, const char *name,
    vbasedev->group = group;
    QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);

-    vbasedev->num_irqs = dev_info.num_irqs;
-    vbasedev->num_regions = dev_info.num_regions;
-    vbasedev->flags = dev_info.flags;
+    vbasedev->num_irqs = info->num_irqs;
+    vbasedev->num_regions = info->num_regions;
+    vbasedev->flags = info->flags;

-    trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
-                          dev_info.num_irqs);
+    trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
+
+    vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);

-    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
    return 0;
 }

--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -18,6 +18,8 @@
 #include "sysemu/runstate.h"
 #include "hw/vfio/vfio-common.h"
 #include "migration/migration.h"
+#include "migration/options.h"
+#include "migration/savevm.h"
 #include "migration/vmstate.h"
 #include "migration/qemu-file.h"
 #include "migration/register.h"
@@ -45,6 +47,7 @@
 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
+#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)

 /*
 * This is an arbitrary size based on migration of mlx5 devices, where typically
@@ -68,6 +71,8 @@ static const char *mig_state_to_str(enum vfio_device_mig_state state)
        return "STOP_COPY";
    case VFIO_DEVICE_STATE_RESUMING:
        return "RESUMING";
+    case VFIO_DEVICE_STATE_PRE_COPY:
+        return "PRE_COPY";
    default:
        return "UNKNOWN STATE";
    }
@@ -241,18 +246,45 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
    return 0;
 }

-/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
-static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
+static int vfio_query_precopy_size(VFIOMigration *migration)
+{
+    struct vfio_precopy_info precopy = {
+        .argsz = sizeof(precopy),
+    };
+
+    migration->precopy_init_size = 0;
+    migration->precopy_dirty_size = 0;
+
+    if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
+        return -errno;
+    }
+
+    migration->precopy_init_size = precopy.initial_bytes;
+    migration->precopy_dirty_size = precopy.dirty_bytes;
+
+    return 0;
+}
+
+/* Returns the size of saved data on success and -errno on error */
+static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
 {
    ssize_t data_size;

    data_size = read(migration->data_fd, migration->data_buffer,
                     migration->data_buffer_size);
    if (data_size < 0) {
+        /*
+         * Pre-copy emptied all the device state for now. For more information,
+         * please refer to the Linux kernel VFIO uAPI.
+         */
+        if (errno == ENOMSG) {
+            return 0;
+        }
+
        return -errno;
    }
    if (data_size == 0) {
-        return 1;
+        return 0;
    }

    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
@@ -262,7 +294,39 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)

    trace_vfio_save_block(migration->vbasedev->name, data_size);

-    return qemu_file_get_error(f);
+    return qemu_file_get_error(f) ?: data_size;
+}
+
+static void vfio_update_estimated_pending_data(VFIOMigration *migration,
+                                               uint64_t data_size)
+{
+    if (!data_size) {
+        /*
+         * Pre-copy emptied all the device state for now, update estimated sizes
+         * accordingly.
+         */
+        migration->precopy_init_size = 0;
+        migration->precopy_dirty_size = 0;
+
+        return;
+    }
+
+    if (migration->precopy_init_size) {
+        uint64_t init_size = MIN(migration->precopy_init_size, data_size);
+
+        migration->precopy_init_size -= init_size;
+        data_size -= init_size;
+    }
+
+    migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
+                                         data_size);
+}
+
+static bool vfio_precopy_supported(VFIODevice *vbasedev)
+{
+    VFIOMigration *migration = vbasedev->migration;
+
+    return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
 }

 /* ---------------------------------------------------------------------- */
@@ -285,6 +349,28 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
        return -ENOMEM;
    }

+    if (vfio_precopy_supported(vbasedev)) {
+        int ret;
+
+        switch (migration->device_state) {
+        case VFIO_DEVICE_STATE_RUNNING:
+            ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
+                                           VFIO_DEVICE_STATE_RUNNING);
+            if (ret) {
+                return ret;
+            }
+
+            vfio_query_precopy_size(migration);
+
+            break;
+        case VFIO_DEVICE_STATE_STOP:
+            /* vfio_save_complete_precopy() will go to STOP_COPY */
+            break;
+        default:
+            return -EINVAL;
+        }
+    }
+
    trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);

    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
@@ -299,26 +385,43 @@ static void vfio_save_cleanup(void *opaque)

    g_free(migration->data_buffer);
    migration->data_buffer = NULL;
+    migration->precopy_init_size = 0;
+    migration->precopy_dirty_size = 0;
+    migration->initial_data_sent = false;
    vfio_migration_cleanup(vbasedev);
    trace_vfio_save_cleanup(vbasedev->name);
 }

+static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
+                                        uint64_t *can_postcopy)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+
+    if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) {
+        return;
+    }
+
+    *must_precopy +=
+        migration->precopy_init_size + migration->precopy_dirty_size;
+
+    trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
+                                      *can_postcopy,
+                                      migration->precopy_init_size,
+                                      migration->precopy_dirty_size);
+}
+
 /*
 * Migration size of VFIO devices can be as little as a few KBs or as big as
 * many GBs. This value should be big enough to cover the worst case.
 */
 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)

-/*
- * Only exact function is implemented and not estimate function. The reason is
- * that during pre-copy phase of migration the estimate function is called
- * repeatedly while pending RAM size is over the threshold, thus migration
- * can't converge and querying the VFIO device pending data size is useless.
- */
 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
                                     uint64_t *can_postcopy)
 {
    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
    uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;

    /*
@@ -328,16 +431,64 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
    vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
    *must_precopy += stop_copy_size;

+    if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
+        vfio_query_precopy_size(migration);
+
+        *must_precopy +=
+            migration->precopy_init_size + migration->precopy_dirty_size;
+    }
+
    trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
-                                   stop_copy_size);
+                                   stop_copy_size, migration->precopy_init_size,
+                                   migration->precopy_dirty_size);
+}
+
+static bool vfio_is_active_iterate(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+
+    return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
+}
+
+static int vfio_save_iterate(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+    ssize_t data_size;
+
+    data_size = vfio_save_block(f, migration);
+    if (data_size < 0) {
+        return data_size;
+    }
+
+    vfio_update_estimated_pending_data(migration, data_size);
+
+    if (migrate_switchover_ack() && !migration->precopy_init_size &&
+        !migration->initial_data_sent) {
+        qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
+        migration->initial_data_sent = true;
+    } else {
+        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+    }
+
+    trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
+                            migration->precopy_dirty_size);
+
+    /*
+     * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero.
+     * Return 1 so following handlers will not be potentially blocked.
+     */
+    return 1;
 }

 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 {
    VFIODevice *vbasedev = opaque;
+    ssize_t data_size;
    int ret;

-    /* We reach here with device state STOP only */
+    /* We reach here with device state STOP or STOP_COPY only */
    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
                                   VFIO_DEVICE_STATE_STOP);
    if (ret) {
@@ -345,11 +496,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
    }

    do {
-        ret = vfio_save_block(f, vbasedev->migration);
-        if (ret < 0) {
-            return ret;
+        data_size = vfio_save_block(f, vbasedev->migration);
+        if (data_size < 0) {
+            return data_size;
        }
-    } while (!ret);
+    } while (data_size);

    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
    ret = qemu_file_get_error(f);
@@ -439,6 +590,24 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
            }
            break;
        }
+        case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
+        {
+            if (!vfio_precopy_supported(vbasedev) ||
+                !migrate_switchover_ack()) {
+                error_report("%s: Received INIT_DATA_SENT but switchover ack "
+                             "is not used", vbasedev->name);
+                return -EINVAL;
+            }
+
+            ret = qemu_loadvm_approve_switchover();
+            if (ret) {
+                error_report(
+                    "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
+                    vbasedev->name, ret, strerror(-ret));
+            }
+
+            return ret;
+        }
        default:
            error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
            return -EINVAL;
@@ -453,15 +622,26 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
    return ret;
 }

+static bool vfio_switchover_ack_needed(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    return vfio_precopy_supported(vbasedev);
+}
+
 static const SaveVMHandlers savevm_vfio_handlers = {
    .save_setup = vfio_save_setup,
    .save_cleanup = vfio_save_cleanup,
+    .state_pending_estimate = vfio_state_pending_estimate,
    .state_pending_exact = vfio_state_pending_exact,
+    .is_active_iterate = vfio_is_active_iterate,
+    .save_live_iterate = vfio_save_iterate,
    .save_live_complete_precopy = vfio_save_complete_precopy,
    .save_state = vfio_save_state,
    .load_setup = vfio_load_setup,
    .load_cleanup = vfio_load_cleanup,
    .load_state = vfio_load_state,
+    .switchover_ack_needed = vfio_switchover_ack_needed,
 };

 /* ---------------------------------------------------------------------- */
@@ -469,13 +649,18 @@ static const SaveVMHandlers savevm_vfio_handlers = {
 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
 {
    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
    enum vfio_device_mig_state new_state;
    int ret;

    if (running) {
        new_state = VFIO_DEVICE_STATE_RUNNING;
    } else {
-        new_state = VFIO_DEVICE_STATE_STOP;
+        new_state =
+            (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY &&
+             (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
+                VFIO_DEVICE_STATE_STOP_COPY :
+                VFIO_DEVICE_STATE_STOP;
    }

    /*
@@ -512,7 +697,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
    case MIGRATION_STATUS_CANCELLING:
    case MIGRATION_STATUS_CANCELLED:
    case MIGRATION_STATUS_FAILED:
-        bytes_transferred = 0;
        /*
         * If setting the device in RUNNING state fails, the device should
         * be reset. To do so, use ERROR state as a recover state.
@@ -540,14 +724,6 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
    feature->argsz = sizeof(buf);
    feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
-        if (errno == ENOTTY) {
-            error_report("%s: VFIO migration is not supported in kernel",
-                         vbasedev->name);
-        } else {
-            error_report("%s: Failed to query VFIO migration support, err: %s",
-                         vbasedev->name, strerror(errno));
-        }
-
        return -errno;
    }

@@ -602,6 +778,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
    migration->vbasedev = vbasedev;
    migration->device_state = VFIO_DEVICE_STATE_RUNNING;
    migration->data_fd = -1;
+    migration->mig_flags = mig_flags;

    vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);

@@ -625,6 +802,27 @@ static int vfio_migration_init(VFIODevice *vbasedev)
    return 0;
 }

+static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
+{
+    int ret;
+
+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+        error_propagate(errp, err);
+        return -EINVAL;
+    }
+
+    vbasedev->migration_blocker = error_copy(err);
+    error_free(err);
+
+    ret = migrate_add_blocker(vbasedev->migration_blocker, errp);
+    if (ret < 0) {
+        error_free(vbasedev->migration_blocker);
+        vbasedev->migration_blocker = NULL;
+    }
+
+    return ret;
+}
+
 /* ---------------------------------------------------------------------- */

 int64_t vfio_mig_bytes_transferred(void)
@@ -632,42 +830,61 @@ int64_t vfio_mig_bytes_transferred(void)
    return bytes_transferred;
 }

+void vfio_reset_bytes_transferred(void)
+{
+    bytes_transferred = 0;
+}
+
 int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
 {
-    int ret = -ENOTSUP;
+    Error *err = NULL;
+    int ret;

-    if (!vbasedev->enable_migration) {
-        goto add_blocker;
+    if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
+        error_setg(&err, "%s: Migration is disabled for VFIO device",
+                   vbasedev->name);
+        return vfio_block_migration(vbasedev, err, errp);
    }

    ret = vfio_migration_init(vbasedev);
    if (ret) {
-        goto add_blocker;
+        if (ret == -ENOTTY) {
+            error_setg(&err, "%s: VFIO migration is not supported in kernel",
+                       vbasedev->name);
+        } else {
+            error_setg(&err,
+                       "%s: Migration couldn't be initialized for VFIO device, "
+                       "err: %d (%s)",
+                       vbasedev->name, ret, strerror(-ret));
+        }
+
+        return vfio_block_migration(vbasedev, err, errp);
    }

-    ret = vfio_block_multiple_devices_migration(errp);
+    if (!vbasedev->dirty_pages_supported) {
+        if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
+            error_setg(&err,
+                       "%s: VFIO device doesn't support device dirty tracking",
+                       vbasedev->name);
+            return vfio_block_migration(vbasedev, err, errp);
+        }
+
+        warn_report("%s: VFIO device doesn't support device dirty tracking",
+                    vbasedev->name);
+    }
+
+    ret = vfio_block_multiple_devices_migration(vbasedev, errp);
    if (ret) {
        return ret;
    }

-    ret = vfio_block_giommu_migration(errp);
+    ret = vfio_block_giommu_migration(vbasedev, errp);
    if (ret) {
        return ret;
    }

-    trace_vfio_migration_probe(vbasedev->name);
+    trace_vfio_migration_realize(vbasedev->name);
    return 0;
-
-add_blocker:
-    error_setg(&vbasedev->migration_blocker,
-               "VFIO device doesn't support migration");
-
-    ret = migrate_add_blocker(vbasedev->migration_blocker, errp);
-    if (ret < 0) {
-        error_free(vbasedev->migration_blocker);
-        vbasedev->migration_blocker = NULL;
-    }
-    return ret;
 }

 void vfio_migration_exit(VFIODevice *vbasedev)
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1490,6 +1490,9 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
 * +---------------------------------+---------------------------------+
 *
 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
+ *
+ * Specification for Turning and later GPU architectures:
+ * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf
 */
 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
                                       const char *name, void *opaque,
@@ -1530,7 +1533,9 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = {
 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
 {
    PCIDevice *pdev = &vdev->pdev;
-    int ret, pos = 0xC8;
+    int ret, pos;
+    bool c8_conflict = false, d4_conflict = false;
+    uint8_t tmp;

    if (vdev->nv_gpudirect_clique == 0xFF) {
        return 0;
@@ -1547,6 +1552,40 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
        return -EINVAL;
    }

+    /*
+     * Per the updated specification above, it's recommended to use offset
+     * D4h for Turing and later GPU architectures due to a conflict of the
+     * MSI-X capability at C8h.  We don't know how to determine the GPU
+     * architecture, instead we walk the capability chain to mark conflicts
+     * and choose one or error based on the result.
+     *
+     * NB. Cap list head in pdev->config is already cleared, read from device.
+     */
+    ret = pread(vdev->vbasedev.fd, &tmp, 1,
+                vdev->config_offset + PCI_CAPABILITY_LIST);
+    if (ret != 1 || !tmp) {
+        error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list");
+        return -EINVAL;
+    }
+
+    do {
+        if (tmp == 0xC8) {
+            c8_conflict = true;
+        } else if (tmp == 0xD4) {
+            d4_conflict = true;
+        }
+        tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT];
+    } while (tmp);
+
+    if (!c8_conflict) {
+        pos = 0xC8;
+    } else if (!d4_conflict) {
+        pos = 0xD4;
+    } else {
+        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space");
+        return -EINVAL;
+    }
+
    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
    if (ret < 0) {
        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -663,6 +663,8 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)

    vfio_disable_interrupts(vdev);

+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+retry:
    /*
     * Setting vector notifiers needs to enable route for each vector.
     * Deferring to commit the KVM routes once rather than per vector
@@ -670,8 +672,6 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
     */
    vfio_prepare_kvm_msi_virq_batch(vdev);

-    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
-retry:
    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);

    for (i = 0; i < vdev->nr_vectors; i++) {
@@ -3221,7 +3221,12 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)

 out_deregister:
    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
-    kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+    if (vdev->irqchip_change_notifier.notify) {
+        kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+    }
+    if (vdev->intx.mmap_timer) {
+        timer_free(vdev->intx.mmap_timer);
+    }
 out_teardown:
    vfio_teardown_msi(vdev);
    vfio_bars_exit(vdev);
@@ -3347,8 +3352,8 @@ static Property vfio_pci_dev_properties[] = {
                    VFIO_FEATURE_ENABLE_REQ_BIT, true),
    DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
                    VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
-    DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice,
-                     vbasedev.enable_migration, false),
+    DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
+                            vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
    DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
    DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
                     vbasedev.ram_block_discard_allowed, false),
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -155,13 +155,15 @@ vfio_load_cleanup(const char *name) " (%s)"
 vfio_load_device_config_state(const char *name) " (%s)"
 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d"
-vfio_migration_probe(const char *name) " (%s)"
+vfio_migration_realize(const char *name) " (%s)"
 vfio_migration_set_state(const char *name, const char *state) " (%s) state %s"
 vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
 vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
 vfio_save_cleanup(const char *name) " (%s)"
 vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
 vfio_save_device_config_state(const char *name) " (%s)"
+vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
 vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64
-vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64
+vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
+vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1444,7 +1444,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
        .log_sync = vhost_log_sync,
        .log_global_start = vhost_log_global_start,
        .log_global_stop = vhost_log_global_stop,
-        .priority = 10
+        .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND
    };

    hdev->iommu_listener = (MemoryListener) {
--- a/hw/xen/xen-hvm-common.c
+++ b/hw/xen/xen-hvm-common.c
@@ -155,7 +155,7 @@ MemoryListener xen_io_listener = {
    .name = "xen-io",
    .region_add = xen_io_add,
    .region_del = xen_io_del,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 DeviceListener xen_device_listener = {
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -691,14 +691,14 @@ static const MemoryListener xen_pt_memory_listener = {
    .name = "xen-pt-mem",
    .region_add = xen_pt_region_add,
    .region_del = xen_pt_region_del,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 static const MemoryListener xen_pt_io_listener = {
    .name = "xen-pt-io",
    .region_add = xen_pt_io_region_add,
    .region_del = xen_pt_io_region_del,
-    .priority = 10,
+    .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 /* destroy. */
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -224,6 +224,13 @@ bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event);
 void co_wrapper_mixed_bdrv_rdlock
 bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);

+#define BLKDBG_CO_EVENT(child, evt) \
+    do { \
+        if (child) { \
+            bdrv_co_debug_event(child->bs, evt); \
+        } \
+    } while (0)
+
 #define BLKDBG_EVENT(child, evt) \
    do { \
        if (child) { \
--- a/include/block/graph-lock.h
+++ b/include/block/graph-lock.h
@@ -111,10 +111,12 @@ void unregister_aiocontext(AioContext *ctx);
 * The wrlock can only be taken from the main loop, with BQL held, as only the
 * main loop is allowed to modify the graph.
 *
+ * If @bs is non-NULL, its AioContext is temporarily released.
+ *
 * This function polls. Callers must not hold the lock of any AioContext other
- * than the current one.
+ * than the current one and the one of @bs.
 */
-void bdrv_graph_wrlock(void) TSA_ACQUIRE(graph_lock) TSA_NO_TSA;
+void bdrv_graph_wrlock(BlockDriverState *bs) TSA_ACQUIRE(graph_lock) TSA_NO_TSA;

 /*
 * bdrv_graph_wrunlock:
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -811,6 +811,10 @@ struct IOMMUMemoryRegion {
 #define IOMMU_NOTIFIER_FOREACH(n, mr) \
    QLIST_FOREACH((n), &(mr)->iommu_notify, node)

+#define MEMORY_LISTENER_PRIORITY_MIN            0
+#define MEMORY_LISTENER_PRIORITY_ACCEL          10
+#define MEMORY_LISTENER_PRIORITY_DEV_BACKEND    10
+
 /**
 * struct MemoryListener: callbacks structure for updates to the physical memory map
 *
--- a/include/hw/char/escc.h
+++ b/include/hw/char/escc.h
@@ -45,6 +45,7 @@ typedef struct ESCCChannelState {
    ESCCChnType type;
    uint8_t rx, tx;
    QemuInputHandlerState *hs;
+    char *sunkbd_layout;
 } ESCCChannelState;

 struct ESCCState {
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -241,9 +241,6 @@ typedef struct SavedIOTLB {
 struct KVMState;
 struct kvm_run;

-struct hax_vcpu_state;
-struct hvf_vcpu_state;
-
 /* work queue */

 /* The union type allows passing of 64 bit target pointers on 32 bit
@@ -309,6 +306,7 @@ struct qemu_work_item;
 * @next_cpu: Next CPU sharing TB cache.
 * @opaque: User data.
 * @mem_io_pc: Host Program Counter at which the memory was accessed.
+ * @accel: Pointer to accelerator specific state.
 * @kvm_fd: vCPU file descriptor for KVM.
 * @work_mutex: Lock to prevent multiple access to @work_list.
 * @work_list: List of pending asynchronous work.
@@ -338,7 +336,6 @@ struct CPUState {

    struct QemuThread *thread;
 #ifdef _WIN32
-    HANDLE hThread;
    QemuSemaphore sem;
 #endif
    int thread_id;
@@ -424,6 +421,7 @@ struct CPUState {
    uint32_t can_do_io;
    int32_t exception_index;

+    AccelCPUState *accel;
    /* shared by kvm, hax and hvf */
    bool vcpu_dirty;

@@ -443,10 +441,6 @@ struct CPUState {
    /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
    bool prctl_unalign_sigbus;

-    struct hax_vcpu_state *hax_vcpu;
-
-    struct hvf_vcpu_state *hvf;
-
    /* track IOMMUs whose translations we've cached in the TCG TLB */
    GArray *iommu_notifiers;
 };
--- a/include/hw/intc/arm_gic.h
+++ b/include/hw/intc/arm_gic.h
@@ -86,4 +86,6 @@ struct ARMGICClass {
    DeviceRealize parent_realize;
 };

+const char *gic_class_name(void);
+
 #endif
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -329,4 +329,14 @@ struct ARMGICv3CommonClass {
 void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler,
                              const MemoryRegionOps *ops);

+/**
+ * gicv3_class_name
+ *
+ * Return name of GICv3 class to use depending on whether KVM acceleration is
+ * in use. May throw an error if the chosen implementation is not available.
+ *
+ * Returns: class name to use
+ */
+const char *gicv3_class_name(void);
+
 #endif
--- a/include/hw/intc/arm_gicv3_its_common.h
+++ b/include/hw/intc/arm_gicv3_its_common.h
@@ -122,5 +122,14 @@ struct GICv3ITSCommonClass {
    void (*post_load)(GICv3ITSState *s);
 };

+/**
+ * its_class_name:
+ *
+ * Return the ITS class name to use depending on whether KVM acceleration
+ * and KVM CAP_SIGNAL_MSI are supported
+ *
+ * Returns: class name to use or NULL
+ */
+const char *its_class_name(void);

 #endif
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -66,6 +66,10 @@ typedef struct VFIOMigration {
    int data_fd;
    void *data_buffer;
    size_t data_buffer_size;
+    uint64_t mig_flags;
+    uint64_t precopy_init_size;
+    uint64_t precopy_dirty_size;
+    bool initial_data_sent;
 } VFIOMigration;

 typedef struct VFIOAddressSpace {
@@ -135,7 +139,7 @@ typedef struct VFIODevice {
    bool needs_reset;
    bool no_mmap;
    bool ram_block_discard_allowed;
-    bool enable_migration;
+    OnOffAuto enable_migration;
    VFIODeviceOps *ops;
    unsigned int num_irqs;
    unsigned int num_regions;
@@ -212,6 +216,7 @@ void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
 void vfio_put_group(VFIOGroup *group);
+struct vfio_device_info *vfio_get_device_info(int fd);
 int vfio_get_device(VFIOGroup *group, const char *name,
                    VFIODevice *vbasedev, Error **errp);

@@ -220,10 +225,11 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
 extern VFIOGroupList vfio_group_list;

 bool vfio_mig_active(void);
-int vfio_block_multiple_devices_migration(Error **errp);
+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
 void vfio_unblock_multiple_devices_migration(void);
-int vfio_block_giommu_migration(Error **errp);
+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp);
 int64_t vfio_mig_bytes_transferred(void);
+void vfio_reset_bytes_transferred(void);

 #ifdef CONFIG_LINUX
 int vfio_get_region_info(VFIODevice *vbasedev, int index,
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -48,6 +48,9 @@ struct virtio_gpu_simple_resource {
    unsigned int iov_cnt;
    uint32_t scanout_bitmask;
    pixman_image_t *image;
+#ifdef WIN32
+    HANDLE handle;
+#endif
    uint64_t hostmem;

    uint64_t blob_size;
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -71,6 +71,8 @@ typedef struct SaveVMHandlers {
    int (*load_cleanup)(void *opaque);
    /* Called when postcopy migration wants to resume from failure */
    int (*resume_prepare)(MigrationState *s, void *opaque);
+    /* Checks if switchover ack should be used. Called only in dest */
+    bool (*switchover_ack_needed)(void *opaque);
 } SaveVMHandlers;

 int register_savevm_live(const char *idstr,
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -21,6 +21,7 @@
 * Incomplete struct types
 * Please keep this list in case-insensitive alphabetical order.
 */
+typedef struct AccelCPUState AccelCPUState;
 typedef struct AccelState AccelState;
 typedef struct AdapterInfo AdapterInfo;
 typedef struct AddressSpace AddressSpace;
--- a/include/sysemu/hax.h
+++ b/include/sysemu/hax.h
@@ -19,6 +19,8 @@
 *
 */

+/* header to be included in non-HAX-specific code */
+
 #ifndef QEMU_HAX_H
 #define QEMU_HAX_H

--- a/include/sysemu/hvf_int.h
+++ b/include/sysemu/hvf_int.h
@@ -49,7 +49,7 @@ struct HVFState {
 };
 extern HVFState *hvf_state;

-struct hvf_vcpu_state {
+struct AccelCPUState {
    uint64_t fd;
    void *exit;
    bool vtimer_masked;
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -11,9 +11,12 @@
 *
 */

+/* header to be included in non-KVM-specific code */
+
 #ifndef QEMU_KVM_H
 #define QEMU_KVM_H

+#include "exec/memattrs.h"
 #include "qemu/accel.h"
 #include "qom/object.h"

--- a/include/sysemu/nvmm.h
+++ b/include/sysemu/nvmm.h
@@ -7,6 +7,8 @@
 * See the COPYING file in the top-level directory.
 */

+/* header to be included in non-NVMM-specific code */
+
 #ifndef QEMU_NVMM_H
 #define QEMU_NVMM_H

--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -263,6 +263,9 @@ EXCEPTION_DISPOSITION
 win32_close_exception_handler(struct _EXCEPTION_RECORD*, void*,
                              struct _CONTEXT*, void*);

+void *qemu_win32_map_alloc(size_t size, HANDLE *h, Error **errp);
+void qemu_win32_map_free(void *ptr, HANDLE h, Error **errp);
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/sysemu/tcg.h
+++ b/include/sysemu/tcg.h
@@ -5,6 +5,8 @@
 * See the COPYING file in the top-level directory.
 */

+/* header to be included in non-TCG-specific code */
+
 #ifndef SYSEMU_TCG_H
 #define SYSEMU_TCG_H

--- a/include/sysemu/whpx.h
+++ b/include/sysemu/whpx.h
@@ -10,6 +10,8 @@
 *
 */

+/* header to be included in non-WHPX-specific code */
+
 #ifndef QEMU_WHPX_H
 #define QEMU_WHPX_H

--- a/include/sysemu/xen.h
+++ b/include/sysemu/xen.h
@@ -5,6 +5,8 @@
 * See the COPYING file in the top-level directory.
 */

+/* header to be included in non-Xen-specific code */
+
 #ifndef SYSEMU_XEN_H
 #define SYSEMU_XEN_H

--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -5,6 +5,7 @@
 #include "qom/object.h"
 #include "qemu/notify.h"
 #include "qapi/qapi-types-ui.h"
+#include "ui/input.h"

 #ifdef CONFIG_OPENGL
 # include <epoxy/gl.h>
@@ -95,6 +96,20 @@ bool kbd_put_qcode_console(QemuConsole *s, int qcode, bool ctrl);
 void kbd_put_string_console(QemuConsole *s, const char *str, int len);
 void kbd_put_keysym(int keysym);

+/* Touch devices */
+typedef struct touch_slot {
+    int x;
+    int y;
+    int tracking_id;
+} touch_slot;
+
+void console_handle_touch_event(QemuConsole *con,
+                                struct touch_slot touch_slots[INPUT_EVENT_SLOTS_MAX],
+                                uint64_t num_slot,
+                                int width, int height,
+                                double x, double y,
+                                InputMultiTouchType type,
+                                Error **errp);
 /* consoles */

 #define TYPE_QEMU_CONSOLE "qemu-console"
@@ -117,6 +132,7 @@ typedef struct ScanoutTexture {
    uint32_t y;
    uint32_t width;
    uint32_t height;
+    void *d3d_tex2d;
 } ScanoutTexture;

 typedef struct DisplaySurface {
@@ -128,6 +144,10 @@ typedef struct DisplaySurface {
    GLenum gltype;
    GLuint texture;
 #endif
+#ifdef WIN32
+    HANDLE handle;
+    uint32_t handle_offset;
+#endif
 } DisplaySurface;

 typedef struct QemuUIInfo {
@@ -251,7 +271,8 @@ typedef struct DisplayChangeListenerOps {
                                   uint32_t backing_width,
                                   uint32_t backing_height,
                                   uint32_t x, uint32_t y,
-                                   uint32_t w, uint32_t h);
+                                   uint32_t w, uint32_t h,
+                                   void *d3d_tex2d);
    /* optional (default to true if has dpy_gl_scanout_dmabuf) */
    bool (*dpy_has_dmabuf)(DisplayChangeListener *dcl);
    /* optional */
@@ -314,6 +335,10 @@ DisplaySurface *qemu_create_displaysurface_from(int width, int height,
 DisplaySurface *qemu_create_displaysurface_pixman(pixman_image_t *image);
 DisplaySurface *qemu_create_placeholder_surface(int w, int h,
                                                const char *msg);
+#ifdef WIN32
+void qemu_displaysurface_win32_set_handle(DisplaySurface *surface,
+                                          HANDLE h, uint32_t offset);
+#endif
 PixelFormat qemu_default_pixelformat(int bpp);

 DisplaySurface *qemu_create_displaysurface(int width, int height);
@@ -355,7 +380,8 @@ void dpy_gl_scanout_disable(QemuConsole *con);
 void dpy_gl_scanout_texture(QemuConsole *con,
                            uint32_t backing_id, bool backing_y_0_top,
                            uint32_t backing_width, uint32_t backing_height,
-                            uint32_t x, uint32_t y, uint32_t w, uint32_t h);
+                            uint32_t x, uint32_t y, uint32_t w, uint32_t h,
+                            void *d3d_tex2d);
 void dpy_gl_scanout_dmabuf(QemuConsole *con,
                           QemuDmaBuf *dmabuf);
 void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf *dmabuf,
--- a/include/ui/egl-helpers.h
+++ b/include/ui/egl-helpers.h
@@ -12,6 +12,7 @@
 extern EGLDisplay *qemu_egl_display;
 extern EGLConfig qemu_egl_config;
 extern DisplayGLMode qemu_egl_mode;
+extern bool qemu_egl_angle_d3d;

 typedef struct egl_fb {
    int width;
@@ -31,16 +32,18 @@ void egl_fb_setup_for_tex(egl_fb *fb, int width, int height,
 void egl_fb_setup_new_tex(egl_fb *fb, int width, int height);
 void egl_fb_blit(egl_fb *dst, egl_fb *src, bool flip);
 void egl_fb_read(DisplaySurface *dst, egl_fb *src);
+void egl_fb_read_rect(DisplaySurface *dst, egl_fb *src, int x, int y, int w, int h);

 void egl_texture_blit(QemuGLShader *gls, egl_fb *dst, egl_fb *src, bool flip);
 void egl_texture_blend(QemuGLShader *gls, egl_fb *dst, egl_fb *src, bool flip,
                       int x, int y, double scale_x, double scale_y);

+extern EGLContext qemu_egl_rn_ctx;
+
 #ifdef CONFIG_GBM

 extern int qemu_egl_rn_fd;
 extern struct gbm_device *qemu_egl_rn_gbm_dev;
-extern EGLContext qemu_egl_rn_ctx;

 int egl_rendernode_init(const char *rendernode, DisplayGLMode mode);
 int egl_get_fd_for_texture(uint32_t tex_id, EGLint *stride, EGLint *fourcc,
@@ -62,9 +65,15 @@ int qemu_egl_init_dpy_mesa(EGLNativeDisplayType dpy, DisplayGLMode mode);

 #endif

+#ifdef WIN32
+int qemu_egl_init_dpy_win32(EGLNativeDisplayType dpy, DisplayGLMode mode);
+#endif
+
 EGLContext qemu_egl_init_ctx(void);
 bool qemu_egl_has_dmabuf(void);

 bool egl_init(const char *rendernode, DisplayGLMode mode, Error **errp);

+const char *qemu_egl_get_error_string(void);
+
 #endif /* EGL_HELPERS_H */
--- a/include/ui/gtk.h
+++ b/include/ui/gtk.h
@@ -175,7 +175,8 @@ void gd_egl_scanout_texture(DisplayChangeListener *dcl,
                            uint32_t backing_width,
                            uint32_t backing_height,
                            uint32_t x, uint32_t y,
-                            uint32_t w, uint32_t h);
+                            uint32_t w, uint32_t h,
+                            void *d3d_tex2d);
 void gd_egl_scanout_dmabuf(DisplayChangeListener *dcl,
                           QemuDmaBuf *dmabuf);
 void gd_egl_cursor_dmabuf(DisplayChangeListener *dcl,
@@ -211,7 +212,8 @@ void gd_gl_area_scanout_texture(DisplayChangeListener *dcl,
                                uint32_t backing_width,
                                uint32_t backing_height,
                                uint32_t x, uint32_t y,
-                                uint32_t w, uint32_t h);
+                                uint32_t w, uint32_t h,
+                                void *d3d_tex2d);
 void gd_gl_area_scanout_disable(DisplayChangeListener *dcl);
 void gd_gl_area_scanout_flush(DisplayChangeListener *dcl,
                              uint32_t x, uint32_t y, uint32_t w, uint32_t h);
--- a/include/ui/sdl2.h
+++ b/include/ui/sdl2.h
@@ -90,7 +90,8 @@ void sdl2_gl_scanout_texture(DisplayChangeListener *dcl,
                             uint32_t backing_width,
                             uint32_t backing_height,
                             uint32_t x, uint32_t y,
-                             uint32_t w, uint32_t h);
+                             uint32_t w, uint32_t h,
+                             void *d3d_tex2d);
 void sdl2_gl_scanout_flush(DisplayChangeListener *dcl,
                           uint32_t x, uint32_t y, uint32_t w, uint32_t h);

--- a/linux-user/i386/cpu_loop.c
+++ b/linux-user/i386/cpu_loop.c
@@ -47,7 +47,7 @@ static void write_dt(void *ptr, unsigned long addr, unsigned long limit,
 }

 static uint64_t *idt_table;
-#ifdef TARGET_X86_64
+
 static void set_gate64(void *ptr, unsigned int type, unsigned int dpl,
                       uint64_t addr, unsigned int sel)
 {
@@ -60,8 +60,10 @@ static void set_gate64(void *ptr, unsigned int type, unsigned int dpl,
    p[2] = tswap32(addr >> 32);
    p[3] = 0;
 }
+
+#ifdef TARGET_X86_64
 /* only dpl matters as we do only user space emulation */
-static void set_idt(int n, unsigned int dpl)
+static void set_idt(int n, unsigned int dpl, bool is64)
 {
    set_gate64(idt_table + n * 2, 0, dpl, 0, 0);
 }
@@ -78,9 +80,13 @@ static void set_gate(void *ptr, unsigned int type, unsigned int dpl,
 }

 /* only dpl matters as we do only user space emulation */
-static void set_idt(int n, unsigned int dpl)
+static void set_idt(int n, unsigned int dpl, bool is64)
 {
-    set_gate(idt_table + n, 0, dpl, 0, 0);
+    if (is64) {
+        set_gate64(idt_table + n * 2, 0, dpl, 0, 0);
+    } else {
+        set_gate(idt_table + n, 0, dpl, 0, 0);
+    }
 }
 #endif

@@ -325,6 +331,9 @@ static void target_cpu_free(void *obj)
 void target_cpu_copy_regs(CPUArchState *env, struct target_pt_regs *regs)
 {
    CPUState *cpu = env_cpu(env);
+    bool is64 = (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) != 0;
+    int i;
+
    OBJECT(cpu)->free = target_cpu_free;
    env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
    env->hflags |= HF_PE_MASK | HF_CPL_MASK;
@@ -332,15 +341,18 @@ void target_cpu_copy_regs(CPUArchState *env, struct target_pt_regs *regs)
        env->cr[4] |= CR4_OSFXSR_MASK;
        env->hflags |= HF_OSFXSR_MASK;
    }
-#ifndef TARGET_ABI32
+
    /* enable 64 bit mode if possible */
-    if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
+    if (is64) {
+        env->cr[4] |= CR4_PAE_MASK;
+        env->efer |= MSR_EFER_LMA | MSR_EFER_LME;
+        env->hflags |= HF_LMA_MASK;
+    }
+#ifndef TARGET_ABI32
+    else {
        fprintf(stderr, "The selected x86 CPU does not support 64 bit mode\n");
        exit(EXIT_FAILURE);
    }
-    env->cr[4] |= CR4_PAE_MASK;
-    env->efer |= MSR_EFER_LMA | MSR_EFER_LME;
-    env->hflags |= HF_LMA_MASK;
 #endif

    /* flags setup : we activate the IRQs by default as in user mode */
@@ -379,27 +391,12 @@ void target_cpu_copy_regs(CPUArchState *env, struct target_pt_regs *regs)
                                PROT_READ|PROT_WRITE,
                                MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
    idt_table = g2h_untagged(env->idt.base);
-    set_idt(0, 0);
-    set_idt(1, 0);
-    set_idt(2, 0);
-    set_idt(3, 3);
-    set_idt(4, 3);
-    set_idt(5, 0);
-    set_idt(6, 0);
-    set_idt(7, 0);
-    set_idt(8, 0);
-    set_idt(9, 0);
-    set_idt(10, 0);
-    set_idt(11, 0);
-    set_idt(12, 0);
-    set_idt(13, 0);
-    set_idt(14, 0);
-    set_idt(15, 0);
-    set_idt(16, 0);
-    set_idt(17, 0);
-    set_idt(18, 0);
-    set_idt(19, 0);
-    set_idt(0x80, 3);
+    for (i = 0; i < 20; i++) {
+        set_idt(i, 0, is64);
+    }
+    set_idt(3, 3, is64);
+    set_idt(4, 3, is64);
+    set_idt(0x80, 3, is64);

    /* linux segment setup */
    {
--- a/meson.build
+++ b/meson.build
@@ -661,8 +661,8 @@ endif
 if get_option('whpx').allowed() and targetos == 'windows'
  if get_option('whpx').enabled() and host_machine.cpu() != 'x86_64'
    error('WHPX requires 64-bit host')
-  elif cc.has_header('WinHvPlatform.h', required: get_option('whpx')) and \
-       cc.has_header('WinHvEmulation.h', required: get_option('whpx'))
+  elif cc.has_header('winhvplatform.h', required: get_option('whpx')) and \
+       cc.has_header('winhvemulation.h', required: get_option('whpx'))
    accelerators += 'CONFIG_WHPX'
  endif
 endif
@@ -838,6 +838,8 @@ if gdbus_codegen.found() and get_option('cfi')
  gdbus_codegen_error = '@0@ uses gdbus-codegen, which does not support control flow integrity'
 endif

+xml_pp = find_program('scripts/xml-preprocess.py')
+
 lttng = not_found
 if 'ust' in get_option('trace_backends')
  lttng = dependency('lttng-ust', required: true, version: '>= 2.1',
@@ -1070,6 +1072,12 @@ if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu
  virgl = dependency('virglrenderer',
                     method: 'pkg-config',
                     required: get_option('virglrenderer'))
+  if virgl.found()
+    config_host_data.set('HAVE_VIRGL_D3D_INFO_EXT',
+                         cc.has_member('struct virgl_renderer_resource_info_ext', 'd3d_tex2d',
+                                       prefix: '#include <virglrenderer.h>',
+                                       dependencies: virgl))
+  endif
 endif
 blkio = not_found
 if not get_option('blkio').auto() or have_block
@@ -1985,8 +1993,6 @@ dbus_display = get_option('dbus_display') \
           error_message: '-display dbus requires glib>=2.64') \
  .require(gdbus_codegen.found(),
           error_message: gdbus_codegen_error.format('-display dbus')) \
-  .require(targetos != 'windows',
-           error_message: '-display dbus is not available on Windows') \
  .allowed()

 have_virtfs = get_option('virtfs') \
--- a/migration/file.c
+++ b/migration/file.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "channel.h"
+#include "file.h"
+#include "migration.h"
+#include "io/channel-file.h"
+#include "io/channel-util.h"
+#include "trace.h"
+
+#define OFFSET_OPTION ",offset="
+
+/* Remove the offset option from @filespec and return it in @offsetp. */
+
+static int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp)
+{
+    char *option = strstr(filespec, OFFSET_OPTION);
+    int ret;
+
+    if (option) {
+        *option = 0;
+        option += sizeof(OFFSET_OPTION) - 1;
+        ret = qemu_strtosz(option, NULL, offsetp);
+        if (ret) {
+            error_setg_errno(errp, -ret, "file URI has bad offset %s", option);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+void file_start_outgoing_migration(MigrationState *s, const char *filespec,
+                                   Error **errp)
+{
+    g_autofree char *filename = g_strdup(filespec);
+    g_autoptr(QIOChannelFile) fioc = NULL;
+    uint64_t offset = 0;
+    QIOChannel *ioc;
+
+    trace_migration_file_outgoing(filename);
+
+    if (file_parse_offset(filename, &offset, errp)) {
+        return;
+    }
+
+    fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC,
+                                     0600, errp);
+    if (!fioc) {
+        return;
+    }
+
+    ioc = QIO_CHANNEL(fioc);
+    if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
+        return;
+    }
+    qio_channel_set_name(ioc, "migration-file-outgoing");
+    migration_channel_connect(s, ioc, NULL, NULL);
+}
+
+static gboolean file_accept_incoming_migration(QIOChannel *ioc,
+                                               GIOCondition condition,
+                                               gpointer opaque)
+{
+    migration_channel_process_incoming(ioc);
+    object_unref(OBJECT(ioc));
+    return G_SOURCE_REMOVE;
+}
+
+void file_start_incoming_migration(const char *filespec, Error **errp)
+{
+    g_autofree char *filename = g_strdup(filespec);
+    QIOChannelFile *fioc = NULL;
+    uint64_t offset = 0;
+    QIOChannel *ioc;
+
+    trace_migration_file_incoming(filename);
+
+    if (file_parse_offset(filename, &offset, errp)) {
+        return;
+    }
+
+    fioc = qio_channel_file_new_path(filename, O_RDONLY, 0, errp);
+    if (!fioc) {
+        return;
+    }
+
+    ioc = QIO_CHANNEL(fioc);
+    if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
+        return;
+    }
+    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming");
+    qio_channel_add_watch_full(ioc, G_IO_IN,
+                               file_accept_incoming_migration,
+                               NULL, NULL,
+                               g_main_context_get_thread_default());
+}
--- a/migration/file.h
+++ b/migration/file.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2021-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_MIGRATION_FILE_H
+#define QEMU_MIGRATION_FILE_H
+void file_start_incoming_migration(const char *filename, Error **errp);
+
+void file_start_outgoing_migration(MigrationState *s, const char *filename,
+                                   Error **errp);
+#endif
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -16,6 +16,7 @@ system_ss.add(files(
  'dirtyrate.c',
  'exec.c',
  'fd.c',
+  'file.c',
  'global_state.c',
  'migration-hmp-cmds.c',
  'migration.c',
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -20,6 +20,7 @@
 #include "migration/blocker.h"
 #include "exec.h"
 #include "fd.h"
+#include "file.h"
 #include "socket.h"
 #include "sysemu/runstate.h"
 #include "sysemu/sysemu.h"
@@ -78,6 +79,7 @@ enum mig_rp_message_type {
    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
    MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
    MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
+    MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */

    MIG_RP_MSG_MAX
 };
@@ -423,13 +425,16 @@ void migrate_add_address(SocketAddress *address)
 static void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
    const char *p = NULL;
+    MigrationIncomingState *mis = migration_incoming_get_current();

    /* URI is not suitable for migration? */
    if (!migration_channels_and_uri_compatible(uri, errp)) {
        return;
    }

-    qapi_event_send_migration(MIGRATION_STATUS_SETUP);
+    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
+                      MIGRATION_STATUS_SETUP);
+
    if (strstart(uri, "tcp:", &p) ||
        strstart(uri, "unix:", NULL) ||
        strstart(uri, "vsock:", NULL)) {
@@ -442,6 +447,8 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
        exec_start_incoming_migration(p, errp);
    } else if (strstart(uri, "fd:", &p)) {
        fd_start_incoming_migration(p, errp);
+    } else if (strstart(uri, "file:", &p)) {
+        file_start_incoming_migration(p, errp);
    } else {
        error_setg(errp, "unknown migration protocol: %s", uri);
    }
@@ -521,7 +528,7 @@ process_incoming_migration_co(void *opaque)

    mis->largest_page_size = qemu_ram_pagesize_largest();
    postcopy_state_set(POSTCOPY_INCOMING_NONE);
-    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
+    migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
                      MIGRATION_STATUS_ACTIVE);

    mis->loadvm_co = qemu_coroutine_self();
@@ -760,6 +767,11 @@ bool migration_has_all_channels(void)
    return true;
 }

+int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
+{
+    return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
+}
+
 /*
 * Send a 'SHUT' message on the return channel with the given value
 * to indicate that we've finished with the RP.  Non-0 value indicates
@@ -1405,6 +1417,7 @@ void migrate_init(MigrationState *s)
    s->vm_old_state = -1;
    s->iteration_initial_bytes = 0;
    s->threshold_size = 0;
+    s->switchover_acked = false;
 }

 int migrate_add_blocker_internal(Error *reason, Error **errp)
@@ -1621,6 +1634,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
     */
    memset(&mig_stats, 0, sizeof(mig_stats));
    memset(&compression_counters, 0, sizeof(compression_counters));
+    reset_vfio_bytes_transferred();

    return true;
 }
@@ -1662,6 +1676,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
        exec_start_outgoing_migration(s, p, &local_err);
    } else if (strstart(uri, "fd:", &p)) {
        fd_start_outgoing_migration(s, p, &local_err);
+    } else if (strstart(uri, "file:", &p)) {
+        file_start_outgoing_migration(s, p, &local_err);
    } else {
        if (!(has_resume && resume)) {
            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
@@ -1721,6 +1737,7 @@ static struct rp_cmd_args {
    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
    [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
    [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
+    [MIG_RP_MSG_SWITCHOVER_ACK] = { .len =  0, .name = "SWITCHOVER_ACK" },
    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
 };

@@ -1959,6 +1976,11 @@ retry:
            }
            break;

+        case MIG_RP_MSG_SWITCHOVER_ACK:
+            ms->switchover_acked = true;
+            trace_source_return_path_thread_switchover_acked();
+            break;
+
        default:
            break;
        }
@@ -2693,6 +2715,20 @@ static void migration_update_counters(MigrationState *s,
                              bandwidth, s->threshold_size);
 }

+static bool migration_can_switchover(MigrationState *s)
+{
+    if (!migrate_switchover_ack()) {
+        return true;
+    }
+
+    /* No reason to wait for switchover ACK if VM is stopped */
+    if (!runstate_is_running()) {
+        return true;
+    }
+
+    return s->switchover_acked;
+}
+
 /* Migration thread iteration status */
 typedef enum {
    MIG_ITERATE_RESUME,         /* Resume current iteration */
@@ -2708,6 +2744,7 @@ static MigIterateState migration_iteration_run(MigrationState *s)
 {
    uint64_t must_precopy, can_postcopy;
    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
+    bool can_switchover = migration_can_switchover(s);

    qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
    uint64_t pending_size = must_precopy + can_postcopy;
@@ -2720,14 +2757,14 @@ static MigIterateState migration_iteration_run(MigrationState *s)
        trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
    }

-    if (!pending_size || pending_size < s->threshold_size) {
+    if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
        trace_migration_thread_low_pending(pending_size);
        migration_completion(s);
        return MIG_ITERATE_BREAK;
    }

    /* Still a significant amount to transfer */
-    if (!in_postcopy && must_precopy <= s->threshold_size &&
+    if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
        qatomic_read(&s->start_postcopy)) {
        if (postcopy_start(s)) {
            error_report("%s: postcopy failed to start", __func__);
@@ -2922,7 +2959,7 @@ static void *migration_thread(void *opaque)
    MigThrError thr_error;
    bool urgent = false;

-    thread = MigrationThreadAdd("live_migration", qemu_get_thread_id());
+    thread = migration_threads_add("live_migration", qemu_get_thread_id());

    rcu_register_thread();

@@ -3000,7 +3037,7 @@ static void *migration_thread(void *opaque)
    migration_iteration_finish(s);
    object_unref(OBJECT(s));
    rcu_unregister_thread();
-    MigrationThreadDel(thread);
+    migration_threads_remove(thread);
    return NULL;
 }

--- a/Show More
+++ b/Show More