migration: fix coverity migrate_mode finding

Coverity diagnoses a possible out-of-range array index here ... static GSList *migration_blockers[MIG_MODE__MAX]; fill_source_migration_info() { GSList *cur_blocker = migration_blockers[migrate_mode()]; ... because it does not know that MIG_MODE__MAX will never be returned as a migration mode. To fix, assert so in migrate_mode(). Fixes: fa3673e497 ("migration: per-mode blockers") Reported-by: Peter Maydell <peter.maydell@linaro.org> Suggested-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/1699907025-215450-1-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu <peterx@redhat.com>
migration/multifd: Remove unnecessary usage of local Error
2024-01-04 09:52:42 +08:00 · 2024-01-04 09:52:42 +08:00 · 2024-01-04 09:52:42 +08:00 · 2024-01-04 09:52:42 +08:00 · 2024-01-04 09:52:42 +08:00 · 2024-01-04 09:52:42 +08:00
446 changed files with 7564 additions and 8093 deletions
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -647,10 +647,7 @@ pages:
    - mkdir -p public
    # HTML-ised source tree
    - make gtags
-    # We unset variables to work around a bug in some htags versions
-    # which causes it to fail when the environment is large
-    - CI_COMMIT_MESSAGE= CI_COMMIT_TAG_MESSAGE= htags
-        -anT --tree-view=filetree -m qemu_init
+    - htags -anT --tree-view=filetree -m qemu_init
        -t "Welcome to the QEMU sourcecode"
    - mv HTML public/src
    # Project documentation
--- a/.gitlab-ci.d/cirrus.yml
+++ b/.gitlab-ci.d/cirrus.yml
@@ -13,7 +13,7 @@
 .cirrus_build_job:
  extends: .base_job_template
  stage: build
-  image: registry.gitlab.com/libvirt/libvirt-ci/cirrus-run:latest
+  image: registry.gitlab.com/libvirt/libvirt-ci/cirrus-run:master
  needs: []
  # 20 mins larger than "timeout_in" in cirrus/build.yml
  # as there's often a 5-10 minute delay before Cirrus CI
@@ -52,7 +52,7 @@ x64-freebsd-13-build:
    NAME: freebsd-13
    CIRRUS_VM_INSTANCE_TYPE: freebsd_instance
    CIRRUS_VM_IMAGE_SELECTOR: image_family
-    CIRRUS_VM_IMAGE_NAME: freebsd-13-3
+    CIRRUS_VM_IMAGE_NAME: freebsd-13-2
    CIRRUS_VM_CPUS: 8
    CIRRUS_VM_RAM: 8G
    UPDATE_COMMAND: pkg update; pkg upgrade -y
--- a/.gitlab-ci.d/cirrus/build.yml
+++ b/.gitlab-ci.d/cirrus/build.yml
@@ -21,7 +21,7 @@ build_task:
  install_script:
    - @UPDATE_COMMAND@
    - @INSTALL_COMMAND@ @PKGS@
-    - if test -n "@PYPI_PKGS@" ; then PYLIB=$(@PYTHON@ -c 'import sysconfig; print(sysconfig.get_path("stdlib"))'); rm -f $PYLIB/EXTERNALLY-MANAGED; @PIP3@ install @PYPI_PKGS@ ; fi
+    - if test -n "@PYPI_PKGS@" ; then @PIP3@ install @PYPI_PKGS@ ; fi
  clone_script:
    - git clone --depth 100 "$CI_REPOSITORY_URL" .
    - git fetch origin "$CI_COMMIT_REF_NAME"
--- a/.gitlab-ci.d/windows.yml
+++ b/.gitlab-ci.d/windows.yml
@@ -88,6 +88,7 @@
      $MINGW_TARGET-libpng
      $MINGW_TARGET-libssh
      $MINGW_TARGET-libtasn1
+      $MINGW_TARGET-libusb
      $MINGW_TARGET-lzo2
      $MINGW_TARGET-nettle
      $MINGW_TARGET-ninja
@@ -97,8 +98,9 @@
      $MINGW_TARGET-SDL2
      $MINGW_TARGET-SDL2_image
      $MINGW_TARGET-snappy
-      $MINGW_TARGET-zstd
-      $EXTRA_PACKAGES "
+      $MINGW_TARGET-spice
+      $MINGW_TARGET-usbredir
+      $MINGW_TARGET-zstd "
  - Write-Output "Running build at $(Get-Date -Format u)"
  - $env:CHERE_INVOKING = 'yes'  # Preserve the current working directory
  - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
@@ -121,8 +123,6 @@ msys2-64bit:
  variables:
    MINGW_TARGET: mingw-w64-x86_64
    MSYSTEM: MINGW64
-    # msys2 only ship these packages for 64-bit, not 32-bit
-    EXTRA_PACKAGES: $MINGW_TARGET-libusb $MINGW_TARGET-usbredir $MINGW_TARGET-spice
    # do not remove "--without-default-devices"!
    # commit 9f8e6cad65a6 ("gitlab-ci: Speed up the msys2-64bit job by using --without-default-devices"
    # changed to compile QEMU with the --without-default-devices switch
@@ -131,3 +131,11 @@ msys2-64bit:
    # qTests don't run successfully with "--without-default-devices",
    # so let's exclude the qtests from CI for now.
    TEST_ARGS: --no-suite qtest
+
+msys2-32bit:
+  extends: .shared_msys2_builder
+  variables:
+    MINGW_TARGET: mingw-w64-i686
+    MSYSTEM: MINGW32
+    CONFIGURE_ARGS:  --target-list=ppc64-softmmu -Ddebug=false -Doptimization=0
+    TEST_ARGS: --no-suite qtest
--- a/.mailmap
+++ b/.mailmap
@@ -81,6 +81,7 @@ Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
 Huacai Chen <chenhuacai@kernel.org> <chenhc@lemote.com>
 Huacai Chen <chenhuacai@kernel.org> <chenhuacai@loongson.cn>
 James Hogan <jhogan@kernel.org> <james.hogan@imgtec.com>
+Juan Quintela <quintela@trasno.org> <quintela@redhat.com>
 Leif Lindholm <quic_llindhol@quicinc.com> <leif.lindholm@linaro.org>
 Leif Lindholm <quic_llindhol@quicinc.com> <leif@nuviainc.com>
 Luc Michel <luc@lmichel.fr> <luc.michel@git.antfield.fr>
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,21 +5,16 @@
 # Required
 version: 2

-# Set the version of Python and other tools you might need
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.11"
-
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  configuration: docs/conf.py

-# We recommend specifying your dependencies to enable reproducible builds:
-# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
-python:
-  install:
-    - requirements: docs/requirements.txt
-
 # We want all the document formats
 formats: all
+
+# For consistency, we require that QEMU's Sphinx extensions
+# run with at least the same minimum version of Python that
+# we require for other Python in our codebase (our conf.py
+# enforces this, and some code needs it.)
+python:
+  version: 3.6
--- a/21
+++ b/21
@@ -70,7 +70,6 @@ R: Daniel P. Berrangé <berrange@redhat.com>
 R: Thomas Huth <thuth@redhat.com>
 R: Markus Armbruster <armbru@redhat.com>
 R: Philippe Mathieu-Daudé <philmd@linaro.org>
-R: Juan Quintela <quintela@redhat.com>
 W: https://www.qemu.org/docs/master/devel/index.html
 S: Odd Fixes
 F: docs/devel/style.rst
@@ -2167,6 +2166,17 @@ F: hw/vfio/ap.c
 F: docs/system/s390x/vfio-ap.rst
 L: qemu-s390x@nongnu.org

+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+M: Zhenzhong Duan <zhenzhong.duan@intel.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+F: include/qemu/chardev_open.h
+F: util/chardev_open.c
+F: docs/devel/vfio-iommufd.rst
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
@@ -2388,8 +2398,13 @@ F: hw/net/net_tx_pkt*
 Vmware
 M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
+F: docs/specs/vmw_pvscsi-spec.txt
+F: hw/display/vmware_vga.c
 F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*
+F: pc-bios/efi-vmxnet3.rom
+F: pc-bios/vgabios-vmware.bin
+F: roms/config.vga-vmware
 F: tests/qtest/vmxnet3-test.c
 F: docs/specs/vwm_pvscsi-spec.rst

@@ -3339,10 +3354,8 @@ S: Odd Fixes
 F: scripts/checkpatch.pl

 Migration
-M: Juan Quintela <quintela@redhat.com>
 M: Peter Xu <peterx@redhat.com>
 M: Fabiano Rosas <farosas@suse.de>
-R: Leonardo Bras <leobras@redhat.com>
 S: Maintained
 F: hw/core/vmstate-if.c
 F: include/hw/vmstate-if.h
@@ -3359,10 +3372,8 @@ F: util/userfaultfd.c
 X: migration/rdma*

 RDMA Migration
-M: Juan Quintela <quintela@redhat.com>
 R: Li Zhijian <lizhijian@fujitsu.com>
 R: Peter Xu <peterx@redhat.com>
-R: Leonardo Bras <leobras@redhat.com>
 S: Odd Fixes
 F: migration/rdma*

--- a/2
+++ b/2
@@ -1 +1 @@
-8.2.3
+8.2.50
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -69,16 +69,6 @@
 #define KVM_GUESTDBG_BLOCKIRQ 0
 #endif

-//#define DEBUG_KVM
-
-#ifdef DEBUG_KVM
-#define DPRINTF(fmt, ...) \
-    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
 struct KVMParkedVcpu {
    unsigned long vcpu_id;
    int kvm_fd;
@@ -98,7 +88,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_vm_attributes_allowed;
 bool kvm_msi_use_devid;
-bool kvm_has_guest_debug;
+static bool kvm_has_guest_debug;
 static int kvm_sstep_flags;
 static bool kvm_immediate_exit;
 static hwaddr kvm_max_slot_size = ~0;
@@ -331,7 +321,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
    struct KVMParkedVcpu *vcpu = NULL;
    int ret = 0;

-    DPRINTF("kvm_destroy_vcpu\n");
+    trace_kvm_destroy_vcpu();

    ret = kvm_arch_destroy_vcpu(cpu);
    if (ret < 0) {
@@ -341,7 +331,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) {
        ret = mmap_size;
-        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        trace_kvm_failed_get_vcpu_mmap_size();
        goto err;
    }

@@ -443,7 +433,6 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
            ret = -errno;
-            DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
            goto err;
        }
    }
@@ -2821,7 +2810,7 @@ int kvm_cpu_exec(CPUState *cpu)
    struct kvm_run *run = cpu->kvm_run;
    int ret, run_ret;

-    DPRINTF("kvm_cpu_exec()\n");
+    trace_kvm_cpu_exec();

    if (kvm_arch_process_async_events(cpu)) {
        qatomic_set(&cpu->exit_request, 0);
@@ -2848,7 +2837,7 @@ int kvm_cpu_exec(CPUState *cpu)

        kvm_arch_pre_run(cpu, run);
        if (qatomic_read(&cpu->exit_request)) {
-            DPRINTF("interrupt exit requested\n");
+	    trace_kvm_interrupt_exit_request();
            /*
             * KVM requires us to reenter the kernel after IO exits to complete
             * instruction emulation. This self-signal will ensure that we
@@ -2878,7 +2867,7 @@ int kvm_cpu_exec(CPUState *cpu)

        if (run_ret < 0) {
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
-                DPRINTF("io window exit\n");
+                trace_kvm_io_window_exit();
                kvm_eat_signals(cpu);
                ret = EXCP_INTERRUPT;
                break;
@@ -2900,7 +2889,6 @@ int kvm_cpu_exec(CPUState *cpu)
        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
-            DPRINTF("handle_io\n");
            /* Called outside BQL */
            kvm_handle_io(run->io.port, attrs,
                          (uint8_t *)run + run->io.data_offset,
@@ -2910,7 +2898,6 @@ int kvm_cpu_exec(CPUState *cpu)
            ret = 0;
            break;
        case KVM_EXIT_MMIO:
-            DPRINTF("handle_mmio\n");
            /* Called outside BQL */
            address_space_rw(&address_space_memory,
                             run->mmio.phys_addr, attrs,
@@ -2920,11 +2907,9 @@ int kvm_cpu_exec(CPUState *cpu)
            ret = 0;
            break;
        case KVM_EXIT_IRQ_WINDOW_OPEN:
-            DPRINTF("irq_window_open\n");
            ret = EXCP_INTERRUPT;
            break;
        case KVM_EXIT_SHUTDOWN:
-            DPRINTF("shutdown\n");
            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
            ret = EXCP_INTERRUPT;
            break;
@@ -2959,6 +2944,7 @@ int kvm_cpu_exec(CPUState *cpu)
            ret = 0;
            break;
        case KVM_EXIT_SYSTEM_EVENT:
+            trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
            switch (run->system_event.type) {
            case KVM_SYSTEM_EVENT_SHUTDOWN:
                qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
@@ -2976,13 +2962,11 @@ int kvm_cpu_exec(CPUState *cpu)
                ret = 0;
                break;
            default:
-                DPRINTF("kvm_arch_handle_exit\n");
                ret = kvm_arch_handle_exit(cpu, run);
                break;
            }
            break;
        default:
-            DPRINTF("kvm_arch_handle_exit\n");
            ret = kvm_arch_handle_exit(cpu, run);
            break;
        }
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -25,4 +25,9 @@ kvm_dirty_ring_reaper(const char *s) "%s"
 kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)"
 kvm_dirty_ring_reaper_kick(const char *reason) "%s"
 kvm_dirty_ring_flush(int finished) "%d"
-
+kvm_destroy_vcpu(void) ""
+kvm_failed_get_vcpu_mmap_size(void) ""
+kvm_cpu_exec(void) ""
+kvm_interrupt_exit_request(void) ""
+kvm_io_window_exit(void) ""
+kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -183,7 +183,7 @@ static bool tb_lookup_cmp(const void *p, const void *d)
    const TranslationBlock *tb = p;
    const struct tb_desc *desc = d;

-    if (tb->pc == desc->pc &&
+    if ((tb_cflags(tb) & CF_PCREL || tb->pc == desc->pc) &&
        tb_page_addr0(tb) == desc->page_addr0 &&
        tb->cs_base == desc->cs_base &&
        tb->flags == desc->flags &&
@@ -233,7 +233,7 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, vaddr pc,
        return NULL;
    }
    desc.page_addr0 = phys_pc;
-    h = tb_hash_func(phys_pc, pc,
+    h = tb_hash_func(phys_pc, (cflags & CF_PCREL ? 0 : pc),
                     flags, cs_base, cflags);
    return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -47,7 +47,7 @@ static bool tb_cmp(const void *ap, const void *bp)
    const TranslationBlock *a = ap;
    const TranslationBlock *b = bp;

-    return (a->pc == b->pc &&
+    return ((tb_cflags(a) & CF_PCREL || a->pc == b->pc) &&
            a->cs_base == b->cs_base &&
            a->flags == b->flags &&
            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
@@ -916,7 +916,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)

    /* remove the TB from the hash list */
    phys_pc = tb_page_addr0(tb);
-    h = tb_hash_func(phys_pc, tb->pc,
+    h = tb_hash_func(phys_pc, (orig_cflags & CF_PCREL ? 0 : tb->pc),
                     tb->flags, tb->cs_base, orig_cflags);
    if (!qht_remove(&tb_ctx.htable, tb, h)) {
        return;
@@ -983,7 +983,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb)
    tb_record(tb);

    /* add in the hash table */
-    h = tb_hash_func(tb_page_addr0(tb), tb->pc,
+    h = tb_hash_func(tb_page_addr0(tb), (tb->cflags & CF_PCREL ? 0 : tb->pc),
                     tb->flags, tb->cs_base, tb->cflags);
    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);

--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -327,7 +327,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

    gen_code_buf = tcg_ctx->code_gen_ptr;
    tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
-    tb->pc = pc;
+    if (!(cflags & CF_PCREL)) {
+        tb->pc = pc;
+    }
    tb->cs_base = cs_base;
    tb->flags = flags;
    tb->cflags = cflags;
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -1744,7 +1744,7 @@ static AudioState *audio_init(Audiodev *dev, Error **errp)
        if (driver) {
            done = !audio_driver_init(s, driver, dev, errp);
        } else {
-            error_setg(errp, "Unknown audio driver `%s'", drvname);
+            error_setg(errp, "Unknown audio driver `%s'\n", drvname);
        }
        if (!done) {
            goto out;
--- a/audio/meson.build
+++ b/audio/meson.build
@@ -30,8 +30,7 @@ endforeach

 if dbus_display
    module_ss = ss.source_set()
-    module_ss.add(when: [gio, dbus_display1_dep, pixman],
-                  if_true: files('dbusaudio.c'))
+    module_ss.add(when: [gio, pixman], if_true: files('dbusaudio.c'))
    audio_modules += {'dbus': module_ss}
 endif

--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,5 @@
 source tpm/Kconfig
+
+config IOMMUFD
+    bool
+    depends on VFIO
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -427,9 +427,7 @@ static int cryptodev_builtin_close_session(
                      CRYPTODEV_BACKEND_BUILTIN(backend);
    CryptoDevBackendBuiltinSession *session;

-    if (session_id >= MAX_NUM_SESSIONS || !builtin->sessions[session_id]) {
-        return -VIRTIO_CRYPTO_INVSESS;
-    }
+    assert(session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]);

    session = builtin->sessions[session_id];
    if (session->cipher) {
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -398,7 +398,6 @@ static void cryptodev_backend_set_ops(Object *obj, Visitor *v,
 static void
 cryptodev_backend_complete(UserCreatable *uc, Error **errp)
 {
-    ERRP_GUARD();
    CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc);
    CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc);
    uint32_t services;
@@ -407,20 +406,11 @@ cryptodev_backend_complete(UserCreatable *uc, Error **errp)
    QTAILQ_INIT(&backend->opinfos);
    value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg;
    cryptodev_backend_set_throttle(backend, THROTTLE_OPS_TOTAL, value, errp);
-    if (*errp) {
-        return;
-    }
    value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg;
    cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp);
-    if (*errp) {
-        return;
-    }

    if (bc->init) {
        bc->init(backend, errp);
-        if (*errp) {
-            return;
-        }
    }

    services = backend->conf.crypto_services;
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -0,0 +1,245 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    be->fd = -1;
+    be->users = 0;
+    be->owned = true;
+    qemu_mutex_init(&be->lock);
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    if (be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+    int fd = -1;
+
+    fd = monitor_fd_param(monitor_cur(), str, errp);
+    if (fd == -1) {
+        error_prepend(errp, "Could not parse remote object fd %s:", str);
+        return;
+    }
+    qemu_mutex_lock(&be->lock);
+    be->fd = fd;
+    be->owned = false;
+    qemu_mutex_unlock(&be->lock);
+    trace_iommu_backend_set_fd(be->fd);
+}
+
+static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
+
+    return !be->users;
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+    ucc->can_be_deleted = iommufd_backend_can_be_deleted;
+
+    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+    int fd, ret = 0;
+
+    qemu_mutex_lock(&be->lock);
+    if (be->users == UINT32_MAX) {
+        error_setg(errp, "too many connections");
+        ret = -E2BIG;
+        goto out;
+    }
+    if (be->owned && !be->users) {
+        fd = qemu_open_old("/dev/iommu", O_RDWR);
+        if (fd < 0) {
+            error_setg_errno(errp, errno, "/dev/iommu opening failed");
+            ret = fd;
+            goto out;
+        }
+        be->fd = fd;
+    }
+    be->users++;
+out:
+    trace_iommufd_backend_connect(be->fd, be->owned,
+                                  be->users, ret);
+    qemu_mutex_unlock(&be->lock);
+    return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+    qemu_mutex_lock(&be->lock);
+    if (!be->users) {
+        goto out;
+    }
+    be->users--;
+    if (!be->users && be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+out:
+    trace_iommufd_backend_disconnect(be->fd, be->users);
+    qemu_mutex_unlock(&be->lock);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+                               Error **errp)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_alloc alloc_data  = {
+        .size = sizeof(alloc_data),
+        .flags = 0,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+    if (ret) {
+        error_setg_errno(errp, errno, "Failed to allocate ioas");
+        return ret;
+    }
+
+    *ioas_id = alloc_data.out_ioas_id;
+    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+    return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+    int ret, fd = be->fd;
+    struct iommu_destroy des = {
+        .size = sizeof(des),
+        .id = id,
+    };
+
+    ret = ioctl(fd, IOMMU_DESTROY, &des);
+    trace_iommufd_backend_free_id(fd, id, ret);
+    if (ret) {
+        error_report("Failed to free id: %u %m", id);
+    }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_map map = {
+        .size = sizeof(map),
+        .flags = IOMMU_IOAS_MAP_READABLE |
+                 IOMMU_IOAS_MAP_FIXED_IOVA,
+        .ioas_id = ioas_id,
+        .__reserved = 0,
+        .user_va = (uintptr_t)vaddr,
+        .iova = iova,
+        .length = size,
+    };
+
+    if (!readonly) {
+        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+    }
+
+    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+                                  vaddr, readonly, ret);
+    if (ret) {
+        ret = -errno;
+
+        /* TODO: Not support mapping hardware PCI BAR region for now. */
+        if (errno == EFAULT) {
+            warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+        } else {
+            error_report("IOMMU_IOAS_MAP failed: %m");
+        }
+    }
+    return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+                              hwaddr iova, ram_addr_t size)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_unmap unmap = {
+        .size = sizeof(unmap),
+        .ioas_id = ioas_id,
+        .iova = iova,
+        .length = size,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+    /*
+     * IOMMUFD takes mapping as some kind of object, unmapping
+     * nonexistent mapping is treated as deleting a nonexistent
+     * object and return ENOENT. This is different from legacy
+     * backend which allows it. vIOMMU may trigger a lot of
+     * redundant unmapping, to avoid flush the log, treat them
+     * as succeess for IOMMUFD just like legacy backend.
+     */
+    if (ret && errno == ENOENT) {
+        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+        ret = 0;
+    } else {
+        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+    }
+
+    if (ret) {
+        ret = -errno;
+        error_report("IOMMU_IOAS_UNMAP failed: %m");
+    }
+    return ret;
+}
+
+static const TypeInfo iommufd_backend_info = {
+    .name = TYPE_IOMMUFD_BACKEND,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(IOMMUFDBackend),
+    .instance_init = iommufd_backend_init,
+    .instance_finalize = iommufd_backend_finalize,
+    .class_size = sizeof(IOMMUFDBackendClass),
+    .class_init = iommufd_backend_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+static void register_types(void)
+{
+    type_register_static(&iommufd_backend_info);
+}
+
+type_init(register_types);
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -20,6 +20,7 @@ if have_vhost_user
  system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
 if have_vhost_user_crypto
  system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
 endif
--- a/backends/tpm/tpm_emulator.c
+++ b/backends/tpm/tpm_emulator.c
@@ -904,7 +904,7 @@ static void tpm_emulator_vm_state_change(void *opaque, bool running,

    trace_tpm_emulator_vm_state_change(running, state);

-    if (!running || state != RUN_STATE_RUNNING || !tpm_emu->relock_storage) {
+    if (!running || !tpm_emu->relock_storage) {
        return;
    }

--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
 dbus_vmstate_post_load(int version_id) "version_id: %d"
 dbus_vmstate_loading(const char *id) "id: %s"
 dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
--- a/block.c
+++ b/block.c
@@ -1616,16 +1616,10 @@ out:
    g_free(gen_node_name);
 }

-/*
- * The caller must always hold @bs AioContext lock, because this function calls
- * bdrv_refresh_total_sectors() which polls when called from non-coroutine
- * context.
- */
 static int no_coroutine_fn GRAPH_UNLOCKED
 bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
                 QDict *options, int open_flags, Error **errp)
 {
-    AioContext *ctx;
    Error *local_err = NULL;
    int i, ret;
    GLOBAL_STATE_CODE();
@@ -1673,21 +1667,15 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
    bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
    bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;

-    /* Get the context after .bdrv_open, it can change the context */
-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
-
    ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
-        aio_context_release(ctx);
        return ret;
    }

    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, &local_err);
    bdrv_graph_rdunlock_main_loop();
-    aio_context_release(ctx);

    if (local_err) {
        error_propagate(errp, local_err);
@@ -1708,12 +1696,12 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
 open_failed:
    bs->drv = NULL;

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    if (bs->file != NULL) {
        bdrv_unref_child(bs, bs->file);
        assert(!bs->file);
    }
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    g_free(bs->opaque);
    bs->opaque = NULL;
@@ -2908,7 +2896,7 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
 * Replaces the node that a BdrvChild points to without updating permissions.
 *
 * If @new_bs is non-NULL, the parent of @child must already be drained through
- * @child and the caller must hold the AioContext lock for @new_bs.
+ * @child.
 */
 static void GRAPH_WRLOCK
 bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
@@ -3048,9 +3036,8 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
 *
 * Returns new created child.
 *
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
 */
 static BdrvChild * GRAPH_WRLOCK
 bdrv_attach_child_common(BlockDriverState *child_bs,
@@ -3062,7 +3049,7 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
                         Transaction *tran, Error **errp)
 {
    BdrvChild *new_child;
-    AioContext *parent_ctx, *new_child_ctx;
+    AioContext *parent_ctx;
    AioContext *child_ctx = bdrv_get_aio_context(child_bs);

    assert(child_class->get_parent_desc);
@@ -3114,12 +3101,6 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
        }
    }

-    new_child_ctx = bdrv_get_aio_context(child_bs);
-    if (new_child_ctx != child_ctx) {
-        aio_context_release(child_ctx);
-        aio_context_acquire(new_child_ctx);
-    }
-
    bdrv_ref(child_bs);
    /*
     * Let every new BdrvChild start with a drained parent. Inserting the child
@@ -3149,20 +3130,14 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
    };
    tran_add(tran, &bdrv_attach_child_common_drv, s);

-    if (new_child_ctx != child_ctx) {
-        aio_context_release(new_child_ctx);
-        aio_context_acquire(child_ctx);
-    }
-
    return new_child;
 }

 /*
 * Function doesn't update permissions, caller is responsible for this.
 *
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
 *
 * After calling this function, the transaction @tran may only be completed
 * while holding a writer lock for the graph.
@@ -3202,9 +3177,6 @@ bdrv_attach_child_noperm(BlockDriverState *parent_bs,
 *
 * On failure NULL is returned, errp is set and the reference to
 * child_bs is also dropped.
- *
- * The caller must hold the AioContext lock @child_bs, but not that of @ctx
- * (unless @child_bs is already in @ctx).
 */
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
@@ -3244,9 +3216,6 @@ out:
 *
 * On failure NULL is returned, errp is set and the reference to
 * child_bs is also dropped.
- *
- * If @parent_bs and @child_bs are in different AioContexts, the caller must
- * hold the AioContext lock for @child_bs, but not for @parent_bs.
 */
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                             BlockDriverState *child_bs,
@@ -3436,9 +3405,8 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
 *
 * Function doesn't update permissions, caller is responsible for this.
 *
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
 *
 * After calling this function, the transaction @tran may only be completed
 * while holding a writer lock for the graph.
@@ -3531,9 +3499,8 @@ out:
 }

 /*
- * The caller must hold the AioContext lock for @backing_hd. Both @bs and
- * @backing_hd can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @bs and @backing_hd can move to a different AioContext in this
+ * function.
 *
 * If a backing child is already present (i.e. we're detaching a node), that
 * child node must be drained.
@@ -3575,9 +3542,9 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,

    bdrv_ref(drain_bs);
    bdrv_drained_begin(drain_bs);
-    bdrv_graph_wrlock(backing_hd);
+    bdrv_graph_wrlock();
    ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
-    bdrv_graph_wrunlock(backing_hd);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(drain_bs);
    bdrv_unref(drain_bs);

@@ -3592,8 +3559,6 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
 * itself, all options starting with "${bdref_key}." are considered part of the
 * BlockdevRef.
 *
- * The caller must hold the main AioContext lock.
- *
 * TODO Can this be unified with bdrv_open_image()?
 */
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
@@ -3605,7 +3570,6 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
    int ret = 0;
    bool implicit_backing = false;
    BlockDriverState *backing_hd;
-    AioContext *backing_hd_ctx;
    QDict *options;
    QDict *tmp_parent_options = NULL;
    Error *local_err = NULL;
@@ -3691,11 +3655,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,

    /* Hook up the backing file link; drop our reference, bs owns the
     * backing_hd reference now */
-    backing_hd_ctx = bdrv_get_aio_context(backing_hd);
-    aio_context_acquire(backing_hd_ctx);
    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
    bdrv_unref(backing_hd);
-    aio_context_release(backing_hd_ctx);

    if (ret < 0) {
        goto free_exit;
@@ -3767,9 +3728,7 @@ done:
 *
 * The BlockdevRef will be removed from the options QDict.
 *
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
 */
 BdrvChild *bdrv_open_child(const char *filename,
                           QDict *options, const char *bdref_key,
@@ -3780,7 +3739,6 @@ BdrvChild *bdrv_open_child(const char *filename,
 {
    BlockDriverState *bs;
    BdrvChild *child;
-    AioContext *ctx;

    GLOBAL_STATE_CODE();

@@ -3790,13 +3748,10 @@ BdrvChild *bdrv_open_child(const char *filename,
        return NULL;
    }

-    bdrv_graph_wrlock(NULL);
-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
+    bdrv_graph_wrlock();
    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
                              errp);
-    aio_context_release(ctx);
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    return child;
 }
@@ -3804,9 +3759,7 @@ BdrvChild *bdrv_open_child(const char *filename,
 /*
 * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
 *
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
 */
 int bdrv_open_file_child(const char *filename,
                         QDict *options, const char *bdref_key,
@@ -3881,7 +3834,6 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
    int64_t total_size;
    QemuOpts *opts = NULL;
    BlockDriverState *bs_snapshot = NULL;
-    AioContext *ctx = bdrv_get_aio_context(bs);
    int ret;

    GLOBAL_STATE_CODE();
@@ -3890,9 +3842,7 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
       instead of opening 'filename' directly */

    /* Get the required size from the image */
-    aio_context_acquire(ctx);
    total_size = bdrv_getlength(bs);
-    aio_context_release(ctx);

    if (total_size < 0) {
        error_setg_errno(errp, -total_size, "Could not get image size");
@@ -3927,10 +3877,7 @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
        goto out;
    }

-    aio_context_acquire(ctx);
    ret = bdrv_append(bs_snapshot, bs, errp);
-    aio_context_release(ctx);
-
    if (ret < 0) {
        bs_snapshot = NULL;
        goto out;
@@ -3955,8 +3902,6 @@ out:
 * The reference parameter may be used to specify an existing block device which
 * should be opened. If specified, neither options nor a filename may be given,
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
- *
- * The caller must always hold the main AioContext lock.
 */
 static BlockDriverState * no_coroutine_fn
 bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -3974,7 +3919,6 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
    Error *local_err = NULL;
    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;
-    AioContext *ctx = qemu_get_aio_context();

    assert(!child_class || !flags);
    assert(!child_class == !parent);
@@ -4115,12 +4059,10 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
             * looking at the header to guess the image format. This works even
             * in cases where a guest would not see a consistent state. */
-            ctx = bdrv_get_aio_context(file_bs);
-            aio_context_acquire(ctx);
+            AioContext *ctx = bdrv_get_aio_context(file_bs);
            file = blk_new(ctx, 0, BLK_PERM_ALL);
            blk_insert_bs(file, file_bs, &local_err);
            bdrv_unref(file_bs);
-            aio_context_release(ctx);

            if (local_err) {
                goto fail;
@@ -4167,13 +4109,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
        goto fail;
    }

-    /* The AioContext could have changed during bdrv_open_common() */
-    ctx = bdrv_get_aio_context(bs);
-
    if (file) {
-        aio_context_acquire(ctx);
        blk_unref(file);
-        aio_context_release(ctx);
        file = NULL;
    }

@@ -4231,16 +4168,13 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
         * (snapshot_bs); thus, we have to drop the strong reference to bs
         * (which we obtained by calling bdrv_new()). bs will not be deleted,
         * though, because the overlay still has a reference to it. */
-        aio_context_acquire(ctx);
        bdrv_unref(bs);
-        aio_context_release(ctx);
        bs = snapshot_bs;
    }

    return bs;

 fail:
-    aio_context_acquire(ctx);
    blk_unref(file);
    qobject_unref(snapshot_options);
    qobject_unref(bs->explicit_options);
@@ -4249,21 +4183,17 @@ fail:
    bs->options = NULL;
    bs->explicit_options = NULL;
    bdrv_unref(bs);
-    aio_context_release(ctx);
    error_propagate(errp, local_err);
    return NULL;

 close_and_fail:
-    aio_context_acquire(ctx);
    bdrv_unref(bs);
-    aio_context_release(ctx);
    qobject_unref(snapshot_options);
    qobject_unref(options);
    error_propagate(errp, local_err);
    return NULL;
 }

-/* The caller must always hold the main AioContext lock. */
 BlockDriverState *bdrv_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp)
 {
@@ -4540,12 +4470,7 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
    if (bs_queue) {
        BlockReopenQueueEntry *bs_entry, *next;
        QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
-            AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs);
-
-            aio_context_acquire(ctx);
            bdrv_drained_end(bs_entry->state.bs);
-            aio_context_release(ctx);
-
            qobject_unref(bs_entry->state.explicit_options);
            qobject_unref(bs_entry->state.options);
            g_free(bs_entry);
@@ -4577,7 +4502,6 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
 {
    int ret = -1;
    BlockReopenQueueEntry *bs_entry, *next;
-    AioContext *ctx;
    Transaction *tran = tran_new();
    g_autoptr(GSList) refresh_list = NULL;

@@ -4586,10 +4510,7 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
    GLOBAL_STATE_CODE();

    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
-        ctx = bdrv_get_aio_context(bs_entry->state.bs);
-        aio_context_acquire(ctx);
        ret = bdrv_flush(bs_entry->state.bs);
-        aio_context_release(ctx);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Error flushing drive");
            goto abort;
@@ -4598,10 +4519,7 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)

    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
        assert(bs_entry->state.bs->quiesce_counter > 0);
-        ctx = bdrv_get_aio_context(bs_entry->state.bs);
-        aio_context_acquire(ctx);
        ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
-        aio_context_release(ctx);
        if (ret < 0) {
            goto abort;
        }
@@ -4644,24 +4562,18 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
     * to first element.
     */
    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
-        ctx = bdrv_get_aio_context(bs_entry->state.bs);
-        aio_context_acquire(ctx);
        bdrv_reopen_commit(&bs_entry->state);
-        aio_context_release(ctx);
    }

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    tran_commit(tran);
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
        BlockDriverState *bs = bs_entry->state.bs;

        if (bs->drv->bdrv_reopen_commit_post) {
-            ctx = bdrv_get_aio_context(bs);
-            aio_context_acquire(ctx);
            bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
-            aio_context_release(ctx);
        }
    }

@@ -4669,16 +4581,13 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
    goto cleanup;

 abort:
-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    tran_abort(tran);
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
        if (bs_entry->prepared) {
-            ctx = bdrv_get_aio_context(bs_entry->state.bs);
-            aio_context_acquire(ctx);
            bdrv_reopen_abort(&bs_entry->state);
-            aio_context_release(ctx);
        }
    }

@@ -4691,24 +4600,13 @@ cleanup:
 int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
                Error **errp)
 {
-    AioContext *ctx = bdrv_get_aio_context(bs);
    BlockReopenQueue *queue;
-    int ret;

    GLOBAL_STATE_CODE();

    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);

-    if (ctx != qemu_get_aio_context()) {
-        aio_context_release(ctx);
-    }
-    ret = bdrv_reopen_multiple(queue, errp);
-
-    if (ctx != qemu_get_aio_context()) {
-        aio_context_acquire(ctx);
-    }
-
-    return ret;
+    return bdrv_reopen_multiple(queue, errp);
 }

 int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
@@ -4743,10 +4641,7 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
 *
 * Return 0 on success, otherwise return < 0 and set @errp.
 *
- * The caller must hold the AioContext lock of @reopen_state->bs.
 * @reopen_state->bs can move to a different AioContext in this function.
- * Callers must make sure that their AioContext locking is still correct after
- * this.
 */
 static int GRAPH_UNLOCKED
 bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
@@ -4760,7 +4655,6 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
    const char *child_name = is_backing ? "backing" : "file";
    QObject *value;
    const char *str;
-    AioContext *ctx, *old_ctx;
    bool has_child;
    int ret;

@@ -4844,25 +4738,13 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        bdrv_drained_begin(old_child_bs);
    }

-    old_ctx = bdrv_get_aio_context(bs);
-    ctx = bdrv_get_aio_context(new_child_bs);
-    if (old_ctx != ctx) {
-        aio_context_release(old_ctx);
-        aio_context_acquire(ctx);
-    }
-
    bdrv_graph_rdunlock_main_loop();
-    bdrv_graph_wrlock(new_child_bs);
+    bdrv_graph_wrlock();

    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
                                          tran, errp);

-    bdrv_graph_wrunlock_ctx(ctx);
-
-    if (old_ctx != ctx) {
-        aio_context_release(ctx);
-        aio_context_acquire(old_ctx);
-    }
+    bdrv_graph_wrunlock();

    if (old_child_bs) {
        bdrv_drained_end(old_child_bs);
@@ -4892,8 +4774,6 @@ out_rdlock:
 * It is the responsibility of the caller to then call the abort() or
 * commit() for any other BDS that have been left in a prepare() state
 *
- * The caller must hold the AioContext lock of @reopen_state->bs.
- *
 * After calling this function, the transaction @change_child_tran may only be
 * completed while holding a writer lock for the graph.
 */
@@ -5209,14 +5089,14 @@ static void bdrv_close(BlockDriverState *bs)
        bs->drv = NULL;
    }

-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
        bdrv_unref_child(bs, child);
    }

    assert(!bs->backing);
    assert(!bs->file);
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();

    g_free(bs->opaque);
    bs->opaque = NULL;
@@ -5509,9 +5389,9 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
    bdrv_graph_rdunlock_main_loop();

    bdrv_drained_begin(child_bs);
-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    ret = bdrv_replace_node_common(bs, child_bs, true, true, errp);
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(child_bs);

    return ret;
@@ -5528,8 +5408,6 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
 * child.
 *
 * This function does not create any image files.
- *
- * The caller must hold the AioContext lock for @bs_top.
 */
 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
                Error **errp)
@@ -5537,7 +5415,6 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
    int ret;
    BdrvChild *child;
    Transaction *tran = tran_new();
-    AioContext *old_context, *new_context = NULL;

    GLOBAL_STATE_CODE();

@@ -5545,23 +5422,10 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
    assert(!bs_new->backing);
    bdrv_graph_rdunlock_main_loop();

-    old_context = bdrv_get_aio_context(bs_top);
    bdrv_drained_begin(bs_top);
-
-    /*
-     * bdrv_drained_begin() requires that only the AioContext of the drained
-     * node is locked, and at this point it can still differ from the AioContext
-     * of bs_top.
-     */
-    new_context = bdrv_get_aio_context(bs_new);
-    aio_context_release(old_context);
-    aio_context_acquire(new_context);
    bdrv_drained_begin(bs_new);
-    aio_context_release(new_context);
-    aio_context_acquire(old_context);
-    new_context = NULL;

-    bdrv_graph_wrlock(bs_top);
+    bdrv_graph_wrlock();

    child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
                                     &child_of_bds, bdrv_backing_role(bs_new),
@@ -5571,18 +5435,6 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
        goto out;
    }

-    /*
-     * bdrv_attach_child_noperm could change the AioContext of bs_top and
-     * bs_new, but at least they are in the same AioContext now. This is the
-     * AioContext that we need to lock for the rest of the function.
-     */
-    new_context = bdrv_get_aio_context(bs_top);
-
-    if (old_context != new_context) {
-        aio_context_release(old_context);
-        aio_context_acquire(new_context);
-    }
-
    ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
    if (ret < 0) {
        goto out;
@@ -5593,16 +5445,11 @@ out:
    tran_finalize(tran, ret);

    bdrv_refresh_limits(bs_top, NULL, NULL);
-    bdrv_graph_wrunlock(bs_top);
+    bdrv_graph_wrunlock();

    bdrv_drained_end(bs_top);
    bdrv_drained_end(bs_new);

-    if (new_context && old_context != new_context) {
-        aio_context_release(new_context);
-        aio_context_acquire(old_context);
-    }
-
    return ret;
 }

@@ -5620,7 +5467,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
    bdrv_ref(old_bs);
    bdrv_drained_begin(old_bs);
    bdrv_drained_begin(new_bs);
-    bdrv_graph_wrlock(new_bs);
+    bdrv_graph_wrlock();

    bdrv_replace_child_tran(child, new_bs, tran);

@@ -5631,7 +5478,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,

    tran_finalize(tran, ret);

-    bdrv_graph_wrunlock(new_bs);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(old_bs);
    bdrv_drained_end(new_bs);
    bdrv_unref(old_bs);
@@ -5667,9 +5514,8 @@ static void bdrv_delete(BlockDriverState *bs)
 * after the call (even on failure), so if the caller intends to reuse the
 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
 *
- * The caller holds the AioContext lock for @bs. It must make sure that @bs
- * stays in the same AioContext, i.e. @options must not refer to nodes in a
- * different AioContext.
+ * The caller must make sure that @bs stays in the same AioContext, i.e.
+ * @options must not refer to nodes in a different AioContext.
 */
 BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
                                   int flags, Error **errp)
@@ -5697,12 +5543,8 @@ BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,

    GLOBAL_STATE_CODE();

-    aio_context_release(ctx);
-    aio_context_acquire(qemu_get_aio_context());
    new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
                                            errp);
-    aio_context_release(qemu_get_aio_context());
-    aio_context_acquire(ctx);
    assert(bdrv_get_aio_context(bs) == ctx);

    options = NULL; /* bdrv_new_open_driver() eats options */
@@ -5718,9 +5560,9 @@ BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
    bdrv_ref(bs);
    bdrv_drained_begin(bs);
    bdrv_drained_begin(new_node_bs);
-    bdrv_graph_wrlock(new_node_bs);
+    bdrv_graph_wrlock();
    ret = bdrv_replace_node(bs, new_node_bs, errp);
-    bdrv_graph_wrunlock(new_node_bs);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(new_node_bs);
    bdrv_drained_end(bs);
    bdrv_unref(bs);
@@ -5975,7 +5817,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,

    bdrv_ref(top);
    bdrv_drained_begin(base);
-    bdrv_graph_wrlock(base);
+    bdrv_graph_wrlock();

    if (!top->drv || !base->drv) {
        goto exit_wrlock;
@@ -6015,7 +5857,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
     * That's a FIXME.
     */
    bdrv_replace_node_common(top, base, false, false, &local_err);
-    bdrv_graph_wrunlock(base);
+    bdrv_graph_wrunlock();

    if (local_err) {
        error_report_err(local_err);
@@ -6052,7 +5894,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
    goto exit;

 exit_wrlock:
-    bdrv_graph_wrunlock(base);
+    bdrv_graph_wrunlock();
 exit:
    bdrv_drained_end(base);
    bdrv_unref(top);
@@ -7037,12 +6879,9 @@ void bdrv_activate_all(Error **errp)
    GRAPH_RDLOCK_GUARD_MAINLOOP();

    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
        int ret;

-        aio_context_acquire(aio_context);
        ret = bdrv_activate(bs, errp);
-        aio_context_release(aio_context);
        if (ret < 0) {
            bdrv_next_cleanup(&it);
            return;
@@ -7137,20 +6976,10 @@ int bdrv_inactivate_all(void)
    BlockDriverState *bs = NULL;
    BdrvNextIterator it;
    int ret = 0;
-    GSList *aio_ctxs = NULL, *ctx;

    GLOBAL_STATE_CODE();
    GRAPH_RDLOCK_GUARD_MAINLOOP();

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        if (!g_slist_find(aio_ctxs, aio_context)) {
-            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
-            aio_context_acquire(aio_context);
-        }
-    }
-
    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        /* Nodes with BDS parents are covered by recursion from the last
         * parent that gets inactivated. Don't inactivate them a second
@@ -7161,17 +6990,10 @@ int bdrv_inactivate_all(void)
        ret = bdrv_inactivate_recurse(bs);
        if (ret < 0) {
            bdrv_next_cleanup(&it);
-            goto out;
+            break;
        }
    }

-out:
-    for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
-        AioContext *aio_context = ctx->data;
-        aio_context_release(aio_context);
-    }
-    g_slist_free(aio_ctxs);
-
    return ret;
 }

@@ -7257,11 +7079,8 @@ void bdrv_unref(BlockDriverState *bs)
 static void bdrv_schedule_unref_bh(void *opaque)
 {
    BlockDriverState *bs = opaque;
-    AioContext *ctx = bdrv_get_aio_context(bs);

-    aio_context_acquire(ctx);
    bdrv_unref(bs);
-    aio_context_release(ctx);
 }

 /*
@@ -7398,8 +7217,6 @@ void bdrv_img_create(const char *filename, const char *fmt,
        return;
    }

-    aio_context_acquire(qemu_get_aio_context());
-
    /* Create parameter list */
    create_opts = qemu_opts_append(create_opts, drv->create_opts);
    create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
@@ -7549,7 +7366,6 @@ out:
    qemu_opts_del(opts);
    qemu_opts_free(create_opts);
    error_propagate(errp, local_err);
-    aio_context_release(qemu_get_aio_context());
 }

 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
@@ -7583,33 +7399,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
    bdrv_dec_in_flight(bs);
 }

-void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
-{
-    AioContext *ctx = bdrv_get_aio_context(bs);
-
-    /* In the main thread, bs->aio_context won't change concurrently */
-    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-
-    /*
-     * We're in coroutine context, so we already hold the lock of the main
-     * loop AioContext. Don't lock it twice to avoid deadlocks.
-     */
-    assert(qemu_in_coroutine());
-    if (ctx != qemu_get_aio_context()) {
-        aio_context_acquire(ctx);
-    }
-}
-
-void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
-{
-    AioContext *ctx = bdrv_get_aio_context(bs);
-
-    assert(qemu_in_coroutine());
-    if (ctx != qemu_get_aio_context()) {
-        aio_context_release(ctx);
-    }
-}
-
 static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
 {
    GLOBAL_STATE_CODE();
@@ -7728,21 +7517,8 @@ static void bdrv_set_aio_context_commit(void *opaque)
    BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
    BlockDriverState *bs = (BlockDriverState *) state->bs;
    AioContext *new_context = state->new_ctx;
-    AioContext *old_context = bdrv_get_aio_context(bs);

-    /*
-     * Take the old AioContex when detaching it from bs.
-     * At this point, new_context lock is already acquired, and we are now
-     * also taking old_context. This is safe as long as bdrv_detach_aio_context
-     * does not call AIO_POLL_WHILE().
-     */
-    if (old_context != qemu_get_aio_context()) {
-        aio_context_acquire(old_context);
-    }
    bdrv_detach_aio_context(bs);
-    if (old_context != qemu_get_aio_context()) {
-        aio_context_release(old_context);
-    }
    bdrv_attach_aio_context(bs, new_context);
 }

@@ -7757,10 +7533,6 @@ static TransactionActionDrv set_aio_context = {
 *
 * Must be called from the main AioContext.
 *
- * The caller must own the AioContext lock for the old AioContext of bs, but it
- * must not own the AioContext lock for new_context (unless new_context is the
- * same as the current context of bs).
- *
 * @visited will accumulate all visited BdrvChild objects. The caller is
 * responsible for freeing the list afterwards.
 */
@@ -7813,13 +7585,6 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
 *
 * If ignore_child is not NULL, that child (and its subgraph) will not
 * be touched.
- *
- * This function still requires the caller to take the bs current
- * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
- * assumes the lock is always held if bs is in another AioContext.
- * For the same reason, it temporarily also holds the new AioContext, since
- * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
- * Therefore the new AioContext lock must not be taken by the caller.
 */
 int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
                                BdrvChild *ignore_child, Error **errp)
@@ -7827,7 +7592,6 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
    Transaction *tran;
    GHashTable *visited;
    int ret;
-    AioContext *old_context = bdrv_get_aio_context(bs);
    GLOBAL_STATE_CODE();

    /*
@@ -7846,8 +7610,8 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,

    /*
     * Linear phase: go through all callbacks collected in the transaction.
-     * Run all callbacks collected in the recursion to switch all nodes
-     * AioContext lock (transaction commit), or undo all changes done in the
+     * Run all callbacks collected in the recursion to switch every node's
+     * AioContext (transaction commit), or undo all changes done in the
     * recursion (transaction abort).
     */

@@ -7857,34 +7621,7 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
        return -EPERM;
    }

-    /*
-     * Release old AioContext, it won't be needed anymore, as all
-     * bdrv_drained_begin() have been called already.
-     */
-    if (qemu_get_aio_context() != old_context) {
-        aio_context_release(old_context);
-    }
-
-    /*
-     * Acquire new AioContext since bdrv_drained_end() is going to be called
-     * after we switched all nodes in the new AioContext, and the function
-     * assumes that the lock of the bs is always taken.
-     */
-    if (qemu_get_aio_context() != ctx) {
-        aio_context_acquire(ctx);
-    }
-
    tran_commit(tran);
-
-    if (qemu_get_aio_context() != ctx) {
-        aio_context_release(ctx);
-    }
-
-    /* Re-acquire the old AioContext, since the caller takes and releases it. */
-    if (qemu_get_aio_context() != old_context) {
-        aio_context_acquire(old_context);
-    }
-
    return 0;
 }

@@ -8006,7 +7743,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
                                        const char *node_name, Error **errp)
 {
    BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
-    AioContext *aio_context;

    GLOBAL_STATE_CODE();

@@ -8015,12 +7751,8 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
        return NULL;
    }

-    aio_context = bdrv_get_aio_context(to_replace_bs);
-    aio_context_acquire(aio_context);
-
    if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
-        to_replace_bs = NULL;
-        goto out;
+        return NULL;
    }

    /* We don't want arbitrary node of the BDS chain to be replaced only the top
@@ -8033,12 +7765,9 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
                   "because it cannot be guaranteed that doing so would not "
                   "lead to an abrupt change of visible data",
                   node_name, parent_bs->node_name);
-        to_replace_bs = NULL;
-        goto out;
+        return NULL;
    }

-out:
-    aio_context_release(aio_context);
    return to_replace_bs;
 }

--- a/block/backup.c
+++ b/block/backup.c
@@ -496,10 +496,10 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
    block_copy_set_speed(bcs, speed);

    /* Required permissions are taken by copy-before-write filter target */
-    bdrv_graph_wrlock(target);
+    bdrv_graph_wrlock();
    block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);
-    bdrv_graph_wrunlock(target);
+    bdrv_graph_wrunlock();

    return &job->common;

--- a/block/blkio.c
+++ b/block/blkio.c
@@ -68,7 +68,7 @@ typedef struct {
    CoQueue bounce_available;

    /* The value of the "mem-region-alignment" property */
-    uint64_t mem_region_alignment;
+    size_t mem_region_alignment;

    /* Can we skip adding/deleting blkio_mem_regions? */
    bool needs_mem_regions;
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -251,9 +251,9 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
    ret = 0;
 fail_log:
    if (ret < 0) {
-        bdrv_graph_wrlock(NULL);
+        bdrv_graph_wrlock();
        bdrv_unref_child(bs, s->log_file);
-        bdrv_graph_wrunlock(NULL);
+        bdrv_graph_wrunlock();
        s->log_file = NULL;
    }
 fail:
@@ -265,10 +265,10 @@ static void blk_log_writes_close(BlockDriverState *bs)
 {
    BDRVBlkLogWritesState *s = bs->opaque;

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    bdrv_unref_child(bs, s->log_file);
    s->log_file = NULL;
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();
 }

 static int64_t coroutine_fn GRAPH_RDLOCK
@@ -328,39 +328,22 @@ static void coroutine_fn GRAPH_RDLOCK
 blk_log_writes_co_do_log(BlkLogWritesLogReq *lr)
 {
    BDRVBlkLogWritesState *s = lr->bs->opaque;
-
-    /*
-     * Determine the offsets and sizes of different parts of the entry, and
-     * update the state of the driver.
-     *
-     * This needs to be done in one go, before any actual I/O is done, as the
-     * log entry may have to be written in two parts, and the state of the
-     * driver may be modified by other driver operations while waiting for the
-     * I/O to complete.
-     */
-    const uint64_t entry_start_sector = s->cur_log_sector;
-    const uint64_t entry_offset = entry_start_sector << s->sectorbits;
-    const uint64_t qiov_aligned_size = ROUND_UP(lr->qiov->size, s->sectorsize);
-    const uint64_t entry_aligned_size = qiov_aligned_size +
-        ROUND_UP(lr->zero_size, s->sectorsize);
-    const uint64_t entry_nr_sectors = entry_aligned_size >> s->sectorbits;
+    uint64_t cur_log_offset = s->cur_log_sector << s->sectorbits;

    s->nr_entries++;
-    s->cur_log_sector += entry_nr_sectors;
+    s->cur_log_sector +=
+            ROUND_UP(lr->qiov->size, s->sectorsize) >> s->sectorbits;

-    /*
-     * Write the log entry. Note that if this is a "write zeroes" operation,
-     * only the entry header is written here, with the zeroing being done
-     * separately below.
-     */
-    lr->log_ret = bdrv_co_pwritev(s->log_file, entry_offset, lr->qiov->size,
+    lr->log_ret = bdrv_co_pwritev(s->log_file, cur_log_offset, lr->qiov->size,
                                  lr->qiov, 0);

    /* Logging for the "write zeroes" operation */
    if (lr->log_ret == 0 && lr->zero_size) {
-        const uint64_t zeroes_offset = entry_offset + qiov_aligned_size;
+        cur_log_offset = s->cur_log_sector << s->sectorbits;
+        s->cur_log_sector +=
+                ROUND_UP(lr->zero_size, s->sectorsize) >> s->sectorbits;

-        lr->log_ret = bdrv_co_pwrite_zeroes(s->log_file, zeroes_offset,
+        lr->log_ret = bdrv_co_pwrite_zeroes(s->log_file, cur_log_offset,
                                            lr->zero_size, 0);
    }

--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -151,10 +151,10 @@ static void blkverify_close(BlockDriverState *bs)
 {
    BDRVBlkverifyState *s = bs->opaque;

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    bdrv_unref_child(bs, s->test_file);
    s->test_file = NULL;
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();
 }

 static int64_t coroutine_fn GRAPH_RDLOCK
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -390,8 +390,6 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 * Both sets of permissions can be changed later using blk_set_perm().
 *
 * Return the new BlockBackend on success, null on failure.
- *
- * Callers must hold the AioContext lock of @bs.
 */
 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
                              uint64_t shared_perm, Error **errp)
@@ -416,8 +414,6 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
 *
- * Called without holding an AioContext lock.
- *
 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 * BDS tree just by specifying the @options QDict (or @reference,
 * alternatively). At the time of adding this function, this is not possible,
@@ -429,7 +425,6 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
 {
    BlockBackend *blk;
    BlockDriverState *bs;
-    AioContext *ctx;
    uint64_t perm = 0;
    uint64_t shared = BLK_PERM_ALL;

@@ -459,23 +454,18 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
        shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
    }

-    aio_context_acquire(qemu_get_aio_context());
    bs = bdrv_open(filename, reference, options, flags, errp);
-    aio_context_release(qemu_get_aio_context());
    if (!bs) {
        return NULL;
    }

    /* bdrv_open() could have moved bs to a different AioContext */
-    ctx = bdrv_get_aio_context(bs);
    blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
    blk->perm = perm;
    blk->shared_perm = shared;

-    aio_context_acquire(ctx);
    blk_insert_bs(blk, bs, errp);
    bdrv_unref(bs);
-    aio_context_release(ctx);

    if (!blk->root) {
        blk_unref(blk);
@@ -577,13 +567,9 @@ void blk_remove_all_bs(void)
    GLOBAL_STATE_CODE();

    while ((blk = blk_all_next(blk)) != NULL) {
-        AioContext *ctx = blk_get_aio_context(blk);
-
-        aio_context_acquire(ctx);
        if (blk->root) {
            blk_remove_bs(blk);
        }
-        aio_context_release(ctx);
    }
 }

@@ -613,14 +599,14 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it)
    /* Must be called from the main loop */
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());

-    old_bs = it->bs;
-
    /* First, return all root nodes of BlockBackends. In order to avoid
     * returning a BDS twice when multiple BBs refer to it, we only return it
     * if the BB is the first one in the parent list of the BDS. */
    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
        BlockBackend *old_blk = it->blk;

+        old_bs = old_blk ? blk_bs(old_blk) : NULL;
+
        do {
            it->blk = blk_all_next(it->blk);
            bs = it->blk ? blk_bs(it->blk) : NULL;
@@ -634,10 +620,11 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it)
        if (bs) {
            bdrv_ref(bs);
            bdrv_unref(old_bs);
-            it->bs = bs;
            return bs;
        }
        it->phase = BDRV_NEXT_MONITOR_OWNED;
+    } else {
+        old_bs = it->bs;
    }

    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
@@ -677,10 +664,13 @@ void bdrv_next_cleanup(BdrvNextIterator *it)
    /* Must be called from the main loop */
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());

-    bdrv_unref(it->bs);
-
-    if (it->phase == BDRV_NEXT_BACKEND_ROOTS && it->blk) {
-        blk_unref(it->blk);
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        if (it->blk) {
+            bdrv_unref(blk_bs(it->blk));
+            blk_unref(it->blk);
+        }
+    } else {
+        bdrv_unref(it->bs);
    }

    bdrv_next_reset(it);
@@ -878,14 +868,11 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)

 /*
 * Disassociates the currently associated BlockDriverState from @blk.
- *
- * The caller must hold the AioContext lock for the BlockBackend.
 */
 void blk_remove_bs(BlockBackend *blk)
 {
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
    BdrvChild *root;
-    AioContext *ctx;

    GLOBAL_STATE_CODE();

@@ -915,30 +902,26 @@ void blk_remove_bs(BlockBackend *blk)
    root = blk->root;
    blk->root = NULL;

-    ctx = bdrv_get_aio_context(root->bs);
-    bdrv_graph_wrlock(root->bs);
+    bdrv_graph_wrlock();
    bdrv_root_unref_child(root);
-    bdrv_graph_wrunlock_ctx(ctx);
+    bdrv_graph_wrunlock();
 }

 /*
 * Associates a new BlockDriverState with @blk.
- *
- * Callers must hold the AioContext lock of @bs.
 */
 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 {
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
-    AioContext *ctx = bdrv_get_aio_context(bs);

    GLOBAL_STATE_CODE();
    bdrv_ref(bs);
-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
                                       blk->perm, blk->shared_perm,
                                       blk, errp);
-    bdrv_graph_wrunlock_ctx(ctx);
+    bdrv_graph_wrunlock();
    if (blk->root == NULL) {
        return -EPERM;
    }
@@ -2735,20 +2718,16 @@ int blk_commit_all(void)
    GRAPH_RDLOCK_GUARD_MAINLOOP();

    while ((blk = blk_all_next(blk)) != NULL) {
-        AioContext *aio_context = blk_get_aio_context(blk);
        BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));

-        aio_context_acquire(aio_context);
        if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
            int ret;

            ret = bdrv_commit(unfiltered_bs);
            if (ret < 0) {
-                aio_context_release(aio_context);
                return ret;
            }
        }
-        aio_context_release(aio_context);
    }
    return 0;
 }
--- a/block/commit.c
+++ b/block/commit.c
@@ -100,9 +100,9 @@ static void commit_abort(Job *job)
    bdrv_graph_rdunlock_main_loop();

    bdrv_drained_begin(commit_top_backing_bs);
-    bdrv_graph_wrlock(commit_top_backing_bs);
+    bdrv_graph_wrlock();
    bdrv_replace_node(s->commit_top_bs, commit_top_backing_bs, &error_abort);
-    bdrv_graph_wrunlock(commit_top_backing_bs);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(commit_top_backing_bs);

    bdrv_unref(s->commit_top_bs);
@@ -339,7 +339,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     * this is the responsibility of the interface (i.e. whoever calls
     * commit_start()).
     */
-    bdrv_graph_wrlock(top);
+    bdrv_graph_wrlock();
    s->base_overlay = bdrv_find_overlay(top, base);
    assert(s->base_overlay);

@@ -370,19 +370,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                 iter_shared_perms, errp);
        if (ret < 0) {
-            bdrv_graph_wrunlock(top);
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }

    if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
-        bdrv_graph_wrunlock(top);
+        bdrv_graph_wrunlock();
        goto fail;
    }
    s->chain_frozen = true;

    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
-    bdrv_graph_wrunlock(top);
+    bdrv_graph_wrunlock();

    if (ret < 0) {
        goto fail;
@@ -434,9 +434,9 @@ fail:
     * otherwise this would fail because of lack of permissions. */
    if (commit_top_bs) {
        bdrv_drained_begin(top);
-        bdrv_graph_wrlock(top);
+        bdrv_graph_wrlock();
        bdrv_replace_node(commit_top_bs, top, &error_abort);
-        bdrv_graph_wrunlock(top);
+        bdrv_graph_wrunlock();
        bdrv_drained_end(top);
    }
 }
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -412,7 +412,6 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t cluster_size;
    g_autoptr(BlockdevOptions) full_opts = NULL;
    BlockdevOptionsCbw *opts;
-    AioContext *ctx;
    int ret;

    full_opts = cbw_parse_options(options, errp);
@@ -435,15 +434,11 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,

    GRAPH_RDLOCK_GUARD_MAINLOOP();

-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
-
    if (opts->bitmap) {
        bitmap = block_dirty_bitmap_lookup(opts->bitmap->node,
                                           opts->bitmap->name, NULL, errp);
        if (!bitmap) {
-            ret = -EINVAL;
-            goto out;
+            return -EINVAL;
        }
    }
    s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error :
@@ -461,24 +456,21 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
    if (!s->bcs) {
        error_prepend(errp, "Cannot create block-copy-state: ");
-        ret = -EINVAL;
-        goto out;
+        return -EINVAL;
    }

    cluster_size = block_copy_cluster_size(s->bcs);

    s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    if (!s->done_bitmap) {
-        ret = -EINVAL;
-        goto out;
+        return -EINVAL;
    }
    bdrv_disable_dirty_bitmap(s->done_bitmap);

    /* s->access_bitmap starts equal to bcs bitmap */
    s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    if (!s->access_bitmap) {
-        ret = -EINVAL;
-        goto out;
+        return -EINVAL;
    }
    bdrv_disable_dirty_bitmap(s->access_bitmap);
    bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
@@ -487,11 +479,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,

    qemu_co_mutex_init(&s->lock);
    QLIST_INIT(&s->frozen_read_reqs);
-
-    ret = 0;
-out:
-    aio_context_release(ctx);
-    return ret;
+    return 0;
 }

 static void cbw_close(BlockDriverState *bs)
--- a/block/export/export.c
+++ b/block/export/export.c
@@ -114,7 +114,6 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
    }

    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);

    if (export->iothread) {
        IOThread *iothread;
@@ -133,8 +132,6 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
        set_context_errp = fixed_iothread ? errp : NULL;
        ret = bdrv_try_change_aio_context(bs, new_ctx, NULL, set_context_errp);
        if (ret == 0) {
-            aio_context_release(ctx);
-            aio_context_acquire(new_ctx);
            ctx = new_ctx;
        } else if (fixed_iothread) {
            goto fail;
@@ -191,8 +188,6 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
    assert(exp->blk != NULL);

    QLIST_INSERT_HEAD(&block_exports, exp, next);
-
-    aio_context_release(ctx);
    return exp;

 fail:
@@ -200,7 +195,6 @@ fail:
        blk_set_dev_ops(blk, NULL, NULL);
        blk_unref(blk);
    }
-    aio_context_release(ctx);
    if (exp) {
        g_free(exp->id);
        g_free(exp);
@@ -218,9 +212,6 @@ void blk_exp_ref(BlockExport *exp)
 static void blk_exp_delete_bh(void *opaque)
 {
    BlockExport *exp = opaque;
-    AioContext *aio_context = exp->ctx;
-
-    aio_context_acquire(aio_context);

    assert(exp->refcount == 0);
    QLIST_REMOVE(exp, next);
@@ -230,8 +221,6 @@ static void blk_exp_delete_bh(void *opaque)
    qapi_event_send_block_export_deleted(exp->id);
    g_free(exp->id);
    g_free(exp);
-
-    aio_context_release(aio_context);
 }

 void blk_exp_unref(BlockExport *exp)
@@ -249,22 +238,16 @@ void blk_exp_unref(BlockExport *exp)
 * connections and other internally held references start to shut down. When
 * the function returns, there may still be active references while the export
 * is in the process of shutting down.
- *
- * Acquires exp->ctx internally. Callers must *not* hold the lock.
 */
 void blk_exp_request_shutdown(BlockExport *exp)
 {
-    AioContext *aio_context = exp->ctx;
-
-    aio_context_acquire(aio_context);
-
    /*
     * If the user doesn't own the export any more, it is already shutting
     * down. We must not call .request_shutdown and decrease the refcount a
     * second time.
     */
    if (!exp->user_owned) {
-        goto out;
+        return;
    }

    exp->drv->request_shutdown(exp);
@@ -272,9 +255,6 @@ void blk_exp_request_shutdown(BlockExport *exp)
    assert(exp->user_owned);
    exp->user_owned = false;
    blk_exp_unref(exp);
-
-out:
-    aio_context_release(aio_context);
 }

 /*
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -278,7 +278,6 @@ static void vu_blk_exp_resize(void *opaque)
    vu_config_change_msg(&vexp->vu_server.vu_dev);
 }

-/* Called with vexp->export.ctx acquired */
 static void vu_blk_drained_begin(void *opaque)
 {
    VuBlkExport *vexp = opaque;
@@ -287,7 +286,6 @@ static void vu_blk_drained_begin(void *opaque)
    vhost_user_server_detach_aio_context(&vexp->vu_server);
 }

-/* Called with vexp->export.blk AioContext acquired */
 static void vu_blk_drained_end(void *opaque)
 {
    VuBlkExport *vexp = opaque;
@@ -300,8 +298,6 @@ static void vu_blk_drained_end(void *opaque)
 * Ensures that bdrv_drained_begin() waits until in-flight requests complete
 * and the server->co_trip coroutine has terminated. It will be restarted in
 * vhost_user_server_attach_aio_context().
- *
- * Called with vexp->export.ctx acquired.
 */
 static bool vu_blk_drained_poll(void *opaque)
 {
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -712,17 +712,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

 #ifdef CONFIG_LINUX_AIO
     /* Currently Linux does AIO only for files opened with O_DIRECT */
-    if (s->use_linux_aio) {
-        if (!(s->open_flags & O_DIRECT)) {
-            error_setg(errp, "aio=native was specified, but it requires "
-                             "cache.direct=on, which was not specified.");
-            ret = -EINVAL;
-            goto fail;
-        }
-        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
-            error_prepend(errp, "Unable to use native AIO: ");
-            goto fail;
-        }
+    if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
+        error_setg(errp, "aio=native was specified, but it requires "
+                         "cache.direct=on, which was not specified.");
+        ret = -EINVAL;
+        goto fail;
    }
 #else
    if (s->use_linux_aio) {
@@ -733,14 +727,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
 #endif /* !defined(CONFIG_LINUX_AIO) */

-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
-            error_prepend(errp, "Unable to use io_uring: ");
-            goto fail;
-        }
-    }
-#else
+#ifndef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        error_setg(errp, "aio=io_uring was specified, but is not supported "
                         "in this build.");
@@ -2444,6 +2431,48 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
    return true;
 }

+#ifdef CONFIG_LINUX_IO_URING
+static inline bool raw_check_linux_io_uring(BDRVRawState *s)
+{
+    Error *local_err = NULL;
+    AioContext *ctx;
+
+    if (!s->use_linux_io_uring) {
+        return false;
+    }
+
+    ctx = qemu_get_current_aio_context();
+    if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
+        error_reportf_err(local_err, "Unable to use linux io_uring, "
+                                     "falling back to thread pool: ");
+        s->use_linux_io_uring = false;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef CONFIG_LINUX_AIO
+static inline bool raw_check_linux_aio(BDRVRawState *s)
+{
+    Error *local_err = NULL;
+    AioContext *ctx;
+
+    if (!s->use_linux_aio) {
+        return false;
+    }
+
+    ctx = qemu_get_current_aio_context();
+    if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
+        error_reportf_err(local_err, "Unable to use Linux AIO, "
+                                     "falling back to thread pool: ");
+        s->use_linux_aio = false;
+        return false;
+    }
+    return true;
+}
+#endif
+
 static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 {
@@ -2474,13 +2503,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
        type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_IO_URING
-    } else if (s->use_linux_io_uring) {
+    } else if (raw_check_linux_io_uring(s)) {
        assert(qiov->size == bytes);
        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
        goto out;
 #endif
 #ifdef CONFIG_LINUX_AIO
-    } else if (s->use_linux_aio) {
+    } else if (raw_check_linux_aio(s)) {
        assert(qiov->size == bytes);
        ret = laio_co_submit(s->fd, offset, qiov, type,
                              s->aio_max_batch);
@@ -2567,39 +2596,13 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
    };

 #ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
+    if (raw_check_linux_io_uring(s)) {
        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
    }
 #endif
    return raw_thread_pool_submit(handle_aiocb_flush, &acb);
 }

-static void raw_aio_attach_aio_context(BlockDriverState *bs,
-                                       AioContext *new_context)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        Error *local_err = NULL;
-        if (!aio_setup_linux_aio(new_context, &local_err)) {
-            error_reportf_err(local_err, "Unable to use native AIO, "
-                                         "falling back to thread pool: ");
-            s->use_linux_aio = false;
-        }
-    }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        Error *local_err = NULL;
-        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
-            error_reportf_err(local_err, "Unable to use linux io_uring, "
-                                         "falling back to thread pool: ");
-            s->use_linux_io_uring = false;
-        }
-    }
-#endif
-}
-
 static void raw_close(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
@@ -3896,7 +3899,6 @@ BlockDriver bdrv_file = {
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
    .bdrv_co_getlength                  = raw_co_getlength,
@@ -4266,7 +4268,6 @@ static BlockDriver bdrv_host_device = {
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
    .bdrv_co_getlength                  = raw_co_getlength,
@@ -4402,7 +4403,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
    .bdrv_co_getlength                  = raw_co_getlength,
@@ -4528,7 +4528,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
    .bdrv_co_getlength                  = raw_co_getlength,
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -106,27 +106,12 @@ static uint32_t reader_count(void)
    return rd;
 }

-void no_coroutine_fn bdrv_graph_wrlock(BlockDriverState *bs)
+void no_coroutine_fn bdrv_graph_wrlock(void)
 {
-    AioContext *ctx = NULL;
-
    GLOBAL_STATE_CODE();
    assert(!qatomic_read(&has_writer));
    assert(!qemu_in_coroutine());

-    /*
-     * Release only non-mainloop AioContext. The mainloop often relies on the
-     * BQL and doesn't lock the main AioContext before doing things.
-     */
-    if (bs) {
-        ctx = bdrv_get_aio_context(bs);
-        if (ctx != qemu_get_aio_context()) {
-            aio_context_release(ctx);
-        } else {
-            ctx = NULL;
-        }
-    }
-
    /* Make sure that constantly arriving new I/O doesn't cause starvation */
    bdrv_drain_all_begin_nopoll();

@@ -155,27 +140,13 @@ void no_coroutine_fn bdrv_graph_wrlock(BlockDriverState *bs)
    } while (reader_count() >= 1);

    bdrv_drain_all_end();
-
-    if (ctx) {
-        aio_context_acquire(bdrv_get_aio_context(bs));
-    }
 }

-void no_coroutine_fn bdrv_graph_wrunlock_ctx(AioContext *ctx)
+void no_coroutine_fn bdrv_graph_wrunlock(void)
 {
    GLOBAL_STATE_CODE();
    assert(qatomic_read(&has_writer));

-    /*
-     * Release only non-mainloop AioContext. The mainloop often relies on the
-     * BQL and doesn't lock the main AioContext before doing things.
-     */
-    if (ctx && ctx != qemu_get_aio_context()) {
-        aio_context_release(ctx);
-    } else {
-        ctx = NULL;
-    }
-
    WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
        /*
         * No need for memory barriers, this works in pair with
@@ -197,17 +168,6 @@ void no_coroutine_fn bdrv_graph_wrunlock_ctx(AioContext *ctx)
     * progress.
     */
    aio_bh_poll(qemu_get_aio_context());
-
-    if (ctx) {
-        aio_context_acquire(ctx);
-    }
-}
-
-void no_coroutine_fn bdrv_graph_wrunlock(BlockDriverState *bs)
-{
-    AioContext *ctx = bs ? bdrv_get_aio_context(bs) : NULL;
-
-    bdrv_graph_wrunlock_ctx(ctx);
 }

 void coroutine_fn bdrv_graph_co_rdlock(void)
--- a/block/io.c
+++ b/block/io.c
@@ -294,8 +294,6 @@ static void bdrv_co_drain_bh_cb(void *opaque)
    BlockDriverState *bs = data->bs;

    if (bs) {
-        AioContext *ctx = bdrv_get_aio_context(bs);
-        aio_context_acquire(ctx);
        bdrv_dec_in_flight(bs);
        if (data->begin) {
            bdrv_do_drained_begin(bs, data->parent, data->poll);
@@ -303,7 +301,6 @@ static void bdrv_co_drain_bh_cb(void *opaque)
            assert(!data->poll);
            bdrv_do_drained_end(bs, data->parent);
        }
-        aio_context_release(ctx);
    } else {
        assert(data->begin);
        bdrv_drain_all_begin();
@@ -320,8 +317,6 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 {
    BdrvCoDrainData data;
    Coroutine *self = qemu_coroutine_self();
-    AioContext *ctx = bdrv_get_aio_context(bs);
-    AioContext *co_ctx = qemu_coroutine_get_aio_context(self);

    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
     * other coroutines run if they were queued by aio_co_enter(). */
@@ -340,17 +335,6 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
        bdrv_inc_in_flight(bs);
    }

-    /*
-     * Temporarily drop the lock across yield or we would get deadlocks.
-     * bdrv_co_drain_bh_cb() reaquires the lock as needed.
-     *
-     * When we yield below, the lock for the current context will be
-     * released, so if this is actually the lock that protects bs, don't drop
-     * it a second time.
-     */
-    if (ctx != co_ctx) {
-        aio_context_release(ctx);
-    }
    replay_bh_schedule_oneshot_event(qemu_get_aio_context(),
                                     bdrv_co_drain_bh_cb, &data);

@@ -358,11 +342,6 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
    /* If we are resumed from some other event (such as an aio completion or a
     * timer callback), it is a bug in the caller that should be fixed. */
    assert(data.done);
-
-    /* Reacquire the AioContext of bs if we dropped it */
-    if (ctx != co_ctx) {
-        aio_context_acquire(ctx);
-    }
 }

 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
@@ -478,13 +457,12 @@ static bool bdrv_drain_all_poll(void)
    GLOBAL_STATE_CODE();
    GRAPH_RDLOCK_GUARD_MAINLOOP();

-    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
-     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
+    /*
+     * bdrv_drain_poll() can't make changes to the graph and we hold the BQL,
+     * so iterating bdrv_next_all_states() is safe.
+     */
    while ((bs = bdrv_next_all_states(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        aio_context_acquire(aio_context);
        result |= bdrv_drain_poll(bs, NULL, true);
-        aio_context_release(aio_context);
    }

    return result;
@@ -525,11 +503,7 @@ void bdrv_drain_all_begin_nopoll(void)
    /* Quiesce all nodes, without polling in-flight requests yet. The graph
     * cannot change during this loop. */
    while ((bs = bdrv_next_all_states(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
        bdrv_do_drained_begin(bs, NULL, false);
-        aio_context_release(aio_context);
    }
 }

@@ -588,11 +562,7 @@ void bdrv_drain_all_end(void)
    }

    while ((bs = bdrv_next_all_states(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
        bdrv_do_drained_end(bs, NULL);
-        aio_context_release(aio_context);
    }

    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
@@ -1756,29 +1726,22 @@ static int bdrv_pad_request(BlockDriverState *bs,
        return 0;
    }

-    /*
-     * For prefetching in stream_populate(), no qiov is passed along, because
-     * only copy-on-read matters.
-     */
-    if (qiov && *qiov) {
-        sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-                                      &sliced_head, &sliced_tail,
-                                      &sliced_niov);
+    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+                                  &sliced_head, &sliced_tail,
+                                  &sliced_niov);

-        /* Guaranteed by bdrv_check_request32() */
-        assert(*bytes <= SIZE_MAX);
-        ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-                                      sliced_head, *bytes);
-        if (ret < 0) {
-            bdrv_padding_finalize(pad);
-            return ret;
-        }
-        *qiov = &pad->local_qiov;
-        *qiov_offset = 0;
+    /* Guaranteed by bdrv_check_request32() */
+    assert(*bytes <= SIZE_MAX);
+    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+                                  sliced_head, *bytes);
+    if (ret < 0) {
+        bdrv_padding_finalize(pad);
+        return ret;
    }
-
    *bytes += pad->head + pad->tail;
    *offset -= pad->head;
+    *qiov = &pad->local_qiov;
+    *qiov_offset = 0;
    if (padded) {
        *padded = true;
    }
@@ -2375,15 +2338,10 @@ int bdrv_flush_all(void)
    }

    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        ret = bdrv_flush(bs);
+        int ret = bdrv_flush(bs);
        if (ret < 0 && !result) {
            result = ret;
        }
-        aio_context_release(aio_context);
    }

    return result;
@@ -2626,16 +2584,6 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
                ret |= (ret2 & BDRV_BLOCK_ZERO);
            }
        }
-
-        /*
-         * Now that the recursive search was done, clear the flag. Otherwise,
-         * with more complicated block graphs like snapshot-access ->
-         * copy-before-write -> qcow2, where the return value will be propagated
-         * further up to a parent bdrv_co_do_block_status() call, both the
-         * BDRV_BLOCK_RECURSE and BDRV_BLOCK_ZERO flags would be set, which is
-         * not allowed.
-         */
-        ret &= ~BDRV_BLOCK_RECURSE;
    }

 out:
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -479,9 +479,9 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
    return bytes_handled;
 }

-static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
+static void coroutine_fn GRAPH_RDLOCK mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source;
+    BlockDriverState *source = s->mirror_top_bs->backing->bs;
    MirrorOp *pseudo_op;
    int64_t offset;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -489,10 +489,6 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
    int max_io_bytes = MAX(s->buf_size / MAX_IN_FLIGHT, MAX_IO_BYTES);

-    bdrv_graph_co_rdlock();
-    source = s->mirror_top_bs->backing->bs;
-    bdrv_graph_co_rdunlock();
-
    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
    offset = bdrv_dirty_iter_next(s->dbi);
    if (offset < 0) {
@@ -666,7 +662,6 @@ static int mirror_exit_common(Job *job)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
    BlockJob *bjob = &s->common;
    MirrorBDSOpaque *bs_opaque;
-    AioContext *replace_aio_context = NULL;
    BlockDriverState *src;
    BlockDriverState *target_bs;
    BlockDriverState *mirror_top_bs;
@@ -681,7 +676,6 @@ static int mirror_exit_common(Job *job)
    }
    s->prepared = true;

-    aio_context_acquire(qemu_get_aio_context());
    bdrv_graph_rdlock_main_loop();

    mirror_top_bs = s->mirror_top_bs;
@@ -746,11 +740,6 @@ static int mirror_exit_common(Job *job)
    }
    bdrv_graph_rdunlock_main_loop();

-    if (s->to_replace) {
-        replace_aio_context = bdrv_get_aio_context(s->to_replace);
-        aio_context_acquire(replace_aio_context);
-    }
-
    if (s->should_complete && !abort) {
        BlockDriverState *to_replace = s->to_replace ?: src;
        bool ro = bdrv_is_read_only(to_replace);
@@ -768,7 +757,7 @@ static int mirror_exit_common(Job *job)
         * check for an op blocker on @to_replace, and we have our own
         * there.
         */
-        bdrv_graph_wrlock(target_bs);
+        bdrv_graph_wrlock();
        if (bdrv_recurse_can_replace(src, to_replace)) {
            bdrv_replace_node(to_replace, target_bs, &local_err);
        } else {
@@ -777,7 +766,7 @@ static int mirror_exit_common(Job *job)
                       "would not lead to an abrupt change of visible data",
                       to_replace->node_name, target_bs->node_name);
        }
-        bdrv_graph_wrunlock(target_bs);
+        bdrv_graph_wrunlock();
        bdrv_drained_end(to_replace);
        if (local_err) {
            error_report_err(local_err);
@@ -789,9 +778,6 @@ static int mirror_exit_common(Job *job)
        error_free(s->replace_blocker);
        bdrv_unref(s->to_replace);
    }
-    if (replace_aio_context) {
-        aio_context_release(replace_aio_context);
-    }
    g_free(s->replaces);

    /*
@@ -800,9 +786,9 @@ static int mirror_exit_common(Job *job)
     * valid.
     */
    block_job_remove_all_bdrv(bjob);
-    bdrv_graph_wrlock(mirror_top_bs);
+    bdrv_graph_wrlock();
    bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
-    bdrv_graph_wrunlock(mirror_top_bs);
+    bdrv_graph_wrunlock();

    bdrv_drained_end(target_bs);
    bdrv_unref(target_bs);
@@ -815,8 +801,6 @@ static int mirror_exit_common(Job *job)
    bdrv_unref(mirror_top_bs);
    bdrv_unref(src);

-    aio_context_release(qemu_get_aio_context());
-
    return ret;
 }

@@ -1082,7 +1066,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
                mirror_wait_for_free_in_flight_slot(s);
                continue;
            } else if (cnt != 0) {
+                bdrv_graph_co_rdlock();
                mirror_iteration(s);
+                bdrv_graph_co_rdunlock();
            }
        }

@@ -1193,24 +1179,17 @@ static void mirror_complete(Job *job, Error **errp)

    /* block all operations on to_replace bs */
    if (s->replaces) {
-        AioContext *replace_aio_context;
-
        s->to_replace = bdrv_find_node(s->replaces);
        if (!s->to_replace) {
            error_setg(errp, "Node name '%s' not found", s->replaces);
            return;
        }

-        replace_aio_context = bdrv_get_aio_context(s->to_replace);
-        aio_context_acquire(replace_aio_context);
-
        /* TODO Translate this into child freeze system. */
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
-
-        aio_context_release(replace_aio_context);
    }

    s->should_complete = true;
@@ -1916,13 +1895,13 @@ static BlockJob *mirror_start_job(
     */
    bdrv_disable_dirty_bitmap(s->dirty_bitmap);

-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    ret = block_job_add_bdrv(&s->common, "source", bs, 0,
                             BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
                             BLK_PERM_CONSISTENT_READ,
                             errp);
    if (ret < 0) {
-        bdrv_graph_wrunlock(bs);
+        bdrv_graph_wrunlock();
        goto fail;
    }

@@ -1967,17 +1946,17 @@ static BlockJob *mirror_start_job(
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     iter_shared_perms, errp);
            if (ret < 0) {
-                bdrv_graph_wrunlock(bs);
+                bdrv_graph_wrunlock();
                goto fail;
            }
        }

        if (bdrv_freeze_backing_chain(mirror_top_bs, target, errp) < 0) {
-            bdrv_graph_wrunlock(bs);
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();

    QTAILQ_INIT(&s->ops_in_flight);

@@ -2003,12 +1982,12 @@ fail:

    bs_opaque->stop = true;
    bdrv_drained_begin(bs);
-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    assert(mirror_top_bs->backing->bs == bs);
    bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                             &error_abort);
    bdrv_replace_node(mirror_top_bs, bs, &error_abort);
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(bs);

    bdrv_unref(mirror_top_bs);
--- a/block/monitor/bitmap-qmp-cmds.c
+++ b/block/monitor/bitmap-qmp-cmds.c
@@ -95,7 +95,6 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
 {
    BlockDriverState *bs;
    BdrvDirtyBitmap *bitmap;
-    AioContext *aio_context;

    if (!name || name[0] == '\0') {
        error_setg(errp, "Bitmap name cannot be empty");
@@ -107,14 +106,11 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
        return;
    }

-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
    if (has_granularity) {
        if (granularity < 512 || !is_power_of_2(granularity)) {
            error_setg(errp, "Granularity must be power of 2 "
                             "and at least 512");
-            goto out;
+            return;
        }
    } else {
        /* Default to cluster size, if available: */
@@ -132,12 +128,12 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
    if (persistent &&
        !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp))
    {
-        goto out;
+        return;
    }

    bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp);
    if (bitmap == NULL) {
-        goto out;
+        return;
    }

    if (disabled) {
@@ -145,9 +141,6 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
    }

    bdrv_dirty_bitmap_set_persistence(bitmap, persistent);
-
-out:
-    aio_context_release(aio_context);
 }

 BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
@@ -157,7 +150,6 @@ BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
 {
    BlockDriverState *bs;
    BdrvDirtyBitmap *bitmap;
-    AioContext *aio_context;

    GLOBAL_STATE_CODE();

@@ -166,19 +158,14 @@ BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
        return NULL;
    }

-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
    if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO,
                                errp)) {
-        aio_context_release(aio_context);
        return NULL;
    }

    if (bdrv_dirty_bitmap_get_persistence(bitmap) &&
        bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0)
    {
-        aio_context_release(aio_context);
        return NULL;
    }

@@ -190,7 +177,6 @@ BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
        *bitmap_bs = bs;
    }

-    aio_context_release(aio_context);
    return release ? NULL : bitmap;
 }

--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -141,7 +141,6 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
    const char *id = qdict_get_str(qdict, "id");
    BlockBackend *blk;
    BlockDriverState *bs;
-    AioContext *aio_context;
    Error *local_err = NULL;

    GLOBAL_STATE_CODE();
@@ -168,14 +167,10 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
        return;
    }

-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
    bs = blk_bs(blk);
    if (bs) {
        if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_DRIVE_DEL, &local_err)) {
            error_report_err(local_err);
-            aio_context_release(aio_context);
            return;
        }

@@ -196,8 +191,6 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
    } else {
        blk_unref(blk);
    }
-
-    aio_context_release(aio_context);
 }

 void hmp_commit(Monitor *mon, const QDict *qdict)
@@ -213,7 +206,6 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
        ret = blk_commit_all();
    } else {
        BlockDriverState *bs;
-        AioContext *aio_context;

        blk = blk_by_name(device);
        if (!blk) {
@@ -222,18 +214,13 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
        }

        bs = bdrv_skip_implicit_filters(blk_bs(blk));
-        aio_context = bdrv_get_aio_context(bs);
-        aio_context_acquire(aio_context);

        if (!blk_is_available(blk)) {
            error_report("Device '%s' has no medium", device);
-            aio_context_release(aio_context);
            return;
        }

        ret = bdrv_commit(bs);
-
-        aio_context_release(aio_context);
    }
    if (ret < 0) {
        error_report("'commit' error for '%s': %s", device, strerror(-ret));
@@ -560,7 +547,6 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;
    BlockBackend *local_blk = NULL;
-    AioContext *ctx = NULL;
    bool qdev = qdict_get_try_bool(qdict, "qdev", false);
    const char *device = qdict_get_str(qdict, "device");
    const char *command = qdict_get_str(qdict, "command");
@@ -582,9 +568,6 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
        }
    }

-    ctx = blk ? blk_get_aio_context(blk) : bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
-
    if (bs) {
        blk = local_blk = blk_new(bdrv_get_aio_context(bs), 0, BLK_PERM_ALL);
        ret = blk_insert_bs(blk, bs, &err);
@@ -622,11 +605,6 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)

 fail:
    blk_unref(local_blk);
-
-    if (ctx) {
-        aio_context_release(ctx);
-    }
-
    hmp_handle_error(mon, err);
 }

@@ -882,7 +860,6 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
    int nb_sns, i;
    int total;
    int *global_snapshots;
-    AioContext *aio_context;

    typedef struct SnapshotEntry {
        QEMUSnapshotInfo sn;
@@ -909,11 +886,8 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
        error_report_err(err);
        return;
    }
-    aio_context = bdrv_get_aio_context(bs);

-    aio_context_acquire(aio_context);
    nb_sns = bdrv_snapshot_list(bs, &sn_tab);
-    aio_context_release(aio_context);

    if (nb_sns < 0) {
        monitor_printf(mon, "bdrv_snapshot_list: error %d\n", nb_sns);
@@ -924,9 +898,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
        int bs1_nb_sns = 0;
        ImageEntry *ie;
        SnapshotEntry *se;
-        AioContext *ctx = bdrv_get_aio_context(bs1);

-        aio_context_acquire(ctx);
        if (bdrv_can_snapshot(bs1)) {
            sn = NULL;
            bs1_nb_sns = bdrv_snapshot_list(bs1, &sn);
@@ -944,7 +916,6 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
            }
            g_free(sn);
        }
-        aio_context_release(ctx);
    }

    if (no_snapshot) {
--- a/block/qapi-sysemu.c
+++ b/block/qapi-sysemu.c
@@ -174,7 +174,6 @@ blockdev_remove_medium(const char *device, const char *id, Error **errp)
 {
    BlockBackend *blk;
    BlockDriverState *bs;
-    AioContext *aio_context;
    bool has_attached_device;

    GLOBAL_STATE_CODE();
@@ -204,13 +203,10 @@ blockdev_remove_medium(const char *device, const char *id, Error **errp)
        return;
    }

-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
    bdrv_graph_rdlock_main_loop();
    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_EJECT, errp)) {
        bdrv_graph_rdunlock_main_loop();
-        goto out;
+        return;
    }
    bdrv_graph_rdunlock_main_loop();

@@ -223,9 +219,6 @@ blockdev_remove_medium(const char *device, const char *id, Error **errp)
         * value passed here (i.e. false). */
        blk_dev_change_media_cb(blk, false, &error_abort);
    }
-
-out:
-    aio_context_release(aio_context);
 }

 void qmp_blockdev_remove_medium(const char *id, Error **errp)
@@ -237,7 +230,6 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
                                            BlockDriverState *bs, Error **errp)
 {
    Error *local_err = NULL;
-    AioContext *ctx;
    bool has_device;
    int ret;

@@ -259,11 +251,7 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
        return;
    }

-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
    ret = blk_insert_bs(blk, bs, errp);
-    aio_context_release(ctx);
-
    if (ret < 0) {
        return;
    }
@@ -374,9 +362,7 @@ void qmp_blockdev_change_medium(const char *device,
        qdict_put_str(options, "driver", format);
    }

-    aio_context_acquire(qemu_get_aio_context());
    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
-    aio_context_release(qemu_get_aio_context());

    if (!medium_bs) {
        goto fail;
@@ -437,20 +423,16 @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
    ThrottleConfig cfg;
    BlockDriverState *bs;
    BlockBackend *blk;
-    AioContext *aio_context;

    blk = qmp_get_blk(arg->device, arg->id, errp);
    if (!blk) {
        return;
    }

-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
    bs = blk_bs(blk);
    if (!bs) {
        error_setg(errp, "Device has no medium");
-        goto out;
+        return;
    }

    throttle_config_init(&cfg);
@@ -505,7 +487,7 @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
    }

    if (!throttle_is_valid(&cfg, errp)) {
-        goto out;
+        return;
    }

    if (throttle_enabled(&cfg)) {
@@ -522,9 +504,6 @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
        /* If all throttling settings are set to 0, disable I/O limits */
        blk_io_limits_disable(blk);
    }
-
-out:
-    aio_context_release(aio_context);
 }

 void qmp_block_latency_histogram_set(
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -234,13 +234,11 @@ bdrv_do_query_node_info(BlockDriverState *bs, BlockNodeInfo *info, Error **errp)
    int ret;
    Error *err = NULL;

-    aio_context_acquire(bdrv_get_aio_context(bs));
-
    size = bdrv_getlength(bs);
    if (size < 0) {
        error_setg_errno(errp, -size, "Can't get image size '%s'",
                         bs->exact_filename);
-        goto out;
+        return;
    }

    bdrv_refresh_filename(bs);
@@ -265,7 +263,7 @@ bdrv_do_query_node_info(BlockDriverState *bs, BlockNodeInfo *info, Error **errp)
    info->format_specific = bdrv_get_specific_info(bs, &err);
    if (err) {
        error_propagate(errp, err);
-        goto out;
+        return;
    }
    backing_filename = bs->backing_file;
    if (backing_filename[0] != '\0') {
@@ -300,11 +298,8 @@ bdrv_do_query_node_info(BlockDriverState *bs, BlockNodeInfo *info, Error **errp)
        break;
    default:
        error_propagate(errp, err);
-        goto out;
+        return;
    }
-
-out:
-    aio_context_release(bdrv_get_aio_context(bs));
 }

 /**
@@ -709,15 +704,10 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
    /* Just to be safe if query_nodes is not always initialized */
    if (has_query_nodes && query_nodes) {
        for (bs = bdrv_next_node(NULL); bs; bs = bdrv_next_node(bs)) {
-            AioContext *ctx = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(ctx);
            QAPI_LIST_APPEND(tail, bdrv_query_bds_stats(bs, false));
-            aio_context_release(ctx);
        }
    } else {
        for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
-            AioContext *ctx = blk_get_aio_context(blk);
            BlockStats *s;
            char *qdev;

@@ -725,7 +715,6 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
                continue;
            }

-            aio_context_acquire(ctx);
            s = bdrv_query_bds_stats(blk_bs(blk), true);
            s->device = g_strdup(blk_name(blk));

@@ -737,7 +726,6 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
            }

            bdrv_query_blk_stats(s->stats, blk);
-            aio_context_release(ctx);

            QAPI_LIST_APPEND(tail, s);
        }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2807,9 +2807,9 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file)
    if (close_data_file && has_data_file(bs)) {
        GLOBAL_STATE_CODE();
        bdrv_graph_rdunlock_main_loop();
-        bdrv_graph_wrlock(NULL);
+        bdrv_graph_wrlock();
        bdrv_unref_child(bs, s->data_file);
-        bdrv_graph_wrunlock(NULL);
+        bdrv_graph_wrunlock();
        s->data_file = NULL;
        bdrv_graph_rdlock_main_loop();
    }
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1037,14 +1037,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

 close_exit:
    /* cleanup on error */
-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    for (i = 0; i < s->num_children; i++) {
        if (!opened[i]) {
            continue;
        }
        bdrv_unref_child(bs, s->children[i]);
    }
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();
    g_free(s->children);
    g_free(opened);
 exit:
@@ -1057,11 +1057,11 @@ static void quorum_close(BlockDriverState *bs)
    BDRVQuorumState *s = bs->opaque;
    int i;

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    for (i = 0; i < s->num_children; i++) {
        bdrv_unref_child(bs, s->children[i]);
    }
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    g_free(s->children);
 }
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -470,7 +470,6 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVRawState *s = bs->opaque;
-    AioContext *ctx;
    bool has_size;
    uint64_t offset, size;
    BdrvChildRole file_role;
@@ -522,11 +521,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                bs->file->bs->filename);
    }

-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
    ret = raw_apply_options(bs, s, offset, has_size, size, errp);
-    aio_context_release(ctx);
-
    if (ret < 0) {
        return ret;
    }
--- a/block/replication.c
+++ b/block/replication.c
@@ -394,14 +394,7 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
    }

    if (reopen_queue) {
-        AioContext *ctx = bdrv_get_aio_context(bs);
-        if (ctx != qemu_get_aio_context()) {
-            aio_context_release(ctx);
-        }
        bdrv_reopen_multiple(reopen_queue, errp);
-        if (ctx != qemu_get_aio_context()) {
-            aio_context_acquire(ctx);
-        }
    }
 }

@@ -462,14 +455,11 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
    BlockDriverState *top_bs;
    BdrvChild *active_disk, *hidden_disk, *secondary_disk;
    int64_t active_length, hidden_length, disk_length;
-    AioContext *aio_context;
    Error *local_err = NULL;
    BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };

    GLOBAL_STATE_CODE();

-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
    s = bs->opaque;

    if (s->stage == BLOCK_REPLICATION_DONE ||
@@ -479,20 +469,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         * Ignore the request because the secondary side of replication
         * doesn't have to do anything anymore.
         */
-        aio_context_release(aio_context);
        return;
    }

    if (s->stage != BLOCK_REPLICATION_NONE) {
        error_setg(errp, "Block replication is running or done");
-        aio_context_release(aio_context);
        return;
    }

    if (s->mode != mode) {
        error_setg(errp, "The parameter mode's value is invalid, needs %d,"
                   " but got %d", s->mode, mode);
-        aio_context_release(aio_context);
        return;
    }

@@ -505,7 +492,6 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
            error_setg(errp, "Active disk doesn't have backing file");
            bdrv_graph_rdunlock_main_loop();
-            aio_context_release(aio_context);
            return;
        }

@@ -513,7 +499,6 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
            error_setg(errp, "Hidden disk doesn't have backing file");
            bdrv_graph_rdunlock_main_loop();
-            aio_context_release(aio_context);
            return;
        }

@@ -521,7 +506,6 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
            error_setg(errp, "The secondary disk doesn't have block backend");
            bdrv_graph_rdunlock_main_loop();
-            aio_context_release(aio_context);
            return;
        }
        bdrv_graph_rdunlock_main_loop();
@@ -534,7 +518,6 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
            active_length != hidden_length || hidden_length != disk_length) {
            error_setg(errp, "Active disk, hidden disk, secondary disk's length"
                       " are not the same");
-            aio_context_release(aio_context);
            return;
        }

@@ -546,7 +529,6 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
            !hidden_disk->bs->drv->bdrv_make_empty) {
            error_setg(errp,
                       "Active disk or hidden disk doesn't support make_empty");
-            aio_context_release(aio_context);
            bdrv_graph_rdunlock_main_loop();
            return;
        }
@@ -556,11 +538,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        reopen_backing_file(bs, true, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            aio_context_release(aio_context);
            return;
        }

-        bdrv_graph_wrlock(bs);
+        bdrv_graph_wrlock();

        bdrv_ref(hidden_disk->bs);
        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
@@ -568,8 +549,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
                                           &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            bdrv_graph_wrunlock(bs);
-            aio_context_release(aio_context);
+            bdrv_graph_wrunlock();
            return;
        }

@@ -579,8 +559,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
                                              BDRV_CHILD_DATA, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            bdrv_graph_wrunlock(bs);
-            aio_context_release(aio_context);
+            bdrv_graph_wrunlock();
            return;
        }

@@ -592,15 +571,14 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (!top_bs || !bdrv_is_root_node(top_bs) ||
            !check_top_bs(top_bs, bs)) {
            error_setg(errp, "No top_bs or it is invalid");
-            bdrv_graph_wrunlock(bs);
+            bdrv_graph_wrunlock();
            reopen_backing_file(bs, false, NULL);
-            aio_context_release(aio_context);
            return;
        }
        bdrv_op_block_all(top_bs, s->blocker);
        bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);

-        bdrv_graph_wrunlock(bs);
+        bdrv_graph_wrunlock();

        s->backup_job = backup_job_create(
                                NULL, s->secondary_disk->bs, s->hidden_disk->bs,
@@ -612,13 +590,11 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (local_err) {
            error_propagate(errp, local_err);
            backup_job_cleanup(bs);
-            aio_context_release(aio_context);
            return;
        }
        job_start(&s->backup_job->job);
        break;
    default:
-        aio_context_release(aio_context);
        abort();
    }

@@ -629,18 +605,12 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
    }

    s->error = 0;
-    aio_context_release(aio_context);
 }

 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
 {
    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
+    BDRVReplicationState *s = bs->opaque;

    if (s->stage == BLOCK_REPLICATION_DONE ||
        s->stage == BLOCK_REPLICATION_FAILOVER) {
@@ -649,38 +619,28 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
         * Ignore the request because the secondary side of replication
         * doesn't have to do anything anymore.
         */
-        aio_context_release(aio_context);
        return;
    }

    if (s->mode == REPLICATION_MODE_SECONDARY) {
        secondary_do_checkpoint(bs, errp);
    }
-    aio_context_release(aio_context);
 }

 static void replication_get_error(ReplicationState *rs, Error **errp)
 {
    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
+    BDRVReplicationState *s = bs->opaque;

    if (s->stage == BLOCK_REPLICATION_NONE) {
        error_setg(errp, "Block replication is not running");
-        aio_context_release(aio_context);
        return;
    }

    if (s->error) {
        error_setg(errp, "I/O error occurred");
-        aio_context_release(aio_context);
        return;
    }
-    aio_context_release(aio_context);
 }

 static void replication_done(void *opaque, int ret)
@@ -691,12 +651,12 @@ static void replication_done(void *opaque, int ret)
    if (ret == 0) {
        s->stage = BLOCK_REPLICATION_DONE;

-        bdrv_graph_wrlock(NULL);
+        bdrv_graph_wrlock();
        bdrv_unref_child(bs, s->secondary_disk);
        s->secondary_disk = NULL;
        bdrv_unref_child(bs, s->hidden_disk);
        s->hidden_disk = NULL;
-        bdrv_graph_wrunlock(NULL);
+        bdrv_graph_wrunlock();

        s->error = 0;
    } else {
@@ -708,12 +668,7 @@ static void replication_done(void *opaque, int ret)
 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
 {
    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
+    BDRVReplicationState *s = bs->opaque;

    if (s->stage == BLOCK_REPLICATION_DONE ||
        s->stage == BLOCK_REPLICATION_FAILOVER) {
@@ -722,13 +677,11 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
         * Ignore the request because the secondary side of replication
         * doesn't have to do anything anymore.
         */
-        aio_context_release(aio_context);
        return;
    }

    if (s->stage != BLOCK_REPLICATION_RUNNING) {
        error_setg(errp, "Block replication is not running");
-        aio_context_release(aio_context);
        return;
    }

@@ -744,15 +697,12 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
         * disk, secondary disk in backup_job_completed().
         */
        if (s->backup_job) {
-            aio_context_release(aio_context);
            job_cancel_sync(&s->backup_job->job, true);
-            aio_context_acquire(aio_context);
        }

        if (!failover) {
            secondary_do_checkpoint(bs, errp);
            s->stage = BLOCK_REPLICATION_DONE;
-            aio_context_release(aio_context);
            return;
        }

@@ -765,10 +715,8 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
        bdrv_graph_rdunlock_main_loop();
        break;
    default:
-        aio_context_release(aio_context);
        abort();
    }
-    aio_context_release(aio_context);
 }

 static const char *const replication_strong_runtime_opts[] = {
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -292,9 +292,9 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
        }

        /* .bdrv_open() will re-attach it */
-        bdrv_graph_wrlock(NULL);
+        bdrv_graph_wrlock();
        bdrv_unref_child(bs, fallback);
-        bdrv_graph_wrunlock(NULL);
+        bdrv_graph_wrunlock();

        ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
        open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
@@ -527,9 +527,7 @@ static bool GRAPH_RDLOCK bdrv_all_snapshots_includes_bs(BlockDriverState *bs)
    return bdrv_has_blk(bs) || QLIST_EMPTY(&bs->parents);
 }

-/* Group operations. All block drivers are involved.
- * These functions will properly handle dataplane (take aio_context_acquire
- * when appropriate for appropriate block drivers) */
+/* Group operations. All block drivers are involved. */

 bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
                           Error **errp)
@@ -547,14 +545,11 @@ bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        bool ok = true;

-        aio_context_acquire(ctx);
        if (devices || bdrv_all_snapshots_includes_bs(bs)) {
            ok = bdrv_can_snapshot(bs);
        }
-        aio_context_release(ctx);
        if (!ok) {
            error_setg(errp, "Device '%s' is writable but does not support "
                       "snapshots", bdrv_get_device_or_node_name(bs));
@@ -584,18 +579,15 @@ int bdrv_all_delete_snapshot(const char *name,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        QEMUSnapshotInfo sn1, *snapshot = &sn1;
        int ret = 0;

-        aio_context_acquire(ctx);
        if ((devices || bdrv_all_snapshots_includes_bs(bs)) &&
            bdrv_snapshot_find(bs, snapshot, name) >= 0)
        {
            ret = bdrv_snapshot_delete(bs, snapshot->id_str,
                                       snapshot->name, errp);
        }
-        aio_context_release(ctx);
        if (ret < 0) {
            error_prepend(errp, "Could not delete snapshot '%s' on '%s': ",
                          name, bdrv_get_device_or_node_name(bs));
@@ -630,17 +622,14 @@ int bdrv_all_goto_snapshot(const char *name,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        bool all_snapshots_includes_bs;

-        aio_context_acquire(ctx);
        bdrv_graph_rdlock_main_loop();
        all_snapshots_includes_bs = bdrv_all_snapshots_includes_bs(bs);
        bdrv_graph_rdunlock_main_loop();

        ret = (devices || all_snapshots_includes_bs) ?
              bdrv_snapshot_goto(bs, name, errp) : 0;
-        aio_context_release(ctx);
        if (ret < 0) {
            bdrv_graph_rdlock_main_loop();
            error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
@@ -672,15 +661,12 @@ int bdrv_all_has_snapshot(const char *name,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        QEMUSnapshotInfo sn;
        int ret = 0;

-        aio_context_acquire(ctx);
        if (devices || bdrv_all_snapshots_includes_bs(bs)) {
            ret = bdrv_snapshot_find(bs, &sn, name);
        }
-        aio_context_release(ctx);
        if (ret < 0) {
            if (ret == -ENOENT) {
                return 0;
@@ -717,10 +703,8 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        int ret = 0;

-        aio_context_acquire(ctx);
        if (bs == vm_state_bs) {
            sn->vm_state_size = vm_state_size;
            ret = bdrv_snapshot_create(bs, sn);
@@ -728,7 +712,6 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
            sn->vm_state_size = 0;
            ret = bdrv_snapshot_create(bs, sn);
        }
-        aio_context_release(ctx);
        if (ret < 0) {
            error_setg(errp, "Could not create snapshot '%s' on '%s'",
                       sn->name, bdrv_get_device_or_node_name(bs));
@@ -759,13 +742,10 @@ BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
    iterbdrvs = bdrvs;
    while (iterbdrvs) {
        BlockDriverState *bs = iterbdrvs->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
        bool found = false;

-        aio_context_acquire(ctx);
        found = (devices || bdrv_all_snapshots_includes_bs(bs)) &&
            bdrv_can_snapshot(bs);
-        aio_context_release(ctx);

        if (vmstate_bs) {
            if (g_str_equal(vmstate_bs,
--- a/block/stream.c
+++ b/block/stream.c
@@ -99,9 +99,9 @@ static int stream_prepare(Job *job)
            }
        }

-        bdrv_graph_wrlock(s->target_bs);
+        bdrv_graph_wrlock();
        bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err);
-        bdrv_graph_wrunlock(s->target_bs);
+        bdrv_graph_wrunlock();

        /*
         * This call will do I/O, so the graph can change again from here on.
@@ -366,10 +366,10 @@ void stream_start(const char *job_id, BlockDriverState *bs,
     * already have our own plans. Also don't allow resize as the image size is
     * queried only at the job start and then cached.
     */
-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();
    if (block_job_add_bdrv(&s->common, "active node", bs, 0,
                           basic_flags | BLK_PERM_WRITE, errp)) {
-        bdrv_graph_wrunlock(bs);
+        bdrv_graph_wrunlock();
        goto fail;
    }

@@ -389,11 +389,11 @@ void stream_start(const char *job_id, BlockDriverState *bs,
        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                 basic_flags, errp);
        if (ret < 0) {
-            bdrv_graph_wrunlock(bs);
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();

    s->base_overlay = base_overlay;
    s->above_base = above_base;
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -272,7 +272,7 @@ static void vmdk_free_extents(BlockDriverState *bs)
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *e;

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock();
    for (i = 0; i < s->num_extents; i++) {
        e = &s->extents[i];
        g_free(e->l1_table);
@@ -283,7 +283,7 @@ static void vmdk_free_extents(BlockDriverState *bs)
            bdrv_unref_child(bs, e->file);
        }
    }
-    bdrv_graph_wrunlock(NULL);
+    bdrv_graph_wrunlock();

    g_free(s->extents);
 }
@@ -1247,9 +1247,9 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options,
                            0, 0, 0, 0, 0, &extent, errp);
            if (ret < 0) {
                bdrv_graph_rdunlock_main_loop();
-                bdrv_graph_wrlock(NULL);
+                bdrv_graph_wrlock();
                bdrv_unref_child(bs, extent_file);
-                bdrv_graph_wrunlock(NULL);
+                bdrv_graph_wrunlock();
                bdrv_graph_rdlock_main_loop();
                goto out;
            }
@@ -1266,9 +1266,9 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options,
            g_free(buf);
            if (ret) {
                bdrv_graph_rdunlock_main_loop();
-                bdrv_graph_wrlock(NULL);
+                bdrv_graph_wrlock();
                bdrv_unref_child(bs, extent_file);
-                bdrv_graph_wrunlock(NULL);
+                bdrv_graph_wrunlock();
                bdrv_graph_rdlock_main_loop();
                goto out;
            }
@@ -1277,9 +1277,9 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options,
            ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
            if (ret) {
                bdrv_graph_rdunlock_main_loop();
-                bdrv_graph_wrlock(NULL);
+                bdrv_graph_wrlock();
                bdrv_unref_child(bs, extent_file);
-                bdrv_graph_wrunlock(NULL);
+                bdrv_graph_wrunlock();
                bdrv_graph_rdlock_main_loop();
                goto out;
            }
@@ -1287,9 +1287,9 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options,
        } else {
            error_setg(errp, "Unsupported extent type '%s'", type);
            bdrv_graph_rdunlock_main_loop();
-            bdrv_graph_wrlock(NULL);
+            bdrv_graph_wrlock();
            bdrv_unref_child(bs, extent_file);
-            bdrv_graph_wrunlock(NULL);
+            bdrv_graph_wrunlock();
            bdrv_graph_rdlock_main_loop();
            ret = -ENOTSUP;
            goto out;
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -33,7 +33,6 @@ void qmp_block_set_write_threshold(const char *node_name,
                                   Error **errp)
 {
    BlockDriverState *bs;
-    AioContext *aio_context;

    bs = bdrv_find_node(node_name);
    if (!bs) {
@@ -41,12 +40,7 @@ void qmp_block_set_write_threshold(const char *node_name,
        return;
    }

-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
    bdrv_write_threshold_set(bs, threshold_bytes);
-
-    aio_context_release(aio_context);
 }

 void bdrv_write_threshold_check_write(BlockDriverState *bs, int64_t offset,
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -198,9 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
     * one to make sure that such a concurrent access does not attempt
     * to process an already freed BdrvChild.
     */
-    aio_context_release(job->job.aio_context);
-    bdrv_graph_wrlock(NULL);
-    aio_context_acquire(job->job.aio_context);
+    bdrv_graph_wrlock();
    while (job->nodes) {
        GSList *l = job->nodes;
        BdrvChild *c = l->data;
@@ -212,7 +210,7 @@ void block_job_remove_all_bdrv(BlockJob *job)

        g_slist_free_1(l);
    }
-    bdrv_graph_wrunlock_ctx(job->job.aio_context);
+    bdrv_graph_wrunlock();
 }

 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
@@ -234,28 +232,12 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
                       uint64_t perm, uint64_t shared_perm, Error **errp)
 {
    BdrvChild *c;
-    AioContext *ctx = bdrv_get_aio_context(bs);
-    bool need_context_ops;
    GLOBAL_STATE_CODE();

    bdrv_ref(bs);

-    need_context_ops = ctx != job->job.aio_context;
-
-    if (need_context_ops) {
-        if (job->job.aio_context != qemu_get_aio_context()) {
-            aio_context_release(job->job.aio_context);
-        }
-        aio_context_acquire(ctx);
-    }
    c = bdrv_root_attach_child(bs, name, &child_job, 0, perm, shared_perm, job,
                               errp);
-    if (need_context_ops) {
-        aio_context_release(ctx);
-        if (job->job.aio_context != qemu_get_aio_context()) {
-            aio_context_acquire(job->job.aio_context);
-        }
-    }
    if (c == NULL) {
        return -EPERM;
    }
@@ -514,7 +496,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    int ret;
    GLOBAL_STATE_CODE();

-    bdrv_graph_wrlock(bs);
+    bdrv_graph_wrlock();

    if (job_id == NULL && !(flags & JOB_INTERNAL)) {
        job_id = bdrv_get_device_name(bs);
@@ -523,7 +505,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job = job_create(job_id, &driver->job_driver, txn, bdrv_get_aio_context(bs),
                     flags, cb, opaque, errp);
    if (job == NULL) {
-        bdrv_graph_wrunlock(bs);
+        bdrv_graph_wrunlock();
        return NULL;
    }

@@ -563,11 +545,11 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
        goto fail;
    }

-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();
    return job;

 fail:
-    bdrv_graph_wrunlock(bs);
+    bdrv_graph_wrunlock();
    job_early_fail(&job->job);
    return NULL;
 }
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -518,7 +518,7 @@ static const ChardevClass *char_get_class(const char *driver, Error **errp)

    if (object_class_is_abstract(oc)) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver",
-                   "a non-abstract device type");
+                   "an abstract device type");
        return NULL;
    }

--- a/7
+++ b/7
@@ -1387,8 +1387,8 @@ probe_target_compiler() {
  done

  try=cross
-  # For softmmu/roms also look for a bi-endian or multilib-enabled host compiler
-  if [ "${1%softmmu}" != "$1" ] || test "$target_arch" = "$cpu"; then
+  # For softmmu/roms we might be able to use the host compiler
+  if [ "${1%softmmu}" != "$1" ]; then
      case "$target_arch:$cpu" in
        aarch64_be:aarch64 | \
        armeb:arm | \
@@ -1675,9 +1675,6 @@ fi
 mkdir -p tests/tcg
 echo "# Automatically generated by configure - do not modify" > $config_host_mak
 echo "SRC_PATH=$source_path" >> $config_host_mak
-if test "$plugins" = "yes" ; then
-    echo "CONFIG_PLUGIN=y" >> $config_host_mak
-fi

 tcg_tests_targets=
 for target in $target_list; do
--- a/contrib/vhost-user-gpu/virgl.c
+++ b/contrib/vhost-user-gpu/virgl.c
@@ -327,7 +327,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id,
 #ifdef VIRGL_RENDERER_RESOURCE_INFO_EXT_VERSION
    struct virgl_renderer_resource_info_ext info_ext;
    ret = virgl_renderer_resource_get_info_ext(resource_id, &info_ext);
-    if (ret) {
+    if (ret < 0) {
        return ret;
    }

@@ -335,7 +335,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id,
    *modifiers = info_ext.modifiers;
 #else
    ret = virgl_renderer_resource_get_info(resource_id, info);
-    if (ret) {
+    if (ret < 0) {
        return ret;
    }

@@ -372,7 +372,7 @@ virgl_cmd_set_scanout(VuGpu *g,
        uint64_t modifiers = 0;
        ret = virgl_get_resource_info_modifiers(ss.resource_id, &info,
                                                &modifiers);
-        if (ret) {
+        if (ret == -1) {
            g_critical("%s: illegal resource specified %d\n",
                       __func__, ss.resource_id);
            cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,6 +29,7 @@
 import os
 import sys
 import sphinx
+from distutils.version import LooseVersion
 from sphinx.errors import ConfigError

 # The per-manual conf.py will set qemu_docdir for a single-manual build;
@@ -164,10 +165,11 @@ html_theme = 'sphinx_rtd_theme'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-html_theme_options = {
-    "style_nav_header_background": "#802400",
-    "navigation_with_keys": True,
-}
+if LooseVersion(sphinx_rtd_theme.__version__) >= LooseVersion("0.4.3"):
+    html_theme_options = {
+        "style_nav_header_background": "#802400",
+        "navigation_with_keys": True,
+    }

 html_logo = os.path.join(qemu_docdir, "../ui/icons/qemu_128x128.png")

--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them.
   s390-dasd-ipl
   tracing
   vfio-migration
+   vfio-iommufd
   writing-monitor-commands
   virtio-backends
--- a/docs/devel/multiple-iothreads.txt
+++ b/docs/devel/multiple-iothreads.txt
@@ -88,27 +88,18 @@ loop, depending on which AioContext instance the caller passes in.

 How to synchronize with an IOThread
 -----------------------------------
-AioContext is not thread-safe so some rules must be followed when using file
-descriptors, event notifiers, timers, or BHs across threads:
+Variables that can be accessed by multiple threads require some form of
+synchronization such as qemu_mutex_lock(), rcu_read_lock(), etc.

-1. AioContext functions can always be called safely.  They handle their
-own locking internally.
-
-2. Other threads wishing to access the AioContext must use
-aio_context_acquire()/aio_context_release() for mutual exclusion.  Once the
-context is acquired no other thread can access it or run event loop iterations
-in this AioContext.
-
-Legacy code sometimes nests aio_context_acquire()/aio_context_release() calls.
-Do not use nesting anymore, it is incompatible with the BDRV_POLL_WHILE() macro
-used in the block layer and can lead to hangs.
-
-There is currently no lock ordering rule if a thread needs to acquire multiple
-AioContexts simultaneously.  Therefore, it is only safe for code holding the
-QEMU global mutex to acquire other AioContexts.
+AioContext functions like aio_set_fd_handler(), aio_set_event_notifier(),
+aio_bh_new(), and aio_timer_new() are thread-safe. They can be used to trigger
+activity in an IOThread.

 Side note: the best way to schedule a function call across threads is to call
-aio_bh_schedule_oneshot().  No acquire/release or locking is needed.
+aio_bh_schedule_oneshot().
+
+The main loop thread can wait synchronously for a condition using
+AIO_WAIT_WHILE().

 AioContext and the block layer
 ------------------------------
@@ -124,22 +115,16 @@ Block layer code must therefore expect to run in an IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.

-If main loop code such as a QMP function wishes to access a BlockDriverState
-it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
-that callbacks in the IOThread do not run in parallel.
-
 Code running in the monitor typically needs to ensure that past
 requests from the guest are completed.  When a block device is running
 in an IOThread, the IOThread can also process requests from the guest
 (via ioeventfd).  To achieve both objects, wrap the code between
 bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
-section".  The functions must be called between aio_context_acquire()
-and aio_context_release().  You can freely release and re-acquire the
-AioContext within a drained section.
+section".

-Long-running jobs (usually in the form of coroutines) are best scheduled in
-the BlockDriverState's AioContext to avoid the need to acquire/release around
-each bdrv_*() call.  The functions bdrv_add/remove_aio_context_notifier,
-or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
-can be used to get a notification whenever bdrv_try_change_aio_context() moves a
+Long-running jobs (usually in the form of coroutines) are often scheduled in
+the BlockDriverState's AioContext.  The functions
+bdrv_add/remove_aio_context_notifier, or alternatively
+blk_add/remove_aio_context_notifier if you use BlockBackends, can be used to
+get a notification whenever bdrv_try_change_aio_context() moves a
 BlockDriverState to a different AioContext.
--- a/docs/devel/vfio-iommufd.rst
+++ b/docs/devel/vfio-iommufd.rst
@@ -0,0 +1,166 @@
+===============================
+IOMMUFD BACKEND usage with VFIO
+===============================
+
+(Same meaning for backend/container/BE)
+
+With the introduction of iommufd, the Linux kernel provides a generic
+interface for user space drivers to propagate their DMA mappings to kernel
+for assigned devices. While the legacy kernel interface is group-centric,
+the new iommufd interface is device-centric, relying on device fd and iommufd.
+
+To support both interfaces in the QEMU VFIO device, introduce a base container
+to abstract the common part of VFIO legacy and iommufd container. So that the
+generic VFIO code can use either container.
+
+The base container implements generic functions such as memory_listener and
+address space management whereas the derived container implements callbacks
+specific to either legacy or iommufd. Each container has its own way to setup
+secure context and dma management interface. The below diagram shows how it
+looks like with both containers.
+
+::
+
+                      VFIO                           AddressSpace/Memory
+      +-------+  +----------+  +-----+  +-----+
+      |  pci  |  | platform |  |  ap |  | ccw |
+      +---+---+  +----+-----+  +--+--+  +--+--+     +----------------------+
+          |           |           |        |        |   AddressSpace       |
+          |           |           |        |        +------------+---------+
+      +---V-----------V-----------V--------V----+               /
+      |           VFIOAddressSpace              | <------------+
+      |                  |                      |  MemoryListener
+      |        VFIOContainerBase list           |
+      +-------+----------------------------+----+
+              |                            |
+              |                            |
+      +-------V------+            +--------V----------+
+      |   iommufd    |            |    vfio legacy    |
+      |  container   |            |     container     |
+      +-------+------+            +--------+----------+
+              |                            |
+              | /dev/iommu                 | /dev/vfio/vfio
+              | /dev/vfio/devices/vfioX    | /dev/vfio/$group_id
+  Userspace   |                            |
+  ============+============================+===========================
+  Kernel      |  device fd                 |
+              +---------------+            | group/container fd
+              | (BIND_IOMMUFD |            | (SET_CONTAINER/SET_IOMMU)
+              |  ATTACH_IOAS) |            | device fd
+              |               |            |
+              |       +-------V------------V-----------------+
+      iommufd |       |                vfio                  |
+  (map/unmap  |       +---------+--------------------+-------+
+  ioas_copy)  |                 |                    | map/unmap
+              |                 |                    |
+       +------V------+    +-----V------+      +------V--------+
+       | iommfd core |    |  device    |      |  vfio iommu   |
+       +-------------+    +------------+      +---------------+
+
+* Secure Context setup
+
+  - iommufd BE: uses device fd and iommufd to setup secure context
+    (bind_iommufd, attach_ioas)
+  - vfio legacy BE: uses group fd and container fd to setup secure context
+    (set_container, set_iommu)
+
+* Device access
+
+  - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX``
+  - vfio legacy BE: device fd is retrieved from group fd ioctl
+
+* DMA Mapping flow
+
+  1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener
+  2. VFIO populates DMA map/unmap via the container BEs
+     * iommufd BE: uses iommufd
+     * vfio legacy BE: uses container fd
+
+Example configuration
+=====================
+
+Step 1: configure the host device
+---------------------------------
+
+It's exactly same as the VFIO device with legacy VFIO container.
+
+Step 2: configure QEMU
+----------------------
+
+Interactions with the ``/dev/iommu`` are abstracted by a new iommufd
+object (compiled in with the ``CONFIG_IOMMUFD`` option).
+
+Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must
+be linked with an iommufd object. It gets a new optional property
+named iommufd which allows to pass an iommufd object. Take ``vfio-pci``
+device for example:
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0
+    -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
+
+Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a
+management layer. In such a case the fd is passed, the fd supports a
+string naming the fd or a number, for example:
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0,fd=22
+    -device vfio-pci,iommufd=iommufd0,fd=23
+
+If the ``fd`` property is not passed, the fd is opened by QEMU.
+
+If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd
+is not used and the user gets the behavior based on the legacy VFIO
+container:
+
+.. code-block:: bash
+
+    -device vfio-pci,host=0000:02:00.0
+
+Supported platform
+==================
+
+Supports x86, ARM and s390x currently.
+
+Caveats
+=======
+
+Dirty page sync
+---------------
+
+Dirty page sync with iommufd backend is unsupported yet, live migration is
+disabled by default. But it can be force enabled like below, low efficient
+though.
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0
+    -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on
+
+P2P DMA
+-------
+
+PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI
+BAR region yet. Below warning shows for assigned PCI device, it's not a bug.
+
+.. code-block:: none
+
+    qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
+    qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
+
+FD passing with mdev
+--------------------
+
+``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev.
+If FD passing is used, there is no way to know that and the mdev is treated
+like a real PCI device. There is an error as below if user wants to enable
+RAM discarding for mdev.
+
+.. code-block:: none
+
+    qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices
+
+``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend
+devices are always mdev and RAM discarding is force enabled.
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -148,9 +148,9 @@ Vring descriptor indices for packed virtqueues
 A vring address description
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

-+-------+-------+------------+------+-----------+-----+
-| index | flags | descriptor | used | available | log |
-+-------+-------+------------+------+-----------+-----+
+-------+-------+------+------------+------+-----------+-----+
+| index | flags | size | descriptor | used | available | log |
+-------+-------+------+------------+------+-----------+-----+

 :index: a 32-bit vring index

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,2 +0,0 @@
-sphinx==5.3.0
-sphinx_rtd_theme==1.1.1
--- a/docs/system/arm/xlnx-versal-virt.rst
+++ b/docs/system/arm/xlnx-versal-virt.rst
@@ -194,7 +194,7 @@ To use a different index value, N, from default of 0, add:

 .. code-block:: bash

-  -global xlnx,bbram-ctrl.drive-index=N
+  -global driver=xlnx.bbram-ctrl,property=drive-index,value=N

 eFUSE File Backend
 """"""""""""""""""
@@ -212,7 +212,7 @@ To use a different index value, N, from default of 1, add:

 .. code-block:: bash

-  -global xlnx,efuse.drive-index=N
+  -global xlnx-efuse.drive-index=N

 .. warning::
  In actual physical Versal, BBRAM and eFUSE contain sensitive data.
--- a/docs/system/invocation.rst
+++ b/docs/system/invocation.rst
@@ -10,6 +10,11 @@ Invocation
 disk_image is a raw hard disk image for IDE hard disk 0. Some targets do
 not need a disk image.

+When dealing with options parameters as arbitrary strings containing
+commas, such as in "file=my,file" and "string=a,b", it's necessary to
+double the commas. For instance,"-fw_cfg name=z,string=a,,b" will be
+parsed as "-fw_cfg name=z,string=a,b".
+
 .. hxtool-doc:: qemu-options.hx

 Device URL Syntax
--- a/docs/system/keys.rst.inc
+++ b/docs/system/keys.rst.inc
@@ -1,9 +1,8 @@
-During the graphical emulation, you can use special key combinations from
-the following table to change modes. By default the modifier is Ctrl-Alt
-(used in the table below) which can be changed with ``-display`` suboption
-``mod=`` where appropriate. For example, ``-display sdl,
-grab-mod=lshift-lctrl-lalt`` changes the modifier key to Ctrl-Alt-Shift,
-while ``-display sdl,grab-mod=rctrl`` changes it to the right Ctrl key.
+During the graphical emulation, you can use special key combinations to
+change modes. The default key mappings are shown below, but if you use
+``-alt-grab`` then the modifier is Ctrl-Alt-Shift (instead of Ctrl-Alt)
+and if you use ``-ctrl-grab`` then the modifier is the right Ctrl key
+(instead of Ctrl-Alt):

 Ctrl-Alt-f
   Toggle full screen
@@ -29,7 +28,7 @@ Ctrl-Alt-n
   *3*
      Serial port

-Ctrl-Alt-g
+Ctrl-Alt
   Toggle mouse and keyboard grab.

 In the virtual consoles, you can use Ctrl-Up, Ctrl-Down, Ctrl-PageUp and
--- a/docs/system/qemu-manpage.rst
+++ b/docs/system/qemu-manpage.rst
@@ -31,6 +31,11 @@ Options
 disk_image is a raw hard disk image for IDE hard disk 0. Some targets do
 not need a disk image.

+When dealing with options parameters as arbitrary strings containing
+commas, such as in "file=my,file" and "string=a,b", it's necessary to
+double the commas. For instance,"-fw_cfg name=z,string=a,,b" will be
+parsed as "-fw_cfg name=z,string=a,b".
+
 .. hxtool-doc:: qemu-options.hx

 .. include:: keys.rst.inc
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -406,7 +406,7 @@ Command description:
  Compare exits with ``0`` in case the images are equal and with ``1``
  in case the images differ. Other exit codes mean an error occurred during
  execution and standard error output should contain an error message.
-  The following table sumarizes all exit codes of the compare subcommand:
+  The following table summarizes all exit codes of the compare subcommand:

  0
    Images are identical (or requested help was printed)
--- a/hw/acpi/cpu_hotplug.c
+++ b/hw/acpi/cpu_hotplug.c
@@ -59,7 +59,8 @@ static const MemoryRegionOps AcpiCpuHotplug_ops = {
    },
 };

-static void acpi_set_cpu_present_bit(AcpiCpuHotplug *g, CPUState *cpu)
+static void acpi_set_cpu_present_bit(AcpiCpuHotplug *g, CPUState *cpu,
+                                     bool *swtchd_to_modern)
 {
    CPUClass *k = CPU_GET_CLASS(cpu);
    int64_t cpu_id;
@@ -68,23 +69,34 @@ static void acpi_set_cpu_present_bit(AcpiCpuHotplug *g, CPUState *cpu)
    if ((cpu_id / 8) >= ACPI_GPE_PROC_LEN) {
        object_property_set_bool(g->device, "cpu-hotplug-legacy", false,
                                 &error_abort);
+        *swtchd_to_modern = true;
        return;
    }

+    *swtchd_to_modern = false;
    g->sts[cpu_id / 8] |= (1 << (cpu_id % 8));
 }

 void legacy_acpi_cpu_plug_cb(HotplugHandler *hotplug_dev,
                             AcpiCpuHotplug *g, DeviceState *dev, Error **errp)
 {
-    acpi_set_cpu_present_bit(g, CPU(dev));
-    acpi_send_event(DEVICE(hotplug_dev), ACPI_CPU_HOTPLUG_STATUS);
+    bool swtchd_to_modern;
+    Error *local_err = NULL;
+
+    acpi_set_cpu_present_bit(g, CPU(dev), &swtchd_to_modern);
+    if (swtchd_to_modern) {
+        /* propagate the hotplug to the modern interface */
+        hotplug_handler_plug(hotplug_dev, dev, &local_err);
+    } else {
+        acpi_send_event(DEVICE(hotplug_dev), ACPI_CPU_HOTPLUG_STATUS);
+    }
 }

 void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner,
                                  AcpiCpuHotplug *gpe_cpu, uint16_t base)
 {
    CPUState *cpu;
+    bool swtchd_to_modern;

    memory_region_init_io(&gpe_cpu->io, owner, &AcpiCpuHotplug_ops,
                          gpe_cpu, "acpi-cpu-hotplug", ACPI_GPE_PROC_LEN);
@@ -92,7 +104,7 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner,
    gpe_cpu->device = owner;

    CPU_FOREACH(cpu) {
-        acpi_set_cpu_present_bit(gpe_cpu, cpu);
+        acpi_set_cpu_present_bit(gpe_cpu, cpu, &swtchd_to_modern);
    }
 }

--- a/hw/acpi/hmat.c
+++ b/hw/acpi/hmat.c
@@ -78,7 +78,6 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
                          uint32_t *initiator_list)
 {
    int i, index;
-    uint32_t initiator_to_index[MAX_NODES] = {};
    HMAT_LB_Data *lb_data;
    uint16_t *entry_list;
    uint32_t base;
@@ -122,8 +121,6 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
    /* Initiator Proximity Domain List */
    for (i = 0; i < num_initiator; i++) {
        build_append_int_noprefix(table_data, initiator_list[i], 4);
-        /* Reverse mapping for array possitions */
-        initiator_to_index[initiator_list[i]] = i;
    }

    /* Target Proximity Domain List */
@@ -135,8 +132,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
    entry_list = g_new0(uint16_t, num_initiator * num_target);
    for (i = 0; i < hmat_lb->list->len; i++) {
        lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
-        index = initiator_to_index[lb_data->initiator] * num_target +
-            lb_data->target;
+        index = lb_data->initiator * num_target + lb_data->target;

        entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base);
    }
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM_VIRT
    imply TPM_TIS_SYSBUS
    imply TPM_TIS_I2C
    imply NVDIMM
+    imply IOMMUFD
    select ARM_GIC
    select ACPI
    select ARM_SMMUV3
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -675,8 +675,6 @@ static void smmu_base_reset_hold(Object *obj)
 {
    SMMUState *s = ARM_SMMU(obj);

-    memset(s->smmu_pcibus_by_bus_num, 0, sizeof(s->smmu_pcibus_by_bus_num));
-
    g_hash_table_remove_all(s->configs);
    g_hash_table_remove_all(s->iotlb);
 }
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1998,13 +1998,14 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem)
            if (pmu) {
                assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
                if (kvm_irqchip_in_kernel()) {
-                    kvm_arm_pmu_set_irq(cpu, VIRTUAL_PMU_IRQ);
+                    kvm_arm_pmu_set_irq(ARM_CPU(cpu), VIRTUAL_PMU_IRQ);
                }
-                kvm_arm_pmu_init(cpu);
+                kvm_arm_pmu_init(ARM_CPU(cpu));
            }
            if (steal_time) {
-                kvm_arm_pvtime_init(cpu, pvtime_reg_base +
-                                         cpu->cpu_index * PVTIME_SIZE_PER_CPU);
+                kvm_arm_pvtime_init(ARM_CPU(cpu), pvtime_reg_base
+                                                  + cpu->cpu_index
+                                                    * PVTIME_SIZE_PER_CPU);
            }
        }
    } else {
@@ -3180,10 +3181,17 @@ static void machvirt_machine_init(void)
 }
 type_init(machvirt_machine_init);

-static void virt_machine_8_2_options(MachineClass *mc)
+static void virt_machine_9_0_options(MachineClass *mc)
 {
 }
-DEFINE_VIRT_MACHINE_AS_LATEST(8, 2)
+DEFINE_VIRT_MACHINE_AS_LATEST(9, 0)
+
+static void virt_machine_8_2_options(MachineClass *mc)
+{
+    virt_machine_9_0_options(mc);
+    compat_props_add(mc->compat_props, hw_compat_8_2, hw_compat_8_2_len);
+}
+DEFINE_VIRT_MACHINE(8, 2)

 static void virt_machine_8_1_options(MachineClass *mc)
 {
--- a/hw/audio/virtio-snd.c
+++ b/hw/audio/virtio-snd.c
@@ -243,13 +243,12 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s,
        memset(&pcm_info[i].padding, 0, 5);
    }

-    cmd->payload_size = sizeof(virtio_snd_pcm_info) * count;
    cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
    iov_from_buf(cmd->elem->in_sg,
                 cmd->elem->in_num,
                 sizeof(virtio_snd_hdr),
                 pcm_info,
-                 cmd->payload_size);
+                 sizeof(virtio_snd_pcm_info) * count);
 }

 /*
@@ -456,6 +455,7 @@ static uint32_t virtio_snd_pcm_prepare(VirtIOSound *s, uint32_t stream_id)
        stream->s = s;
        qemu_mutex_init(&stream->queue_mutex);
        QSIMPLEQ_INIT(&stream->queue);
+        QSIMPLEQ_INIT(&stream->invalid);

        /*
         * stream_id >= s->snd_conf.streams was checked before so this is
@@ -610,6 +610,9 @@ static size_t virtio_snd_pcm_get_io_msgs_count(VirtIOSoundPCMStream *stream)
        QSIMPLEQ_FOREACH_SAFE(buffer, &stream->queue, entry, next) {
            count += 1;
        }
+        QSIMPLEQ_FOREACH_SAFE(buffer, &stream->invalid, entry, next) {
+            count += 1;
+        }
    }
    return count;
 }
@@ -746,8 +749,7 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd)
                 0,
                 &cmd->resp,
                 sizeof(virtio_snd_hdr));
-    virtqueue_push(cmd->vq, cmd->elem,
-                   sizeof(virtio_snd_hdr) + cmd->payload_size);
+    virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr));
    virtio_notify(VIRTIO_DEVICE(s), cmd->vq);
 }

@@ -806,7 +808,6 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
        cmd->elem = elem;
        cmd->vq = vq;
        cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
-        /* implicit cmd->payload_size = 0; */
        QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next);
        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
    }
@@ -827,36 +828,47 @@ static void virtio_snd_handle_event(VirtIODevice *vdev, VirtQueue *vq)
    trace_virtio_snd_handle_event();
 }

-/*
- * Must only be called if vsnd->invalid is not empty.
- */
 static inline void empty_invalid_queue(VirtIODevice *vdev, VirtQueue *vq)
 {
    VirtIOSoundPCMBuffer *buffer = NULL;
+    VirtIOSoundPCMStream *stream = NULL;
    virtio_snd_pcm_status resp = { 0 };
    VirtIOSound *vsnd = VIRTIO_SND(vdev);
+    bool any = false;

-    g_assert(!QSIMPLEQ_EMPTY(&vsnd->invalid));
-
-    while (!QSIMPLEQ_EMPTY(&vsnd->invalid)) {
-        buffer = QSIMPLEQ_FIRST(&vsnd->invalid);
-        /* If buffer->vq != vq, our logic is fundamentally wrong, so bail out */
-        g_assert(buffer->vq == vq);
-
-        resp.status = cpu_to_le32(VIRTIO_SND_S_BAD_MSG);
-        iov_from_buf(buffer->elem->in_sg,
-                     buffer->elem->in_num,
-                     0,
-                     &resp,
-                     sizeof(virtio_snd_pcm_status));
-        virtqueue_push(vq,
-                       buffer->elem,
-                       sizeof(virtio_snd_pcm_status));
-        QSIMPLEQ_REMOVE_HEAD(&vsnd->invalid, entry);
-        virtio_snd_pcm_buffer_free(buffer);
+    for (uint32_t i = 0; i < vsnd->snd_conf.streams; i++) {
+        stream = vsnd->pcm->streams[i];
+        if (stream) {
+            any = false;
+            WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) {
+                while (!QSIMPLEQ_EMPTY(&stream->invalid)) {
+                    buffer = QSIMPLEQ_FIRST(&stream->invalid);
+                    if (buffer->vq != vq) {
+                        break;
+                    }
+                    any = true;
+                    resp.status = cpu_to_le32(VIRTIO_SND_S_BAD_MSG);
+                    iov_from_buf(buffer->elem->in_sg,
+                                 buffer->elem->in_num,
+                                 0,
+                                 &resp,
+                                 sizeof(virtio_snd_pcm_status));
+                    virtqueue_push(vq,
+                                   buffer->elem,
+                                   sizeof(virtio_snd_pcm_status));
+                    QSIMPLEQ_REMOVE_HEAD(&stream->invalid, entry);
+                    virtio_snd_pcm_buffer_free(buffer);
+                }
+                if (any) {
+                    /*
+                     * Notify vq about virtio_snd_pcm_status responses.
+                     * Buffer responses must be notified separately later.
+                     */
+                    virtio_notify(vdev, vq);
+                }
+            }
+        }
    }
-    /* Notify vq about virtio_snd_pcm_status responses. */
-    virtio_notify(vdev, vq);
 }

 /*
@@ -868,14 +880,15 @@ static inline void empty_invalid_queue(VirtIODevice *vdev, VirtQueue *vq)
 */
 static void virtio_snd_handle_tx_xfer(VirtIODevice *vdev, VirtQueue *vq)
 {
-    VirtIOSound *vsnd = VIRTIO_SND(vdev);
+    VirtIOSound *s = VIRTIO_SND(vdev);
+    VirtIOSoundPCMStream *stream = NULL;
    VirtIOSoundPCMBuffer *buffer;
    VirtQueueElement *elem;
    size_t msg_sz, size;
    virtio_snd_pcm_xfer hdr;
    uint32_t stream_id;
    /*
-     * If any of the I/O messages are invalid, put them in vsnd->invalid and
+     * If any of the I/O messages are invalid, put them in stream->invalid and
     * return them after the for loop.
     */
    bool must_empty_invalid_queue = false;
@@ -886,8 +899,6 @@ static void virtio_snd_handle_tx_xfer(VirtIODevice *vdev, VirtQueue *vq)
    trace_virtio_snd_handle_tx_xfer();

    for (;;) {
-        VirtIOSoundPCMStream *stream;
-
        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
        if (!elem) {
            break;
@@ -903,12 +914,12 @@ static void virtio_snd_handle_tx_xfer(VirtIODevice *vdev, VirtQueue *vq)
        }
        stream_id = le32_to_cpu(hdr.stream_id);

-        if (stream_id >= vsnd->snd_conf.streams
-            || vsnd->pcm->streams[stream_id] == NULL) {
+        if (stream_id >= s->snd_conf.streams
+            || s->pcm->streams[stream_id] == NULL) {
            goto tx_err;
        }

-        stream = vsnd->pcm->streams[stream_id];
+        stream = s->pcm->streams[stream_id];
        if (stream->info.direction != VIRTIO_SND_D_OUTPUT) {
            goto tx_err;
        }
@@ -928,11 +939,13 @@ static void virtio_snd_handle_tx_xfer(VirtIODevice *vdev, VirtQueue *vq)
        continue;

 tx_err:
-        must_empty_invalid_queue = true;
-        buffer = g_malloc0(sizeof(VirtIOSoundPCMBuffer));
-        buffer->elem = elem;
-        buffer->vq = vq;
-        QSIMPLEQ_INSERT_TAIL(&vsnd->invalid, buffer, entry);
+        WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) {
+            must_empty_invalid_queue = true;
+            buffer = g_malloc0(sizeof(VirtIOSoundPCMBuffer));
+            buffer->elem = elem;
+            buffer->vq = vq;
+            QSIMPLEQ_INSERT_TAIL(&stream->invalid, buffer, entry);
+        }
    }

    if (must_empty_invalid_queue) {
@@ -949,14 +962,15 @@ tx_err:
 */
 static void virtio_snd_handle_rx_xfer(VirtIODevice *vdev, VirtQueue *vq)
 {
-    VirtIOSound *vsnd = VIRTIO_SND(vdev);
+    VirtIOSound *s = VIRTIO_SND(vdev);
+    VirtIOSoundPCMStream *stream = NULL;
    VirtIOSoundPCMBuffer *buffer;
    VirtQueueElement *elem;
    size_t msg_sz, size;
    virtio_snd_pcm_xfer hdr;
    uint32_t stream_id;
    /*
-     * if any of the I/O messages are invalid, put them in vsnd->invalid and
+     * if any of the I/O messages are invalid, put them in stream->invalid and
     * return them after the for loop.
     */
    bool must_empty_invalid_queue = false;
@@ -967,8 +981,6 @@ static void virtio_snd_handle_rx_xfer(VirtIODevice *vdev, VirtQueue *vq)
    trace_virtio_snd_handle_rx_xfer();

    for (;;) {
-        VirtIOSoundPCMStream *stream;
-
        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
        if (!elem) {
            break;
@@ -984,12 +996,12 @@ static void virtio_snd_handle_rx_xfer(VirtIODevice *vdev, VirtQueue *vq)
        }
        stream_id = le32_to_cpu(hdr.stream_id);

-        if (stream_id >= vsnd->snd_conf.streams
-            || !vsnd->pcm->streams[stream_id]) {
+        if (stream_id >= s->snd_conf.streams
+            || !s->pcm->streams[stream_id]) {
            goto rx_err;
        }

-        stream = vsnd->pcm->streams[stream_id];
+        stream = s->pcm->streams[stream_id];
        if (stream == NULL || stream->info.direction != VIRTIO_SND_D_INPUT) {
            goto rx_err;
        }
@@ -1006,11 +1018,13 @@ static void virtio_snd_handle_rx_xfer(VirtIODevice *vdev, VirtQueue *vq)
        continue;

 rx_err:
-        must_empty_invalid_queue = true;
-        buffer = g_malloc0(sizeof(VirtIOSoundPCMBuffer));
-        buffer->elem = elem;
-        buffer->vq = vq;
-        QSIMPLEQ_INSERT_TAIL(&vsnd->invalid, buffer, entry);
+        WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) {
+            must_empty_invalid_queue = true;
+            buffer = g_malloc0(sizeof(VirtIOSoundPCMBuffer));
+            buffer->elem = elem;
+            buffer->vq = vq;
+            QSIMPLEQ_INSERT_TAIL(&stream->invalid, buffer, entry);
+        }
    }

    if (must_empty_invalid_queue) {
@@ -1110,7 +1124,6 @@ static void virtio_snd_realize(DeviceState *dev, Error **errp)
        virtio_add_queue(vdev, 64, virtio_snd_handle_rx_xfer);
    qemu_mutex_init(&vsnd->cmdq_mutex);
    QTAILQ_INIT(&vsnd->cmdq);
-    QSIMPLEQ_INIT(&vsnd->invalid);

    for (uint32_t i = 0; i < vsnd->snd_conf.streams; i++) {
        status = virtio_snd_set_pcm_params(vsnd, i, &default_params);
@@ -1360,20 +1373,13 @@ static void virtio_snd_unrealize(DeviceState *dev)

 static void virtio_snd_reset(VirtIODevice *vdev)
 {
-    VirtIOSound *vsnd = VIRTIO_SND(vdev);
+    VirtIOSound *s = VIRTIO_SND(vdev);
    virtio_snd_ctrl_command *cmd;

-    /*
-     * Sanity check that the invalid buffer message queue is emptied at the end
-     * of every virtio_snd_handle_tx_xfer/virtio_snd_handle_rx_xfer call, and
-     * must be empty otherwise.
-     */
-    g_assert(QSIMPLEQ_EMPTY(&vsnd->invalid));
-
-    WITH_QEMU_LOCK_GUARD(&vsnd->cmdq_mutex) {
-        while (!QTAILQ_EMPTY(&vsnd->cmdq)) {
-            cmd = QTAILQ_FIRST(&vsnd->cmdq);
-            QTAILQ_REMOVE(&vsnd->cmdq, cmd, next);
+    WITH_QEMU_LOCK_GUARD(&s->cmdq_mutex) {
+        while (!QTAILQ_EMPTY(&s->cmdq)) {
+            cmd = QTAILQ_FIRST(&s->cmdq);
+            QTAILQ_REMOVE(&s->cmdq, cmd, next);
            virtio_snd_ctrl_cmd_free(cmd);
        }
    }
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -32,13 +32,11 @@ struct VirtIOBlockDataPlane {
    VirtIOBlkConf *conf;
    VirtIODevice *vdev;

-    /* Note that these EventNotifiers are assigned by value.  This is
-     * fine as long as you do not call event_notifier_cleanup on them
-     * (because you don't own the file descriptor or handle; you just
-     * use it).
+    /*
+     * The AioContext for each virtqueue. The BlockDriverState will use the
+     * first element as its AioContext.
     */
-    IOThread *iothread;
-    AioContext *ctx;
+    AioContext **vq_aio_context;
 };

 /* Raise an interrupt to signal guest, if necessary */
@@ -47,6 +45,45 @@ void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq)
    virtio_notify_irqfd(s->vdev, vq);
 }

+/* Generate vq:AioContext mappings from a validated iothread-vq-mapping list */
+static void
+apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
+                 AioContext **vq_aio_context, uint16_t num_queues)
+{
+    IOThreadVirtQueueMappingList *node;
+    size_t num_iothreads = 0;
+    size_t cur_iothread = 0;
+
+    for (node = iothread_vq_mapping_list; node; node = node->next) {
+        num_iothreads++;
+    }
+
+    for (node = iothread_vq_mapping_list; node; node = node->next) {
+        IOThread *iothread = iothread_by_id(node->value->iothread);
+        AioContext *ctx = iothread_get_aio_context(iothread);
+
+        /* Released in virtio_blk_data_plane_destroy() */
+        object_ref(OBJECT(iothread));
+
+        if (node->value->vqs) {
+            uint16List *vq;
+
+            /* Explicit vq:IOThread assignment */
+            for (vq = node->value->vqs; vq; vq = vq->next) {
+                vq_aio_context[vq->value] = ctx;
+            }
+        } else {
+            /* Round-robin vq:IOThread assignment */
+            for (unsigned i = cur_iothread; i < num_queues;
+                 i += num_iothreads) {
+                vq_aio_context[i] = ctx;
+            }
+        }
+
+        cur_iothread++;
+    }
+}
+
 /* Context: QEMU global mutex held */
 bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
                                  VirtIOBlockDataPlane **dataplane,
@@ -58,7 +95,7 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,

    *dataplane = NULL;

-    if (conf->iothread) {
+    if (conf->iothread || conf->iothread_vq_mapping_list) {
        if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
            error_setg(errp,
                       "device is incompatible with iothread "
@@ -86,13 +123,24 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
    s = g_new0(VirtIOBlockDataPlane, 1);
    s->vdev = vdev;
    s->conf = conf;
+    s->vq_aio_context = g_new(AioContext *, conf->num_queues);

-    if (conf->iothread) {
-        s->iothread = conf->iothread;
-        object_ref(OBJECT(s->iothread));
-        s->ctx = iothread_get_aio_context(s->iothread);
+    if (conf->iothread_vq_mapping_list) {
+        apply_vq_mapping(conf->iothread_vq_mapping_list, s->vq_aio_context,
+                         conf->num_queues);
+    } else if (conf->iothread) {
+        AioContext *ctx = iothread_get_aio_context(conf->iothread);
+        for (unsigned i = 0; i < conf->num_queues; i++) {
+            s->vq_aio_context[i] = ctx;
+        }
+
+        /* Released in virtio_blk_data_plane_destroy() */
+        object_ref(OBJECT(conf->iothread));
    } else {
-        s->ctx = qemu_get_aio_context();
+        AioContext *ctx = qemu_get_aio_context();
+        for (unsigned i = 0; i < conf->num_queues; i++) {
+            s->vq_aio_context[i] = ctx;
+        }
    }

    *dataplane = s;
@@ -104,6 +152,7 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
 void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 {
    VirtIOBlock *vblk;
+    VirtIOBlkConf *conf;

    if (!s) {
        return;
@@ -111,9 +160,22 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)

    vblk = VIRTIO_BLK(s->vdev);
    assert(!vblk->dataplane_started);
-    if (s->iothread) {
-        object_unref(OBJECT(s->iothread));
+    conf = s->conf;
+
+    if (conf->iothread_vq_mapping_list) {
+        IOThreadVirtQueueMappingList *node;
+
+        for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
+            IOThread *iothread = iothread_by_id(node->value->iothread);
+            object_unref(OBJECT(iothread));
+        }
    }
+
+    if (conf->iothread) {
+        object_unref(OBJECT(conf->iothread));
+    }
+
+    g_free(s->vq_aio_context);
    g_free(s);
 }

@@ -124,7 +186,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
    VirtIOBlockDataPlane *s = vblk->dataplane;
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vblk)));
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    AioContext *old_context;
    unsigned i;
    unsigned nvqs = s->conf->num_queues;
    Error *local_err = NULL;
@@ -178,22 +239,13 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)

    trace_virtio_blk_data_plane_start(s);

-    old_context = blk_get_aio_context(s->conf->conf.blk);
-    aio_context_acquire(old_context);
-    r = blk_set_aio_context(s->conf->conf.blk, s->ctx, &local_err);
-    aio_context_release(old_context);
+    r = blk_set_aio_context(s->conf->conf.blk, s->vq_aio_context[0],
+                            &local_err);
    if (r < 0) {
        error_report_err(local_err);
        goto fail_aio_context;
    }

-    /* Kick right away to begin processing requests already in vring */
-    for (i = 0; i < nvqs; i++) {
-        VirtQueue *vq = virtio_get_queue(s->vdev, i);
-
-        event_notifier_set(virtio_queue_get_host_notifier(vq));
-    }
-
    /*
     * These fields must be visible to the IOThread when it processes the
     * virtqueue, otherwise it will think dataplane has not started yet.
@@ -208,13 +260,15 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)

    /* Get this show started by hooking up our callbacks */
    if (!blk_in_drain(s->conf->conf.blk)) {
-        aio_context_acquire(s->ctx);
        for (i = 0; i < nvqs; i++) {
            VirtQueue *vq = virtio_get_queue(s->vdev, i);
+            AioContext *ctx = s->vq_aio_context[i];

-            virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+            /* Kick right away to begin processing requests already in vring */
+            event_notifier_set(virtio_queue_get_host_notifier(vq));
+
+            virtio_queue_aio_attach_host_notifier(vq, ctx);
        }
-        aio_context_release(s->ctx);
    }
    return 0;

@@ -242,23 +296,18 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 *
 * Context: BH in IOThread
 */
-static void virtio_blk_data_plane_stop_bh(void *opaque)
+static void virtio_blk_data_plane_stop_vq_bh(void *opaque)
 {
-    VirtIOBlockDataPlane *s = opaque;
-    unsigned i;
+    VirtQueue *vq = opaque;
+    EventNotifier *host_notifier = virtio_queue_get_host_notifier(vq);

-    for (i = 0; i < s->conf->num_queues; i++) {
-        VirtQueue *vq = virtio_get_queue(s->vdev, i);
-        EventNotifier *host_notifier = virtio_queue_get_host_notifier(vq);
+    virtio_queue_aio_detach_host_notifier(vq, qemu_get_current_aio_context());

-        virtio_queue_aio_detach_host_notifier(vq, s->ctx);
-
-        /*
-         * Test and clear notifier after disabling event, in case poll callback
-         * didn't have time to run.
-         */
-        virtio_queue_host_notifier_read(host_notifier);
-    }
+    /*
+     * Test and clear notifier after disabling event, in case poll callback
+     * didn't have time to run.
+     */
+    virtio_queue_host_notifier_read(host_notifier);
 }

 /* Context: QEMU global mutex held */
@@ -285,7 +334,12 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
    trace_virtio_blk_data_plane_stop(s);

    if (!blk_in_drain(s->conf->conf.blk)) {
-        aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
+        for (i = 0; i < nvqs; i++) {
+            VirtQueue *vq = virtio_get_queue(s->vdev, i);
+            AioContext *ctx = s->vq_aio_context[i];
+
+            aio_wait_bh_oneshot(ctx, virtio_blk_data_plane_stop_vq_bh, vq);
+        }
    }

    /*
@@ -314,8 +368,6 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
     */
    vblk->dataplane_started = false;

-    aio_context_acquire(s->ctx);
-
    /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
    blk_drain(s->conf->conf.blk);

@@ -325,10 +377,28 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
     */
    blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL);

-    aio_context_release(s->ctx);
-
    /* Clean up guest notifier (irq) */
    k->set_guest_notifiers(qbus->parent, nvqs, false);

    s->stopping = false;
 }
+
+void virtio_blk_data_plane_detach(VirtIOBlockDataPlane *s)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(s->vdev);
+
+    for (uint16_t i = 0; i < s->conf->num_queues; i++) {
+        VirtQueue *vq = virtio_get_queue(vdev, i);
+        virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
+    }
+}
+
+void virtio_blk_data_plane_attach(VirtIOBlockDataPlane *s)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(s->vdev);
+
+    for (uint16_t i = 0; i < s->conf->num_queues; i++) {
+        VirtQueue *vq = virtio_get_queue(vdev, i);
+        virtio_queue_aio_attach_host_notifier(vq, s->vq_aio_context[i]);
+    }
+}
--- a/hw/block/dataplane/virtio-blk.h
+++ b/hw/block/dataplane/virtio-blk.h
@@ -28,4 +28,7 @@ void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq);
 int virtio_blk_data_plane_start(VirtIODevice *vdev);
 void virtio_blk_data_plane_stop(VirtIODevice *vdev);

+void virtio_blk_data_plane_detach(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_attach(VirtIOBlockDataPlane *s);
+
 #endif /* HW_DATAPLANE_VIRTIO_BLK_H */
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -260,8 +260,6 @@ static void xen_block_complete_aio(void *opaque, int ret)
    XenBlockRequest *request = opaque;
    XenBlockDataPlane *dataplane = request->dataplane;

-    aio_context_acquire(dataplane->ctx);
-
    if (ret != 0) {
        error_report("%s I/O error",
                     request->req.operation == BLKIF_OP_READ ?
@@ -273,10 +271,10 @@ static void xen_block_complete_aio(void *opaque, int ret)
    if (request->presync) {
        request->presync = 0;
        xen_block_do_aio(request);
-        goto done;
+        return;
    }
    if (request->aio_inflight > 0) {
-        goto done;
+        return;
    }

    switch (request->req.operation) {
@@ -318,9 +316,6 @@ static void xen_block_complete_aio(void *opaque, int ret)
    if (dataplane->more_work) {
        qemu_bh_schedule(dataplane->bh);
    }
-
-done:
-    aio_context_release(dataplane->ctx);
 }

 static bool xen_block_split_discard(XenBlockRequest *request,
@@ -601,9 +596,7 @@ static void xen_block_dataplane_bh(void *opaque)
 {
    XenBlockDataPlane *dataplane = opaque;

-    aio_context_acquire(dataplane->ctx);
    xen_block_handle_requests(dataplane);
-    aio_context_release(dataplane->ctx);
 }

 static bool xen_block_dataplane_event(void *opaque)
@@ -703,10 +696,8 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
        xen_block_dataplane_detach(dataplane);
    }

-    aio_context_acquire(dataplane->ctx);
    /* Xen doesn't have multiple users for nodes, so this can't fail */
    blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
-    aio_context_release(dataplane->ctx);

    /*
     * Now that the context has been moved onto the main thread, cancel
@@ -752,7 +743,6 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
 {
    ERRP_GUARD();
    XenDevice *xendev = dataplane->xendev;
-    AioContext *old_context;
    unsigned int ring_size;
    unsigned int i;

@@ -836,11 +826,8 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
        goto stop;
    }

-    old_context = blk_get_aio_context(dataplane->blk);
-    aio_context_acquire(old_context);
    /* If other users keep the BlockBackend in the iothread, that's ok */
    blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
-    aio_context_release(old_context);

    if (!blk_in_drain(dataplane->blk)) {
        xen_block_dataplane_attach(dataplane);
--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -84,11 +84,7 @@ struct NANDFlashState {

    void (*blk_write)(NANDFlashState *s);
    void (*blk_erase)(NANDFlashState *s);
-    /*
-     * Returns %true when block containing (@addr + @offset) is
-     * successfully loaded, otherwise %false.
-     */
-    bool (*blk_load)(NANDFlashState *s, uint64_t addr, unsigned offset);
+    void (*blk_load)(NANDFlashState *s, uint64_t addr, int offset);

    uint32_t ioaddr_vmstate;
 };
@@ -247,30 +243,9 @@ static inline void nand_pushio_byte(NANDFlashState *s, uint8_t value)
    }
 }

-/*
- * nand_load_block: Load block containing (s->addr + @offset).
- * Returns length of data available at @offset in this block.
- */
-static unsigned nand_load_block(NANDFlashState *s, unsigned offset)
-{
-    unsigned iolen;
-
-    if (!s->blk_load(s, s->addr, offset)) {
-        return 0;
-    }
-
-    iolen = (1 << s->page_shift);
-    if (s->gnd) {
-        iolen += 1 << s->oob_shift;
-    }
-    assert(offset <= iolen);
-    iolen -= offset;
-
-    return iolen;
-}
-
 static void nand_command(NANDFlashState *s)
 {
+    unsigned int offset;
    switch (s->cmd) {
    case NAND_CMD_READ0:
        s->iolen = 0;
@@ -296,7 +271,12 @@ static void nand_command(NANDFlashState *s)
    case NAND_CMD_NOSERIALREAD2:
        if (!(nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP))
            break;
-        s->iolen = nand_load_block(s, s->addr & ((1 << s->addr_shift) - 1));
+        offset = s->addr & ((1 << s->addr_shift) - 1);
+        s->blk_load(s, s->addr, offset);
+        if (s->gnd)
+            s->iolen = (1 << s->page_shift) - offset;
+        else
+            s->iolen = (1 << s->page_shift) + (1 << s->oob_shift) - offset;
        break;

    case NAND_CMD_RESET:
@@ -617,7 +597,12 @@ uint32_t nand_getio(DeviceState *dev)
    if (!s->iolen && s->cmd == NAND_CMD_READ0) {
        offset = (int) (s->addr & ((1 << s->addr_shift) - 1)) + s->offset;
        s->offset = 0;
-        s->iolen = nand_load_block(s, offset);
+
+        s->blk_load(s, s->addr, offset);
+        if (s->gnd)
+            s->iolen = (1 << s->page_shift) - offset;
+        else
+            s->iolen = (1 << s->page_shift) + (1 << s->oob_shift) - offset;
    }

    if (s->ce || s->iolen <= 0) {
@@ -778,15 +763,11 @@ static void glue(nand_blk_erase_, NAND_PAGE_SIZE)(NANDFlashState *s)
    }
 }

-static bool glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s,
-                                                 uint64_t addr, unsigned offset)
+static void glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s,
+                uint64_t addr, int offset)
 {
    if (PAGE(addr) >= s->pages) {
-        return false;
-    }
-
-    if (offset > NAND_PAGE_SIZE + OOB_SIZE) {
-        return false;
+        return;
    }

    if (s->blk) {
@@ -814,8 +795,6 @@ static bool glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s,
                        offset, NAND_PAGE_SIZE + OOB_SIZE - offset);
        s->ioaddr = s->io;
    }
-
-    return true;
 }

 static void glue(nand_init_, NAND_PAGE_SIZE)(NANDFlashState *s)
--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@@ -80,39 +80,16 @@ struct PFlashCFI01 {
    uint16_t ident3;
    uint8_t cfi_table[0x52];
    uint64_t counter;
-    uint32_t writeblock_size;
+    unsigned int writeblock_size;
    MemoryRegion mem;
    char *name;
    void *storage;
    VMChangeStateEntry *vmstate;
    bool old_multiple_chip_handling;
-
-    /* block update buffer */
-    unsigned char *blk_bytes;
-    uint32_t blk_offset;
 };

 static int pflash_post_load(void *opaque, int version_id);

-static bool pflash_blk_write_state_needed(void *opaque)
-{
-    PFlashCFI01 *pfl = opaque;
-
-    return (pfl->blk_offset != -1);
-}
-
-static const VMStateDescription vmstate_pflash_blk_write = {
-    .name = "pflash_cfi01_blk_write",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .needed = pflash_blk_write_state_needed,
-    .fields = (const VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(blk_bytes, PFlashCFI01, 0, NULL, writeblock_size),
-        VMSTATE_UINT32(blk_offset, PFlashCFI01),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
 static const VMStateDescription vmstate_pflash = {
    .name = "pflash_cfi01",
    .version_id = 1,
@@ -124,10 +101,6 @@ static const VMStateDescription vmstate_pflash = {
        VMSTATE_UINT8(status, PFlashCFI01),
        VMSTATE_UINT64(counter, PFlashCFI01),
        VMSTATE_END_OF_LIST()
-    },
-    .subsections = (const VMStateDescription * []) {
-        &vmstate_pflash_blk_write,
-        NULL
    }
 };

@@ -252,10 +225,34 @@ static uint32_t pflash_data_read(PFlashCFI01 *pfl, hwaddr offset,
    uint32_t ret;

    p = pfl->storage;
-    if (be) {
-        ret = ldn_be_p(p + offset, width);
-    } else {
-        ret = ldn_le_p(p + offset, width);
+    switch (width) {
+    case 1:
+        ret = p[offset];
+        break;
+    case 2:
+        if (be) {
+            ret = p[offset] << 8;
+            ret |= p[offset + 1];
+        } else {
+            ret = p[offset];
+            ret |= p[offset + 1] << 8;
+        }
+        break;
+    case 4:
+        if (be) {
+            ret = p[offset] << 24;
+            ret |= p[offset + 1] << 16;
+            ret |= p[offset + 2] << 8;
+            ret |= p[offset + 3];
+        } else {
+            ret = p[offset];
+            ret |= p[offset + 1] << 8;
+            ret |= p[offset + 2] << 16;
+            ret |= p[offset + 3] << 24;
+        }
+        break;
+    default:
+        abort();
    }
    trace_pflash_data_read(pfl->name, offset, width, ret);
    return ret;
@@ -403,61 +400,40 @@ static void pflash_update(PFlashCFI01 *pfl, int offset,
    }
 }

-/* copy current flash content to block update buffer */
-static void pflash_blk_write_start(PFlashCFI01 *pfl, hwaddr offset)
-{
-    hwaddr mask = ~(pfl->writeblock_size - 1);
-
-    trace_pflash_write_block_start(pfl->name, pfl->counter);
-    pfl->blk_offset = offset & mask;
-    memcpy(pfl->blk_bytes, pfl->storage + pfl->blk_offset,
-           pfl->writeblock_size);
-}
-
-/* commit block update buffer changes */
-static void pflash_blk_write_flush(PFlashCFI01 *pfl)
-{
-    g_assert(pfl->blk_offset != -1);
-    trace_pflash_write_block_flush(pfl->name);
-    memcpy(pfl->storage + pfl->blk_offset, pfl->blk_bytes,
-           pfl->writeblock_size);
-    pflash_update(pfl, pfl->blk_offset, pfl->writeblock_size);
-    pfl->blk_offset = -1;
-}
-
-/* discard block update buffer changes */
-static void pflash_blk_write_abort(PFlashCFI01 *pfl)
-{
-    trace_pflash_write_block_abort(pfl->name);
-    pfl->blk_offset = -1;
-}
-
 static inline void pflash_data_write(PFlashCFI01 *pfl, hwaddr offset,
                                     uint32_t value, int width, int be)
 {
-    uint8_t *p;
+    uint8_t *p = pfl->storage;

-    if (pfl->blk_offset != -1) {
-        /* block write: redirect writes to block update buffer */
-        if ((offset < pfl->blk_offset) ||
-            (offset + width > pfl->blk_offset + pfl->writeblock_size)) {
-            pfl->status |= 0x10; /* Programming error */
-            return;
+    trace_pflash_data_write(pfl->name, offset, width, value, pfl->counter);
+    switch (width) {
+    case 1:
+        p[offset] = value;
+        break;
+    case 2:
+        if (be) {
+            p[offset] = value >> 8;
+            p[offset + 1] = value;
+        } else {
+            p[offset] = value;
+            p[offset + 1] = value >> 8;
        }
-        trace_pflash_data_write_block(pfl->name, offset, width, value,
-                                      pfl->counter);
-        p = pfl->blk_bytes + (offset - pfl->blk_offset);
-    } else {
-        /* write directly to storage */
-        trace_pflash_data_write(pfl->name, offset, width, value);
-        p = pfl->storage + offset;
+        break;
+    case 4:
+        if (be) {
+            p[offset] = value >> 24;
+            p[offset + 1] = value >> 16;
+            p[offset + 2] = value >> 8;
+            p[offset + 3] = value;
+        } else {
+            p[offset] = value;
+            p[offset + 1] = value >> 8;
+            p[offset + 2] = value >> 16;
+            p[offset + 3] = value >> 24;
+        }
+        break;
    }

-    if (be) {
-        stn_be_p(p, width, value);
-    } else {
-        stn_le_p(p, width, value);
-    }
 }

 static void pflash_write(PFlashCFI01 *pfl, hwaddr offset,
@@ -572,9 +548,9 @@ static void pflash_write(PFlashCFI01 *pfl, hwaddr offset,
            } else {
                value = extract32(value, 0, pfl->bank_width * 8);
            }
+            trace_pflash_write_block(pfl->name, value);
            pfl->counter = value;
            pfl->wcycle++;
-            pflash_blk_write_start(pfl, offset);
            break;
        case 0x60:
            if (cmd == 0xd0) {
@@ -605,7 +581,12 @@ static void pflash_write(PFlashCFI01 *pfl, hwaddr offset,
        switch (pfl->cmd) {
        case 0xe8: /* Block write */
            /* FIXME check @offset, @width */
-            if (!pfl->ro && (pfl->blk_offset != -1)) {
+            if (!pfl->ro) {
+                /*
+                 * FIXME writing straight to memory is *wrong*.  We
+                 * should write to a buffer, and flush it to memory
+                 * only on confirm command (see below).
+                 */
                pflash_data_write(pfl, offset, value, width, be);
            } else {
                pfl->status |= 0x10; /* Programming error */
@@ -614,8 +595,18 @@ static void pflash_write(PFlashCFI01 *pfl, hwaddr offset,
            pfl->status |= 0x80;

            if (!pfl->counter) {
+                hwaddr mask = pfl->writeblock_size - 1;
+                mask = ~mask;
+
                trace_pflash_write(pfl->name, "block write finished");
                pfl->wcycle++;
+                if (!pfl->ro) {
+                    /* Flush the entire write buffer onto backing storage.  */
+                    /* FIXME premature! */
+                    pflash_update(pfl, offset & mask, pfl->writeblock_size);
+                } else {
+                    pfl->status |= 0x10; /* Programming error */
+                }
            }

            pfl->counter--;
@@ -627,17 +618,20 @@ static void pflash_write(PFlashCFI01 *pfl, hwaddr offset,
    case 3: /* Confirm mode */
        switch (pfl->cmd) {
        case 0xe8: /* Block write */
-            if ((cmd == 0xd0) && !(pfl->status & 0x10)) {
-                pflash_blk_write_flush(pfl);
+            if (cmd == 0xd0) {
+                /* FIXME this is where we should write out the buffer */
                pfl->wcycle = 0;
                pfl->status |= 0x80;
            } else {
-                pflash_blk_write_abort(pfl);
+                qemu_log_mask(LOG_UNIMP,
+                    "%s: Aborting write to buffer not implemented,"
+                    " the data is already written to storage!\n"
+                    "Flash device reset into READ mode.\n",
+                    __func__);
                goto mode_read_array;
            }
            break;
        default:
-            pflash_blk_write_abort(pfl);
            goto error_flash;
        }
        break;
@@ -871,9 +865,6 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
    pfl->cmd = 0x00;
    pfl->status = 0x80; /* WSM ready */
    pflash_cfi01_fill_cfi_table(pfl);
-
-    pfl->blk_bytes = g_malloc(pfl->writeblock_size);
-    pfl->blk_offset = -1;
 }

 static void pflash_cfi01_system_reset(DeviceState *dev)
@@ -893,8 +884,6 @@ static void pflash_cfi01_system_reset(DeviceState *dev)
     * This model deliberately ignores this delay.
     */
    pfl->status = 0x80;
-
-    pfl->blk_offset = -1;
 }

 static Property pflash_cfi01_properties[] = {
--- a/hw/block/pflash_cfi02.c
+++ b/hw/block/pflash_cfi02.c
@@ -546,7 +546,7 @@ static void pflash_write(void *opaque, hwaddr offset, uint64_t value,
                }
                goto reset_flash;
            }
-            trace_pflash_data_write(pfl->name, offset, width, value);
+            trace_pflash_data_write(pfl->name, offset, width, value, 0);
            if (!pfl->ro) {
                p = (uint8_t *)pfl->storage + offset;
                if (pfl->be) {
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -12,8 +12,7 @@ fdctrl_tc_pulse(int level) "TC pulse: %u"
 pflash_chip_erase_invalid(const char *name, uint64_t offset) "%s: chip erase: invalid address 0x%" PRIx64
 pflash_chip_erase_start(const char *name) "%s: start chip erase"
 pflash_data_read(const char *name, uint64_t offset, unsigned size, uint32_t value) "%s: data offset:0x%04"PRIx64" size:%u value:0x%04x"
-pflash_data_write(const char *name, uint64_t offset, unsigned size, uint32_t value) "%s: data offset:0x%04"PRIx64" size:%u value:0x%04x"
-pflash_data_write_block(const char *name, uint64_t offset, unsigned size, uint32_t value, uint64_t counter) "%s: data offset:0x%04"PRIx64" size:%u value:0x%04x counter:0x%016"PRIx64
+pflash_data_write(const char *name, uint64_t offset, unsigned size, uint32_t value, uint64_t counter) "%s: data offset:0x%04"PRIx64" size:%u value:0x%04x counter:0x%016"PRIx64
 pflash_device_id(const char *name, uint16_t id) "%s: read device ID: 0x%04x"
 pflash_device_info(const char *name, uint64_t offset) "%s: read device information offset:0x%04" PRIx64
 pflash_erase_complete(const char *name) "%s: sector erase complete"
@@ -33,9 +32,7 @@ pflash_unlock0_failed(const char *name, uint64_t offset, uint8_t cmd, uint16_t a
 pflash_unlock1_failed(const char *name, uint64_t offset, uint8_t cmd) "%s: unlock0 failed 0x%" PRIx64 " 0x%02x"
 pflash_unsupported_device_configuration(const char *name, uint8_t width, uint8_t max) "%s: unsupported device configuration: device_width:%d max_device_width:%d"
 pflash_write(const char *name, const char *str) "%s: %s"
-pflash_write_block_start(const char *name, uint32_t value) "%s: block write start: bytes:0x%x"
-pflash_write_block_flush(const char *name) "%s: block write flush"
-pflash_write_block_abort(const char *name) "%s: block write abort"
+pflash_write_block(const char *name, uint32_t value) "%s: block write: bytes:0x%x"
 pflash_write_block_erase(const char *name, uint64_t offset, uint64_t len) "%s: block erase offset:0x%" PRIx64 " bytes:0x%" PRIx64
 pflash_write_failed(const char *name, uint64_t offset, uint8_t cmd) "%s: command failed 0x%" PRIx64 " 0x%02x"
 pflash_write_invalid(const char *name, uint8_t cmd) "%s: invalid write for command 0x%02x"
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -65,7 +65,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
    iov_discard_undo(&req->inhdr_undo);
    iov_discard_undo(&req->outhdr_undo);
    virtqueue_push(req->vq, &req->elem, req->in_len);
-    if (qemu_in_iothread()) {
+    if (s->dataplane_started && !s->dataplane_disabled) {
        virtio_blk_data_plane_notify(s->dataplane, req->vq);
    } else {
        virtio_notify(vdev, req->vq);
@@ -82,8 +82,11 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
        /* Break the link as the next request is going to be parsed from the
         * ring again. Otherwise we may end up doing a double completion! */
        req->mr_next = NULL;
-        req->next = s->rq;
-        s->rq = req;
+
+        WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+            req->next = s->rq;
+            s->rq = req;
+        }
    } else if (action == BLOCK_ERROR_ACTION_REPORT) {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
        if (acct_failed) {
@@ -102,7 +105,6 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
    VirtIOBlock *s = next->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(s);

-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    while (next) {
        VirtIOBlockReq *req = next;
        next = req->mr_next;
@@ -135,7 +137,6 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
        block_acct_done(blk_get_stats(s->blk), &req->acct);
        virtio_blk_free_request(req);
    }
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }

 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -143,19 +144,13 @@ static void virtio_blk_flush_complete(void *opaque, int ret)
    VirtIOBlockReq *req = opaque;
    VirtIOBlock *s = req->dev;

-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
-    if (ret) {
-        if (virtio_blk_handle_rw_error(req, -ret, 0, true)) {
-            goto out;
-        }
+    if (ret && virtio_blk_handle_rw_error(req, -ret, 0, true)) {
+        return;
    }

    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
    block_acct_done(blk_get_stats(s->blk), &req->acct);
    virtio_blk_free_request(req);
-
-out:
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }

 static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
@@ -165,11 +160,8 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
    bool is_write_zeroes = (virtio_ldl_p(VIRTIO_DEVICE(s), &req->out.type) &
                            ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;

-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
-    if (ret) {
-        if (virtio_blk_handle_rw_error(req, -ret, false, is_write_zeroes)) {
-            goto out;
-        }
+    if (ret && virtio_blk_handle_rw_error(req, -ret, false, is_write_zeroes)) {
+        return;
    }

    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
@@ -177,9 +169,6 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
        block_acct_done(blk_get_stats(s->blk), &req->acct);
    }
    virtio_blk_free_request(req);
-
-out:
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }

 #ifdef __linux__
@@ -226,10 +215,8 @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
    virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);

 out:
-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    virtio_blk_req_complete(req, status);
    virtio_blk_free_request(req);
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
    g_free(ioctl_req);
 }

@@ -669,7 +656,6 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
 {
    ZoneCmdData *data = opaque;
    VirtIOBlockReq *req = data->req;
-    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
    struct iovec *in_iov = data->in_iov;
    unsigned in_num = data->in_num;
@@ -760,10 +746,8 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
    }

 out:
-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    virtio_blk_req_complete(req, err_status);
    virtio_blk_free_request(req);
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
    g_free(data->zone_report_data.zones);
    g_free(data);
 }
@@ -783,8 +767,7 @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
            sizeof(struct virtio_blk_zone_report) +
            sizeof(struct virtio_blk_zone_descriptor)) {
        virtio_error(vdev, "in buffer too small for zone report");
-        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
-        goto out;
+        return;
    }

    /* start byte offset of the zone report */
@@ -827,10 +810,8 @@ static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
    }

-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    virtio_blk_req_complete(req, err_status);
    virtio_blk_free_request(req);
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }

 static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
@@ -880,7 +861,6 @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
 {
    ZoneCmdData *data = opaque;
    VirtIOBlockReq *req = data->req;
-    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
    int64_t append_sector, n;
    uint8_t err_status = VIRTIO_BLK_S_OK;
@@ -903,10 +883,8 @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
    trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);

 out:
-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    virtio_blk_req_complete(req, err_status);
    virtio_blk_free_request(req);
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
    g_free(data);
 }

@@ -942,10 +920,8 @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
    return 0;

 out:
-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    virtio_blk_req_complete(req, err_status);
    virtio_blk_free_request(req);
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
    return err_status;
 }

@@ -1135,7 +1111,6 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
    MultiReqBuffer mrb = {};
    bool suppress_notifications = virtio_queue_get_notification(vq);

-    aio_context_acquire(blk_get_aio_context(s->blk));
    defer_call_begin();

    do {
@@ -1161,7 +1136,6 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
    }

    defer_call_end();
-    aio_context_release(blk_get_aio_context(s->blk));
 }

 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -1177,6 +1151,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
            return;
        }
    }
+
    virtio_blk_handle_vq(s, vq);
 }

@@ -1184,12 +1159,14 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
    VirtIOBlock *s = opaque;

-    VirtIOBlockReq *req = s->rq;
+    VirtIOBlockReq *req;
    MultiReqBuffer mrb = {};

-    s->rq = NULL;
+    WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+        req = s->rq;
+        s->rq = NULL;
+    }

-    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
    while (req) {
        VirtIOBlockReq *next = req->next;
        if (virtio_blk_handle_request(req, &mrb)) {
@@ -1213,8 +1190,6 @@ static void virtio_blk_dma_restart_bh(void *opaque)

    /* Paired with inc in virtio_blk_dma_restart_cb() */
    blk_dec_in_flight(s->conf.conf.blk);
-
-    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }

 static void virtio_blk_dma_restart_cb(void *opaque, bool running,
@@ -1236,25 +1211,28 @@ static void virtio_blk_dma_restart_cb(void *opaque, bool running,
 static void virtio_blk_reset(VirtIODevice *vdev)
 {
    VirtIOBlock *s = VIRTIO_BLK(vdev);
-    AioContext *ctx;
    VirtIOBlockReq *req;

-    ctx = blk_get_aio_context(s->blk);
-    aio_context_acquire(ctx);
+    /* Dataplane has stopped... */
+    assert(!s->dataplane_started);
+
+    /* ...but requests may still be in flight. */
    blk_drain(s->blk);

    /* We drop queued requests after blk_drain() because blk_drain() itself can
     * produce them. */
-    while (s->rq) {
-        req = s->rq;
-        s->rq = req->next;
-        virtqueue_detach_element(req->vq, &req->elem, 0);
-        virtio_blk_free_request(req);
+    WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+        while (s->rq) {
+            req = s->rq;
+            s->rq = req->next;
+
+            /* No other threads can access req->vq here */
+            virtqueue_detach_element(req->vq, &req->elem, 0);
+
+            virtio_blk_free_request(req);
+        }
    }

-    aio_context_release(ctx);
-
-    assert(!s->dataplane_started);
    blk_set_enable_write_cache(s->blk, s->original_wce);
 }

@@ -1269,10 +1247,6 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
    uint64_t capacity;
    int64_t length;
    int blk_size = conf->logical_block_size;
-    AioContext *ctx;
-
-    ctx = blk_get_aio_context(s->blk);
-    aio_context_acquire(ctx);

    blk_get_geometry(s->blk, &capacity);
    memset(&blkcfg, 0, sizeof(blkcfg));
@@ -1296,7 +1270,6 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
     * per track (cylinder).
     */
    length = blk_getlength(s->blk);
-    aio_context_release(ctx);
    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
        blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
    } else {
@@ -1363,9 +1336,7 @@ static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)

    memcpy(&blkcfg, config, s->config_size);

-    aio_context_acquire(blk_get_aio_context(s->blk));
    blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
-    aio_context_release(blk_get_aio_context(s->blk));
 }

 static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
@@ -1433,29 +1404,31 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
     * s->blk would erroneously be placed in writethrough mode.
     */
    if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
-        aio_context_acquire(blk_get_aio_context(s->blk));
        blk_set_enable_write_cache(s->blk,
                                   virtio_vdev_has_feature(vdev,
                                                           VIRTIO_BLK_F_WCE));
-        aio_context_release(blk_get_aio_context(s->blk));
    }
 }

 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
 {
    VirtIOBlock *s = VIRTIO_BLK(vdev);
-    VirtIOBlockReq *req = s->rq;

-    while (req) {
-        qemu_put_sbyte(f, 1);
+    WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+        VirtIOBlockReq *req = s->rq;

-        if (s->conf.num_queues > 1) {
-            qemu_put_be32(f, virtio_get_queue_index(req->vq));
+        while (req) {
+            qemu_put_sbyte(f, 1);
+
+            if (s->conf.num_queues > 1) {
+                qemu_put_be32(f, virtio_get_queue_index(req->vq));
+            }
+
+            qemu_put_virtqueue_element(vdev, f, &req->elem);
+            req = req->next;
        }
-
-        qemu_put_virtqueue_element(vdev, f, &req->elem);
-        req = req->next;
    }
+
    qemu_put_sbyte(f, 0);
 }

@@ -1481,13 +1454,78 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,

        req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
        virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
-        req->next = s->rq;
-        s->rq = req;
+
+        WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+            req->next = s->rq;
+            s->rq = req;
+        }
    }

    return 0;
 }

+static bool
+validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
+        uint16_t num_queues, Error **errp)
+{
+    g_autofree unsigned long *vqs = bitmap_new(num_queues);
+    g_autoptr(GHashTable) iothreads =
+        g_hash_table_new(g_str_hash, g_str_equal);
+
+    for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
+        const char *name = node->value->iothread;
+        uint16List *vq;
+
+        if (!iothread_by_id(name)) {
+            error_setg(errp, "IOThread \"%s\" object does not exist", name);
+            return false;
+        }
+
+        if (!g_hash_table_add(iothreads, (gpointer)name)) {
+            error_setg(errp,
+                    "duplicate IOThread name \"%s\" in iothread-vq-mapping",
+                    name);
+            return false;
+        }
+
+        if (node != list) {
+            if (!!node->value->vqs != !!list->value->vqs) {
+                error_setg(errp, "either all items in iothread-vq-mapping "
+                                 "must have vqs or none of them must have it");
+                return false;
+            }
+        }
+
+        for (vq = node->value->vqs; vq; vq = vq->next) {
+            if (vq->value >= num_queues) {
+                error_setg(errp, "vq index %u for IOThread \"%s\" must be "
+                        "less than num_queues %u in iothread-vq-mapping",
+                        vq->value, name, num_queues);
+                return false;
+            }
+
+            if (test_and_set_bit(vq->value, vqs)) {
+                error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
+                        "because it is already assigned", vq->value, name);
+                return false;
+            }
+        }
+    }
+
+    if (list->value->vqs) {
+        for (uint16_t i = 0; i < num_queues; i++) {
+            if (!test_bit(i, vqs)) {
+                error_setg(errp,
+                        "missing vq %u IOThread assignment in iothread-vq-mapping",
+                        i);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 static void virtio_resize_cb(void *opaque)
 {
    VirtIODevice *vdev = opaque;
@@ -1512,34 +1550,24 @@ static void virtio_blk_resize(void *opaque)
 static void virtio_blk_drained_begin(void *opaque)
 {
    VirtIOBlock *s = opaque;
-    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
-    AioContext *ctx = blk_get_aio_context(s->conf.conf.blk);

    if (!s->dataplane || !s->dataplane_started) {
        return;
    }

-    for (uint16_t i = 0; i < s->conf.num_queues; i++) {
-        VirtQueue *vq = virtio_get_queue(vdev, i);
-        virtio_queue_aio_detach_host_notifier(vq, ctx);
-    }
+    virtio_blk_data_plane_detach(s->dataplane);
 }

 /* Resume virtqueue ioeventfd processing after drain */
 static void virtio_blk_drained_end(void *opaque)
 {
    VirtIOBlock *s = opaque;
-    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
-    AioContext *ctx = blk_get_aio_context(s->conf.conf.blk);

    if (!s->dataplane || !s->dataplane_started) {
        return;
    }

-    for (uint16_t i = 0; i < s->conf.num_queues; i++) {
-        VirtQueue *vq = virtio_get_queue(vdev, i);
-        virtio_queue_aio_attach_host_notifier(vq, ctx);
-    }
+    virtio_blk_data_plane_attach(s->dataplane);
 }

 static const BlockDevOps virtio_block_ops = {
@@ -1625,10 +1653,25 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
        return;
    }

+    if (conf->iothread_vq_mapping_list) {
+        if (conf->iothread) {
+            error_setg(errp, "iothread and iothread-vq-mapping properties "
+                             "cannot be set at the same time");
+            return;
+        }
+
+        if (!validate_iothread_vq_mapping_list(conf->iothread_vq_mapping_list,
+                                               conf->num_queues, errp)) {
+            return;
+        }
+    }
+
    s->config_size = virtio_get_config_size(&virtio_blk_cfg_size_params,
                                            s->host_features);
    virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size);

+    qemu_mutex_init(&s->rq_lock);
+
    s->blk = conf->conf.blk;
    s->rq = NULL;
    s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
@@ -1680,6 +1723,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
        virtio_del_queue(vdev, i);
    }
    qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
+    qemu_mutex_destroy(&s->rq_lock);
    blk_ram_registrar_destroy(&s->blk_ram_registrar);
    qemu_del_vm_change_state_handler(s->change);
    blockdev_mark_auto_del(s->blk);
@@ -1724,6 +1768,8 @@ static Property virtio_blk_properties[] = {
    DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
    DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
                     IOThread *),
+    DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOBlock,
+                                         conf.iothread_vq_mapping_list),
    DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features,
                      VIRTIO_BLK_F_DISCARD, true),
    DEFINE_PROP_BOOL("report-discard-granularity", VirtIOBlock,
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -985,7 +985,8 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp)
        return;
    }

-    port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port);
+    port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
+                                   &dev->mem_reentrancy_guard);
    port->elem = NULL;
 }

--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -32,6 +32,9 @@
 #include "hw/virtio/virtio-net.h"
 #include "audio/audio.h"

+GlobalProperty hw_compat_8_2[] = {};
+const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
+
 GlobalProperty hw_compat_8_1[] = {
    { TYPE_PCI_BRIDGE, "x-pci-express-writeable-slt-bug", "true" },
    { "ramfb", "x-migrate", "off" },
@@ -97,7 +100,6 @@ GlobalProperty hw_compat_5_2[] = {
    { "PIIX4_PM", "smm-compat", "on"},
    { "virtio-blk-device", "report-discard-granularity", "off" },
    { "virtio-net-pci-base", "vectors", "3"},
-    { "nvme", "msix-exclusive-bar", "on"},
 };
 const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);

--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -18,6 +18,7 @@
 #include "qapi/qapi-types-block.h"
 #include "qapi/qapi-types-machine.h"
 #include "qapi/qapi-types-migration.h"
+#include "qapi/qapi-visit-virtio.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ctype.h"
 #include "qemu/cutils.h"
@@ -120,9 +121,7 @@ static void set_drive_helper(Object *obj, Visitor *v, const char *name,
                       "node");
        }

-        aio_context_acquire(ctx);
        blk_replace_bs(blk, bs, errp);
-        aio_context_release(ctx);
        return;
    }

@@ -148,10 +147,7 @@ static void set_drive_helper(Object *obj, Visitor *v, const char *name,
                          0, BLK_PERM_ALL);
            blk_created = true;

-            aio_context_acquire(ctx);
            ret = blk_insert_bs(blk, bs, errp);
-            aio_context_release(ctx);
-
            if (ret < 0) {
                goto fail;
            }
@@ -207,12 +203,8 @@ static void release_drive(Object *obj, const char *name, void *opaque)
    BlockBackend **ptr = object_field_prop_ptr(obj, prop);

    if (*ptr) {
-        AioContext *ctx = blk_get_aio_context(*ptr);
-
-        aio_context_acquire(ctx);
        blockdev_auto_del(*ptr);
        blk_detach_dev(*ptr, dev);
-        aio_context_release(ctx);
    }
 }

@@ -1169,3 +1161,48 @@ const PropertyInfo qdev_prop_cpus390entitlement = {
    .set   = qdev_propinfo_set_enum,
    .set_default_value = qdev_propinfo_set_default_value_enum,
 };
+
+/* --- IOThreadVirtQueueMappingList --- */
+
+static void get_iothread_vq_mapping_list(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    IOThreadVirtQueueMappingList **prop_ptr =
+        object_field_prop_ptr(obj, opaque);
+
+    visit_type_IOThreadVirtQueueMappingList(v, name, prop_ptr, errp);
+}
+
+static void set_iothread_vq_mapping_list(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    IOThreadVirtQueueMappingList **prop_ptr =
+        object_field_prop_ptr(obj, opaque);
+    IOThreadVirtQueueMappingList *list;
+
+    if (!visit_type_IOThreadVirtQueueMappingList(v, name, &list, errp)) {
+        return;
+    }
+
+    qapi_free_IOThreadVirtQueueMappingList(*prop_ptr);
+    *prop_ptr = list;
+}
+
+static void release_iothread_vq_mapping_list(Object *obj,
+        const char *name, void *opaque)
+{
+    IOThreadVirtQueueMappingList **prop_ptr =
+        object_field_prop_ptr(obj, opaque);
+
+    qapi_free_IOThreadVirtQueueMappingList(*prop_ptr);
+    *prop_ptr = NULL;
+}
+
+const PropertyInfo qdev_prop_iothread_vq_mapping_list = {
+    .name = "IOThreadVirtQueueMappingList",
+    .description = "IOThread virtqueue mapping list [{\"iothread\":\"<id>\", "
+                   "\"vqs\":[1,2,3,...]},...]",
+    .get = get_iothread_vq_mapping_list,
+    .set = set_iothread_vq_mapping_list,
+    .release = release_iothread_vq_mapping_list,
+};
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@@ -1076,16 +1076,18 @@ void device_class_set_props(DeviceClass *dc, Property *props)
 void qdev_alias_all_properties(DeviceState *target, Object *source)
 {
    ObjectClass *class;
-    Property *prop;
+    ObjectPropertyIterator iter;
+    ObjectProperty *prop;

    class = object_get_class(OBJECT(target));
-    do {
-        DeviceClass *dc = DEVICE_CLASS(class);

-        for (prop = dc->props_; prop && prop->name; prop++) {
-            object_property_add_alias(source, prop->name,
-                                      OBJECT(target), prop->name);
+    object_class_property_iter_init(&iter, class);
+    while ((prop = object_property_iter_next(&iter))) {
+        if (object_property_find(source, prop->name)) {
+            continue; /* skip duplicate properties */
        }
-        class = object_class_get_parent(class);
-    } while (class != object_class_by_name(TYPE_DEVICE));
+
+        object_property_add_alias(source, prop->name,
+                                  OBJECT(target), prop->name);
+    }
 }
--- a/hw/cxl/cxl-cdat.c
+++ b/hw/cxl/cxl-cdat.c
@@ -49,7 +49,6 @@ static void ct3_build_cdat(CDATObject *cdat, Error **errp)
    g_autofree CDATTableHeader *cdat_header = NULL;
    g_autofree CDATEntry *cdat_st = NULL;
    uint8_t sum = 0;
-    uint8_t *hdr_buf;
    int ent, i;

    /* Use default table if fopen == NULL */
@@ -64,7 +63,7 @@ static void ct3_build_cdat(CDATObject *cdat, Error **errp)
    cdat->built_buf_len = cdat->build_cdat_table(&cdat->built_buf,
                                                 cdat->private);

-    if (cdat->built_buf_len <= 0) {
+    if (!cdat->built_buf_len) {
        /* Build later as not all data available yet */
        cdat->to_update = true;
        return;
@@ -96,12 +95,8 @@ static void ct3_build_cdat(CDATObject *cdat, Error **errp)
    /* For now, no runtime updates */
    cdat_header->sequence = 0;
    cdat_header->length += sizeof(CDATTableHeader);
-
-    hdr_buf = (uint8_t *)cdat_header;
-    for (i = 0; i < sizeof(*cdat_header); i++) {
-        sum += hdr_buf[i];
-    }
-
+    sum += cdat_header->revision + cdat_header->sequence +
+        cdat_header->length;
    /* Sum of all bytes including checksum must be 0 */
    cdat_header->checksum = ~sum + 1;

--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -199,7 +199,7 @@ void cxl_component_register_block_init(Object *obj,
    /* io registers controls link which we don't care about in QEMU */
    memory_region_init_io(&cregs->io, obj, NULL, cregs, ".io",
                          CXL2_COMPONENT_IO_REGION_SIZE);
-    memory_region_init_io(&cregs->cache_mem, obj, &cache_mem_ops, cxl_cstate,
+    memory_region_init_io(&cregs->cache_mem, obj, &cache_mem_ops, cregs,
                          ".cache_mem", CXL2_COMPONENT_CM_REGION_SIZE);

    memory_region_add_subregion(&cregs->component_registers, 0, &cregs->io);
--- a/hw/cxl/cxl-device-utils.c
+++ b/hw/cxl/cxl-device-utils.c
@@ -229,9 +229,12 @@ static void mailbox_reg_write(void *opaque, hwaddr offset, uint64_t value,

 static uint64_t mdev_reg_read(void *opaque, hwaddr offset, unsigned size)
 {
-    CXLDeviceState *cxl_dstate = opaque;
+    uint64_t retval = 0;

-    return cxl_dstate->memdev_status;
+    retval = FIELD_DP64(retval, CXL_MEM_DEV_STS, MEDIA_STATUS, 1);
+    retval = FIELD_DP64(retval, CXL_MEM_DEV_STS, MBOX_READY, 1);
+
+    return retval;
 }

 static void ro_reg_write(void *opaque, hwaddr offset, uint64_t value,
@@ -368,15 +371,7 @@ static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate)
    cxl_dstate->mbox_msi_n = msi_n;
 }

-static void memdev_reg_init_common(CXLDeviceState *cxl_dstate)
-{
-    uint64_t memdev_status_reg;
-
-    memdev_status_reg = FIELD_DP64(0, CXL_MEM_DEV_STS, MEDIA_STATUS, 1);
-    memdev_status_reg = FIELD_DP64(memdev_status_reg, CXL_MEM_DEV_STS,
-                                   MBOX_READY, 1);
-    cxl_dstate->memdev_status = memdev_status_reg;
-}
+static void memdev_reg_init_common(CXLDeviceState *cxl_dstate) { }

 void cxl_device_register_init_t3(CXLType3Dev *ct3d)
 {
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -251,7 +251,11 @@ void
 virtio_gpu_base_device_unrealize(DeviceState *qdev)
 {
    VirtIOGPUBase *g = VIRTIO_GPU_BASE(qdev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(qdev);

+    virtio_del_queue(vdev, 0);
+    virtio_del_queue(vdev, 1);
+    virtio_cleanup(vdev);
    migrate_del_blocker(&g->migration_blocker);
 }

--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -181,7 +181,7 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g,
        memset(&info, 0, sizeof(info));
        ret = virgl_renderer_resource_get_info(ss.resource_id, &info);
 #endif
-        if (ret) {
+        if (ret == -1) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: illegal resource specified %d\n",
                          __func__, ss.resource_id);
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1463,8 +1463,10 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)

    g->ctrl_vq = virtio_get_queue(vdev, 0);
    g->cursor_vq = virtio_get_queue(vdev, 1);
-    g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g);
-    g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g);
+    g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g,
+                                     &qdev->mem_reentrancy_guard);
+    g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g,
+                                       &qdev->mem_reentrancy_guard);
    g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g);
    qemu_cond_init(&g->reset_cond);
    QTAILQ_INIT(&g->reslist);
--- a/hw/hppa/Kconfig
+++ b/hw/hppa/Kconfig
@@ -7,7 +7,6 @@ config HPPA_B160L
    select DINO
    select LASI
    select SERIAL
-    select SERIAL_PCI
    select ISA_BUS
    select I8259
    select IDE_CMD646
@@ -17,4 +16,3 @@ config HPPA_B160L
    select LASIPS2
    select PARALLEL
    select ARTIST
-    select USB_OHCI_PCI
--- a/hw/hppa/machine.c
+++ b/hw/hppa/machine.c
@@ -36,8 +36,8 @@

 #define MIN_SEABIOS_HPPA_VERSION 12 /* require at least this fw version */

-#define HPA_POWER_BUTTON        (FIRMWARE_END - 0x10)
-static hwaddr soft_power_reg;
+/* Power button address at &PAGE0->pad[4] */
+#define HPA_POWER_BUTTON (0x40 + 4 * sizeof(uint32_t))

 #define enable_lasi_lan()       0

@@ -45,6 +45,7 @@ static DeviceState *lasi_dev;

 static void hppa_powerdown_req(Notifier *n, void *opaque)
 {
+    hwaddr soft_power_reg = HPA_POWER_BUTTON;
    uint32_t val;

    val = ldl_be_phys(&address_space_memory, soft_power_reg);
@@ -220,7 +221,7 @@ static FWCfgState *create_fw_cfg(MachineState *ms, PCIBus *pci_bus,
    fw_cfg_add_file(fw_cfg, "/etc/hppa/machine",
                    g_memdup(mc->name, len), len);

-    val = cpu_to_le64(soft_power_reg);
+    val = cpu_to_le64(HPA_POWER_BUTTON);
    fw_cfg_add_file(fw_cfg, "/etc/hppa/power-button-addr",
                    g_memdup(&val, sizeof(val)), sizeof(val));

@@ -275,7 +276,6 @@ static TranslateFn *machine_HP_common_init_cpus(MachineState *machine)
    unsigned int smp_cpus = machine->smp.cpus;
    TranslateFn *translate;
    MemoryRegion *cpu_region;
-    uint64_t ram_max;

    /* Create CPUs.  */
    for (unsigned int i = 0; i < smp_cpus; i++) {
@@ -288,14 +288,10 @@ static TranslateFn *machine_HP_common_init_cpus(MachineState *machine)
     */
    if (hppa_is_pa20(&cpu[0]->env)) {
        translate = translate_pa20;
-        ram_max = 0xf0000000;      /* 3.75 GB (limited by 32-bit firmware) */
    } else {
        translate = translate_pa10;
-        ram_max = 0xf0000000;      /* 3.75 GB (32-bit CPU) */
    }

-    soft_power_reg = translate(NULL, HPA_POWER_BUTTON);
-
    for (unsigned int i = 0; i < smp_cpus; i++) {
        g_autofree char *name = g_strdup_printf("cpu%u-io-eir", i);

@@ -315,9 +311,9 @@ static TranslateFn *machine_HP_common_init_cpus(MachineState *machine)
                                cpu_region);

    /* Main memory region. */
-    if (machine->ram_size > ram_max) {
-        info_report("Max RAM size limited to %" PRIu64 " MB", ram_max / MiB);
-        machine->ram_size = ram_max;
+    if (machine->ram_size > 3 * GiB) {
+        error_report("RAM size is currently restricted to 3GB");
+        exit(EXIT_FAILURE);
    }
    memory_region_add_subregion_overlap(addr_space, 0, machine->ram, -1);

@@ -347,10 +343,8 @@ static void machine_HP_common_init_tail(MachineState *machine, PCIBus *pci_bus,
    SysBusDevice *s;

    /* SCSI disk setup. */
-    if (drive_get_max_bus(IF_SCSI) >= 0) {
-        dev = DEVICE(pci_create_simple(pci_bus, -1, "lsi53c895a"));
-        lsi53c8xx_handle_legacy_cmdline(dev);
-    }
+    dev = DEVICE(pci_create_simple(pci_bus, -1, "lsi53c895a"));
+    lsi53c8xx_handle_legacy_cmdline(dev);

    /* Graphics setup. */
    if (machine->enable_graphics && vga_interface_type != VGA_NONE) {
@@ -363,7 +357,7 @@ static void machine_HP_common_init_tail(MachineState *machine, PCIBus *pci_bus,
    }

    /* Network setup. */
-    if (nd_table[0].used && enable_lasi_lan()) {
+    if (enable_lasi_lan()) {
        lasi_82596_init(addr_space, translate(NULL, LASI_LAN_HPA),
                        qdev_get_gpio_in(lasi_dev, LASI_IRQ_LAN_HPA));
    }
@@ -388,7 +382,7 @@ static void machine_HP_common_init_tail(MachineState *machine, PCIBus *pci_bus,
    pci_set_word(&pci_dev->config[PCI_SUBSYSTEM_ID], 0x1227); /* Powerbar */

    /* create a second serial PCI card when running Astro */
-    if (serial_hd(1) && !lasi_dev) {
+    if (!lasi_dev) {
        pci_dev = pci_new(-1, "pci-serial-4x");
        qdev_prop_set_chr(DEVICE(pci_dev), "chardev1", serial_hd(1));
        qdev_prop_set_chr(DEVICE(pci_dev), "chardev2", serial_hd(2));
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -95,6 +95,7 @@ config Q35
    imply E1000E_PCI_EXPRESS
    imply VMPORT
    imply VMMOUSE
+    imply IOMMUFD
    select PC_PCI
    select PC_ACPI
    select PCI_EXPRESS_Q35
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1415,7 +1415,7 @@ static void build_acpi0017(Aml *table)
    aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0017")));

    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
-    aml_append(method, aml_return(aml_int(0x0B)));
+    aml_append(method, aml_return(aml_int(0x01)));
    aml_append(dev, method);
    build_cxl_dsm_method(dev);

--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -78,6 +78,9 @@
    { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\
    { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },

+GlobalProperty pc_compat_8_2[] = {};
+const size_t pc_compat_8_2_len = G_N_ELEMENTS(pc_compat_8_2);
+
 GlobalProperty pc_compat_8_1[] = {};
 const size_t pc_compat_8_1_len = G_N_ELEMENTS(pc_compat_8_1);

--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -545,13 +545,26 @@ static void pc_i440fx_machine_options(MachineClass *m)
                                     "Use a different south bridge than PIIX3");
 }

-static void pc_i440fx_8_2_machine_options(MachineClass *m)
+static void pc_i440fx_9_0_machine_options(MachineClass *m)
 {
    pc_i440fx_machine_options(m);
    m->alias = "pc";
    m->is_default = true;
 }

+DEFINE_I440FX_MACHINE(v9_0, "pc-i440fx-9.0", NULL,
+                      pc_i440fx_9_0_machine_options);
+
+static void pc_i440fx_8_2_machine_options(MachineClass *m)
+{
+    pc_i440fx_9_0_machine_options(m);
+    m->alias = NULL;
+    m->is_default = false;
+
+    compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len);
+    compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len);
+}
+
 DEFINE_I440FX_MACHINE(v8_2, "pc-i440fx-8.2", NULL,
                      pc_i440fx_8_2_machine_options);

@@ -560,8 +573,6 @@ static void pc_i440fx_8_1_machine_options(MachineClass *m)
    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);

    pc_i440fx_8_2_machine_options(m);
-    m->alias = NULL;
-    m->is_default = false;
    pcmc->broken_32bit_mem_addr_check = true;

    compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len);
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -383,12 +383,23 @@ static void pc_q35_machine_options(MachineClass *m)
    machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
 }

-static void pc_q35_8_2_machine_options(MachineClass *m)
+static void pc_q35_9_0_machine_options(MachineClass *m)
 {
    pc_q35_machine_options(m);
    m->alias = "q35";
 }

+DEFINE_Q35_MACHINE(v9_0, "pc-q35-9.0", NULL,
+                   pc_q35_9_0_machine_options);
+
+static void pc_q35_8_2_machine_options(MachineClass *m)
+{
+    pc_q35_9_0_machine_options(m);
+    m->alias = NULL;
+    compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len);
+    compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len);
+}
+
 DEFINE_Q35_MACHINE(v8_2, "pc-q35-8.2", NULL,
                   pc_q35_8_2_machine_options);

--- a/hw/i386/sgx-stub.c
+++ b/hw/i386/sgx-stub.c
@@ -34,5 +34,5 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms)

 bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
 {
-    return true;
+    g_assert_not_reached();
 }
--- a/hw/intc/Kconfig
+++ b/hw/intc/Kconfig
@@ -12,6 +12,10 @@ config IOAPIC
    bool
    select I8259

+config ARM_GIC
+    bool
+    select MSI_NONBROKEN
+
 config OPENPIC
    bool
    select MSI_NONBROKEN
@@ -21,18 +25,14 @@ config APIC
    select MSI_NONBROKEN
    select I8259

-config ARM_GIC
-    bool
-    select ARM_GICV3_TCG if TCG
-    select ARM_GIC_KVM if KVM
-    select MSI_NONBROKEN
-
 config ARM_GICV3_TCG
    bool
+    default y
    depends on ARM_GIC && TCG

 config ARM_GIC_KVM
    bool
+    default y
    depends on ARM_GIC && KVM

 config XICS
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -1067,7 +1067,7 @@ static uint64_t icc_hppir0_value(GICv3CPUState *cs, CPUARMState *env)
     */
    bool irq_is_secure;

-    if (icc_no_enabled_hppi(cs)) {
+    if (cs->hppi.prio == 0xff) {
        return INTID_SPURIOUS;
    }

@@ -1104,7 +1104,7 @@ static uint64_t icc_hppir1_value(GICv3CPUState *cs, CPUARMState *env)
     */
    bool irq_is_secure;

-    if (icc_no_enabled_hppi(cs)) {
+    if (cs->hppi.prio == 0xff) {
        return INTID_SPURIOUS;
    }

@@ -1434,25 +1434,16 @@ static void icv_eoir_write(CPUARMState *env, const ARMCPRegInfo *ri,
    idx = icv_find_active(cs, irq);

    if (idx < 0) {
-        /*
-         * No valid list register corresponding to EOI ID; if this is a vLPI
-         * not in the list regs then do nothing; otherwise increment EOI count
-         */
-        if (irq < GICV3_LPI_INTID_START) {
-            icv_increment_eoicount(cs);
-        }
+        /* No valid list register corresponding to EOI ID */
+        icv_increment_eoicount(cs);
    } else {
        uint64_t lr = cs->ich_lr_el2[idx];
        int thisgrp = (lr & ICH_LR_EL2_GROUP) ? GICV3_G1NS : GICV3_G0;
        int lr_gprio = ich_lr_prio(lr) & icv_gprio_mask(cs, grp);

        if (thisgrp == grp && lr_gprio == dropprio) {
-            if (!icv_eoi_split(env, cs) || irq >= GICV3_LPI_INTID_START) {
-                /*
-                 * Priority drop and deactivate not split: deactivate irq now.
-                 * LPIs always get their active state cleared immediately
-                 * because no separate deactivate is expected.
-                 */
+            if (!icv_eoi_split(env, cs)) {
+                /* Priority drop and deactivate not split: deactivate irq now */
                icv_deactivate_irq(cs, idx);
            }
        }
--- a/hw/intc/arm_gicv3_its_kvm.c
+++ b/hw/intc/arm_gicv3_its_kvm.c
@@ -21,6 +21,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "qemu/error-report.h"
 #include "hw/intc/arm_gicv3_its_common.h"
 #include "hw/qdev-properties.h"
 #include "sysemu/runstate.h"
--- a/hw/intc/riscv_aplic.c
+++ b/hw/intc/riscv_aplic.c
@@ -463,7 +463,6 @@ static uint32_t riscv_aplic_idc_claimi(RISCVAPLICState *aplic, uint32_t idc)

    if (!topi) {
        aplic->iforce[idc] = 0;
-        riscv_aplic_idc_update(aplic, idc);
        return 0;
    }

--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .2.3
 .2.50