qtest/migration: Document live=true cases

Document every single live=true use cases on why it should be done in the live manner. Also document on the parameter so new precopy cases should always use live=off unless with explicit reasonings. Cc: Thomas Huth <thuth@redhat.com> Cc: Juan Quintela <quintela@redhat.com> Cc: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20230601172935.175726-1-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
tests/qtest: make more migration pre-copy scenarios run non-live
2023-06-02 11:46:20 +02:00 · 2023-06-02 11:46:19 +02:00 · 2023-06-02 11:46:19 +02:00 · 2023-06-02 11:46:19 +02:00 · 2023-06-02 11:46:19 +02:00 · 2023-06-02 11:46:19 +02:00
877 changed files with 34603 additions and 22438 deletions
--- a/.gitlab-ci.d/buildtest-template.yml
+++ b/.gitlab-ci.d/buildtest-template.yml
@@ -5,19 +5,15 @@
  before_script:
    - JOBS=$(expr $(nproc) + 1)
  script:
-    - if test -n "$LD_JOBS";
-      then
-        scripts/git-submodule.sh update meson ;
-      fi
    - mkdir build
    - cd build
    - ../configure --enable-werror --disable-docs --enable-fdt=system
-          ${LD_JOBS:+--meson=git} ${TARGETS:+--target-list="$TARGETS"}
+          ${TARGETS:+--target-list="$TARGETS"}
          $CONFIGURE_ARGS ||
      { cat config.log meson-logs/meson-log.txt && exit 1; }
    - if test -n "$LD_JOBS";
      then
-        ../meson/meson.py configure . -Dbackend_max_links="$LD_JOBS" ;
+        pyvenv/bin/meson configure . -Dbackend_max_links="$LD_JOBS" ;
      fi || exit 1;
    - make -j"$JOBS"
    - if test -n "$MAKE_CHECK_ARGS";
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -103,7 +103,7 @@ crash-test-debian:
  script:
    - cd build
    - make NINJA=":" check-venv
-    - tests/venv/bin/python3 scripts/device-crash-test -q --tcg-only ./qemu-system-i386
+    - pyvenv/bin/python3 scripts/device-crash-test -q --tcg-only ./qemu-system-i386

 build-system-fedora:
  extends:
@@ -146,8 +146,8 @@ crash-test-fedora:
  script:
    - cd build
    - make NINJA=":" check-venv
-    - tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-ppc
-    - tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-riscv32
+    - pyvenv/bin/python3 scripts/device-crash-test -q ./qemu-system-ppc
+    - pyvenv/bin/python3 scripts/device-crash-test -q ./qemu-system-riscv32

 build-system-centos:
  extends:
@@ -527,7 +527,7 @@ build-without-defaults:
      --disable-strip
    TARGETS: avr-softmmu mips64-softmmu s390x-softmmu sh4-softmmu
      sparc64-softmmu hexagon-linux-user i386-linux-user s390x-linux-user
-    MAKE_CHECK_ARGS: check-unit check-qtest-avr check-qtest-mips64
+    MAKE_CHECK_ARGS: check

 build-libvhost-user:
  extends: .base_job_template
--- a/.gitlab-ci.d/cirrus/freebsd-13.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-13.vars
@@ -11,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
 NINJA='/usr/local/bin/ninja'
 PACKAGING_COMMAND='pkg'
 PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage cmocka ctags curl cyrus-sasl dbus diffutils dtc flex fusefs-libs3 gettext git glib gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs libslirp libspice-server libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy sndio socat spice-protocol tesseract usbredir virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache cmocka ctags curl cyrus-sasl dbus diffutils dtc flex fusefs-libs3 gettext git glib gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs libslirp libspice-server libssh libtasn1 llvm lzo2 meson mtools ncurses nettle ninja opencv pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy sndio socat spice-protocol tesseract usbredir virglrenderer vte3 xorriso zstd'
 PYPI_PKGS=''
 PYTHON='/usr/local/bin/python3'
--- a/.gitlab-ci.d/cirrus/macos-12.vars
+++ b/.gitlab-ci.d/cirrus/macos-12.vars
@@ -11,6 +11,6 @@ MAKE='/opt/homebrew/bin/gmake'
 NINJA='/opt/homebrew/bin/ninja'
 PACKAGING_COMMAND='brew'
 PIP3='/opt/homebrew/bin/pip3'
-PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 libusb llvm lzo make meson ncurses nettle ninja pixman pkg-config python3 rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract usbredir vde vte3 zlib zstd'
+PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 libusb llvm lzo make meson mtools ncurses nettle ninja pixman pkg-config python3 rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract usbredir vde vte3 xorriso zlib zstd'
 PYPI_PKGS='PyYAML numpy pillow sphinx sphinx-rtd-theme'
 PYTHON='/opt/homebrew/bin/python3'
--- a/.gitlab-ci.d/container-template.yml
+++ b/.gitlab-ci.d/container-template.yml
@@ -1,6 +1,6 @@
 .container_job_template:
  extends: .base_job_template
-  image: docker:stable
+  image: docker:latest
  stage: containers
  services:
    - docker:dind
@@ -8,8 +8,8 @@
    - export TAG="$CI_REGISTRY_IMAGE/qemu/$NAME:latest"
    - export COMMON_TAG="$CI_REGISTRY/qemu-project/qemu/qemu/$NAME:latest"
    - apk add python3
-    - docker info
    - docker login $CI_REGISTRY -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD"
+    - until docker info; do sleep 1; done
  script:
    - echo "TAG:$TAG"
    - echo "COMMON_TAG:$COMMON_TAG"
--- a/.gitlab-ci.d/custom-runners.yml
+++ b/.gitlab-ci.d/custom-runners.yml
@@ -20,8 +20,10 @@ variables:
  artifacts:
    name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
    expire_in: 7 days
+    when: always
    paths:
-      - build/meson-logs/testlog.txt
+      - build/build.ninja
+      - build/meson-logs
    reports:
      junit: build/meson-logs/testlog.junit.xml

--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml
@@ -1,6 +1,6 @@
 # All ubuntu-22.04 jobs should run successfully in an environment
 # setup by the scripts/ci/setup/qemu/build-environment.yml task
-# "Install basic packages to build QEMU on Ubuntu 20.04"
+# "Install basic packages to build QEMU on Ubuntu 22.04"

 ubuntu-22.04-aarch32-all:
 extends: .custom_runner_template
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml
@@ -1,6 +1,6 @@
-# All ubuntu-20.04 jobs should run successfully in an environment
+# All ubuntu-22.04 jobs should run successfully in an environment
 # setup by the scripts/ci/setup/qemu/build-environment.yml task
-# "Install basic packages to build QEMU on Ubuntu 20.04"
+# "Install basic packages to build QEMU on Ubuntu 22.04"

 ubuntu-22.04-aarch64-all-linux-static:
 extends: .custom_runner_template
@@ -45,6 +45,28 @@ ubuntu-22.04-aarch64-all:
 - make --output-sync -j`nproc --ignore=40`
 - make --output-sync -j`nproc --ignore=40` check

+ubuntu-22.04-aarch64-without-defaults:
+ extends: .custom_runner_template
+ needs: []
+ stage: build
+ tags:
+ - ubuntu_22.04
+ - aarch64
+ rules:
+ - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
+   when: manual
+   allow_failure: true
+ - if: "$AARCH64_RUNNER_AVAILABLE"
+   when: manual
+   allow_failure: true
+ script:
+ - mkdir build
+ - cd build
+ - ../configure --disable-user --without-default-devices --without-default-features
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
+ - make --output-sync -j`nproc --ignore=40`
+ - make --output-sync -j`nproc --ignore=40` check
+
 ubuntu-22.04-aarch64-alldbg:
 extends: .custom_runner_template
 needs: []
@@ -123,7 +145,7 @@ ubuntu-22.04-aarch64-notcg:
 script:
 - mkdir build
 - cd build
- - ../configure --disable-tcg
+ - ../configure --disable-tcg --with-devices-aarch64=minimal
   || { cat config.log meson-logs/meson-log.txt; exit 1; }
 - make --output-sync -j`nproc --ignore=40`
 - make --output-sync -j`nproc --ignore=40` check
--- a/.gitlab-ci.d/opensbi.yml
+++ b/.gitlab-ci.d/opensbi.yml
@@ -42,17 +42,15 @@
 docker-opensbi:
  extends: .opensbi_job_rules
  stage: containers
-  image: docker:stable
+  image: docker:latest
  services:
-    - docker:stable-dind
+    - docker:dind
  variables:
    GIT_DEPTH: 3
    IMAGE_TAG: $CI_REGISTRY_IMAGE:opensbi-cross-build
-    # We don't use TLS
-    DOCKER_HOST: tcp://docker:2375
-    DOCKER_TLS_CERTDIR: ""
  before_script:
    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+    - until docker info; do sleep 1; done
  script:
    - docker pull $IMAGE_TAG || true
    - docker build --cache-from $IMAGE_TAG --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
--- a/.gitlab-ci.d/qemu-project.yml
+++ b/.gitlab-ci.d/qemu-project.yml
@@ -1,6 +1,13 @@
 # This file contains the set of jobs run by the QEMU project:
 # https://gitlab.com/qemu-project/qemu/-/pipelines

+variables:
+  RUNNER_TAG: ""
+
+default:
+  tags:
+    - $RUNNER_TAG
+
 include:
  - local: '/.gitlab-ci.d/base.yml'
  - local: '/.gitlab-ci.d/stages.yml'
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,8 +13,8 @@
 [submodule "roms/qemu-palcode"]
 	path = roms/qemu-palcode
 	url = https://gitlab.com/qemu-project/qemu-palcode.git
-[submodule "dtc"]
-	path = dtc
+[submodule "subprojects/dtc"]
+	path = subprojects/dtc
 	url = https://gitlab.com/qemu-project/dtc.git
 [submodule "roms/u-boot"]
 	path = roms/u-boot
@@ -25,8 +25,8 @@
 [submodule "roms/QemuMacDrivers"]
 	path = roms/QemuMacDrivers
 	url = https://gitlab.com/qemu-project/QemuMacDrivers.git
-[submodule "ui/keycodemapdb"]
-	path = ui/keycodemapdb
+[submodule "subprojects/keycodemapdb"]
+	path = subprojects/keycodemapdb
 	url = https://gitlab.com/qemu-project/keycodemapdb.git
 [submodule "roms/seabios-hppa"]
 	path = roms/seabios-hppa
@@ -49,9 +49,6 @@
 [submodule "roms/qboot"]
 	path = roms/qboot
 	url = https://gitlab.com/qemu-project/qboot.git
-[submodule "meson"]
-	path = meson
-	url = https://gitlab.com/qemu-project/meson.git
 [submodule "roms/vbootrom"]
 	path = roms/vbootrom
 	url = https://gitlab.com/qemu-project/vbootrom.git
--- a/.mailmap
+++ b/.mailmap
@@ -78,6 +78,7 @@ Philippe Mathieu-Daudé <philmd@linaro.org> <philmd@redhat.com>
 Philippe Mathieu-Daudé <philmd@linaro.org> <philmd@fungible.com>
 Stefan Brankovic <stefan.brankovic@syrmia.com> <stefan.brankovic@rt-rk.com.com>
 Yongbok Kim <yongbok.kim@mips.com> <yongbok.kim@imgtec.com>
+Taylor Simpson <ltaylorsimpson@gmail.com> <tsimpson@quicinc.com>

 # Also list preferred name forms where people have changed their
 # git author config, or had utf8/latin1 encoding issues.
--- a/26
+++ b/26
@@ -157,6 +157,9 @@ F: include/exec/helper*.h
 F: include/sysemu/cpus.h
 F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
+F: host/include/*/host/cpuinfo.h
+F: util/cpuinfo-*.c
+F: include/tcg/

 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
@@ -215,7 +218,7 @@ F: tests/tcg/cris/
 F: disas/cris.c

 Hexagon TCG CPUs
-M: Taylor Simpson <tsimpson@quicinc.com>
+M: Brian Cain <bcain@quicinc.com>
 S: Supported
 F: target/hexagon/
 X: target/hexagon/idef-parser/
@@ -225,6 +228,7 @@ F: tests/tcg/hexagon/
 F: disas/hexagon.c
 F: configs/targets/hexagon-linux-user/default.mak
 F: docker/dockerfiles/debian-hexagon-cross.docker
+F: gdb-xml/hexagon*.xml

 Hexagon idef-parser
 M: Alessandro Di Federico <ale@rev.ng>
@@ -245,6 +249,7 @@ M: Xiaojuan Yang <yangxiaojuan@loongson.cn>
 S: Maintained
 F: target/loongarch/
 F: tests/tcg/loongarch64/
+F: tests/avocado/machine_loongarch.py

 M68K TCG CPUs
 M: Laurent Vivier <laurent@vivier.eu>
@@ -939,6 +944,7 @@ SBSA-REF
 M: Radoslaw Biernacki <rad@semihalf.com>
 M: Peter Maydell <peter.maydell@linaro.org>
 R: Leif Lindholm <quic_llindhol@quicinc.com>
+R: Marcin Juszkiewicz <marcin.juszkiewicz@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/sbsa-ref.c
@@ -2234,6 +2240,7 @@ F: tests/qtest/fuzz-megasas-test.c

 Network packet abstractions
 M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
+R: Akihiko Odaki <akihiko.odaki@daynix.com>
 S: Maintained
 F: include/net/eth.h
 F: net/eth.c
@@ -2276,7 +2283,7 @@ R: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
 S: Maintained
 F: docs/system/devices/igb.rst
 F: hw/net/igb*
-F: tests/avocado/igb.py
+F: tests/avocado/netdev-ethtool.py
 F: tests/qtest/igb-test.c
 F: tests/qtest/libqos/igb.c

@@ -2566,7 +2573,7 @@ Core Audio framework backend
 M: Gerd Hoffmann <kraxel@redhat.com>
 M: Philippe Mathieu-Daudé <philmd@linaro.org>
 R: Christian Schoenebeck <qemu_oss@crudebyte.com>
-R: Akihiko Odaki <akihiko.odaki@gmail.com>
+R: Akihiko Odaki <akihiko.odaki@daynix.com>
 S: Odd Fixes
 F: audio/coreaudio.c

@@ -2643,6 +2650,7 @@ F: util/aio-*.c
 F: util/aio-*.h
 F: util/fdmon-*.c
 F: block/io.c
+F: block/plug.c
 F: migration/block*
 F: include/block/aio.h
 F: include/block/aio-wait.h
@@ -2850,7 +2858,7 @@ F: docs/devel/ui.rst
 Cocoa graphics
 M: Peter Maydell <peter.maydell@linaro.org>
 M: Philippe Mathieu-Daudé <philmd@linaro.org>
-R: Akihiko Odaki <akihiko.odaki@gmail.com>
+R: Akihiko Odaki <akihiko.odaki@daynix.com>
 S: Odd Fixes
 F: ui/cocoa.m

@@ -3910,6 +3918,16 @@ F: configure
 F: scripts/mtest2make.py
 F: tests/Makefile.include

+Kconfig
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Maintained
+F: scripts/minikconf.py
+F: docs/devel/kconfig.rst
+F: Kconfig*
+F: */Kconfig*
+F: hw/*/Kconfig*
+F: target/*/Kconfig*
+
 GIT submodules
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
--- a/12
+++ b/12
@@ -26,7 +26,7 @@ quiet-command-run = $(if $(V),,$(if $2,printf "  %-7s %s\n" $2 $3 && ))$1
 quiet-@ = $(if $(V),,@)
 quiet-command = $(quiet-@)$(call quiet-command-run,$1,$2,$3)

-UNCHECKED_GOALS := %clean TAGS cscope ctags dist \
+UNCHECKED_GOALS := TAGS gtags cscope ctags dist \
    help check-help print-% \
    docker docker-% vm-help vm-test vm-build-%

@@ -115,15 +115,15 @@ Makefile.ninja: build.ninja
 	  $(NINJA) -t query build.ninja | sed -n '1,/^  input:/d; /^  outputs:/q; s/$$/ \\/p'; \
 	} > $@.tmp && mv $@.tmp $@
 -include Makefile.ninja
+endif

+ifneq ($(MESON),)
 # A separate rule is needed for Makefile dependencies to avoid -n
 build.ninja: build.ninja.stamp
 $(build-files):
 build.ninja.stamp: meson.stamp $(build-files)
-	$(NINJA) $(if $V,-v,) build.ninja && touch $@
-endif
+	$(MESON) setup --reconfigure $(SRC_PATH) && touch $@

-ifneq ($(MESON),)
 Makefile.mtest: build.ninja scripts/mtest2make.py
 	$(MESON) introspect --targets --tests --benchmarks | $(PYTHON) scripts/mtest2make.py > $@
 -include Makefile.mtest
@@ -176,10 +176,8 @@ plugins:
 endif # $(CONFIG_PLUGIN)

 else # config-host.mak does not exist
-config-host.mak:
 ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
-	@echo "Please call configure before running make!"
-	@exit 1
+$(error Please call configure before running make)
 endif
 endif # config-host.mak does not exist

--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1361,6 +1361,10 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
                 */
                if (kvm_state->kvm_dirty_ring_size) {
                    kvm_dirty_ring_reap_locked(kvm_state, NULL);
+                    if (kvm_state->kvm_dirty_ring_with_bitmap) {
+                        kvm_slot_sync_dirty_pages(mem);
+                        kvm_slot_get_dirty_log(kvm_state, mem);
+                    }
                } else {
                    kvm_slot_get_dirty_log(kvm_state, mem);
                }
@@ -1458,6 +1462,69 @@ static int kvm_dirty_ring_reaper_init(KVMState *s)
    return 0;
 }

+static int kvm_dirty_ring_init(KVMState *s)
+{
+    uint32_t ring_size = s->kvm_dirty_ring_size;
+    uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
+    unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
+    int ret;
+
+    s->kvm_dirty_ring_size = 0;
+    s->kvm_dirty_ring_bytes = 0;
+
+    /* Bail if the dirty ring size isn't specified */
+    if (!ring_size) {
+        return 0;
+    }
+
+    /*
+     * Read the max supported pages. Fall back to dirty logging mode
+     * if the dirty ring isn't supported.
+     */
+    ret = kvm_vm_check_extension(s, capability);
+    if (ret <= 0) {
+        capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
+        ret = kvm_vm_check_extension(s, capability);
+    }
+
+    if (ret <= 0) {
+        warn_report("KVM dirty ring not available, using bitmap method");
+        return 0;
+    }
+
+    if (ring_bytes > ret) {
+        error_report("KVM dirty ring size %" PRIu32 " too big "
+                     "(maximum is %ld).  Please use a smaller value.",
+                     ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
+        return -EINVAL;
+    }
+
+    ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
+    if (ret) {
+        error_report("Enabling of KVM dirty ring failed: %s. "
+                     "Suggested minimum value is 1024.", strerror(-ret));
+        return -EIO;
+    }
+
+    /* Enable the backup bitmap if it is supported */
+    ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
+    if (ret > 0) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
+        if (ret) {
+            error_report("Enabling of KVM dirty ring's backup bitmap failed: "
+                         "%s. ", strerror(-ret));
+            return -EIO;
+        }
+
+        s->kvm_dirty_ring_with_bitmap = true;
+    }
+
+    s->kvm_dirty_ring_size = ring_size;
+    s->kvm_dirty_ring_bytes = ring_bytes;
+
+    return 0;
+}
+
 static void kvm_region_add(MemoryListener *listener,
                           MemoryRegionSection *section)
 {
@@ -1563,7 +1630,7 @@ static void kvm_log_sync(MemoryListener *listener,
    kvm_slots_unlock();
 }

-static void kvm_log_sync_global(MemoryListener *l)
+static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
 {
    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
    KVMState *s = kvm_state;
@@ -1582,6 +1649,12 @@ static void kvm_log_sync_global(MemoryListener *l)
        mem = &kml->slots[i];
        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
            kvm_slot_sync_dirty_pages(mem);
+
+            if (s->kvm_dirty_ring_with_bitmap && last_stage &&
+                kvm_slot_get_dirty_log(s, mem)) {
+                kvm_slot_sync_dirty_pages(mem);
+            }
+
            /*
             * This is not needed by KVM_GET_DIRTY_LOG because the
             * ioctl will unconditionally overwrite the whole region.
@@ -2521,35 +2594,9 @@ static int kvm_init(MachineState *ms)
     * Enable KVM dirty ring if supported, otherwise fall back to
     * dirty logging mode
     */
-    if (s->kvm_dirty_ring_size > 0) {
-        uint64_t ring_bytes;
-
-        ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn);
-
-        /* Read the max supported pages */
-        ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING);
-        if (ret > 0) {
-            if (ring_bytes > ret) {
-                error_report("KVM dirty ring size %" PRIu32 " too big "
-                             "(maximum is %ld).  Please use a smaller value.",
-                             s->kvm_dirty_ring_size,
-                             (long)ret / sizeof(struct kvm_dirty_gfn));
-                ret = -EINVAL;
-                goto err;
-            }
-
-            ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes);
-            if (ret) {
-                error_report("Enabling of KVM dirty ring failed: %s. "
-                             "Suggested minimum value is 1024.", strerror(-ret));
-                goto err;
-            }
-
-            s->kvm_dirty_ring_bytes = ring_bytes;
-         } else {
-             warn_report("KVM dirty ring not available, using bitmap method");
-             s->kvm_dirty_ring_size = 0;
-        }
+    ret = kvm_dirty_ring_init(s);
+    if (ret < 0) {
+        goto err;
    }

    /*
@@ -3710,6 +3757,7 @@ static void kvm_accel_instance_init(Object *obj)
    s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
    /* KVM dirty ring is by default off */
    s->kvm_dirty_ring_size = 0;
+    s->kvm_dirty_ring_with_bitmap = false;
    s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
    s->notify_window = 0;
    s->xen_version = 0;
--- a/accel/tcg/atomic_common.c.inc
+++ b/accel/tcg/atomic_common.c.inc
@@ -13,26 +13,12 @@
 * See the COPYING file in the top-level directory.
 */

-static void atomic_trace_rmw_post(CPUArchState *env, target_ulong addr,
+static void atomic_trace_rmw_post(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi)
 {
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_RW);
 }

-#if HAVE_ATOMIC128
-static void atomic_trace_ld_post(CPUArchState *env, target_ulong addr,
-                                 MemOpIdx oi)
-{
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-}
-
-static void atomic_trace_st_post(CPUArchState *env, target_ulong addr,
-                                 MemOpIdx oi)
-{
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-#endif
-
 /*
 * Atomic helpers callable from TCG.
 * These have a common interface and all defer to cpu_atomic_*
@@ -40,7 +26,7 @@ static void atomic_trace_st_post(CPUArchState *env, target_ulong addr,
 */

 #define CMPXCHG_HELPER(OP, TYPE) \
-    TYPE HELPER(atomic_##OP)(CPUArchState *env, target_ulong addr,  \
+    TYPE HELPER(atomic_##OP)(CPUArchState *env, uint64_t addr,      \
                             TYPE oldv, TYPE newv, uint32_t oi)     \
    { return cpu_atomic_##OP##_mmu(env, addr, oldv, newv, oi, GETPC()); }

@@ -62,36 +48,16 @@ CMPXCHG_HELPER(cmpxchgo_le, Int128)

 #undef CMPXCHG_HELPER

-Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, target_ulong addr,
-                                     Int128 cmpv, Int128 newv, uint32_t oi)
+Int128 HELPER(nonatomic_cmpxchgo)(CPUArchState *env, uint64_t addr,
+                                  Int128 cmpv, Int128 newv, uint32_t oi)
 {
 #if TCG_TARGET_REG_BITS == 32
    uintptr_t ra = GETPC();
    Int128 oldv;

-    oldv = cpu_ld16_be_mmu(env, addr, oi, ra);
+    oldv = cpu_ld16_mmu(env, addr, oi, ra);
    if (int128_eq(oldv, cmpv)) {
-        cpu_st16_be_mmu(env, addr, newv, oi, ra);
-    } else {
-        /* Even with comparison failure, still need a write cycle. */
-        probe_write(env, addr, 16, get_mmuidx(oi), ra);
-    }
-    return oldv;
-#else
-    g_assert_not_reached();
-#endif
-}
-
-Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr,
-                                     Int128 cmpv, Int128 newv, uint32_t oi)
-{
-#if TCG_TARGET_REG_BITS == 32
-    uintptr_t ra = GETPC();
-    Int128 oldv;
-
-    oldv = cpu_ld16_le_mmu(env, addr, oi, ra);
-    if (int128_eq(oldv, cmpv)) {
-        cpu_st16_le_mmu(env, addr, newv, oi, ra);
+        cpu_st16_mmu(env, addr, newv, oi, ra);
    } else {
        /* Even with comparison failure, still need a write cycle. */
        probe_write(env, addr, 16, get_mmuidx(oi), ra);
@@ -103,7 +69,7 @@ Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr,
 }

 #define ATOMIC_HELPER(OP, TYPE) \
-    TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, target_ulong addr,  \
+    TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, uint64_t addr,  \
                                  TYPE val, uint32_t oi)                 \
    { return glue(glue(cpu_atomic_,OP),_mmu)(env, addr, val, oi, GETPC()); }

--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -73,8 +73,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                              ABI_TYPE cmpv, ABI_TYPE newv,
                              MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
    DATA_TYPE ret;

 #if DATA_SIZE == 16
@@ -87,38 +86,11 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
    return ret;
 }

-#if DATA_SIZE >= 16
-#if HAVE_ATOMIC128
-ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
-                         MemOpIdx oi, uintptr_t retaddr)
-{
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ, retaddr);
-    DATA_TYPE val;
-
-    val = atomic16_read(haddr);
-    ATOMIC_MMU_CLEANUP;
-    atomic_trace_ld_post(env, addr, oi);
-    return val;
-}
-
-void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_WRITE, retaddr);
-
-    atomic16_set(haddr, val);
-    ATOMIC_MMU_CLEANUP;
-    atomic_trace_st_post(env, addr, oi);
-}
-#endif
-#else
+#if DATA_SIZE < 16
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                           MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
    DATA_TYPE ret;

    ret = qatomic_xchg__nocheck(haddr, val);
@@ -131,9 +103,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
    ret = qatomic_##X(haddr, val);                                  \
    ATOMIC_MMU_CLEANUP;                                             \
    atomic_trace_rmw_post(env, addr, oi);                           \
@@ -163,9 +134,8 @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE cmp, old, new, val = xval;                           \
+    XDATA_TYPE *haddr, cmp, old, new, val = xval;                   \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
    smp_mb();                                                       \
    cmp = qatomic_read__nocheck(haddr);                             \
    do {                                                            \
@@ -188,7 +158,7 @@ GEN_ATOMIC_HELPER_FN(smax_fetch, MAX, SDATA_TYPE, new)
 GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)

 #undef GEN_ATOMIC_HELPER_FN
-#endif /* DATA SIZE >= 16 */
+#endif /* DATA SIZE < 16 */

 #undef END

@@ -206,8 +176,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                              ABI_TYPE cmpv, ABI_TYPE newv,
                              MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
    DATA_TYPE ret;

 #if DATA_SIZE == 16
@@ -220,39 +189,11 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
    return BSWAP(ret);
 }

-#if DATA_SIZE >= 16
-#if HAVE_ATOMIC128
-ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
-                         MemOpIdx oi, uintptr_t retaddr)
-{
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ, retaddr);
-    DATA_TYPE val;
-
-    val = atomic16_read(haddr);
-    ATOMIC_MMU_CLEANUP;
-    atomic_trace_ld_post(env, addr, oi);
-    return BSWAP(val);
-}
-
-void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_WRITE, retaddr);
-
-    val = BSWAP(val);
-    atomic16_set(haddr, val);
-    ATOMIC_MMU_CLEANUP;
-    atomic_trace_st_post(env, addr, oi);
-}
-#endif
-#else
+#if DATA_SIZE < 16
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                           MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
    ABI_TYPE ret;

    ret = qatomic_xchg__nocheck(haddr, BSWAP(val));
@@ -265,9 +206,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
    ret = qatomic_##X(haddr, BSWAP(val));                           \
    ATOMIC_MMU_CLEANUP;                                             \
    atomic_trace_rmw_post(env, addr, oi);                           \
@@ -294,9 +234,8 @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
+    XDATA_TYPE *haddr, ldo, ldn, old, new, val = xval;              \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
    smp_mb();                                                       \
    ldn = qatomic_read__nocheck(haddr);                             \
    do {                                                            \
@@ -326,7 +265,7 @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
 #undef ADD

 #undef GEN_ATOMIC_HELPER_FN
-#endif /* DATA_SIZE >= 16 */
+#endif /* DATA_SIZE < 16 */

 #undef END
 #endif /* DATA_SIZE > 1 */
--- a/accel/tcg/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
@@ -22,6 +22,7 @@
 #include "sysemu/tcg.h"
 #include "exec/exec-all.h"
 #include "qemu/plugin.h"
+#include "internal.h"

 bool tcg_allowed;

@@ -81,6 +82,8 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)

 void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
 {
+    /* Prevent looping if already executing in a serial context. */
+    g_assert(!cpu_in_serial_context(cpu));
    cpu->exception_index = EXCP_ATOMIC;
    cpu_loop_exit_restore(cpu, pc);
 }
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -175,7 +175,6 @@ struct tb_desc {
    tb_page_addr_t page_addr0;
    uint32_t flags;
    uint32_t cflags;
-    uint32_t trace_vcpu_dstate;
 };

 static bool tb_lookup_cmp(const void *p, const void *d)
@@ -187,7 +186,6 @@ static bool tb_lookup_cmp(const void *p, const void *d)
        tb_page_addr0(tb) == desc->page_addr0 &&
        tb->cs_base == desc->cs_base &&
        tb->flags == desc->flags &&
-        tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
        tb_cflags(tb) == desc->cflags) {
        /* check next page if needed */
        tb_page_addr_t tb_phys_page1 = tb_page_addr1(tb);
@@ -228,7 +226,6 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    desc.cs_base = cs_base;
    desc.flags = flags;
    desc.cflags = cflags;
-    desc.trace_vcpu_dstate = *cpu->trace_dstate;
    desc.pc = pc;
    phys_pc = get_page_addr_code(desc.env, pc);
    if (phys_pc == -1) {
@@ -236,7 +233,7 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    }
    desc.page_addr0 = phys_pc;
    h = tb_hash_func(phys_pc, (cflags & CF_PCREL ? 0 : pc),
-                     flags, cflags, *cpu->trace_dstate);
+                     flags, cs_base, cflags);
    return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }

@@ -263,7 +260,6 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
                   jc->array[hash].pc == pc &&
                   tb->cs_base == cs_base &&
                   tb->flags == flags &&
-                   tb->trace_vcpu_dstate == *cpu->trace_dstate &&
                   tb_cflags(tb) == cflags)) {
            return tb;
        }
@@ -282,7 +278,6 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
                   tb->pc == pc &&
                   tb->cs_base == cs_base &&
                   tb->flags == flags &&
-                   tb->trace_vcpu_dstate == *cpu->trace_dstate &&
                   tb_cflags(tb) == cflags)) {
            return tb;
        }
@@ -307,7 +302,6 @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                      cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
                      tb->flags, tb->cflags, lookup_symbol(pc));

-#if defined(DEBUG_DISAS)
        if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
            FILE *logfile = qemu_log_trylock();
            if (logfile) {
@@ -323,7 +317,6 @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                qemu_log_unlock(logfile);
            }
        }
-#endif /* DEBUG_DISAS */
    }
 }

--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -64,6 +64,15 @@ static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
    }
 }

+/*
+ * Return true if CS is not running in parallel with other cpus, either
+ * because there are no other cpus or we are within an exclusive context.
+ */
+static inline bool cpu_in_serial_context(CPUState *cs)
+{
+    return !(cs->tcg_cflags & CF_PARALLEL) || cpu_in_exclusive_context(cs);
+}
+
 extern int64_t max_delay;
 extern int64_t max_advance;

--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
--- a/accel/tcg/ldst_common.c.inc
+++ b/accel/tcg/ldst_common.c.inc
@@ -26,7 +26,7 @@ uint32_t cpu_lduw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_be_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }

 int cpu_ldsw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -39,21 +39,21 @@ uint32_t cpu_ldl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                              int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_be_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }

 uint64_t cpu_ldq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                              int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_be_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }

 uint32_t cpu_lduw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_le_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }

 int cpu_ldsw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -66,14 +66,14 @@ uint32_t cpu_ldl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                              int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_le_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }

 uint64_t cpu_ldq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                              int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_le_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }

 void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -87,42 +87,42 @@ void cpu_stw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    cpu_stw_be_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }

 void cpu_stl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    cpu_stl_be_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }

 void cpu_stq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_be_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }

 void cpu_stw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    cpu_stw_le_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }

 void cpu_stl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    cpu_stl_le_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }

 void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                          int mmu_idx, uintptr_t ra)
 {
    MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_le_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }

 /*--------------------------*/
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -92,27 +92,6 @@ void HELPER(plugin_vcpu_mem_cb)(unsigned int vcpu_index,
                                void *userdata)
 { }

-static void do_gen_mem_cb(TCGv vaddr, uint32_t info)
-{
-    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
-    TCGv_i32 meminfo = tcg_temp_ebb_new_i32();
-    TCGv_i64 vaddr64 = tcg_temp_ebb_new_i64();
-    TCGv_ptr udata = tcg_temp_ebb_new_ptr();
-
-    tcg_gen_movi_i32(meminfo, info);
-    tcg_gen_movi_ptr(udata, 0);
-    tcg_gen_ld_i32(cpu_index, cpu_env,
-                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
-    tcg_gen_extu_tl_i64(vaddr64, vaddr);
-
-    gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, vaddr64, udata);
-
-    tcg_temp_free_ptr(udata);
-    tcg_temp_free_i64(vaddr64);
-    tcg_temp_free_i32(meminfo);
-    tcg_temp_free_i32(cpu_index);
-}
-
 static void gen_empty_udata_cb(void)
 {
    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
@@ -145,9 +124,22 @@ static void gen_empty_inline_cb(void)
    tcg_temp_free_i64(val);
 }

-static void gen_empty_mem_cb(TCGv addr, uint32_t info)
+static void gen_empty_mem_cb(TCGv_i64 addr, uint32_t info)
 {
-    do_gen_mem_cb(addr, info);
+    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
+    TCGv_i32 meminfo = tcg_temp_ebb_new_i32();
+    TCGv_ptr udata = tcg_temp_ebb_new_ptr();
+
+    tcg_gen_movi_i32(meminfo, info);
+    tcg_gen_movi_ptr(udata, 0);
+    tcg_gen_ld_i32(cpu_index, cpu_env,
+                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+
+    gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, addr, udata);
+
+    tcg_temp_free_ptr(udata);
+    tcg_temp_free_i32(meminfo);
+    tcg_temp_free_i32(cpu_index);
 }

 /*
@@ -202,35 +194,17 @@ static void plugin_gen_empty_callback(enum plugin_gen_from from)
    }
 }

-union mem_gen_fn {
-    void (*mem_fn)(TCGv, uint32_t);
-    void (*inline_fn)(void);
-};
-
-static void gen_mem_wrapped(enum plugin_gen_cb type,
-                            const union mem_gen_fn *f, TCGv addr,
-                            uint32_t info, bool is_mem)
+void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info)
 {
    enum qemu_plugin_mem_rw rw = get_plugin_meminfo_rw(info);

-    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, type, rw);
-    if (is_mem) {
-        f->mem_fn(addr, info);
-    } else {
-        f->inline_fn();
-    }
+    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_MEM, rw);
+    gen_empty_mem_cb(addr, info);
    tcg_gen_plugin_cb_end();
-}

-void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info)
-{
-    union mem_gen_fn fn;
-
-    fn.mem_fn = gen_empty_mem_cb;
-    gen_mem_wrapped(PLUGIN_GEN_CB_MEM, &fn, addr, info, true);
-
-    fn.inline_fn = gen_empty_inline_cb;
-    gen_mem_wrapped(PLUGIN_GEN_CB_INLINE, &fn, 0, info, false);
+    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_INLINE, rw);
+    gen_empty_inline_cb();
+    tcg_gen_plugin_cb_end();
 }

 static TCGOp *find_op(TCGOp *op, TCGOpcode opc)
@@ -280,33 +254,6 @@ static TCGOp *copy_op(TCGOp **begin_op, TCGOp *op, TCGOpcode opc)
    return op;
 }

-static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* mov_i32 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        /* mov_i32 w/ $0 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-    } else {
-        /* extu_i32_i64 */
-        op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
-    }
-    return op;
-}
-
-static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* 2x mov_i32 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-    } else {
-        /* mov_i64 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i64);
-    }
-    return op;
-}
-
 static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
 {
    if (UINTPTR_MAX == UINT32_MAX) {
@@ -321,18 +268,6 @@ static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
    return op;
 }

-static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TARGET_LONG_BITS == 32) {
-        /* extu_i32_i64 */
-        op = copy_extu_i32_i64(begin_op, op);
-    } else {
-        /* mov_i64 */
-        op = copy_mov_i64(begin_op, op);
-    }
-    return op;
-}
-
 static TCGOp *copy_ld_i64(TCGOp **begin_op, TCGOp *op)
 {
    if (TCG_TARGET_REG_BITS == 32) {
@@ -477,9 +412,6 @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
        tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
    }

-    /* extu_tl_i64 */
-    op = copy_extu_tl_i64(&begin_op, op);
-
    if (type == PLUGIN_GEN_CB_MEM) {
        /* call */
        op = copy_call(&begin_op, op, HELPER(plugin_vcpu_mem_cb),
--- a/accel/tcg/tb-hash.h
+++ b/accel/tcg/tb-hash.h
@@ -61,10 +61,10 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
 #endif /* CONFIG_SOFTMMU */

 static inline
-uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags,
-                      uint32_t cf_mask, uint32_t trace_vcpu_dstate)
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc,
+                      uint32_t flags, uint64_t flags2, uint32_t cf_mask)
 {
-    return qemu_xxhash7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate);
+    return qemu_xxhash8(phys_pc, pc, flags2, flags, cf_mask);
 }

 #endif
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -50,7 +50,6 @@ static bool tb_cmp(const void *ap, const void *bp)
            a->cs_base == b->cs_base &&
            a->flags == b->flags &&
            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
-            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
            tb_page_addr0(a) == tb_page_addr0(b) &&
            tb_page_addr1(a) == tb_page_addr1(b));
 }
@@ -746,7 +745,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)

    tcg_region_reset_all();
    /* XXX: flush processor icache at this point if cache flush is expensive */
-    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
+    qatomic_inc(&tb_ctx.tb_flush_count);

 done:
    mmap_unlock();
@@ -758,9 +757,9 @@ done:
 void tb_flush(CPUState *cpu)
 {
    if (tcg_enabled()) {
-        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
+        unsigned tb_flush_count = qatomic_read(&tb_ctx.tb_flush_count);

-        if (cpu_in_exclusive_context(cpu)) {
+        if (cpu_in_serial_context(cpu)) {
            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
        } else {
            async_safe_run_on_cpu(cpu, do_tb_flush,
@@ -888,7 +887,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
    /* remove the TB from the hash list */
    phys_pc = tb_page_addr0(tb);
    h = tb_hash_func(phys_pc, (orig_cflags & CF_PCREL ? 0 : tb->pc),
-                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
+                     tb->flags, tb->cs_base, orig_cflags);
    if (!qht_remove(&tb_ctx.htable, tb, h)) {
        return;
    }
@@ -969,7 +968,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,

    /* add in the hash table */
    h = tb_hash_func(phys_pc, (tb->cflags & CF_PCREL ? 0 : tb->pc),
-                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
+                     tb->flags, tb->cs_base, tb->cflags);
    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);

    /* remove TB from the page(s) if we couldn't insert it */
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -89,7 +89,20 @@ void icount_handle_deadline(void)
    }
 }

-void icount_prepare_for_run(CPUState *cpu)
+/* Distribute the budget evenly across all CPUs */
+int64_t icount_percpu_budget(int cpu_count)
+{
+    int64_t limit = icount_get_limit();
+    int64_t timeslice = limit / cpu_count;
+
+    if (timeslice == 0) {
+        timeslice = limit;
+    }
+
+    return timeslice;
+}
+
+void icount_prepare_for_run(CPUState *cpu, int64_t cpu_budget)
 {
    int insns_left;

@@ -101,13 +114,13 @@ void icount_prepare_for_run(CPUState *cpu)
    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
    g_assert(cpu->icount_extra == 0);

-    cpu->icount_budget = icount_get_limit();
+    replay_mutex_lock();
+
+    cpu->icount_budget = MIN(icount_get_limit(), cpu_budget);
    insns_left = MIN(0xffff, cpu->icount_budget);
    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
    cpu->icount_extra = cpu->icount_budget - insns_left;

-    replay_mutex_lock();
-
    if (cpu->icount_budget == 0) {
        /*
         * We're called without the iothread lock, so must take it while
--- a/accel/tcg/tcg-accel-ops-icount.h
+++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -11,7 +11,8 @@
 #define TCG_ACCEL_OPS_ICOUNT_H

 void icount_handle_deadline(void);
-void icount_prepare_for_run(CPUState *cpu);
+void icount_prepare_for_run(CPUState *cpu, int64_t cpu_budget);
+int64_t icount_percpu_budget(int cpu_count);
 void icount_process_data(CPUState *cpu);

 void icount_handle_interrupt(CPUState *cpu, int mask);
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -24,6 +24,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/lockable.h"
 #include "sysemu/tcg.h"
 #include "sysemu/replay.h"
 #include "sysemu/cpu-timers.h"
@@ -71,11 +72,13 @@ static void rr_kick_next_cpu(void)
 {
    CPUState *cpu;
    do {
-        cpu = qatomic_mb_read(&rr_current_cpu);
+        cpu = qatomic_read(&rr_current_cpu);
        if (cpu) {
            cpu_exit(cpu);
        }
-    } while (cpu != qatomic_mb_read(&rr_current_cpu));
+        /* Finish kicking this cpu before reading again.  */
+        smp_mb();
+    } while (cpu != qatomic_read(&rr_current_cpu));
 }

 static void rr_kick_thread(void *opaque)
@@ -139,6 +142,33 @@ static void rr_force_rcu(Notifier *notify, void *data)
    rr_kick_next_cpu();
 }

+/*
+ * Calculate the number of CPUs that we will process in a single iteration of
+ * the main CPU thread loop so that we can fairly distribute the instruction
+ * count across CPUs.
+ *
+ * The CPU count is cached based on the CPU list generation ID to avoid
+ * iterating the list every time.
+ */
+static int rr_cpu_count(void)
+{
+    static unsigned int last_gen_id = ~0;
+    static int cpu_count;
+    CPUState *cpu;
+
+    QEMU_LOCK_GUARD(&qemu_cpu_list_lock);
+
+    if (cpu_list_generation_id_get() != last_gen_id) {
+        cpu_count = 0;
+        CPU_FOREACH(cpu) {
+            ++cpu_count;
+        }
+        last_gen_id = cpu_list_generation_id_get();
+    }
+
+    return cpu_count;
+}
+
 /*
 * In the single-threaded case each vCPU is simulated in turn. If
 * there is more than a single vCPU we create a simple timer to kick
@@ -185,11 +215,16 @@ static void *rr_cpu_thread_fn(void *arg)
    cpu->exit_request = 1;

    while (1) {
+        /* Only used for icount_enabled() */
+        int64_t cpu_budget = 0;
+
        qemu_mutex_unlock_iothread();
        replay_mutex_lock();
        qemu_mutex_lock_iothread();

        if (icount_enabled()) {
+            int cpu_count = rr_cpu_count();
+
            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
            icount_account_warp_timer();
            /*
@@ -197,6 +232,8 @@ static void *rr_cpu_thread_fn(void *arg)
             * waking up the I/O thread and waiting for completion.
             */
            icount_handle_deadline();
+
+            cpu_budget = icount_percpu_budget(cpu_count);
        }

        replay_mutex_unlock();
@@ -206,8 +243,9 @@ static void *rr_cpu_thread_fn(void *arg)
        }

        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-
+            /* Store rr_current_cpu before evaluating cpu_can_run().  */
            qatomic_mb_set(&rr_current_cpu, cpu);
+
            current_cpu = cpu;

            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -218,7 +256,7 @@ static void *rr_cpu_thread_fn(void *arg)

                qemu_mutex_unlock_iothread();
                if (icount_enabled()) {
-                    icount_prepare_for_run(cpu);
+                    icount_prepare_for_run(cpu, cpu_budget);
                }
                r = tcg_cpus_exec(cpu);
                if (icount_enabled()) {
@@ -245,7 +283,7 @@ static void *rr_cpu_thread_fn(void *arg)
            cpu = CPU_NEXT(cpu);
        } /* while (cpu && !cpu->exit_request).. */

-        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
+        /* Does not need a memory barrier because a spurious wakeup is okay.  */
        qatomic_set(&rr_current_cpu, NULL);

        if (cpu && cpu->exit_request) {
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -39,62 +39,63 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 #endif /* IN_HELPER_PROTO */

+DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, i64, i32)
+DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, i64, i128, i32)
+
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 #ifdef CONFIG_ATOMIC64
 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
-                   i64, env, tl, i64, i64, i32)
+                   i64, env, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
-                   i64, env, tl, i64, i64, i32)
+                   i64, env, i64, i64, i64, i32)
 #endif
 #ifdef CONFIG_CMPXCHG128
 DEF_HELPER_FLAGS_5(atomic_cmpxchgo_be, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgo_le, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)
 #endif

-DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_be, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
-DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_le, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo, TCG_CALL_NO_WG,
+                   i128, env, i64, i128, i128, i32)

 #ifdef CONFIG_ATOMIC64
 #define GEN_ATOMIC_HELPERS(NAME)                                  \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),           \
-                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)    \
+                       TCG_CALL_NO_WG, i64, env, i64, i64, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),           \
-                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+                       TCG_CALL_NO_WG, i64, env, i64, i64, i32)
 #else
 #define GEN_ATOMIC_HELPERS(NAME)                                  \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)
 #endif /* CONFIG_ATOMIC64 */

 GEN_ATOMIC_HELPERS(fetch_add)
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -65,16 +65,13 @@
 #include "internal.h"
 #include "perf.h"

-/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
-QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
-                  sizeof_field(TranslationBlock, trace_vcpu_dstate)
-                  * BITS_PER_BYTE);
-
 TBContext tb_ctx;

-/* Encode VAL as a signed leb128 sequence at P.
-   Return P incremented past the encoded value.  */
-static uint8_t *encode_sleb128(uint8_t *p, target_long val)
+/*
+ * Encode VAL as a signed leb128 sequence at P.
+ * Return P incremented past the encoded value.
+ */
+static uint8_t *encode_sleb128(uint8_t *p, int64_t val)
 {
    int more, byte;

@@ -92,21 +89,23 @@ static uint8_t *encode_sleb128(uint8_t *p, target_long val)
    return p;
 }

-/* Decode a signed leb128 sequence at *PP; increment *PP past the
-   decoded value.  Return the decoded value.  */
-static target_long decode_sleb128(const uint8_t **pp)
+/*
+ * Decode a signed leb128 sequence at *PP; increment *PP past the
+ * decoded value.  Return the decoded value.
+ */
+static int64_t decode_sleb128(const uint8_t **pp)
 {
    const uint8_t *p = *pp;
-    target_long val = 0;
+    int64_t val = 0;
    int byte, shift = 0;

    do {
        byte = *p++;
-        val |= (target_ulong)(byte & 0x7f) << shift;
+        val |= (int64_t)(byte & 0x7f) << shift;
        shift += 7;
    } while (byte & 0x80);
    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
-        val |= -(target_ulong)1 << shift;
+        val |= -(int64_t)1 << shift;
    }

    *pp = p;
@@ -132,7 +131,7 @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
    int i, j, n;

    for (i = 0, n = tb->icount; i < n; ++i) {
-        target_ulong prev;
+        uint64_t prev;

        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
            if (i == 0) {
@@ -348,10 +347,16 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tb->cs_base = cs_base;
    tb->flags = flags;
    tb->cflags = cflags;
-    tb->trace_vcpu_dstate = *cpu->trace_dstate;
    tb_set_page_addr0(tb, phys_pc);
    tb_set_page_addr1(tb, -1);
    tcg_ctx->gen_tb = tb;
+    tcg_ctx->addr_type = TCG_TYPE_TL;
+#ifdef CONFIG_SOFTMMU
+    tcg_ctx->page_bits = TARGET_PAGE_BITS;
+    tcg_ctx->page_mask = TARGET_PAGE_MASK;
+    tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS;
+#endif
+
 tb_overflow:

 #ifdef CONFIG_PROFILER
@@ -421,7 +426,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
 #endif

-#ifdef DEBUG_DISAS
    if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
        qemu_log_in_addr_range(pc)) {
        FILE *logfile = qemu_log_trylock();
@@ -444,7 +448,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
            /* Dump header and the first instruction */
            fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
            fprintf(logfile,
-                    "  -- guest addr 0x" TARGET_FMT_lx " + tb prologue\n",
+                    "  -- guest addr 0x%016" PRIx64 " + tb prologue\n",
                    tcg_ctx->gen_insn_data[insn][0]);
            chunk_start = tcg_ctx->gen_insn_end_off[insn];
            disas(logfile, tb->tc.ptr, chunk_start);
@@ -457,7 +461,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
            while (insn < tb->icount) {
                size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
                if (chunk_end > chunk_start) {
-                    fprintf(logfile, "  -- guest addr 0x" TARGET_FMT_lx "\n",
+                    fprintf(logfile, "  -- guest addr 0x%016" PRIx64 "\n",
                            tcg_ctx->gen_insn_data[insn][0]);
                    disas(logfile, tb->tc.ptr + chunk_start,
                          chunk_end - chunk_start);
@@ -494,7 +498,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
            qemu_log_unlock(logfile);
        }
    }
-#endif

    qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
        ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -122,7 +122,6 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
    tb->size = db->pc_next - db->pc_first;
    tb->icount = db->num_insns;

-#ifdef DEBUG_DISAS
    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
        && qemu_log_in_addr_range(db->pc_first)) {
        FILE *logfile = qemu_log_trylock();
@@ -133,7 +132,6 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
            qemu_log_unlock(logfile);
        }
    }
-#endif
 }

 static void *translator_access(CPUArchState *env, DisasContextBase *db,
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -889,35 +889,9 @@ void page_reset_target_data(target_ulong start, target_ulong last) { }

 /* The softmmu versions of these helpers are in cputlb.c.  */

-/*
- * Verify that we have passed the correct MemOp to the correct function.
- *
- * We could present one function to target code, and dispatch based on
- * the MemOp, but so far we have worked hard to avoid an indirect function
- * call along the memory path.
- */
-static void validate_memop(MemOpIdx oi, MemOp expected)
+static void *cpu_mmu_lookup(CPUArchState *env, abi_ptr addr,
+                            MemOp mop, uintptr_t ra, MMUAccessType type)
 {
-#ifdef CONFIG_DEBUG_TCG
-    MemOp have = get_memop(oi) & (MO_SIZE | MO_BSWAP);
-    assert(have == expected);
-#endif
-}
-
-void helper_unaligned_ld(CPUArchState *env, target_ulong addr)
-{
-    cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_LOAD, GETPC());
-}
-
-void helper_unaligned_st(CPUArchState *env, target_ulong addr)
-{
-    cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, GETPC());
-}
-
-static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
-                            MemOpIdx oi, uintptr_t ra, MMUAccessType type)
-{
-    MemOp mop = get_memop(oi);
    int a_bits = get_alignment_bits(mop);
    void *ret;

@@ -931,251 +905,320 @@ static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
    return ret;
 }

-uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
-                    MemOpIdx oi, uintptr_t ra)
+#include "ldst_atomicity.c.inc"
+
+static uint8_t do_ld1_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOp mop, uintptr_t ra)
 {
    void *haddr;
    uint8_t ret;

-    validate_memop(oi, MO_UB);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    tcg_debug_assert((mop & MO_SIZE) == MO_8);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
    ret = ldub_p(haddr);
    clear_helper_retaddr();
+    return ret;
+}
+
+tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return (int8_t)do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    uint8_t ret = do_ld1_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
    return ret;
 }

-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+static uint16_t do_ld2_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
    void *haddr;
    uint16_t ret;

-    validate_memop(oi, MO_BEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_be_p(haddr);
+    tcg_debug_assert((mop & MO_SIZE) == MO_16);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_2(env, ra, haddr, mop);
    clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld2_mmu(env, addr, get_memop(oi), ra);
+}
+
+tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return (int16_t)do_ld2_mmu(env, addr, get_memop(oi), ra);
+}
+
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint16_t ret = do_ld2_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
    return ret;
 }

-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+static uint32_t do_ld4_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
    void *haddr;
    uint32_t ret;

-    validate_memop(oi, MO_BEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_be_p(haddr);
+    tcg_debug_assert((mop & MO_SIZE) == MO_32);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_4(env, ra, haddr, mop);
    clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld4_mmu(env, addr, get_memop(oi), ra);
+}
+
+tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return (int32_t)do_ld4_mmu(env, addr, get_memop(oi), ra);
+}
+
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint32_t ret = do_ld4_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
    return ret;
 }

-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+static uint64_t do_ld8_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
    void *haddr;
    uint64_t ret;

-    validate_memop(oi, MO_BEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_be_p(haddr);
+    tcg_debug_assert((mop & MO_SIZE) == MO_64);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_8(env, ra, haddr, mop);
    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap64(ret);
+    }
    return ret;
 }

-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
+uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
                        MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-    uint16_t ret;
-
-    validate_memop(oi, MO_LEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_le_p(haddr);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return do_ld8_mmu(env, addr, get_memop(oi), ra);
 }

-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-    uint32_t ret;
-
-    validate_memop(oi, MO_LEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_le_p(haddr);
-    clear_helper_retaddr();
+    uint64_t ret = do_ld8_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
    return ret;
 }

-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-    uint64_t ret;
-
-    validate_memop(oi, MO_LEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_le_p(haddr);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
-}
-
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
+static Int128 do_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOp mop, uintptr_t ra)
 {
    void *haddr;
    Int128 ret;

-    validate_memop(oi, MO_128 | MO_BE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    memcpy(&ret, haddr, 16);
+    tcg_debug_assert((mop & MO_SIZE) == MO_128);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_16(env, ra, haddr, mop);
    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);

-    if (!HOST_BIG_ENDIAN) {
+    if (mop & MO_BSWAP) {
        ret = bswap128(ret);
    }
    return ret;
 }

-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
+Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
                       MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-    Int128 ret;
+    return do_ld16_mmu(env, addr, get_memop(oi), ra);
+}

-    validate_memop(oi, MO_128 | MO_LE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    memcpy(&ret, haddr, 16);
-    clear_helper_retaddr();
+Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
+{
+    return helper_ld16_mmu(env, addr, oi, GETPC());
+}
+
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    Int128 ret = do_ld16_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-
-    if (HOST_BIG_ENDIAN) {
-        ret = bswap128(ret);
-    }
    return ret;
 }

+static void do_st1_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
+                       MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_8);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+    stb_p(haddr, val);
+    clear_helper_retaddr();
+}
+
+void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    do_st1_mmu(env, addr, val, get_memop(oi), ra);
+}
+
 void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
                 MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-
-    validate_memop(oi, MO_UB);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stb_p(haddr, val);
-    clear_helper_retaddr();
+    do_st1_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

-void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+static void do_st2_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+                       MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_16);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap16(val);
+    }
+    store_atom_2(env, ra, haddr, mop, val);
+    clear_helper_retaddr();
+}
+
+void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-
-    validate_memop(oi, MO_BEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stw_be_p(haddr, val);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
 }

-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-
-    validate_memop(oi, MO_BEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stl_be_p(haddr, val);
-    clear_helper_retaddr();
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+static void do_st4_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_32);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap32(val);
+    }
+    store_atom_4(env, ra, haddr, mop, val);
+    clear_helper_retaddr();
+}
+
+void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
+}

-    validate_memop(oi, MO_BEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stq_be_p(haddr, val);
-    clear_helper_retaddr();
+void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                 MemOpIdx oi, uintptr_t ra)
+{
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

-void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+static void do_st8_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+                       MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_64);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap64(val);
+    }
+    store_atom_8(env, ra, haddr, mop, val);
+    clear_helper_retaddr();
+}
+
+void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-
-    validate_memop(oi, MO_LEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stw_le_p(haddr, val);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
 }

-void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
-
-    validate_memop(oi, MO_LEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stl_le_p(haddr, val);
-    clear_helper_retaddr();
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

-void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra)
+static void do_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                        MemOp mop, uintptr_t ra)
 {
    void *haddr;

-    validate_memop(oi, MO_LEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stq_le_p(haddr, val);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
+    tcg_debug_assert((mop & MO_SIZE) == MO_128);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);

-void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-
-    validate_memop(oi, MO_128 | MO_BE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    if (!HOST_BIG_ENDIAN) {
+    if (mop & MO_BSWAP) {
        val = bswap128(val);
    }
-    memcpy(haddr, &val, 16);
+    store_atom_16(env, ra, haddr, mop, val);
    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

-void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
+void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
+}

-    validate_memop(oi, MO_128 | MO_LE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    if (HOST_BIG_ENDIAN) {
-        val = bswap128(val);
-    }
-    memcpy(haddr, &val, 16);
-    clear_helper_retaddr();
+void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
+{
+    helper_st16_mmu(env, addr, val, oi, GETPC());
+}
+
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr,
+                  Int128 val, MemOpIdx oi, uintptr_t ra)
+{
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

@@ -1267,7 +1310,6 @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
    void *haddr;
    uint64_t ret;

-    validate_memop(oi, MO_BEUQ);
    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
    ret = ldq_p(haddr);
    clear_helper_retaddr();
@@ -1281,12 +1323,9 @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,

 /*
 * Do not allow unaligned operations to proceed.  Return the host address.
- *
- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
 */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               MemOpIdx oi, int size, int prot,
-                               uintptr_t retaddr)
+                               MemOpIdx oi, int size, uintptr_t retaddr)
 {
    MemOp mop = get_memop(oi);
    int a_bits = get_alignment_bits(mop);
@@ -1294,8 +1333,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,

    /* Enforce guest required alignment.  */
    if (unlikely(addr & ((1 << a_bits) - 1))) {
-        MMUAccessType t = prot == PAGE_READ ? MMU_DATA_LOAD : MMU_DATA_STORE;
-        cpu_loop_exit_sigbus(env_cpu(env), addr, t, retaddr);
+        cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, retaddr);
    }

    /* Enforce qemu required alignment.  */
@@ -1333,7 +1371,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #include "atomic_template.h"
 #endif

-#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -27,6 +27,7 @@ struct HostMemoryBackendFile {

    char *mem_path;
    uint64_t align;
+    uint64_t offset;
    bool discard_data;
    bool is_pmem;
    bool readonly;
@@ -58,7 +59,8 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
    ram_flags |= fb->is_pmem ? RAM_PMEM : 0;
    memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name,
                                     backend->size, fb->align, ram_flags,
-                                     fb->mem_path, fb->readonly, errp);
+                                     fb->mem_path, fb->offset, fb->readonly,
+                                     errp);
    g_free(name);
 #endif
 }
@@ -125,6 +127,36 @@ static void file_memory_backend_set_align(Object *o, Visitor *v,
    fb->align = val;
 }

+static void file_memory_backend_get_offset(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    uint64_t val = fb->offset;
+
+    visit_type_size(v, name, &val, errp);
+}
+
+static void file_memory_backend_set_offset(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(o);
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    uint64_t val;
+
+    if (host_memory_backend_mr_inited(backend)) {
+        error_setg(errp, "cannot change property '%s' of %s", name,
+                   object_get_typename(o));
+        return;
+    }
+
+    if (!visit_type_size(v, name, &val, errp)) {
+        return;
+    }
+    fb->offset = val;
+}
+
 #ifdef CONFIG_LIBPMEM
 static bool file_memory_backend_get_pmem(Object *o, Error **errp)
 {
@@ -197,6 +229,12 @@ file_backend_class_init(ObjectClass *oc, void *data)
        file_memory_backend_get_align,
        file_memory_backend_set_align,
        NULL, NULL);
+    object_class_property_add(oc, "offset", "int",
+        file_memory_backend_get_offset,
+        file_memory_backend_set_offset,
+        NULL, NULL);
+    object_class_property_set_description(oc, "offset",
+        "Offset into the target file (ex: 1G)");
 #ifdef CONFIG_LIBPMEM
    object_class_property_add_bool(oc, "pmem",
        file_memory_backend_get_pmem, file_memory_backend_set_pmem);
--- a/block.c
+++ b/block.c
@@ -533,7 +533,6 @@ int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
    int ret;
    GLOBAL_STATE_CODE();
    ERRP_GUARD();
-    assert_bdrv_graph_readable();

    if (!drv->bdrv_co_create_opts) {
        error_setg(errp, "Driver '%s' does not support image creation",
@@ -680,7 +679,7 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,

    ret = 0;
 out:
-    blk_unref(blk);
+    blk_co_unref(blk);
    return ret;
 }

@@ -1610,10 +1609,11 @@ out:
 * bdrv_refresh_total_sectors() which polls when called from non-coroutine
 * context.
 */
-static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
-                            const char *node_name, QDict *options,
-                            int open_flags, Error **errp)
+static int no_coroutine_fn GRAPH_UNLOCKED
+bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
+                 QDict *options, int open_flags, Error **errp)
 {
+    AioContext *ctx;
    Error *local_err = NULL;
    int i, ret;
    GLOBAL_STATE_CODE();
@@ -1661,13 +1661,22 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
    bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
    bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;

+    /* Get the context after .bdrv_open, it can change the context */
+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+
    ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
+        aio_context_release(ctx);
        return ret;
    }

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, &local_err);
+    bdrv_graph_rdunlock_main_loop();
+    aio_context_release(ctx);
+
    if (local_err) {
        error_propagate(errp, local_err);
        return -EINVAL;
@@ -3419,7 +3428,9 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
    }

 out:
+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(parent_bs, tran, NULL);
+    bdrv_graph_rdunlock_main_loop();

    return 0;
 }
@@ -3474,6 +3485,8 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
 * itself, all options starting with "${bdref_key}." are considered part of the
 * BlockdevRef.
 *
+ * The caller must hold the main AioContext lock.
+ *
 * TODO Can this be unified with bdrv_open_image()?
 */
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
@@ -3640,6 +3653,9 @@ done:
 * BlockdevRef.
 *
 * The BlockdevRef will be removed from the options QDict.
+ *
+ * @parent can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
 */
 BdrvChild *bdrv_open_child(const char *filename,
                           QDict *options, const char *bdref_key,
@@ -3664,6 +3680,9 @@ BdrvChild *bdrv_open_child(const char *filename,

 /*
 * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
+ *
+ * @parent can move to a different AioContext in this function. Callers must
+ * make sure that their AioContext locking is still correct after this.
 */
 int bdrv_open_file_child(const char *filename,
                         QDict *options, const char *bdref_key,
@@ -3806,9 +3825,7 @@ out:
 * should be opened. If specified, neither options nor a filename may be given,
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
 *
- * The caller must always hold @filename AioContext lock, because this
- * function eventually calls bdrv_refresh_total_sectors() which polls
- * when called from non-coroutine context.
+ * The caller must always hold the main AioContext lock.
 */
 static BlockDriverState * no_coroutine_fn
 bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -4096,11 +4113,7 @@ close_and_fail:
    return NULL;
 }

-/*
- * The caller must always hold @filename AioContext lock, because this
- * function eventually calls bdrv_refresh_total_sectors() which polls
- * when called from non-coroutine context.
- */
+/* The caller must always hold the main AioContext lock. */
 BlockDriverState *bdrv_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp)
 {
@@ -4917,7 +4930,9 @@ static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
    qdict_del(bs->explicit_options, "backing");
    qdict_del(bs->options, "backing");

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, NULL);
+    bdrv_graph_rdunlock_main_loop();
    bdrv_refresh_total_sectors(bs, bs->total_sectors);
 }

@@ -5316,7 +5331,9 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
 out:
    tran_finalize(tran, ret);

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs_top, NULL, NULL);
+    bdrv_graph_rdunlock_main_loop();

    if (new_context && old_context != new_context) {
        aio_context_release(new_context);
@@ -5382,12 +5399,17 @@ static void bdrv_delete(BlockDriverState *bs)
 * empty set of options. The reference to the QDict belongs to the block layer
 * after the call (even on failure), so if the caller intends to reuse the
 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
+ *
+ * The caller holds the AioContext lock for @bs. It must make sure that @bs
+ * stays in the same AioContext, i.e. @options must not refer to nodes in a
+ * different AioContext.
 */
 BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
                                   int flags, Error **errp)
 {
    ERRP_GUARD();
    int ret;
+    AioContext *ctx = bdrv_get_aio_context(bs);
    BlockDriverState *new_node_bs = NULL;
    const char *drvname, *node_name;
    BlockDriver *drv;
@@ -5408,8 +5430,14 @@ BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,

    GLOBAL_STATE_CODE();

+    aio_context_release(ctx);
+    aio_context_acquire(qemu_get_aio_context());
    new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
                                            errp);
+    aio_context_release(qemu_get_aio_context());
+    aio_context_acquire(ctx);
+    assert(bdrv_get_aio_context(bs) == ctx);
+
    options = NULL; /* bdrv_new_open_driver() eats options */
    if (!new_node_bs) {
        error_prepend(errp, "Could not create node: ");
@@ -5750,7 +5778,8 @@ exit:
 * sums the size of all data-bearing children.  (This excludes backing
 * children.)
 */
-static int64_t coroutine_fn bdrv_sum_allocated_file_size(BlockDriverState *bs)
+static int64_t coroutine_fn GRAPH_RDLOCK
+bdrv_sum_allocated_file_size(BlockDriverState *bs)
 {
    BdrvChild *child;
    int64_t child_size, sum = 0;
@@ -5778,6 +5807,7 @@ int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
 {
    BlockDriver *drv = bs->drv;
    IO_CODE();
+    assert_bdrv_graph_readable();

    if (!drv) {
        return -ENOMEDIUM;
@@ -6347,6 +6377,8 @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    int ret;
    BlockDriver *drv = bs->drv;
    IO_CODE();
+    assert_bdrv_graph_readable();
+
    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
    if (!drv) {
        return -ENOMEDIUM;
@@ -6395,6 +6427,8 @@ BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
 void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
 {
    IO_CODE();
+    assert_bdrv_graph_readable();
+
    if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
        return;
    }
@@ -7029,6 +7063,8 @@ void bdrv_img_create(const char *filename, const char *fmt,
        return;
    }

+    aio_context_acquire(qemu_get_aio_context());
+
    /* Create parameter list */
    create_opts = qemu_opts_append(create_opts, drv->create_opts);
    create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
@@ -7178,6 +7214,7 @@ out:
    qemu_opts_del(opts);
    qemu_opts_free(create_opts);
    error_propagate(errp, local_err);
+    aio_context_release(qemu_get_aio_context());
 }

 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
@@ -7268,9 +7305,6 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)
        bs->drv->bdrv_detach_aio_context(bs);
    }

-    if (bs->quiesce_counter) {
-        aio_enable_external(bs->aio_context);
-    }
    bs->aio_context = NULL;
 }

@@ -7280,10 +7314,6 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
    BdrvAioNotifier *ban, *ban_tmp;
    GLOBAL_STATE_CODE();

-    if (bs->quiesce_counter) {
-        aio_disable_external(new_context);
-    }
-
    bs->aio_context = new_context;

    if (bs->drv && bs->drv->bdrv_attach_aio_context) {
@@ -7967,6 +7997,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
        return;
    }

+    /*
+     * Non-zoned block drivers do not follow zoned storage constraints
+     * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
+     * drivers in a graph.
+     */
+    if (!parent_bs->drv->supports_zoned_children &&
+        child_bs->bl.zoned == BLK_Z_HM) {
+        /*
+         * The host-aware model allows zoned storage constraints and random
+         * write. Allow mixing host-aware and non-zoned drivers. Using
+         * host-aware device as a regular device.
+         */
+        error_setg(errp, "Cannot add a %s child to a %s parent",
+                   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
+                   parent_bs->drv->supports_zoned_children ?
+                   "support zoned children" : "not support zoned children");
+        return;
+    }
+
    if (!QLIST_EMPTY(&child_bs->parents)) {
        error_setg(errp, "The node %s already has a parent",
                   child_bs->node_name);
--- a/block/amend.c
+++ b/block/amend.c
@@ -46,6 +46,7 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error **errp)
 {
    BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
    int ret;
+    GRAPH_RDLOCK_GUARD();

    job_progress_set_remaining(&s->common, 1);
    ret = s->bs->drv->bdrv_co_amend(s->bs, s->opts, s->force, errp);
@@ -54,7 +55,8 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error **errp)
    return ret;
 }

-static int blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
+static int GRAPH_RDLOCK
+blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
 {
    if (s->bs->drv->bdrv_amend_pre_run) {
        return s->bs->drv->bdrv_amend_pre_run(s->bs, errp);
@@ -67,9 +69,11 @@ static void blockdev_amend_free(Job *job)
 {
    BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);

+    bdrv_graph_rdlock_main_loop();
    if (s->bs->drv->bdrv_amend_clean) {
        s->bs->drv->bdrv_amend_clean(s->bs);
    }
+    bdrv_graph_rdunlock_main_loop();

    bdrv_unref(s->bs);
 }
@@ -93,6 +97,8 @@ void qmp_x_blockdev_amend(const char *job_id,
    BlockDriver *drv = bdrv_find_format(fmt);
    BlockDriverState *bs;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    bs = bdrv_lookup_bs(NULL, node_name, errp);
    if (!bs) {
        return;
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -17,6 +17,7 @@
 #include "qemu/error-report.h"
 #include "qapi/qmp/qdict.h"
 #include "qemu/module.h"
+#include "sysemu/block-backend.h"
 #include "exec/memory.h" /* for ram_block_discard_disable() */

 #include "block/block-io.h"
@@ -306,35 +307,44 @@ static void blkio_attach_aio_context(BlockDriverState *bs,
 {
    BDRVBlkioState *s = bs->opaque;

-    aio_set_fd_handler(new_context,
-                       s->completion_fd,
-                       false,
-                       blkio_completion_fd_read,
-                       NULL,
+    aio_set_fd_handler(new_context, s->completion_fd,
+                       blkio_completion_fd_read, NULL,
                       blkio_completion_fd_poll,
-                       blkio_completion_fd_poll_ready,
-                       bs);
+                       blkio_completion_fd_poll_ready, bs);
 }

 static void blkio_detach_aio_context(BlockDriverState *bs)
 {
    BDRVBlkioState *s = bs->opaque;

-    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       s->completion_fd,
-                       false, NULL, NULL, NULL, NULL, NULL);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
+                       NULL, NULL, NULL);
 }

-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
-static void blkio_submit_io(BlockDriverState *bs)
+/*
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
+ * blkio_lock.
+ */
+static void blkio_unplug_fn(void *opaque)
 {
-    if (qatomic_read(&bs->io_plugged) == 0) {
-        BDRVBlkioState *s = bs->opaque;
+    BDRVBlkioState *s = opaque;

+    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
    }
 }

+/*
+ * Schedule I/O submission after enqueuing a new request. Called without
+ * blkio_lock.
+ */
+static void blkio_submit_io(BlockDriverState *bs)
+{
+    BDRVBlkioState *s = bs->opaque;
+
+    blk_io_plug_call(blkio_unplug_fn, s);
+}
+
 static int coroutine_fn
 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
@@ -345,9 +355,9 @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)

    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
-        blkio_submit_io(bs);
    }

+    blkio_submit_io(bs);
    qemu_coroutine_yield();
    return cod.ret;
 }
@@ -378,9 +388,9 @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,

    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
-        blkio_submit_io(bs);
    }

+    blkio_submit_io(bs);
    qemu_coroutine_yield();

    if (use_bounce_buffer) {
@@ -423,9 +433,9 @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,

    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
-        blkio_submit_io(bs);
    }

+    blkio_submit_io(bs);
    qemu_coroutine_yield();

    if (use_bounce_buffer) {
@@ -444,9 +454,9 @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)

    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_flush(s->blkioq, &cod, 0);
-        blkio_submit_io(bs);
    }

+    blkio_submit_io(bs);
    qemu_coroutine_yield();
    return cod.ret;
 }
@@ -472,22 +482,13 @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,

    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
        blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
-        blkio_submit_io(bs);
    }

+    blkio_submit_io(bs);
    qemu_coroutine_yield();
    return cod.ret;
 }

-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVBlkioState *s = bs->opaque;
-
-    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
-        blkio_submit_io(bs);
-    }
-}
-
 typedef enum {
    BMRR_OK,
    BMRR_SKIP,
@@ -672,25 +673,60 @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs,
 {
    const char *path = qdict_get_try_str(options, "path");
    BDRVBlkioState *s = bs->opaque;
-    int ret;
+    bool fd_supported = false;
+    int fd, ret;

    if (!path) {
        error_setg(errp, "missing 'path' option");
        return -EINVAL;
    }

-    ret = blkio_set_str(s->blkio, "path", path);
-    qdict_del(options, "path");
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "failed to set path: %s",
-                         blkio_get_error_msg());
-        return ret;
-    }
-
    if (!(flags & BDRV_O_NOCACHE)) {
        error_setg(errp, "cache.direct=off is not supported");
        return -EINVAL;
    }
+
+    if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
+        fd_supported = true;
+    }
+
+    /*
+     * If the libblkio driver supports fd passing, let's always use qemu_open()
+     * to open the `path`, so we can handle fd passing from the management
+     * layer through the "/dev/fdset/N" special path.
+     */
+    if (fd_supported) {
+        int open_flags;
+
+        if (flags & BDRV_O_RDWR) {
+            open_flags = O_RDWR;
+        } else {
+            open_flags = O_RDONLY;
+        }
+
+        fd = qemu_open(path, open_flags, errp);
+        if (fd < 0) {
+            return -EINVAL;
+        }
+
+        ret = blkio_set_int(s->blkio, "fd", fd);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "failed to set fd: %s",
+                             blkio_get_error_msg());
+            qemu_close(fd);
+            return ret;
+        }
+    } else {
+        ret = blkio_set_str(s->blkio, "path", path);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "failed to set path: %s",
+                             blkio_get_error_msg());
+            return ret;
+        }
+    }
+
+    qdict_del(options, "path");
+
    return 0;
 }

@@ -1009,7 +1045,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
        .bdrv_co_pwritev         = blkio_co_pwritev, \
        .bdrv_co_flush_to_disk   = blkio_co_flush, \
        .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-        .bdrv_co_io_unplug       = blkio_co_io_unplug, \
        .bdrv_refresh_limits     = blkio_refresh_limits, \
        .bdrv_register_buf       = blkio_register_buf, \
        .bdrv_unregister_buf     = blkio_unregister_buf, \
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -265,8 +265,9 @@ static int coroutine_fn GRAPH_RDLOCK blkverify_co_flush(BlockDriverState *bs)
    return bdrv_co_flush(s->test_file->bs);
 }

-static bool blkverify_recurse_can_replace(BlockDriverState *bs,
-                                          BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+blkverify_recurse_can_replace(BlockDriverState *bs,
+                              BlockDriverState *to_replace)
 {
    BDRVBlkverifyState *s = bs->opaque;

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -389,6 +389,8 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 * Both sets of permissions can be changed later using blk_set_perm().
 *
 * Return the new BlockBackend on success, null on failure.
+ *
+ * Callers must hold the AioContext lock of @bs.
 */
 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
                              uint64_t shared_perm, Error **errp)
@@ -406,11 +408,15 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,

 /*
 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
- * The new BlockBackend is in the main AioContext.
+ * By default, the new BlockBackend is in the main AioContext, but if the
+ * parameters connect it with any existing node in a different AioContext, it
+ * may end up there instead.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
 *
+ * Called without holding an AioContext lock.
+ *
 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 * BDS tree just by specifying the @options QDict (or @reference,
 * alternatively). At the time of adding this function, this is not possible,
@@ -422,6 +428,7 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
 {
    BlockBackend *blk;
    BlockDriverState *bs;
+    AioContext *ctx;
    uint64_t perm = 0;
    uint64_t shared = BLK_PERM_ALL;

@@ -451,16 +458,24 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
        shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
    }

-    blk = blk_new(qemu_get_aio_context(), perm, shared);
+    aio_context_acquire(qemu_get_aio_context());
    bs = bdrv_open(filename, reference, options, flags, errp);
+    aio_context_release(qemu_get_aio_context());
    if (!bs) {
-        blk_unref(blk);
        return NULL;
    }

-    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
-                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
-                                       perm, shared, blk, errp);
+    /* bdrv_open() could have moved bs to a different AioContext */
+    ctx = bdrv_get_aio_context(bs);
+    blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
+    blk->perm = perm;
+    blk->shared_perm = shared;
+
+    aio_context_acquire(ctx);
+    blk_insert_bs(blk, bs, errp);
+    bdrv_unref(bs);
+    aio_context_release(ctx);
+
    if (!blk->root) {
        blk_unref(blk);
        return NULL;
@@ -901,6 +916,8 @@ void blk_remove_bs(BlockBackend *blk)

 /*
 * Associates a new BlockDriverState with @blk.
+ *
+ * Callers must hold the AioContext lock of @bs.
 */
 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 {
@@ -1270,6 +1287,13 @@ blk_check_byte_request(BlockBackend *blk, int64_t offset, int64_t bytes)
    return 0;
 }

+/* Are we currently in a drained section? */
+bool blk_in_drain(BlockBackend *blk)
+{
+    GLOBAL_STATE_CODE(); /* change to IO_OR_GS_CODE(), if necessary */
+    return qatomic_read(&blk->quiesce_counter);
+}
+
 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
 {
@@ -1845,6 +1869,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
    return ret;
 }

+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
+                                   (unsigned int*)(uintptr_t)acb->bytes,
+                                   rwco->iobuf);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+                                unsigned int *nr_zones,
+                                BlockZoneDescriptor  *zones,
+                                BlockCompletionFunc *cb, void *opaque)
+{
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .offset = offset,
+        .iobuf  = zones,
+        .ret    = NOT_DONE,
+    };
+    acb->bytes = (int64_t)(uintptr_t)nr_zones,
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_mgmt(rwco->blk,
+                                 (BlockZoneOp)(uintptr_t)rwco->iobuf,
+                                 rwco->offset, acb->bytes);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                              int64_t offset, int64_t len,
+                              BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .offset = offset,
+        .iobuf  = (void *)(uintptr_t)op,
+        .ret    = NOT_DONE,
+    };
+    acb->bytes = len;
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
+                                   rwco->iobuf, rwco->flags);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .ret    = NOT_DONE,
+        .flags  = flags,
+        .iobuf  = qiov,
+    };
+    acb->bytes = (int64_t)(uintptr_t)offset;
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation;
+ * offset is the byte offset from the start of the zoned device;
+ * len is the maximum number of bytes the command should operate on. It
+ * should be aligned with the device zone size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        blk_dec_in_flight(blk);
+        return ret;
+    }
+
+    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+        QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
    BlockDriverState *bs = blk_bs(blk);
@@ -2024,7 +2246,15 @@ void blk_activate(BlockBackend *blk, Error **errp)
        return;
    }

-    bdrv_activate(bs, errp);
+    /*
+     * Migration code can call this function in coroutine context, so leave
+     * coroutine context if necessary.
+     */
+    if (qemu_in_coroutine()) {
+        bdrv_co_activate(bs, errp);
+    } else {
+        bdrv_activate(bs, errp);
+    }
 }

 bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
@@ -2188,9 +2418,14 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)

 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs;
    IO_CODE();

+    if (!blk) {
+        return qemu_get_aio_context();
+    }
+
+    bs = blk_bs(blk);
    if (bs) {
        AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
        assert(ctx == blk->ctx);
@@ -2205,52 +2440,31 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
    return blk_get_aio_context(blk_acb->blk);
 }

-static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
-                                  bool update_root_node, Error **errp)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
-    int ret;
-
-    if (bs) {
-        bdrv_ref(bs);
-
-        if (update_root_node) {
-            /*
-             * update_root_node MUST be false for blk_root_set_aio_ctx_commit(),
-             * as we are already in the commit function of a transaction.
-             */
-            ret = bdrv_try_change_aio_context(bs, new_context, blk->root, errp);
-            if (ret < 0) {
-                bdrv_unref(bs);
-                return ret;
-            }
-        }
-        /*
-         * Make blk->ctx consistent with the root node before we invoke any
-         * other operations like drain that might inquire blk->ctx
-         */
-        blk->ctx = new_context;
-        if (tgm->throttle_state) {
-            bdrv_drained_begin(bs);
-            throttle_group_detach_aio_context(tgm);
-            throttle_group_attach_aio_context(tgm, new_context);
-            bdrv_drained_end(bs);
-        }
-
-        bdrv_unref(bs);
-    } else {
-        blk->ctx = new_context;
-    }
-
-    return 0;
-}
-
 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
                        Error **errp)
 {
+    bool old_allow_change;
+    BlockDriverState *bs = blk_bs(blk);
+    int ret;
+
    GLOBAL_STATE_CODE();
-    return blk_do_set_aio_context(blk, new_context, true, errp);
+
+    if (!bs) {
+        blk->ctx = new_context;
+        return 0;
+    }
+
+    bdrv_ref(bs);
+
+    old_allow_change = blk->allow_aio_context_change;
+    blk->allow_aio_context_change = true;
+
+    ret = bdrv_try_change_aio_context(bs, new_context, NULL, errp);
+
+    blk->allow_aio_context_change = old_allow_change;
+
+    bdrv_unref(bs);
+    return ret;
 }

 typedef struct BdrvStateBlkRootContext {
@@ -2262,8 +2476,14 @@ static void blk_root_set_aio_ctx_commit(void *opaque)
 {
    BdrvStateBlkRootContext *s = opaque;
    BlockBackend *blk = s->blk;
+    AioContext *new_context = s->new_ctx;
+    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;

-    blk_do_set_aio_context(blk, s->new_ctx, false, &error_abort);
+    blk->ctx = new_context;
+    if (tgm->throttle_state) {
+        throttle_group_detach_aio_context(tgm);
+        throttle_group_attach_aio_context(tgm, new_context);
+    }
 }

 static TransactionActionDrv set_blk_root_context = {
@@ -2362,28 +2582,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
    notifier_list_add(&blk->insert_bs_notifiers, notify);
 }

-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    IO_CODE();
-    GRAPH_RDLOCK_GUARD();
-
-    if (bs) {
-        bdrv_co_io_plug(bs);
-    }
-}
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    IO_CODE();
-    GRAPH_RDLOCK_GUARD();
-
-    if (bs) {
-        bdrv_co_io_unplug(bs);
-    }
-}
-
 BlockAcctStats *blk_get_stats(BlockBackend *blk)
 {
    IO_CODE();
--- a/block/commit.c
+++ b/block/commit.c
@@ -116,7 +116,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 {
    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
    int64_t offset;
-    uint64_t delay_ns = 0;
    int ret = 0;
    int64_t n = 0; /* bytes */
    QEMU_AUTO_VFREE void *buf = NULL;
@@ -149,7 +148,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
        /* Note that even when no rate limit is applied we need to yield
         * with no pending I/O here so that bdrv_drain_all() returns.
         */
-        job_sleep_ns(&s->common.job, delay_ns);
+        block_job_ratelimit_sleep(&s->common);
        if (job_is_cancelled(&s->common.job)) {
            break;
        }
@@ -184,9 +183,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
        job_progress_update(&s->common.job, n);

        if (copy) {
-            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
-        } else {
-            delay_ns = 0;
+            block_job_ratelimit_processed_bytes(&s->common, n);
        }
    }

--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -412,6 +412,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t cluster_size;
    g_autoptr(BlockdevOptions) full_opts = NULL;
    BlockdevOptionsCbw *opts;
+    AioContext *ctx;
    int ret;

    full_opts = cbw_parse_options(options, errp);
@@ -432,11 +433,15 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
        return -EINVAL;
    }

+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+
    if (opts->bitmap) {
        bitmap = block_dirty_bitmap_lookup(opts->bitmap->node,
                                           opts->bitmap->name, NULL, errp);
        if (!bitmap) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out;
        }
    }
    s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error :
@@ -454,21 +459,24 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
    if (!s->bcs) {
        error_prepend(errp, "Cannot create block-copy-state: ");
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }

    cluster_size = block_copy_cluster_size(s->bcs);

    s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    if (!s->done_bitmap) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }
    bdrv_disable_dirty_bitmap(s->done_bitmap);

    /* s->access_bitmap starts equal to bcs bitmap */
    s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    if (!s->access_bitmap) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }
    bdrv_disable_dirty_bitmap(s->access_bitmap);
    bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
@@ -478,7 +486,10 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    qemu_co_mutex_init(&s->lock);
    QLIST_INIT(&s->frozen_read_reqs);

-    return 0;
+    ret = 0;
+out:
+    aio_context_release(ctx);
+    return ret;
 }

 static void cbw_close(BlockDriverState *bs)
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -61,7 +61,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int coroutine_fn GRAPH_RDLOCK
 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);

-int coroutine_fn
+int coroutine_fn GRAPH_RDLOCK
 nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
                               Error **errp);

@@ -85,7 +85,8 @@ bdrv_common_block_status_above(BlockDriverState *bs,
                               int64_t *map,
                               BlockDriverState **file,
                               int *depth);
-int co_wrapper_mixed
+
+int co_wrapper_mixed_bdrv_rdlock
 nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);

 #endif /* BLOCK_COROUTINES_H */
--- a/block/create.c
+++ b/block/create.c
@@ -43,7 +43,6 @@ static int coroutine_fn blockdev_create_run(Job *job, Error **errp)
    int ret;

    GLOBAL_STATE_CODE();
-    GRAPH_RDLOCK_GUARD();

    job_progress_set_remaining(&s->common, 1);
    ret = s->drv->bdrv_co_create(s->opts, errp);
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -99,12 +99,10 @@ struct BlockCryptoCreateData {
 };


-static int block_crypto_create_write_func(QCryptoBlock *block,
-                                          size_t offset,
-                                          const uint8_t *buf,
-                                          size_t buflen,
-                                          void *opaque,
-                                          Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+block_crypto_create_write_func(QCryptoBlock *block, size_t offset,
+                               const uint8_t *buf, size_t buflen, void *opaque,
+                               Error **errp)
 {
    struct BlockCryptoCreateData *data = opaque;
    ssize_t ret;
@@ -117,10 +115,9 @@ static int block_crypto_create_write_func(QCryptoBlock *block,
    return 0;
 }

-static int block_crypto_create_init_func(QCryptoBlock *block,
-                                         size_t headerlen,
-                                         void *opaque,
-                                         Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+block_crypto_create_init_func(QCryptoBlock *block, size_t headerlen,
+                              void *opaque, Error **errp)
 {
    struct BlockCryptoCreateData *data = opaque;
    Error *local_error = NULL;
@@ -314,7 +311,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
 }


-static int coroutine_fn
+static int coroutine_fn GRAPH_UNLOCKED
 block_crypto_co_create_generic(BlockDriverState *bs, int64_t size,
                               QCryptoBlockCreateOptions *opts,
                               PreallocMode prealloc, Error **errp)
@@ -355,7 +352,7 @@ block_crypto_co_create_generic(BlockDriverState *bs, int64_t size,
    ret = 0;
 cleanup:
    qcrypto_block_free(crypto);
-    blk_unref(blk);
+    blk_co_unref(blk);
    return ret;
 }

@@ -627,7 +624,7 @@ static int block_crypto_open_luks(BlockDriverState *bs,
                                     bs, options, flags, errp);
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_UNLOCKED
 block_crypto_co_create_luks(BlockdevCreateOptions *create_options, Error **errp)
 {
    BlockdevCreateOptionsLUKS *luks_opts;
@@ -661,11 +658,11 @@ block_crypto_co_create_luks(BlockdevCreateOptions *create_options, Error **errp)

    ret = 0;
 fail:
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 block_crypto_co_create_opts_luks(BlockDriver *drv, const char *filename,
                                 QemuOpts *opts, Error **errp)
 {
@@ -727,16 +724,18 @@ fail:
     * beforehand, it has been truncated and corrupted in the process.
     */
    if (ret) {
+        bdrv_graph_co_rdlock();
        bdrv_co_delete_file_noerr(bs);
+        bdrv_graph_co_rdunlock();
    }

-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_QCryptoBlockCreateOptions(create_opts);
    qobject_unref(cryptoopts);
    return ret;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 block_crypto_co_get_info_luks(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    BlockDriverInfo subbdi;
--- a/block/curl.c
+++ b/block/curl.c
@@ -132,7 +132,7 @@ static gboolean curl_drop_socket(void *key, void *value, void *opaque)
    CURLSocket *socket = value;
    BDRVCURLState *s = socket->s;

-    aio_set_fd_handler(s->aio_context, socket->fd, false,
+    aio_set_fd_handler(s->aio_context, socket->fd,
                       NULL, NULL, NULL, NULL, NULL);
    return true;
 }
@@ -180,20 +180,20 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
    trace_curl_sock_cb(action, (int)fd);
    switch (action) {
        case CURL_POLL_IN:
-            aio_set_fd_handler(s->aio_context, fd, false,
+            aio_set_fd_handler(s->aio_context, fd,
                               curl_multi_do, NULL, NULL, NULL, socket);
            break;
        case CURL_POLL_OUT:
-            aio_set_fd_handler(s->aio_context, fd, false,
+            aio_set_fd_handler(s->aio_context, fd,
                               NULL, curl_multi_do, NULL, NULL, socket);
            break;
        case CURL_POLL_INOUT:
-            aio_set_fd_handler(s->aio_context, fd, false,
+            aio_set_fd_handler(s->aio_context, fd,
                               curl_multi_do, curl_multi_do,
                               NULL, NULL, socket);
            break;
        case CURL_POLL_REMOVE:
-            aio_set_fd_handler(s->aio_context, fd, false,
+            aio_set_fd_handler(s->aio_context, fd,
                               NULL, NULL, NULL, NULL, NULL);
            break;
    }
--- a/block/export/export.c
+++ b/block/export/export.c
@@ -192,8 +192,10 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
    return exp;

 fail:
-    blk_set_dev_ops(exp->blk, NULL, NULL);
-    blk_unref(blk);
+    if (blk) {
+        blk_set_dev_ops(blk, NULL, NULL);
+        blk_unref(blk);
+    }
    aio_context_release(ctx);
    if (exp) {
        g_free(exp->id);
@@ -202,11 +204,10 @@ fail:
    return NULL;
 }

-/* Callers must hold exp->ctx lock */
 void blk_exp_ref(BlockExport *exp)
 {
-    assert(exp->refcount > 0);
-    exp->refcount++;
+    assert(qatomic_read(&exp->refcount) > 0);
+    qatomic_inc(&exp->refcount);
 }

 /* Runs in the main thread */
@@ -229,11 +230,10 @@ static void blk_exp_delete_bh(void *opaque)
    aio_context_release(aio_context);
 }

-/* Callers must hold exp->ctx lock */
 void blk_exp_unref(BlockExport *exp)
 {
-    assert(exp->refcount > 0);
-    if (--exp->refcount == 0) {
+    assert(qatomic_read(&exp->refcount) > 0);
+    if (qatomic_fetch_dec(&exp->refcount) == 1) {
        /* Touch the block_exports list only in the main thread */
        aio_bh_schedule_oneshot(qemu_get_aio_context(), blk_exp_delete_bh,
                                exp);
@@ -341,7 +341,8 @@ void qmp_block_export_del(const char *id,
    if (!has_mode) {
        mode = BLOCK_EXPORT_REMOVE_MODE_SAFE;
    }
-    if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE && exp->refcount > 1) {
+    if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE &&
+        qatomic_read(&exp->refcount) > 1) {
        error_setg(errp, "export '%s' still in use", exp->id);
        error_append_hint(errp, "Use mode='hard' to force client "
                          "disconnect\n");
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -50,6 +50,7 @@ typedef struct FuseExport {

    struct fuse_session *fuse_session;
    struct fuse_buf fuse_buf;
+    unsigned int in_flight; /* atomic */
    bool mounted, fd_handler_set_up;

    char *mountpoint;
@@ -78,6 +79,42 @@ static void read_from_fuse_export(void *opaque);
 static bool is_regular_file(const char *path, Error **errp);


+static void fuse_export_drained_begin(void *opaque)
+{
+    FuseExport *exp = opaque;
+
+    aio_set_fd_handler(exp->common.ctx,
+                       fuse_session_fd(exp->fuse_session),
+                       NULL, NULL, NULL, NULL, NULL);
+    exp->fd_handler_set_up = false;
+}
+
+static void fuse_export_drained_end(void *opaque)
+{
+    FuseExport *exp = opaque;
+
+    /* Refresh AioContext in case it changed */
+    exp->common.ctx = blk_get_aio_context(exp->common.blk);
+
+    aio_set_fd_handler(exp->common.ctx,
+                       fuse_session_fd(exp->fuse_session),
+                       read_from_fuse_export, NULL, NULL, NULL, exp);
+    exp->fd_handler_set_up = true;
+}
+
+static bool fuse_export_drained_poll(void *opaque)
+{
+    FuseExport *exp = opaque;
+
+    return qatomic_read(&exp->in_flight) > 0;
+}
+
+static const BlockDevOps fuse_export_blk_dev_ops = {
+    .drained_begin = fuse_export_drained_begin,
+    .drained_end   = fuse_export_drained_end,
+    .drained_poll  = fuse_export_drained_poll,
+};
+
 static int fuse_export_create(BlockExport *blk_exp,
                              BlockExportOptions *blk_exp_args,
                              Error **errp)
@@ -101,6 +138,15 @@ static int fuse_export_create(BlockExport *blk_exp,
        }
    }

+    blk_set_dev_ops(exp->common.blk, &fuse_export_blk_dev_ops, exp);
+
+    /*
+     * We handle draining ourselves using an in-flight counter and by disabling
+     * the FUSE fd handler. Do not queue BlockBackend requests, they need to
+     * complete so the in-flight counter reaches zero.
+     */
+    blk_set_disable_request_queuing(exp->common.blk, true);
+
    init_exports_table();

    /*
@@ -224,7 +270,7 @@ static int setup_fuse_export(FuseExport *exp, const char *mountpoint,
    g_hash_table_insert(exports, g_strdup(mountpoint), NULL);

    aio_set_fd_handler(exp->common.ctx,
-                       fuse_session_fd(exp->fuse_session), true,
+                       fuse_session_fd(exp->fuse_session),
                       read_from_fuse_export, NULL, NULL, NULL, exp);
    exp->fd_handler_set_up = true;

@@ -246,6 +292,8 @@ static void read_from_fuse_export(void *opaque)

    blk_exp_ref(&exp->common);

+    qatomic_inc(&exp->in_flight);
+
    do {
        ret = fuse_session_receive_buf(exp->fuse_session, &exp->fuse_buf);
    } while (ret == -EINTR);
@@ -256,6 +304,10 @@ static void read_from_fuse_export(void *opaque)
    fuse_session_process_buf(exp->fuse_session, &exp->fuse_buf);

 out:
+    if (qatomic_fetch_dec(&exp->in_flight) == 1) {
+        aio_wait_kick(); /* wake AIO_WAIT_WHILE() */
+    }
+
    blk_exp_unref(&exp->common);
 }

@@ -268,7 +320,7 @@ static void fuse_export_shutdown(BlockExport *blk_exp)

        if (exp->fd_handler_set_up) {
            aio_set_fd_handler(exp->common.ctx,
-                               fuse_session_fd(exp->fuse_session), true,
+                               fuse_session_fd(exp->fuse_session),
                               NULL, NULL, NULL, NULL, NULL);
            exp->fd_handler_set_up = false;
        }
--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
    VduseDev *dev;
    uint16_t num_queues;
    char *recon_file;
-    unsigned int inflight;
+    unsigned int inflight; /* atomic */
+    bool vqs_started;
 } VduseBlkExport;

 typedef struct VduseBlkReq {
@@ -41,13 +42,20 @@ typedef struct VduseBlkReq {

 static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
 {
-    vblk_exp->inflight++;
+    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
+        /* Prevent export from being deleted */
+        blk_exp_ref(&vblk_exp->export);
+    }
 }

 static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
 {
-    if (--vblk_exp->inflight == 0) {
+    if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
+        /* Wake AIO_WAIT_WHILE() */
        aio_wait_kick();
+
+        /* Now the export can be deleted */
+        blk_exp_unref(&vblk_exp->export);
    }
 }

@@ -124,8 +132,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
 {
    VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);

+    if (!vblk_exp->vqs_started) {
+        return; /* vduse_blk_drained_end() will start vqs later */
+    }
+
    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-                       true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
+                       on_vduse_vq_kick, NULL, NULL, NULL, vq);
    /* Make sure we don't miss any kick afer reconnecting */
    eventfd_write(vduse_queue_get_fd(vq), 1);
 }
@@ -133,9 +145,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
 static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
 {
    VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
+    int fd = vduse_queue_get_fd(vq);

-    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-                       true, NULL, NULL, NULL, NULL, NULL);
+    if (fd < 0) {
+        return;
+    }
+
+    aio_set_fd_handler(vblk_exp->export.ctx, fd,
+                       NULL, NULL, NULL, NULL, NULL);
 }

 static const VduseOps vduse_blk_ops = {
@@ -152,42 +169,19 @@ static void on_vduse_dev_kick(void *opaque)

 static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
 {
-    int i;
-
    aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
-                       true, on_vduse_dev_kick, NULL, NULL, NULL,
+                       on_vduse_dev_kick, NULL, NULL, NULL,
                       vblk_exp->dev);

-    for (i = 0; i < vblk_exp->num_queues; i++) {
-        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
-        int fd = vduse_queue_get_fd(vq);
-
-        if (fd < 0) {
-            continue;
-        }
-        aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
-                           on_vduse_vq_kick, NULL, NULL, NULL, vq);
-    }
+    /* Virtqueues are handled by vduse_blk_drained_end() */
 }

 static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
 {
-    int i;
-
-    for (i = 0; i < vblk_exp->num_queues; i++) {
-        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
-        int fd = vduse_queue_get_fd(vq);
-
-        if (fd < 0) {
-            continue;
-        }
-        aio_set_fd_handler(vblk_exp->export.ctx, fd,
-                           true, NULL, NULL, NULL, NULL, NULL);
-    }
    aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
-                       true, NULL, NULL, NULL, NULL, NULL);
+                       NULL, NULL, NULL, NULL, NULL);

-    AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
+    /* Virtqueues are handled by vduse_blk_drained_begin() */
 }


@@ -220,8 +214,55 @@ static void vduse_blk_resize(void *opaque)
                            (char *)&config.capacity);
 }

+static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
+{
+    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+        vduse_blk_disable_queue(vblk_exp->dev, vq);
+    }
+
+    vblk_exp->vqs_started = false;
+}
+
+static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
+{
+    vblk_exp->vqs_started = true;
+
+    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+        vduse_blk_enable_queue(vblk_exp->dev, vq);
+    }
+}
+
+static void vduse_blk_drained_begin(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    vduse_blk_stop_virtqueues(vblk_exp);
+}
+
+static void vduse_blk_drained_end(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    vduse_blk_start_virtqueues(vblk_exp);
+}
+
+static bool vduse_blk_drained_poll(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    return qatomic_read(&vblk_exp->inflight) > 0;
+}
+
 static const BlockDevOps vduse_block_ops = {
-    .resize_cb = vduse_blk_resize,
+    .resize_cb     = vduse_blk_resize,
+    .drained_begin = vduse_blk_drained_begin,
+    .drained_end   = vduse_blk_drained_end,
+    .drained_poll  = vduse_blk_drained_poll,
 };

 static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
@@ -268,6 +309,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
    vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
    vblk_exp->handler.logical_block_size = logical_block_size;
    vblk_exp->handler.writable = opts->writable;
+    vblk_exp->vqs_started = true;

    config.capacity =
            cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
@@ -322,14 +364,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
        vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
    }

-    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
+    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev),
                       on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);

    blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                 vblk_exp);
-
    blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);

+    /*
+     * We handle draining ourselves using an in-flight counter and by disabling
+     * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
+     * complete so the in-flight counter reaches zero.
+     */
+    blk_set_disable_request_queuing(exp->blk, true);
+
    return 0;
 err:
    vduse_dev_destroy(vblk_exp->dev);
@@ -344,6 +392,9 @@ static void vduse_blk_exp_delete(BlockExport *exp)
    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
    int ret;

+    assert(qatomic_read(&vblk_exp->inflight) == 0);
+
+    vduse_blk_detach_ctx(vblk_exp);
    blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                    vblk_exp);
    ret = vduse_dev_destroy(vblk_exp->dev);
@@ -354,13 +405,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
    g_free(vblk_exp->handler.serial);
 }

+/* Called with exp->ctx acquired */
 static void vduse_blk_exp_request_shutdown(BlockExport *exp)
 {
    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);

-    aio_context_acquire(vblk_exp->export.ctx);
-    vduse_blk_detach_ctx(vblk_exp);
-    aio_context_acquire(vblk_exp->export.ctx);
+    vduse_blk_stop_virtqueues(vblk_exp);
 }

 const BlockExportDriver blk_exp_vduse_blk = {
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -50,7 +50,10 @@ static void vu_blk_req_complete(VuBlkReq *req, size_t in_len)
    free(req);
 }

-/* Called with server refcount increased, must decrease before returning */
+/*
+ * Called with server in_flight counter increased, must decrease before
+ * returning.
+ */
 static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
 {
    VuBlkReq *req = opaque;
@@ -68,12 +71,12 @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
                                    in_num, out_num);
    if (in_len < 0) {
        free(req);
-        vhost_user_server_unref(server);
+        vhost_user_server_dec_in_flight(server);
        return;
    }

    vu_blk_req_complete(req, in_len);
-    vhost_user_server_unref(server);
+    vhost_user_server_dec_in_flight(server);
 }

 static void vu_blk_process_vq(VuDev *vu_dev, int idx)
@@ -95,7 +98,7 @@ static void vu_blk_process_vq(VuDev *vu_dev, int idx)
        Coroutine *co =
            qemu_coroutine_create(vu_blk_virtio_process_req, req);

-        vhost_user_server_ref(server);
+        vhost_user_server_inc_in_flight(server);
        qemu_coroutine_enter(co);
    }
 }
@@ -209,15 +212,21 @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
 {
    VuBlkExport *vexp = opaque;

+    /*
+     * The actual attach will happen in vu_blk_drained_end() and we just
+     * restore ctx here.
+     */
    vexp->export.ctx = ctx;
-    vhost_user_server_attach_aio_context(&vexp->vu_server, ctx);
 }

 static void blk_aio_detach(void *opaque)
 {
    VuBlkExport *vexp = opaque;

-    vhost_user_server_detach_aio_context(&vexp->vu_server);
+    /*
+     * The actual detach already happened in vu_blk_drained_begin() but from
+     * this point on we must not access ctx anymore.
+     */
    vexp->export.ctx = NULL;
 }

@@ -269,7 +278,38 @@ static void vu_blk_exp_resize(void *opaque)
    vu_config_change_msg(&vexp->vu_server.vu_dev);
 }

+/* Called with vexp->export.ctx acquired */
+static void vu_blk_drained_begin(void *opaque)
+{
+    VuBlkExport *vexp = opaque;
+
+    vhost_user_server_detach_aio_context(&vexp->vu_server);
+}
+
+/* Called with vexp->export.blk AioContext acquired */
+static void vu_blk_drained_end(void *opaque)
+{
+    VuBlkExport *vexp = opaque;
+
+    vhost_user_server_attach_aio_context(&vexp->vu_server, vexp->export.ctx);
+}
+
+/*
+ * Ensures that bdrv_drained_begin() waits until in-flight requests complete.
+ *
+ * Called with vexp->export.ctx acquired.
+ */
+static bool vu_blk_drained_poll(void *opaque)
+{
+    VuBlkExport *vexp = opaque;
+
+    return vhost_user_server_has_in_flight(&vexp->vu_server);
+}
+
 static const BlockDevOps vu_blk_dev_ops = {
+    .drained_begin = vu_blk_drained_begin,
+    .drained_end   = vu_blk_drained_end,
+    .drained_poll  = vu_blk_drained_poll,
    .resize_cb = vu_blk_exp_resize,
 };

--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -68,6 +68,9 @@
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -157,6 +160,7 @@ typedef struct BDRVRawState {
    bool has_write_zeroes:1;
    bool use_linux_aio:1;
    bool use_linux_io_uring:1;
+    int64_t *offset; /* offset of zone append operation */
    int page_cache_inconsistent; /* errno from fdatasync failure */
    bool has_fallocate;
    bool needs_alignment;
@@ -216,6 +220,13 @@ typedef struct RawPosixAIOData {
            PreallocMode prealloc;
            Error **errp;
        } truncate;
+        struct {
+            unsigned int *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        struct {
+            unsigned long op;
+        } zone_mgmt;
    };
 } RawPosixAIOData;

@@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
            goto fail;
        }
    }
+#ifdef CONFIG_BLKZONED
+    /*
+     * The kernel page cache does not reliably work for writes to SWR zones
+     * of zoned block device because it can not guarantee the order of writes.
+     */
+    if ((bs->bl.zoned != BLK_Z_NONE) &&
+        (!(s->open_flags & O_DIRECT))) {
+        error_setg(errp, "The driver supports zoned devices, and it requires "
+                         "cache.direct=on, which was not specified.");
+        return -EINVAL; /* No host kernel page cache */
+    }
+#endif

    if (S_ISBLK(st.st_mode)) {
 #ifdef __linux__
@@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
 #endif
 }

+/*
+ * Get a sysfs attribute value as character string.
+ */
+#ifdef CONFIG_LINUX
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+                             char **val) {
+    g_autofree char *sysfspath = NULL;
+    int ret;
+    size_t len;
+
+    if (!S_ISBLK(st->st_mode)) {
+        return -ENOTSUP;
+    }
+
+    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+                                major(st->st_rdev), minor(st->st_rdev),
+                                attribute);
+    ret = g_file_get_contents(sysfspath, val, &len, NULL);
+    if (ret == -1) {
+        return -ENOENT;
+    }
+
+    /* The file is ended with '\n' */
+    char *p;
+    p = *val;
+    if (*(p + len - 1) == '\n') {
+        *(p + len - 1) = '\0';
+    }
+    return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
+{
+    g_autofree char *val = NULL;
+    int ret;
+
+    ret = get_sysfs_str_val(st, "zoned", &val);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (strcmp(val, "host-managed") == 0) {
+        *zoned = BLK_Z_HM;
+    } else if (strcmp(val, "host-aware") == 0) {
+        *zoned = BLK_Z_HA;
+    } else if (strcmp(val, "none") == 0) {
+        *zoned = BLK_Z_NONE;
+    } else {
+        return -ENOTSUP;
+    }
+    return 0;
+}
+#endif /* defined(CONFIG_BLKZONED) */
+
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
+#ifdef CONFIG_LINUX
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
+{
+    g_autofree char *str = NULL;
+    const char *end;
+    long val;
+    int ret;
+
+    ret = get_sysfs_str_val(st, attribute, &str);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The file is ended with '\n', pass 'end' to accept that. */
+    ret = qemu_strtol(str, &end, 10, &val);
+    if (ret == 0 && end && *end == '\0') {
+        ret = val;
+    }
+    return ret;
+}
+#endif
+
 static int hdev_get_max_segments(int fd, struct stat *st)
 {
 #ifdef CONFIG_LINUX
-    char buf[32];
-    const char *end;
-    char *sysfspath = NULL;
    int ret;
-    int sysfd = -1;
-    long max_segments;

    if (S_ISCHR(st->st_mode)) {
        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
@@ -1218,44 +1317,176 @@ static int hdev_get_max_segments(int fd, struct stat *st)
        }
        return -ENOTSUP;
    }
-
-    if (!S_ISBLK(st->st_mode)) {
-        return -ENOTSUP;
-    }
-
-    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st->st_rdev), minor(st->st_rdev));
-    sysfd = open(sysfspath, O_RDONLY);
-    if (sysfd == -1) {
-        ret = -errno;
-        goto out;
-    }
-    ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
-    } else if (ret == 0) {
-        ret = -EIO;
-        goto out;
-    }
-    buf[ret] = 0;
-    /* The file is ended with '\n', pass 'end' to accept that. */
-    ret = qemu_strtol(buf, &end, 10, &max_segments);
-    if (ret == 0 && end && *end == '\n') {
-        ret = max_segments;
-    }
-
-out:
-    if (sysfd != -1) {
-        close(sysfd);
-    }
-    g_free(sysfspath);
-    return ret;
+    return get_sysfs_long_val(st, "max_segments");
 #else
    return -ENOTSUP;
 #endif
 }

+#if defined(CONFIG_BLKZONED)
+/*
+ * If the reset_all flag is true, then the wps of zone whose state is
+ * not readonly or offline should be all reset to the start sector.
+ * Else, take the real wp of the device.
+ */
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+                        unsigned int nrz, bool reset_all)
+{
+    struct blk_zone *blkz;
+    size_t rep_size;
+    uint64_t sector = offset >> BDRV_SECTOR_BITS;
+    BlockZoneWps *wps = bs->wps;
+    unsigned int j = offset / bs->bl.zone_size;
+    unsigned int n = 0, i = 0;
+    int ret;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+                wps->wp[j] |= 1ULL << 63;
+            } else {
+                switch(blkz[i].cond) {
+                case BLK_ZONE_COND_FULL:
+                case BLK_ZONE_COND_READONLY:
+                    /* Zone not writable */
+                    wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
+                    break;
+                case BLK_ZONE_COND_OFFLINE:
+                    /* Zone not writable nor readable */
+                    wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
+                    break;
+                default:
+                    if (reset_all) {
+                        wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
+                    } else {
+                        wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
+                    }
+                    break;
+                }
+            }
+        }
+        sector = blkz[i - 1].start + blkz[i - 1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+                            unsigned int nrz)
+{
+    if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
+        error_report("update zone wp failed");
+    }
+}
+
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+                                     Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+    BlockZoneModel zoned;
+    int ret;
+
+    bs->bl.zoned = BLK_Z_NONE;
+
+    ret = get_sysfs_zoned_model(st, &zoned);
+    if (ret < 0 || zoned == BLK_Z_NONE) {
+        return;
+    }
+    bs->bl.zoned = zoned;
+
+    ret = get_sysfs_long_val(st, "max_open_zones");
+    if (ret >= 0) {
+        bs->bl.max_open_zones = ret;
+    }
+
+    ret = get_sysfs_long_val(st, "max_active_zones");
+    if (ret >= 0) {
+        bs->bl.max_active_zones = ret;
+    }
+
+    /*
+     * The zoned device must at least have zone size and nr_zones fields.
+     */
+    ret = get_sysfs_long_val(st, "chunk_sectors");
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
+                                     "sysfs attribute");
+        return;
+    } else if (!ret) {
+        error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
+        return;
+    }
+    bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
+
+    ret = get_sysfs_long_val(st, "nr_zones");
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Unable to read nr_zones "
+                                     "sysfs attribute");
+        return;
+    } else if (!ret) {
+        error_setg(errp, "Read 0 from nr_zones sysfs attribute");
+        return;
+    }
+    bs->bl.nr_zones = ret;
+
+    ret = get_sysfs_long_val(st, "zone_append_max_bytes");
+    if (ret > 0) {
+        bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
+    }
+
+    ret = get_sysfs_long_val(st, "physical_block_size");
+    if (ret >= 0) {
+        bs->bl.write_granularity = ret;
+    }
+
+    /* The refresh_limits() function can be called multiple times. */
+    g_free(bs->wps);
+    bs->wps = g_malloc(sizeof(BlockZoneWps) +
+            sizeof(int64_t) * bs->bl.nr_zones);
+    ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "report wps failed");
+        bs->wps = NULL;
+        return;
+    }
+    qemu_co_mutex_init(&bs->wps->colock);
+}
+#else /* !defined(CONFIG_BLKZONED) */
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+                                     Error **errp)
+{
+    bs->bl.zoned = BLK_Z_NONE;
+}
+#endif /* !defined(CONFIG_BLKZONED) */
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
@@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
            bs->bl.max_hw_iov = ret;
        }
    }
+
+    raw_refresh_zoned_limits(bs, &st, errp);
 }

 static int check_for_dasd(int fd)
@@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
    BDRVRawState *s = bs->opaque;
    int ret;

-    /* If DASD, get blocksizes */
+    /* If DASD or zoned devices, get blocksizes */
    if (check_for_dasd(s->fd) < 0) {
-        return -ENOTSUP;
+        /* zoned devices are not DASD */
+        if (bs->bl.zoned == BLK_Z_NONE) {
+            return -ENOTSUP;
+        }
    }
    ret = probe_logical_blocksize(s->fd, &bsz->log);
    if (ret < 0) {
@@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
    ssize_t len;

    len = RETRY_ON_EINTR(
-        (aiocb->aio_type & QEMU_AIO_WRITE) ?
+        (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
            qemu_pwritev(aiocb->aio_fildes,
                           aiocb->io.iov,
                           aiocb->io.niov,
@@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
    ssize_t len;

    while (offset < aiocb->aio_nbytes) {
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
            len = pwrite(aiocb->aio_fildes,
                         (const char *)buf + offset,
                         aiocb->aio_nbytes - offset,
@@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
        char *p = buf;
        size_t count = aiocb->aio_nbytes, copy;
        int i;
@@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif

+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
+                              const struct blk_zone *blkz) {
+    zone->start = blkz->start << BDRV_SECTOR_BITS;
+    zone->length = blkz->len << BDRV_SECTOR_BITS;
+    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
+
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
+    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
+#else
+    zone->cap = blkz->len << BDRV_SECTOR_BITS;
+#endif
+
+    switch (blkz->type) {
+    case BLK_ZONE_TYPE_SEQWRITE_REQ:
+        zone->type = BLK_ZT_SWR;
+        break;
+    case BLK_ZONE_TYPE_SEQWRITE_PREF:
+        zone->type = BLK_ZT_SWP;
+        break;
+    case BLK_ZONE_TYPE_CONVENTIONAL:
+        zone->type = BLK_ZT_CONV;
+        break;
+    default:
+        error_report("Unsupported zone type: 0x%x", blkz->type);
+        return -ENOTSUP;
+    }
+
+    switch (blkz->cond) {
+    case BLK_ZONE_COND_NOT_WP:
+        zone->state = BLK_ZS_NOT_WP;
+        break;
+    case BLK_ZONE_COND_EMPTY:
+        zone->state = BLK_ZS_EMPTY;
+        break;
+    case BLK_ZONE_COND_IMP_OPEN:
+        zone->state = BLK_ZS_IOPEN;
+        break;
+    case BLK_ZONE_COND_EXP_OPEN:
+        zone->state = BLK_ZS_EOPEN;
+        break;
+    case BLK_ZONE_COND_CLOSED:
+        zone->state = BLK_ZS_CLOSED;
+        break;
+    case BLK_ZONE_COND_READONLY:
+        zone->state = BLK_ZS_RDONLY;
+        break;
+    case BLK_ZONE_COND_FULL:
+        zone->state = BLK_ZS_FULL;
+        break;
+    case BLK_ZONE_COND_OFFLINE:
+        zone->state = BLK_ZS_OFFLINE;
+        break;
+    default:
+        error_report("Unsupported zone state: 0x%x", blkz->cond);
+        return -ENOTSUP;
+    }
+    return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_report(void *opaque)
+{
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    /* zoned block devices use 512-byte sectors */
+    uint64_t sector = aiocb->aio_offset / 512;
+
+    struct blk_zone *blkz;
+    size_t rep_size;
+    unsigned int nrz;
+    int ret;
+    unsigned int n = 0, i = 0;
+
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                         fd, sector, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            ret = parse_zone(&zones[n], &blkz[i]);
+            if (ret != 0) {
+                return ret;
+            }
+
+            /* The next report should start after the last zone reported */
+            sector = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_mgmt(void *opaque)
+{
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    uint64_t sector = aiocb->aio_offset / 512;
+    int64_t nr_sectors = aiocb->aio_nbytes / 512;
+    struct blk_zone_range range;
+    int ret;
+
+    /* Execute the operation */
+    range.sector = sector;
+    range.nr_sectors = nr_sectors;
+    do {
+        ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
+    } while (ret != 0 && errno == EINTR);
+
+    return ret < 0 ? -errno : ret;
+}
+#endif
+
 static int handle_aiocb_copy_range(void *opaque)
 {
    RawPosixAIOData *aiocb = opaque;
@@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
 {
    BDRVRawState *s = bs->opaque;
    RawPosixAIOData acb;
+    int ret;

    if (fd_open(bs) < 0)
        return -EIO;
+#if defined(CONFIG_BLKZONED)
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
+        qemu_co_mutex_lock(&bs->wps->colock);
+        if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+            int index = offset / bs->bl.zone_size;
+            offset = bs->wps->wp[index];
+        }
+    }
+#endif

    /*
     * When using O_DIRECT, the request must be aligned to be able to use
@@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
 #ifdef CONFIG_LINUX_IO_URING
    } else if (s->use_linux_io_uring) {
        assert(qiov->size == bytes);
-        return luring_co_submit(bs, s->fd, offset, qiov, type);
+        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+        goto out;
 #endif
 #ifdef CONFIG_LINUX_AIO
    } else if (s->use_linux_aio) {
        assert(qiov->size == bytes);
-        return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
+        ret = laio_co_submit(s->fd, offset, qiov, type,
+                              s->aio_max_batch);
+        goto out;
 #endif
    }

@@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
    };

    assert(qiov->size == bytes);
-    return raw_thread_pool_submit(handle_aiocb_rw, &acb);
+    ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+    goto out; /* Avoid the compiler err of unused label */
+
+out:
+#if defined(CONFIG_BLKZONED)
+{
+    BlockZoneWps *wps = bs->wps;
+    if (ret == 0) {
+        if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
+            && wps && bs->bl.zone_size) {
+            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
+            if (!BDRV_ZT_IS_CONV(*wp)) {
+                if (type & QEMU_AIO_ZONE_APPEND) {
+                    *s->offset = *wp;
+                    trace_zbd_zone_append_complete(bs, *s->offset
+                        >> BDRV_SECTOR_BITS);
+                }
+                /* Advance the wp if needed */
+                if (offset + bytes > *wp) {
+                    *wp = offset + bytes;
+                }
+            }
+        }
+    } else {
+        if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
+            update_zones_wp(bs, s->fd, 0, 1);
+        }
+    }
+
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
+        qemu_co_mutex_unlock(&wps->colock);
+    }
+}
+#endif
+    return ret;
 }

 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
@@ -2126,36 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }

-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_plug();
-    }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        luring_io_plug();
-    }
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_unplug(s->aio_max_batch);
-    }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        luring_io_unplug();
-    }
-#endif
-}
-
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
@@ -2212,6 +2606,9 @@ static void raw_close(BlockDriverState *bs)
    BDRVRawState *s = bs->opaque;

    if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        g_free(bs->wps);
+#endif
        qemu_close(s->fd);
        s->fd = -1;
    }
@@ -2969,6 +3366,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
    }
 }

+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ * zones is an array of zone descriptors to hold zone information on reply;
+ * offset can be any byte within the entire size of the device;
+ * nr_zones is the maxium number of sectors the command should operate on.
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                           unsigned int *nr_zones,
+                                           BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_ZONE_REPORT,
+        .aio_offset = offset,
+        .zone_report    = {
+            .nr_zones       = nr_zones,
+            .zones          = zones,
+        },
+    };
+
+    trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
+    return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
+}
+#endif
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+    int64_t zone_size, zone_size_mask;
+    const char *op_name;
+    unsigned long zo;
+    int ret;
+    BlockZoneWps *wps = bs->wps;
+    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+
+    zone_size = bs->bl.zone_size;
+    zone_size_mask = zone_size - 1;
+    if (offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId64 "", offset / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    if (((offset + len) < capacity && len & zone_size_mask) ||
+        offset + len > capacity) {
+        error_report("number of sectors %" PRId64 " is not aligned to zone size"
+                      " %" PRId64 "", len / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    uint32_t i = offset / bs->bl.zone_size;
+    uint32_t nrz = len / bs->bl.zone_size;
+    uint64_t *wp = &wps->wp[i];
+    if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
+        error_report("zone mgmt operations are not allowed for conventional zones");
+        return -EIO;
+    }
+
+    switch (op) {
+    case BLK_ZO_OPEN:
+        op_name = "BLKOPENZONE";
+        zo = BLKOPENZONE;
+        break;
+    case BLK_ZO_CLOSE:
+        op_name = "BLKCLOSEZONE";
+        zo = BLKCLOSEZONE;
+        break;
+    case BLK_ZO_FINISH:
+        op_name = "BLKFINISHZONE";
+        zo = BLKFINISHZONE;
+        break;
+    case BLK_ZO_RESET:
+        op_name = "BLKRESETZONE";
+        zo = BLKRESETZONE;
+        break;
+    default:
+        error_report("Unsupported zone op: 0x%x", op);
+        return -ENOTSUP;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_ZONE_MGMT,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .zone_mgmt  = {
+            .op = zo,
+        },
+    };
+
+    trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
+                        len >> BDRV_SECTOR_BITS);
+    ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
+    if (ret != 0) {
+        update_zones_wp(bs, s->fd, offset, i);
+        error_report("ioctl %s failed %d", op_name, ret);
+        return ret;
+    }
+
+    if (zo == BLKRESETZONE && len == capacity) {
+        ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
+        if (ret < 0) {
+            error_report("reporting single wp failed");
+            return ret;
+        }
+    } else if (zo == BLKRESETZONE) {
+        for (unsigned int j = 0; j < nrz; ++j) {
+            wp[j] = offset + j * zone_size;
+        }
+    } else if (zo == BLKFINISHZONE) {
+        for (unsigned int j = 0; j < nrz; ++j) {
+            /* The zoned device allows the last zone smaller that the
+             * zone size. */
+            wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    assert(flags == 0);
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+    BDRVRawState *s = bs->opaque;
+    s->offset = offset;
+
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+        iov_len = qiov->iov[i].iov_len;
+        if (iov_len & wg_mask) {
+            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+                         "block size %" PRId64 "", i, iov_len, wg);
+            return -EINVAL;
+        }
+        len += iov_len;
+    }
+
+    trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
+    return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                bool blkdev)
@@ -3332,8 +3894,6 @@ BlockDriver bdrv_file = {
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
@@ -3704,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
    .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
@@ -3724,6 +4282,14 @@ static BlockDriver bdrv_host_device = {
 #ifdef __linux__
    .bdrv_co_ioctl          = hdev_co_ioctl,
 #endif
+
+    /* zoned device */
+#if defined(CONFIG_BLKZONED)
+    /* zone management operations */
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+    .bdrv_co_zone_append = raw_co_zone_append,
+#endif
 };

 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -3834,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
@@ -3962,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,

    .bdrv_co_truncate                   = raw_co_truncate,
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -30,8 +30,10 @@ BdrvGraphLock graph_lock;
 /* Protects the list of aiocontext and orphaned_reader_count */
 static QemuMutex aio_context_list_lock;

+#if 0
 /* Written and read with atomic operations. */
 static int has_writer;
+#endif

 /*
 * A reader coroutine could move from an AioContext to another.
@@ -88,6 +90,7 @@ void unregister_aiocontext(AioContext *ctx)
    g_free(ctx->bdrv_graph);
 }

+#if 0
 static uint32_t reader_count(void)
 {
    BdrvGraphRWlock *brdv_graph;
@@ -105,10 +108,17 @@ static uint32_t reader_count(void)
    assert((int32_t)rd >= 0);
    return rd;
 }
+#endif

 void bdrv_graph_wrlock(void)
 {
    GLOBAL_STATE_CODE();
+    /*
+     * TODO Some callers hold an AioContext lock when this is called, which
+     * causes deadlocks. Reenable once the AioContext locking is cleaned up (or
+     * AioContext locks are gone).
+     */
+#if 0
    assert(!qatomic_read(&has_writer));

    /* Make sure that constantly arriving new I/O doesn't cause starvation */
@@ -139,11 +149,13 @@ void bdrv_graph_wrlock(void)
    } while (reader_count() >= 1);

    bdrv_drain_all_end();
+#endif
 }

 void bdrv_graph_wrunlock(void)
 {
    GLOBAL_STATE_CODE();
+#if 0
    QEMU_LOCK_GUARD(&aio_context_list_lock);
    assert(qatomic_read(&has_writer));

@@ -155,18 +167,16 @@ void bdrv_graph_wrunlock(void)

    /* Wake up all coroutine that are waiting to read the graph */
    qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
+#endif
 }

 void coroutine_fn bdrv_graph_co_rdlock(void)
 {
+    /* TODO Reenable when wrlock is reenabled */
+#if 0
    BdrvGraphRWlock *bdrv_graph;
    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;

-    /* Do not lock if in main thread */
-    if (qemu_in_main_thread()) {
-        return;
-    }
-
    for (;;) {
        qatomic_set(&bdrv_graph->reader_count,
                    bdrv_graph->reader_count + 1);
@@ -223,18 +233,15 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
            qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
        }
    }
+#endif
 }

 void coroutine_fn bdrv_graph_co_rdunlock(void)
 {
+#if 0
    BdrvGraphRWlock *bdrv_graph;
    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;

-    /* Do not lock if in main thread */
-    if (qemu_in_main_thread()) {
-        return;
-    }
-
    qatomic_store_release(&bdrv_graph->reader_count,
                          bdrv_graph->reader_count - 1);
    /* make sure writer sees reader_count before we check has_writer */
@@ -249,6 +256,7 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
    if (qatomic_read(&has_writer)) {
        aio_wait_kick();
    }
+#endif
 }

 void bdrv_graph_rdlock_main_loop(void)
@@ -265,11 +273,20 @@ void bdrv_graph_rdunlock_main_loop(void)

 void assert_bdrv_graph_readable(void)
 {
+    /* reader_count() is slow due to aio_context_list_lock lock contention */
+    /* TODO Reenable when wrlock is reenabled */
+#if 0
+#ifdef CONFIG_DEBUG_GRAPH_LOCK
    assert(qemu_in_main_thread() || reader_count());
+#endif
+#endif
 }

 void assert_bdrv_graph_writable(void)
 {
    assert(qemu_in_main_thread());
+    /* TODO Reenable when wrlock is reenabled */
+#if 0
    assert(qatomic_read(&has_writer));
+#endif
 }
--- a/block/io.c
+++ b/block/io.c
@@ -60,7 +60,7 @@ static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)

 void bdrv_parent_drained_end_single(BdrvChild *c)
 {
-    IO_OR_GS_CODE();
+    GLOBAL_STATE_CODE();

    assert(c->quiesced_parent);
    c->quiesced_parent = false;
@@ -108,7 +108,7 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,

 void bdrv_parent_drained_begin_single(BdrvChild *c)
 {
-    IO_OR_GS_CODE();
+    GLOBAL_STATE_CODE();

    assert(!c->quiesced_parent);
    c->quiesced_parent = true;
@@ -160,7 +160,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
    bool have_limits;

    GLOBAL_STATE_CODE();
-    assume_graph_lock(); /* FIXME */

    if (tran) {
        BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
@@ -248,7 +247,7 @@ typedef struct {
 bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
                     bool ignore_bds_parents)
 {
-    IO_OR_GS_CODE();
+    GLOBAL_STATE_CODE();

    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
        return true;
@@ -335,7 +334,8 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
    if (ctx != co_ctx) {
        aio_context_release(ctx);
    }
-    replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data);
+    replay_bh_schedule_oneshot_event(qemu_get_aio_context(),
+                                     bdrv_co_drain_bh_cb, &data);

    qemu_coroutine_yield();
    /* If we are resumed from some other event (such as an aio completion or a
@@ -358,9 +358,10 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
        return;
    }

+    GLOBAL_STATE_CODE();
+
    /* Stop things in parent-to-child order */
    if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
-        aio_disable_external(bdrv_get_aio_context(bs));
        bdrv_parent_drained_begin(bs, parent);
        if (bs->drv && bs->drv->bdrv_drain_begin) {
            bs->drv->bdrv_drain_begin(bs);
@@ -400,11 +401,14 @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
    int old_quiesce_counter;

+    IO_OR_GS_CODE();
+
    if (qemu_in_coroutine()) {
        bdrv_co_yield_to_drain(bs, false, parent, false);
        return;
    }
    assert(bs->quiesce_counter > 0);
+    GLOBAL_STATE_CODE();

    /* Re-enable things in child-to-parent order */
    old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
@@ -413,7 +417,6 @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
            bs->drv->bdrv_drain_end(bs);
        }
        bdrv_parent_drained_end(bs, parent);
-        aio_enable_external(bdrv_get_aio_context(bs));
    }
 }

@@ -727,10 +730,9 @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 /**
 * Round a region to cluster boundaries
 */
-void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t offset, int64_t bytes,
-                            int64_t *cluster_offset,
-                            int64_t *cluster_bytes)
+void coroutine_fn GRAPH_RDLOCK
+bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                       int64_t *cluster_offset, int64_t *cluster_bytes)
 {
    BlockDriverInfo bdi;
    IO_CODE();
@@ -744,7 +746,7 @@ void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs,
    }
 }

-static coroutine_fn int bdrv_get_cluster_size(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK bdrv_get_cluster_size(BlockDriverState *bs)
 {
    BlockDriverInfo bdi;
    int ret;
@@ -1800,7 +1802,7 @@ fail:
    return ret;
 }

-static inline int coroutine_fn
+static inline int coroutine_fn GRAPH_RDLOCK
 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
                          BdrvTrackedRequest *req, int flags)
 {
@@ -3115,6 +3117,74 @@ out:
    return co.ret;
 }

+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                        unsigned int *nr_zones,
+                        BlockZoneDescriptor *zones)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                        QEMUIOVector *qiov,
+                        BdrvRequestFlags flags)
+{
+    int ret;
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
    IO_CODE();
@@ -3153,43 +3223,6 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
    return mem;
 }

-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
-{
-    BdrvChild *child;
-    IO_CODE();
-    assert_bdrv_graph_readable();
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_co_io_plug(child->bs);
-    }
-
-    if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
-        BlockDriver *drv = bs->drv;
-        if (drv && drv->bdrv_co_io_plug) {
-            drv->bdrv_co_io_plug(bs);
-        }
-    }
-}
-
-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
-{
-    BdrvChild *child;
-    IO_CODE();
-    assert_bdrv_graph_readable();
-
-    assert(bs->io_plugged);
-    if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
-        BlockDriver *drv = bs->drv;
-        if (drv && drv->bdrv_co_io_unplug) {
-            drv->bdrv_co_io_unplug(bs);
-        }
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_co_io_unplug(child->bs);
-    }
-}
-
 /* Helper that undoes bdrv_register_buf() when it fails partway through */
 static void GRAPH_RDLOCK
 bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -16,6 +16,7 @@
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 #include "trace.h"

 /* Only used for assertions.  */
@@ -41,7 +42,6 @@ typedef struct LuringAIOCB {
 } LuringAIOCB;

 typedef struct LuringQueue {
-    int plugged;
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
@@ -267,7 +267,7 @@ static void luring_process_completions_and_submit(LuringState *s)
 {
    luring_process_completions(s);

-    if (!s->io_q.plugged && s->io_q.in_queue > 0) {
+    if (s->io_q.in_queue > 0) {
        ioq_submit(s);
    }
 }
@@ -301,29 +301,17 @@ static void qemu_luring_poll_ready(void *opaque)
 static void ioq_init(LuringQueue *io_q)
 {
    QSIMPLEQ_INIT(&io_q->submit_queue);
-    io_q->plugged = 0;
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
 }

-void luring_io_plug(void)
+static void luring_unplug_fn(void *opaque)
 {
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    trace_luring_io_plug(s);
-    s->io_q.plugged++;
-}
-
-void luring_io_unplug(void)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    assert(s->io_q.plugged);
-    trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
-                           s->io_q.in_queue, s->io_q.in_flight);
-    if (--s->io_q.plugged == 0 &&
-        !s->io_q.blocked && s->io_q.in_queue > 0) {
+    LuringState *s = opaque;
+    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
+                           s->io_q.in_flight);
+    if (!s->io_q.blocked && s->io_q.in_queue > 0) {
        ioq_submit(s);
    }
 }
@@ -350,6 +338,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
                             luringcb->qiov->niov, offset);
        break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+                             luringcb->qiov->niov, offset);
+        break;
    case QEMU_AIO_READ:
        io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
                            luringcb->qiov->niov, offset);
@@ -366,14 +358,16 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,

    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
    s->io_q.in_queue++;
-    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
-                           s->io_q.in_queue, s->io_q.in_flight);
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
-        ret = ioq_submit(s);
-        trace_luring_do_submit_done(s, ret);
-        return ret;
+    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
+                           s->io_q.in_flight);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
+            ret = ioq_submit(s);
+            trace_luring_do_submit_done(s, ret);
+            return ret;
+        }
+
+        blk_io_plug_call(luring_unplug_fn, s);
    }
    return 0;
 }
@@ -406,7 +400,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,

 void luring_detach_aio_context(LuringState *s, AioContext *old_context)
 {
-    aio_set_fd_handler(old_context, s->ring.ring_fd, false,
+    aio_set_fd_handler(old_context, s->ring.ring_fd,
                       NULL, NULL, NULL, NULL, s);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
@@ -416,7 +410,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context)
 {
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
-    aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false,
+    aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
                       qemu_luring_completion_cb, NULL,
                       qemu_luring_poll_cb, qemu_luring_poll_ready, s);
 }
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -363,7 +363,6 @@ iscsi_set_events(IscsiLun *iscsilun)

    if (ev != iscsilun->events) {
        aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi),
-                           false,
                           (ev & POLLIN) ? iscsi_process_read : NULL,
                           (ev & POLLOUT) ? iscsi_process_write : NULL,
                           NULL, NULL,
@@ -1540,7 +1539,7 @@ static void iscsi_detach_aio_context(BlockDriverState *bs)
    IscsiLun *iscsilun = bs->opaque;

    aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi),
-                       false, NULL, NULL, NULL, NULL, NULL);
+                       NULL, NULL, NULL, NULL, NULL);
    iscsilun->events = 0;

    if (iscsilun->nop_timer) {
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -15,6 +15,7 @@
 #include "qemu/event_notifier.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"

 /* Only used for assertions.  */
 #include "qemu/coroutine_int.h"
@@ -46,7 +47,6 @@ struct qemu_laiocb {
 };

 typedef struct {
-    int plugged;
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
@@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
    qemu_laio_process_completions(s);

-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }
@@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
 static void ioq_init(LaioQueue *io_q)
 {
    QSIMPLEQ_INIT(&io_q->pending);
-    io_q->plugged = 0;
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
@@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
    return max_batch;
 }

-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
 {
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
+    LinuxAioState *s = opaque;

-    s->io_q.plugged++;
-}
-
-void laio_io_unplug(uint64_t dev_max_batch)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
-
-    assert(s->io_q.plugged);
-    s->io_q.plugged--;
-
-    /*
-     * Why max batch checking is performed here:
-     * Another BDS may have queued requests with a higher dev_max_batch and
-     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
-     * batch so we can honor our device's dev_max_batch.
-     */
-    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
-        (!s->io_q.plugged &&
-         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }
@@ -394,6 +373,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
@@ -407,10 +389,12 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
-        ioq_submit(s);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+            ioq_submit(s);
+        } else {
+            blk_io_plug_call(laio_unplug_fn, s);
+        }
    }

    return 0;
@@ -443,7 +427,7 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,

 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
-    aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL);
+    aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
 }
@@ -452,7 +436,7 @@ void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
-    aio_set_event_notifier(new_context, &s->e, false,
+    aio_set_event_notifier(new_context, &s->e,
                           qemu_laio_completion_cb,
                           qemu_laio_poll_cb,
                           qemu_laio_poll_ready);
--- a/block/meson.build
+++ b/block/meson.build
@@ -23,6 +23,7 @@ block_ss.add(files(
  'mirror.c',
  'nbd.c',
  'null.c',
+  'plug.c',
  'qapi.c',
  'qcow2-bitmap.c',
  'qcow2-cache.c',
@@ -38,11 +39,6 @@ block_ss.add(files(
  'snapshot-access.c',
  'throttle-groups.c',
  'throttle.c',
-  'vhdx-endian.c',
-  'vhdx-log.c',
-  'vhdx.c',
-  'vmdk.c',
-  'vpc.c',
  'write-threshold.c',
 ), zstd, zlib, gnutls)

@@ -55,6 +51,19 @@ endif
 if get_option('vdi').allowed()
  block_ss.add(files('vdi.c'))
 endif
+if get_option('vhdx').allowed()
+  block_ss.add(files(
+    'vhdx-endian.c',
+    'vhdx-log.c',
+    'vhdx.c'
+  ))
+endif
+if get_option('vmdk').allowed()
+  block_ss.add(files('vmdk.c'))
+endif
+if get_option('vpc').allowed()
+  block_ss.add(files('vpc.c'))
+endif
 if get_option('cloop').allowed()
  block_ss.add(files('cloop.c'))
 endif
@@ -84,7 +93,7 @@ block_ss.add(when: 'CONFIG_WIN32', if_true: files('file-win32.c', 'win32-aio.c')
 block_ss.add(when: 'CONFIG_POSIX', if_true: [files('file-posix.c'), coref, iokit])
 block_ss.add(when: libiscsi, if_true: files('iscsi-opts.c'))
 block_ss.add(when: 'CONFIG_LINUX', if_true: files('nvme.c'))
-if not get_option('replication').disabled()
+if get_option('replication').allowed()
  block_ss.add(files('replication.c'))
 endif
 block_ss.add(when: libaio, if_true: files('linux-aio.c'))
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -270,8 +270,8 @@ static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,

 /* Round offset and/or bytes to target cluster if COW is needed, and
 * return the offset of the adjusted tail against original. */
-static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
-                            uint64_t *bytes)
+static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
+                                         uint64_t *bytes)
 {
    bool need_cow;
    int ret = 0;
@@ -471,12 +471,11 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
    return bytes_handled;
 }

-static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->mirror_top_bs->backing->bs;
    MirrorOp *pseudo_op;
    int64_t offset;
-    uint64_t delay_ns = 0, ret = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@@ -576,8 +575,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_offset;
            int64_t target_bytes;
-            bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
-                                   &target_offset, &target_bytes);
+            WITH_GRAPH_RDLOCK_GUARD() {
+                bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
+                                       &target_offset, &target_bytes);
+            }
            if (target_offset == offset &&
                target_bytes == io_bytes) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
@@ -606,16 +607,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        assert(io_bytes);
        offset += io_bytes;
        nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
-        delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
+        block_job_ratelimit_processed_bytes(&s->common, io_bytes_acct);
    }

-    ret = delay_ns;
 fail:
    QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next);
    qemu_co_queue_restart_all(&pseudo_op->waiting_requests);
    g_free(pseudo_op);
-
-    return ret;
 }

 static void mirror_free_init(MirrorBlockJob *s)
@@ -664,11 +662,15 @@ static int mirror_exit_common(Job *job)
    bool abort = job->ret < 0;
    int ret = 0;

+    GLOBAL_STATE_CODE();
+
    if (s->prepared) {
        return 0;
    }
    s->prepared = true;

+    aio_context_acquire(qemu_get_aio_context());
+
    mirror_top_bs = s->mirror_top_bs;
    bs_opaque = mirror_top_bs->opaque;
    src = mirror_top_bs->backing->bs;
@@ -745,7 +747,10 @@ static int mirror_exit_common(Job *job)
         * Cannot use check_to_replace_node() here, because that would
         * check for an op blocker on @to_replace, and we have our own
         * there.
+         *
+         * TODO Pull out the writer lock from bdrv_replace_node() to here
         */
+        bdrv_graph_rdlock_main_loop();
        if (bdrv_recurse_can_replace(src, to_replace)) {
            bdrv_replace_node(to_replace, target_bs, &local_err);
        } else {
@@ -754,6 +759,7 @@ static int mirror_exit_common(Job *job)
                       "would not lead to an abrupt change of visible data",
                       to_replace->node_name, target_bs->node_name);
        }
+        bdrv_graph_rdunlock_main_loop();
        bdrv_drained_end(target_bs);
        if (local_err) {
            error_report_err(local_err);
@@ -787,6 +793,8 @@ static int mirror_exit_common(Job *job)
    bdrv_unref(mirror_top_bs);
    bdrv_unref(src);

+    aio_context_release(qemu_get_aio_context());
+
    return ret;
 }

@@ -966,11 +974,13 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
     */
    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
+    bdrv_graph_co_rdlock();
    if (!bdrv_co_get_info(target_bs, &bdi) && bdi.cluster_size) {
        s->target_cluster_size = bdi.cluster_size;
    } else {
        s->target_cluster_size = BDRV_SECTOR_SIZE;
    }
+    bdrv_graph_co_rdunlock();
    if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) &&
        s->granularity < s->target_cluster_size) {
        s->buf_size = MAX(s->buf_size, s->target_cluster_size);
@@ -1003,7 +1013,6 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
    for (;;) {
-        uint64_t delay_ns = 0;
        int64_t cnt, delta;
        bool should_complete;

@@ -1043,7 +1052,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
                mirror_wait_for_free_in_flight_slot(s);
                continue;
            } else if (cnt != 0) {
-                delay_ns = mirror_iteration(s);
+                mirror_iteration(s);
            }
        }

@@ -1106,12 +1115,14 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
        }

        if (job_is_ready(&s->common.job) && !should_complete) {
-            delay_ns = (s->in_flight == 0 &&
-                        cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0);
+            if (s->in_flight == 0 && cnt == 0) {
+                trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
+                                          BLOCK_JOB_SLICE_TIME);
+                job_sleep_ns(&s->common.job, BLOCK_JOB_SLICE_TIME);
+            }
+        } else {
+            block_job_ratelimit_sleep(&s->common);
        }
-        trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
-                                  delay_ns);
-        job_sleep_ns(&s->common.job, delay_ns);
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    }

@@ -1416,7 +1427,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
    return op;
 }

-static void coroutine_fn active_write_settle(MirrorOp *op)
+static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op)
 {
    uint64_t start_chunk = op->offset / op->s->granularity;
    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -322,6 +322,7 @@ int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs,
    int ret;
    IO_CODE();

+    assert_bdrv_graph_readable();
    assert(!s->ioc);

    s->ioc = nbd_co_establish_connection(s->conn, &s->info, blocking, errp);
@@ -369,7 +370,7 @@ static bool nbd_client_connecting(BDRVNBDState *s)
 }

 /* Called with s->requests_lock taken.  */
-static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
+static void coroutine_fn GRAPH_RDLOCK nbd_reconnect_attempt(BDRVNBDState *s)
 {
    int ret;
    bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
@@ -480,9 +481,9 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
    }
 }

-static int coroutine_fn nbd_co_send_request(BlockDriverState *bs,
-                                            NBDRequest *request,
-                                            QEMUIOVector *qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_send_request(BlockDriverState *bs, NBDRequest *request,
+                    QEMUIOVector *qiov)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    int rc, i = -1;
@@ -1171,8 +1172,9 @@ static int coroutine_fn nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
    return iter.ret;
 }

-static int coroutine_fn nbd_co_request(BlockDriverState *bs, NBDRequest *request,
-                                       QEMUIOVector *write_qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_request(BlockDriverState *bs, NBDRequest *request,
+               QEMUIOVector *write_qiov)
 {
    int ret, request_ret;
    Error *local_err = NULL;
@@ -1208,9 +1210,9 @@ static int coroutine_fn nbd_co_request(BlockDriverState *bs, NBDRequest *request
    return ret ? ret : request_ret;
 }

-static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offset,
-                                             int64_t bytes, QEMUIOVector *qiov,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                     QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
    int ret, request_ret;
    Error *local_err = NULL;
@@ -1266,9 +1268,9 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse
    return ret ? ret : request_ret;
 }

-static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset,
-                                              int64_t bytes, QEMUIOVector *qiov,
-                                              BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                      QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1291,8 +1293,9 @@ static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t offs
    return nbd_co_request(bs, &request, qiov);
 }

-static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                                    int64_t bytes, BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                            BdrvRequestFlags flags)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1326,7 +1329,7 @@ static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_flush(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK nbd_client_co_flush(BlockDriverState *bs)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = { .type = NBD_CMD_FLUSH };
@@ -1341,8 +1344,8 @@ static int coroutine_fn nbd_client_co_flush(BlockDriverState *bs)
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                               int64_t bytes)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1361,7 +1364,7 @@ static int coroutine_fn nbd_client_co_pdiscard(BlockDriverState *bs, int64_t off
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_block_status(
+static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status(
        BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
        int64_t *pnum, int64_t *map, BlockDriverState **file)
 {
@@ -1920,11 +1923,6 @@ fail:
    return ret;
 }

-static int coroutine_fn nbd_co_flush(BlockDriverState *bs)
-{
-    return nbd_client_co_flush(bs);
-}
-
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
@@ -2120,7 +2118,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
@@ -2148,7 +2146,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
@@ -2176,7 +2174,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -195,7 +195,6 @@ static void nfs_set_events(NFSClient *client)
    int ev = nfs_which_events(client->context);
    if (ev != client->events) {
        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
-                           false,
                           (ev & POLLIN) ? nfs_process_read : NULL,
                           (ev & POLLOUT) ? nfs_process_write : NULL,
                           NULL, NULL, client);
@@ -373,7 +372,7 @@ static void nfs_detach_aio_context(BlockDriverState *bs)
    NFSClient *client = bs->opaque;

    aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
-                       false, NULL, NULL, NULL, NULL, NULL);
+                       NULL, NULL, NULL, NULL, NULL);
    client->events = 0;
 }

@@ -391,7 +390,7 @@ static void nfs_client_close(NFSClient *client)
    if (client->context) {
        qemu_mutex_lock(&client->mutex);
        aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context),
-                           false, NULL, NULL, NULL, NULL, NULL);
+                           NULL, NULL, NULL, NULL, NULL);
        qemu_mutex_unlock(&client->mutex);
        if (client->fh) {
            nfs_close(client->context, client->fh);
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -25,6 +25,7 @@
 #include "qemu/vfio-helpers.h"
 #include "block/block-io.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/replay.h"
 #include "trace.h"

@@ -119,7 +120,6 @@ struct BDRVNVMeState {
    int blkshift;

    uint64_t max_transfer;
-    bool plugged;

    bool supports_write_zeroes;
    bool supports_discard;
@@ -282,7 +282,7 @@ static void nvme_kick(NVMeQueuePair *q)
 {
    BDRVNVMeState *s = q->s;

-    if (s->plugged || !q->need_kick) {
+    if (!q->need_kick) {
        return;
    }
    trace_nvme_kick(s, q->index);
@@ -387,10 +387,6 @@ static bool nvme_process_completion(NVMeQueuePair *q)
    NvmeCqe *c;

    trace_nvme_process_completion(s, q->index, q->inflight);
-    if (s->plugged) {
-        trace_nvme_process_completion_queue_plugged(s, q->index);
-        return false;
-    }

    /*
     * Support re-entrancy when a request cb() function invokes aio_poll().
@@ -480,6 +476,15 @@ static void nvme_trace_command(const NvmeCmd *cmd)
    }
 }

+static void nvme_unplug_fn(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    QEMU_LOCK_GUARD(&q->lock);
+    nvme_kick(q);
+    nvme_process_completion(q);
+}
+
 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                NvmeCmd *cmd, BlockCompletionFunc cb,
                                void *opaque)
@@ -496,8 +501,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
           q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
    q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
    q->need_kick++;
-    nvme_kick(q);
-    nvme_process_completion(q);
+    blk_io_plug_call(nvme_unplug_fn, q);
    qemu_mutex_unlock(&q->lock);
 }

@@ -862,7 +866,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
    }
    aio_set_event_notifier(bdrv_get_aio_context(bs),
                           &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                           false, nvme_handle_event, nvme_poll_cb,
+                           nvme_handle_event, nvme_poll_cb,
                           nvme_poll_ready);

    if (!nvme_identify(bs, namespace, errp)) {
@@ -948,7 +952,7 @@ static void nvme_close(BlockDriverState *bs)
    g_free(s->queues);
    aio_set_event_notifier(bdrv_get_aio_context(bs),
                           &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                           false, NULL, NULL, NULL);
+                           NULL, NULL, NULL);
    event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
    qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
                            0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
@@ -1546,7 +1550,7 @@ static void nvme_detach_aio_context(BlockDriverState *bs)

    aio_set_event_notifier(bdrv_get_aio_context(bs),
                           &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                           false, NULL, NULL, NULL);
+                           NULL, NULL, NULL);
 }

 static void nvme_attach_aio_context(BlockDriverState *bs,
@@ -1556,7 +1560,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs,

    s->aio_context = new_context;
    aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                           false, nvme_handle_event, nvme_poll_cb,
+                           nvme_handle_event, nvme_poll_cb,
                           nvme_poll_ready);

    for (unsigned i = 0; i < s->queue_count; i++) {
@@ -1567,27 +1571,6 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
    }
 }

-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
-{
-    BDRVNVMeState *s = bs->opaque;
-    assert(!s->plugged);
-    s->plugged = true;
-}
-
-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVNVMeState *s = bs->opaque;
-    assert(s->plugged);
-    s->plugged = false;
-    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
-        NVMeQueuePair *q = s->queues[i];
-        qemu_mutex_lock(&q->lock);
-        nvme_kick(q);
-        nvme_process_completion(q);
-        qemu_mutex_unlock(&q->lock);
-    }
-}
-
 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
                              Error **errp)
 {
@@ -1664,9 +1647,6 @@ static BlockDriver bdrv_nvme = {
    .bdrv_detach_aio_context  = nvme_detach_aio_context,
    .bdrv_attach_aio_context  = nvme_attach_aio_context,

-    .bdrv_co_io_plug          = nvme_co_io_plug,
-    .bdrv_co_io_unplug        = nvme_co_io_unplug,
-
    .bdrv_register_buf        = nvme_register_buf,
    .bdrv_unregister_buf      = nvme_unregister_buf,
 };
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -522,8 +522,8 @@ out:
 }


-static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,
-                                            Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+parallels_co_create(BlockdevCreateOptions* opts, Error **errp)
 {
    BlockdevCreateOptionsParallels *parallels_opts;
    BlockDriverState *bs;
@@ -613,8 +613,8 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,

    ret = 0;
 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;

 exit:
@@ -622,7 +622,7 @@ exit:
    goto out;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 parallels_co_create_opts(BlockDriver *drv, const char *filename,
                         QemuOpts *opts, Error **errp)
 {
@@ -691,7 +691,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename,

 done:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/plug.c
+++ b/block/plug.c
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block I/O plugging
+ *
+ * Copyright Red Hat.
+ *
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
+ * section, allowing multiple calls to batch up. This is a performance
+ * optimization that is used in the block layer to submit several I/O requests
+ * at once instead of individually:
+ *
+ *   blk_io_plug(); <-- start of plugged region
+ *   ...
+ *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   ...
+ *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
+ *
+ * This code is actually generic and not tied to the block layer. If another
+ * subsystem needs this functionality, it could be renamed.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "sysemu/block-backend.h"
+
+/* A function call that has been deferred until unplug() */
+typedef struct {
+    void (*fn)(void *);
+    void *opaque;
+} UnplugFn;
+
+/* Per-thread state */
+typedef struct {
+    unsigned count;       /* how many times has plug() been called? */
+    GArray *unplug_fns;   /* functions to call at unplug time */
+} Plug;
+
+/* Use get_ptr_plug() to fetch this thread-local value */
+QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
+
+/* Called at thread cleanup time */
+static void blk_io_plug_atexit(Notifier *n, void *value)
+{
+    Plug *plug = get_ptr_plug();
+    g_array_free(plug->unplug_fns, TRUE);
+}
+
+/* This won't involve coroutines, so use __thread */
+static __thread Notifier blk_io_plug_atexit_notifier;
+
+/**
+ * blk_io_plug_call:
+ * @fn: a function pointer to be invoked
+ * @opaque: a user-defined argument to @fn()
+ *
+ * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
+ * section.
+ *
+ * Otherwise defer the call until the end of the outermost
+ * blk_io_plug()/blk_io_unplug() section in this thread. If the same
+ * @fn/@opaque pair has already been deferred, it will only be called once upon
+ * blk_io_unplug() so that accumulated calls are batched into a single call.
+ *
+ * The caller must ensure that @opaque is not freed before @fn() is invoked.
+ */
+void blk_io_plug_call(void (*fn)(void *), void *opaque)
+{
+    Plug *plug = get_ptr_plug();
+
+    /* Call immediately if we're not plugged */
+    if (plug->count == 0) {
+        fn(opaque);
+        return;
+    }
+
+    GArray *array = plug->unplug_fns;
+    if (!array) {
+        array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
+        plug->unplug_fns = array;
+        blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
+        qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
+    }
+
+    UnplugFn *fns = (UnplugFn *)array->data;
+    UnplugFn new_fn = {
+        .fn = fn,
+        .opaque = opaque,
+    };
+
+    /*
+     * There won't be many, so do a linear search. If this becomes a bottleneck
+     * then a binary search (glib 2.62+) or different data structure could be
+     * used.
+     */
+    for (guint i = 0; i < array->len; i++) {
+        if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
+            return; /* already exists */
+        }
+    }
+
+    g_array_append_val(array, new_fn);
+}
+
+/**
+ * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
+ *
+ * blk_io_plug/unplug are thread-local operations. This means that multiple
+ * threads can simultaneously call plug/unplug, but the caller must ensure that
+ * each unplug() is called in the same thread of the matching plug().
+ *
+ * Nesting is supported. blk_io_plug_call() functions are only called at the
+ * outermost blk_io_unplug().
+ */
+void blk_io_plug(void)
+{
+    Plug *plug = get_ptr_plug();
+
+    assert(plug->count < UINT32_MAX);
+
+    plug->count++;
+}
+
+/**
+ * blk_io_unplug: Run any pending blk_io_plug_call() functions
+ *
+ * There must have been a matching blk_io_plug() call in the same thread prior
+ * to this blk_io_unplug() call.
+ */
+void blk_io_unplug(void)
+{
+    Plug *plug = get_ptr_plug();
+
+    assert(plug->count > 0);
+
+    if (--plug->count > 0) {
+        return;
+    }
+
+    GArray *array = plug->unplug_fns;
+    if (!array) {
+        return;
+    }
+
+    UnplugFn *fns = (UnplugFn *)array->data;
+
+    for (guint i = 0; i < array->len; i++) {
+        fns[i].fn(fns[i].opaque);
+    }
+
+    /*
+     * This resets the array without freeing memory so that appending is cheap
+     * in the future.
+     */
+    g_array_set_size(array, 0);
+}
--- a/block/qapi-sysemu.c
+++ b/block/qapi-sysemu.c
@@ -362,7 +362,10 @@ void qmp_blockdev_change_medium(const char *device,
        qdict_put_str(options, "driver", format);
    }

+    aio_context_acquire(qemu_get_aio_context());
    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
+    aio_context_release(qemu_get_aio_context());
+
    if (!medium_bs) {
        goto fail;
    }
@@ -517,6 +520,7 @@ void qmp_block_latency_histogram_set(
    bool has_boundaries, uint64List *boundaries,
    bool has_boundaries_read, uint64List *boundaries_read,
    bool has_boundaries_write, uint64List *boundaries_write,
+    bool has_boundaries_append, uint64List *boundaries_append,
    bool has_boundaries_flush, uint64List *boundaries_flush,
    Error **errp)
 {
@@ -557,6 +561,16 @@ void qmp_block_latency_histogram_set(
        }
    }

+    if (has_boundaries || has_boundaries_append) {
+        ret = block_latency_histogram_set(
+                stats, BLOCK_ACCT_ZONE_APPEND,
+                has_boundaries_append ? boundaries_append : boundaries);
+        if (ret) {
+            error_setg(errp, "Device '%s' set append write boundaries fail", id);
+            return;
+        }
+    }
+
    if (has_boundaries || has_boundaries_flush) {
        ret = block_latency_histogram_set(
            stats, BLOCK_ACCT_FLUSH,
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)

    ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
    ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+    ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
    ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
    ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+    ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];

    ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
    ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+    ds->failed_zone_append_operations =
+        stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
    ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];

    ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
    ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+    ds->invalid_zone_append_operations =
+        stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->invalid_flush_operations =
        stats->invalid_ops[BLOCK_ACCT_FLUSH];
    ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];

    ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
    ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+    ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
    ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
    ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+    ds->zone_append_total_time_ns =
+        stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
    ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
    ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
    ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
@@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)

        TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
        TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+        TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
        TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];

        dev_stats->interval_length = ts->interval_length;
@@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
        dev_stats->max_wr_latency_ns = timed_average_max(wr);
        dev_stats->avg_wr_latency_ns = timed_average_avg(wr);

+        dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
+        dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
+        dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
+
        dev_stats->min_flush_latency_ns = timed_average_min(fl);
        dev_stats->max_flush_latency_ns = timed_average_max(fl);
        dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
@@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
            block_acct_queue_depth(ts, BLOCK_ACCT_READ);
        dev_stats->avg_wr_queue_depth =
            block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+        dev_stats->avg_zone_append_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);

        QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
    }
@@ -600,12 +616,14 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
    ds->wr_latency_histogram
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
+    ds->zone_append_latency_histogram
+        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
    ds->flush_latency_histogram
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
 }

-static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
-                                        bool blk_level)
+static BlockStats * GRAPH_RDLOCK
+bdrv_query_bds_stats(BlockDriverState *bs, bool blk_level)
 {
    BdrvChild *parent_child;
    BlockDriverState *filter_or_cow_bs;
@@ -713,6 +731,8 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
    BlockBackend *blk;
    BlockDriverState *bs;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /* Just to be safe if query_nodes is not always initialized */
    if (has_query_nodes && query_nodes) {
        for (bs = bdrv_next_node(NULL); bs; bs = bdrv_next_node(bs)) {
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -800,8 +800,8 @@ static void qcow_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

-static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
-                                       Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+qcow_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
    BlockdevCreateOptionsQcow *qcow_opts;
    int header_size, backing_filename_len, l1_size, shift, i;
@@ -915,13 +915,13 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
    g_free(tmp);
    ret = 0;
 exit:
-    blk_unref(qcow_blk);
-    bdrv_unref(bs);
+    blk_co_unref(qcow_blk);
+    bdrv_co_unref(bs);
    qcrypto_block_free(crypto);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 qcow_co_create_opts(BlockDriver *drv, const char *filename,
                    QemuOpts *opts, Error **errp)
 {
@@ -1015,7 +1015,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename,
 fail:
    g_free(backing_fmt);
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -3715,7 +3715,7 @@ int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)

    qemu_co_mutex_assert_locked(&s->lock);

-    file_length = bdrv_getlength(bs->file->bs);
+    file_length = bdrv_co_getlength(bs->file->bs);
    if (file_length < 0) {
        return file_length;
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -118,8 +118,9 @@ static int qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
 }


-static int qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
-                                      void *opaque, Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, void *opaque,
+                           Error **errp)
 {
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
@@ -144,9 +145,7 @@ static int qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
     */
    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0);
-    ret = bdrv_pwrite_zeroes(bs->file,
-                             ret,
-                             clusterlen, 0);
+    ret = bdrv_co_pwrite_zeroes(bs->file, ret, clusterlen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
        return -1;
@@ -156,9 +155,11 @@ static int qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
 }


-static int qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
-                                       const uint8_t *buf, size_t buflen,
-                                       void *opaque, Error **errp)
+/* The graph lock must be held when called in coroutine context */
+static int coroutine_mixed_fn
+qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
+                            const uint8_t *buf, size_t buflen,
+                            void *opaque, Error **errp)
 {
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
@@ -1618,9 +1619,11 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,

    if (open_data_file) {
        /* Open external data file */
+        bdrv_graph_co_rdunlock();
        s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs,
                                          &child_of_bds, BDRV_CHILD_DATA,
                                          true, errp);
+        bdrv_graph_co_rdlock();
        if (*errp) {
            ret = -EINVAL;
            goto fail;
@@ -1628,10 +1631,12 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,

        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
            if (!s->data_file && s->image_data_file) {
+                bdrv_graph_co_rdunlock();
                s->data_file = bdrv_co_open_child(s->image_data_file, options,
                                                  "data-file", bs,
                                                  &child_of_bds,
                                                  BDRV_CHILD_DATA, false, errp);
+                bdrv_graph_co_rdlock();
                if (!s->data_file) {
                    ret = -EINVAL;
                    goto fail;
@@ -1856,7 +1861,9 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
 fail:
    g_free(s->image_data_file);
    if (open_data_file && has_data_file(bs)) {
+        bdrv_graph_co_rdunlock();
        bdrv_unref_child(bs, s->data_file);
+        bdrv_graph_co_rdlock();
        s->data_file = NULL;
    }
    g_free(s->unknown_header_fields);
@@ -1891,12 +1898,14 @@ static void coroutine_fn qcow2_open_entry(void *opaque)
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

-    assume_graph_lock(); /* FIXME */
+    GRAPH_RDLOCK_GUARD();

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
                             qoc->errp);
    qemu_co_mutex_unlock(&s->lock);
+
+    aio_wait_kick();
 }

 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
@@ -1920,14 +1929,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

-    if (qemu_in_coroutine()) {
-        /* From bdrv_co_create.  */
-        qcow2_open_entry(&qoc);
-    } else {
-        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
-        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
-    }
+    assert(!qemu_in_coroutine());
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+
+    aio_co_enter(bdrv_get_aio_context(bs),
+                 qemu_coroutine_create(qcow2_open_entry, &qoc));
+    AIO_WAIT_WHILE_UNLOCKED(NULL, qoc.ret == -EINPROGRESS);
+
    return qoc.ret;
 }

@@ -2089,11 +2097,10 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
    }
 }

-static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
-                                              bool want_zero,
-                                              int64_t offset, int64_t count,
-                                              int64_t *pnum, int64_t *map,
-                                              BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                      int64_t count, int64_t *pnum, int64_t *map,
+                      BlockDriverState **file)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t host_offset;
@@ -3141,9 +3148,10 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
    return qcow2_update_header(bs);
 }

-static int qcow2_set_up_encryption(BlockDriverState *bs,
-                                   QCryptoBlockCreateOptions *cryptoopts,
-                                   Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_set_up_encryption(BlockDriverState *bs,
+                        QCryptoBlockCreateOptions *cryptoopts,
+                        Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
    QCryptoBlock *crypto = NULL;
@@ -3235,7 +3243,7 @@ preallocate_co(BlockDriverState *bs, uint64_t offset, uint64_t new_length,
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
-    file_length = bdrv_getlength(s->data_file->bs);
+    file_length = bdrv_co_getlength(s->data_file->bs);
    if (file_length < 0) {
        error_setg_errno(errp, -file_length, "Could not get file size");
        ret = file_length;
@@ -3430,7 +3438,7 @@ static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
    return refcount_bits;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_UNLOCKED
 qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
 {
    BlockdevCreateOptionsQcow2 *qcow2_opts;
@@ -3706,7 +3714,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
        goto out;
    }

-    blk_unref(blk);
+    blk_co_unref(blk);
    blk = NULL;

    /*
@@ -3728,8 +3736,10 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
        goto out;
    }

+    bdrv_graph_co_rdlock();
    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
    if (ret < 0) {
+        bdrv_graph_co_rdunlock();
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
        goto out;
@@ -3747,6 +3757,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)

    /* Create a full header (including things like feature table) */
    ret = qcow2_update_header(blk_bs(blk));
+    bdrv_graph_co_rdunlock();
+
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
@@ -3780,13 +3792,16 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)

    /* Want encryption? There you go. */
    if (qcow2_opts->encrypt) {
+        bdrv_graph_co_rdlock();
        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
+        bdrv_graph_co_rdunlock();
+
        if (ret < 0) {
            goto out;
        }
    }

-    blk_unref(blk);
+    blk_co_unref(blk);
    blk = NULL;

    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
@@ -3811,13 +3826,13 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)

    ret = 0;
 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
-    bdrv_unref(data_bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
+    bdrv_co_unref(data_bs);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
                     Error **errp)
 {
@@ -3937,15 +3952,17 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
    ret = qcow2_co_create(create_options, errp);
 finish:
    if (ret < 0) {
+        bdrv_graph_co_rdlock();
        bdrv_co_delete_file_noerr(bs);
        bdrv_co_delete_file_noerr(data_bs);
+        bdrv_graph_co_rdunlock();
    } else {
        ret = 0;
    }

    qobject_unref(qdict);
-    bdrv_unref(bs);
-    bdrv_unref(data_bs);
+    bdrv_co_unref(bs);
+    bdrv_co_unref(data_bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
@@ -4098,7 +4115,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs,
        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
            if (bs->backing && bs->backing->bs) {
-                int64_t backing_length = bdrv_getlength(bs->backing->bs);
+                int64_t backing_length = bdrv_co_getlength(bs->backing->bs);
                if (src_offset >= backing_length) {
                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
                } else {
@@ -4293,7 +4310,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
            goto fail;
        }

-        old_file_size = bdrv_getlength(bs->file->bs);
+        old_file_size = bdrv_co_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
@@ -4386,7 +4403,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
            break;
        }

-        old_file_size = bdrv_getlength(bs->file->bs);
+        old_file_size = bdrv_co_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
@@ -4694,7 +4711,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         * align end of file to a sector boundary to ease reading with
         * sector based I/Os
         */
-        int64_t len = bdrv_getlength(bs->file->bs);
+        int64_t len = bdrv_co_getlength(bs->file->bs);
        if (len < 0) {
            return len;
        }
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -895,7 +895,9 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                void *cb_opaque, Error **errp);
 int coroutine_fn GRAPH_RDLOCK qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
-int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs);
+
+int coroutine_fn GRAPH_RDLOCK
+qcow2_detect_metadata_preallocation(BlockDriverState *bs);

 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
--- a/block/qed.c
+++ b/block/qed.c
@@ -557,11 +557,13 @@ typedef struct QEDOpenCo {
    int ret;
 } QEDOpenCo;

-static void coroutine_fn GRAPH_RDLOCK bdrv_qed_open_entry(void *opaque)
+static void coroutine_fn bdrv_qed_open_entry(void *opaque)
 {
    QEDOpenCo *qoc = opaque;
    BDRVQEDState *s = qoc->bs->opaque;

+    GRAPH_RDLOCK_GUARD();
+
    qemu_co_mutex_lock(&s->table_lock);
    qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->table_lock);
@@ -579,21 +581,17 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    };
    int ret;

-    assume_graph_lock(); /* FIXME */
-
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
    }

    bdrv_qed_init_state(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_qed_open_entry(&qoc);
-    } else {
-        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-        qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
-        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
-    }
+    assert(!qemu_in_coroutine());
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
+    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+
    return qoc.ret;
 }

@@ -632,8 +630,8 @@ static void bdrv_qed_close(BlockDriverState *bs)
    qemu_vfree(s->l1_table);
 }

-static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
-                                           Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+bdrv_qed_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
    BlockdevCreateOptionsQed *qed_opts;
    BlockBackend *blk = NULL;
@@ -748,12 +746,12 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
    ret = 0; /* success */
 out:
    g_free(l1_table);
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
                        QemuOpts *opts, Error **errp)
 {
@@ -819,7 +817,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -825,8 +825,8 @@ static coroutine_fn GRAPH_RDLOCK int quorum_co_flush(BlockDriverState *bs)
    return result;
 }

-static bool quorum_recurse_can_replace(BlockDriverState *bs,
-                                       BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+quorum_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
    return bdrv_co_pdiscard(bs->file, offset, bytes);
 }

+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                   unsigned int *nr_zones,
+                   BlockZoneDescriptor *zones)
+{
+    return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+                 int64_t offset, int64_t len)
+{
+    return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
+                   BdrvRequestFlags flags)
+{
+    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
 static int64_t coroutine_fn GRAPH_RDLOCK
 raw_co_getlength(BlockDriverState *bs)
 {
@@ -369,7 +391,7 @@ static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs,
    return info;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    return bdrv_co_get_info(bs->file->bs, bdi);
@@ -435,7 +457,7 @@ static int raw_has_zero_init(BlockDriverState *bs)
    return bdrv_has_zero_init(bs->file->bs);
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 raw_co_create_opts(BlockDriver *drv, const char *filename,
                   QemuOpts *opts, Error **errp)
 {
@@ -446,6 +468,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVRawState *s = bs->opaque;
+    AioContext *ctx;
    bool has_size;
    uint64_t offset, size;
    BdrvChildRole file_role;
@@ -493,7 +516,11 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                bs->file->bs->filename);
    }

+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
    ret = raw_apply_options(bs, s, offset, has_size, size, errp);
+    aio_context_release(ctx);
+
    if (ret < 0) {
        return ret;
    }
@@ -608,6 +635,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
 BlockDriver bdrv_raw = {
    .format_name          = "raw",
    .instance_size        = sizeof(BDRVRawState),
+    .supports_zoned_children = true,
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
    .bdrv_reopen_commit   = &raw_reopen_commit,
@@ -619,6 +647,9 @@ BlockDriver bdrv_raw = {
    .bdrv_co_pwritev      = &raw_co_pwritev,
    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_pdiscard     = &raw_co_pdiscard,
+    .bdrv_co_zone_report  = &raw_co_zone_report,
+    .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
+    .bdrv_co_zone_append = &raw_co_zone_append,
    .bdrv_co_block_status = &raw_co_block_status,
    .bdrv_co_copy_range_from = &raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -1019,7 +1019,7 @@ static void restart_coroutine(void *opaque)
    AioContext *ctx = bdrv_get_aio_context(bs);

    trace_ssh_restart_coroutine(restart->co);
-    aio_set_fd_handler(ctx, s->sock, false, NULL, NULL, NULL, NULL, NULL);
+    aio_set_fd_handler(ctx, s->sock, NULL, NULL, NULL, NULL, NULL);

    aio_co_wake(restart->co);
 }
@@ -1049,7 +1049,7 @@ static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
    trace_ssh_co_yield(s->sock, rd_handler, wr_handler);

    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, rd_handler, wr_handler, NULL, NULL, &restart);
+                       rd_handler, wr_handler, NULL, NULL, &restart);
    qemu_coroutine_yield();
    trace_ssh_co_yield_back(s->sock);
 }
--- a/block/stream.c
+++ b/block/stream.c
@@ -131,7 +131,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
    BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
    int64_t len;
    int64_t offset = 0;
-    uint64_t delay_ns = 0;
    int error = 0;
    int64_t n = 0; /* bytes */

@@ -155,7 +154,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
        /* Note that even when no rate limit is applied we need to yield
         * with no pending I/O here so that bdrv_drain_all() returns.
         */
-        job_sleep_ns(&s->common.job, delay_ns);
+        block_job_ratelimit_sleep(&s->common);
        if (job_is_cancelled(&s->common.job)) {
            break;
        }
@@ -204,9 +203,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
        /* Publish progress */
        job_progress_update(&s->common.job, n);
        if (copy) {
-            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
-        } else {
-            delay_ns = 0;
+            block_job_ratelimit_processed_bytes(&s->common, n);
        }
    }

--- a/block/trace-events
+++ b/block/trace-events
@@ -64,9 +64,8 @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
 # io_uring.c
 luring_init_state(void *s, size_t size) "s %p size %zu"
 luring_cleanup_state(void *s) "%p freed"
-luring_io_plug(void *s) "LuringState %p plug"
-luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
-luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
+luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
+luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
 luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
 luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
 luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
@@ -141,7 +140,6 @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
-nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
 nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
@@ -209,6 +207,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
 file_setup_cdrom(const char *partition) "Using %s as optical disc"
 file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
 file_flush_fdatasync_failed(int err) "errno %d"
+zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
+zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
+zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
+zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""

 # ssh.c
 sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -734,8 +734,9 @@ nonallocating_write:
    return ret;
 }

-static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
-                                         size_t block_size, Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+vdi_co_do_create(BlockdevCreateOptions *create_options, size_t block_size,
+                 Error **errp)
 {
    BlockdevCreateOptionsVdi *vdi_opts;
    int ret = 0;
@@ -886,19 +887,19 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,

    ret = 0;
 exit:
-    blk_unref(blk);
-    bdrv_unref(bs_file);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs_file);
    g_free(bmap);
    return ret;
 }

-static int coroutine_fn vdi_co_create(BlockdevCreateOptions *create_options,
-                                      Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+vdi_co_create(BlockdevCreateOptions *create_options, Error **errp)
 {
    return vdi_co_do_create(create_options, DEFAULT_CLUSTER_SIZE, errp);
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vdi_co_create_opts(BlockDriver *drv, const char *filename,
                   QemuOpts *opts, Error **errp)
 {
@@ -975,7 +976,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename,
 done:
    qobject_unref(qdict);
    qapi_free_BlockdevCreateOptions(create_options);
-    bdrv_unref(bs_file);
+    bdrv_co_unref(bs_file);
    return ret;
 }

--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1506,14 +1506,17 @@ exit:
 * There are 2 headers, and the highest sequence number will represent
 * the active header
 */
-static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
-                                   uint32_t log_size)
+static int coroutine_fn GRAPH_UNLOCKED
+vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
+                        uint32_t log_size)
 {
    BlockDriverState *bs = blk_bs(blk);
    BdrvChild *child;
    int ret = 0;
    VHDXHeader *hdr = NULL;

+    GRAPH_RDLOCK_GUARD();
+
    hdr = g_new0(VHDXHeader, 1);

    hdr->signature       = VHDX_HEADER_SIGNATURE;
@@ -1897,8 +1900,8 @@ exit:
 *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
 *   1MB
 */
-static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,
-                                       Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+vhdx_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
    BlockdevCreateOptionsVhdx *vhdx_opts;
    BlockBackend *blk = NULL;
@@ -2053,13 +2056,13 @@ static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,

    ret = 0;
 delete_and_exit:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    g_free(creator);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vhdx_co_create_opts(BlockDriver *drv, const char *filename,
                    QemuOpts *opts, Error **errp)
 {
@@ -2144,7 +2147,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2165,10 +2165,9 @@ vmdk_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
    return ret;
 }

-static int vmdk_init_extent(BlockBackend *blk,
-                            int64_t filesize, bool flat,
-                            bool compress, bool zeroed_grain,
-                            Error **errp)
+static int GRAPH_UNLOCKED
+vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress,
+                 bool zeroed_grain, Error **errp)
 {
    int ret, i;
    VMDK4Header header;
@@ -2277,7 +2276,7 @@ exit:
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vmdk_create_extent(const char *filename, int64_t filesize, bool flat,
                   bool compress, bool zeroed_grain, BlockBackend **pbb,
                   QemuOpts *opts, Error **errp)
@@ -2306,7 +2305,7 @@ exit:
        if (pbb) {
            *pbb = blk;
        } else {
-            blk_unref(blk);
+            blk_co_unref(blk);
            blk = NULL;
        }
    }
@@ -2358,7 +2357,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 *           non-split format.
 * idx >= 1: get the n-th extent if in a split subformat
 */
-typedef BlockBackend * coroutine_fn /* GRAPH_RDLOCK */
+typedef BlockBackend * coroutine_fn GRAPH_UNLOCKED_PTR
    (*vmdk_create_extent_fn)(int64_t size, int idx, bool flat, bool split,
                             bool compress, bool zeroed_grain, void *opaque,
                             Error **errp);
@@ -2374,7 +2373,7 @@ static void vmdk_desc_add_extent(GString *desc,
    g_free(basename);
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vmdk_co_do_create(int64_t size,
                  BlockdevVmdkSubformat subformat,
                  BlockdevVmdkAdapterType adapter_type,
@@ -2516,12 +2515,12 @@ vmdk_co_do_create(int64_t size,
        if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
            error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
                       blk_bs(backing)->drv->format_name);
-            blk_unref(backing);
+            blk_co_unref(backing);
            ret = -EINVAL;
            goto exit;
        }
        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
-        blk_unref(backing);
+        blk_co_unref(backing);
        if (ret) {
            error_setg(errp, "Failed to read parent CID");
            goto exit;
@@ -2542,14 +2541,14 @@ vmdk_co_do_create(int64_t size,
                             blk_bs(extent_blk)->filename);
        created_size += cur_size;
        extent_idx++;
-        blk_unref(extent_blk);
+        blk_co_unref(extent_blk);
    }

    /* Check whether we got excess extents */
    extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
                           opaque, NULL);
    if (extent_blk) {
-        blk_unref(extent_blk);
+        blk_co_unref(extent_blk);
        error_setg(errp, "List of extents contains unused extents");
        ret = -EINVAL;
        goto exit;
@@ -2590,7 +2589,7 @@ vmdk_co_do_create(int64_t size,
    ret = 0;
 exit:
    if (blk) {
-        blk_unref(blk);
+        blk_co_unref(blk);
    }
    g_free(desc);
    g_free(parent_desc_line);
@@ -2605,7 +2604,7 @@ typedef struct {
    QemuOpts *opts;
 } VMDKCreateOptsData;

-static BlockBackend * coroutine_fn GRAPH_RDLOCK
+static BlockBackend * coroutine_fn GRAPH_UNLOCKED
 vmdk_co_create_opts_cb(int64_t size, int idx, bool flat, bool split,
                       bool compress, bool zeroed_grain, void *opaque,
                       Error **errp)
@@ -2641,13 +2640,13 @@ vmdk_co_create_opts_cb(int64_t size, int idx, bool flat, bool split,
                           errp)) {
        goto exit;
    }
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
 exit:
    g_free(ext_filename);
    return blk;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vmdk_co_create_opts(BlockDriver *drv, const char *filename,
                    QemuOpts *opts, Error **errp)
 {
@@ -2756,11 +2755,9 @@ exit:
    return ret;
 }

-static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx,
-                                                     bool flat, bool split,
-                                                     bool compress,
-                                                     bool zeroed_grain,
-                                                     void *opaque, Error **errp)
+static BlockBackend * coroutine_fn GRAPH_UNLOCKED
+vmdk_co_create_cb(int64_t size, int idx, bool flat, bool split, bool compress,
+                  bool zeroed_grain, void *opaque, Error **errp)
 {
    int ret;
    BlockDriverState *bs;
@@ -2797,19 +2794,19 @@ static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx,
        return NULL;
    }
    blk_set_allow_write_beyond_eof(blk, true);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);

    if (size != -1) {
        ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
        if (ret) {
-            blk_unref(blk);
+            blk_co_unref(blk);
            blk = NULL;
        }
    }
    return blk;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vmdk_co_create(BlockdevCreateOptions *create_options, Error **errp)
 {
    BlockdevCreateOptionsVmdk *opts;
@@ -2845,7 +2842,7 @@ static void vmdk_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

-static int64_t coroutine_fn
+static int64_t coroutine_fn GRAPH_RDLOCK
 vmdk_co_get_allocated_file_size(BlockDriverState *bs)
 {
    int i;
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -967,8 +967,8 @@ static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
    return 0;
 }

-static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
-                                      Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+vpc_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
    BlockdevCreateOptionsVpc *vpc_opts;
    BlockBackend *blk = NULL;
@@ -1082,12 +1082,12 @@ static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
    }

 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;
 }

-static int coroutine_fn GRAPH_RDLOCK
+static int coroutine_fn GRAPH_UNLOCKED
 vpc_co_create_opts(BlockDriver *drv, const char *filename,
                   QemuOpts *opts, Error **errp)
 {
@@ -1162,7 +1162,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -174,7 +174,7 @@ int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context)
 {
-    aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL, NULL);
+    aio_set_event_notifier(old_context, &aio->e, NULL, NULL, NULL);
    aio->aio_ctx = NULL;
 }

@@ -182,8 +182,8 @@ void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *new_context)
 {
    aio->aio_ctx = new_context;
-    aio_set_event_notifier(new_context, &aio->e, false,
-                           win32_aio_completion_cb, NULL, NULL);
+    aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb,
+                           NULL, NULL);
 }

 QEMUWin32AIOState *win32_aio_init(void)
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -319,10 +319,28 @@ static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
    return block_job_set_speed_locked(job, speed, errp);
 }

-int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
+void block_job_ratelimit_processed_bytes(BlockJob *job, uint64_t n)
 {
    IO_CODE();
-    return ratelimit_calculate_delay(&job->limit, n);
+    ratelimit_calculate_delay(&job->limit, n);
+}
+
+void block_job_ratelimit_sleep(BlockJob *job)
+{
+    uint64_t delay_ns;
+
+    /*
+     * Sleep at least once. If the job is reentered early, keep waiting until
+     * we've waited for the full time that is necessary to keep the job at the
+     * right speed.
+     *
+     * Make sure to recalculate the delay after each (possibly interrupted)
+     * sleep because the speed can change while the job has yielded.
+     */
+    do {
+        delay_ns = ratelimit_calculate_delay(&job->limit, 0);
+        job_sleep_ns(&job->job, delay_ns);
+    } while (delay_ns && !job_is_cancelled(&job->job));
 }

 BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -352,9 +352,10 @@ static abi_ulong load_elf_interp(struct elfhdr *interp_elf_ex,

 static int symfind(const void *s0, const void *s1)
 {
-    target_ulong addr = *(target_ulong *)s0;
    struct elf_sym *sym = (struct elf_sym *)s1;
+    __typeof(sym->st_value) addr = *(uint64_t *)s0;
    int result = 0;
+
    if (addr < sym->st_value) {
        result = -1;
    } else if (addr >= sym->st_value + sym->st_size) {
@@ -363,7 +364,7 @@ static int symfind(const void *s0, const void *s1)
    return result;
 }

-static const char *lookup_symbolxx(struct syminfo *s, target_ulong orig_addr)
+static const char *lookup_symbolxx(struct syminfo *s, uint64_t orig_addr)
 {
 #if ELF_CLASS == ELFCLASS32
    struct elf_sym *syms = s->disas_symtab.elf32;
--- a/bsd-user/freebsd/os-syscall.c
+++ b/bsd-user/freebsd/os-syscall.c
@@ -528,10 +528,8 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1,
                            abi_long arg5, abi_long arg6, abi_long arg7,
                            abi_long arg8)
 {
-    CPUState *cpu = env_cpu(cpu_env);
    abi_long ret;

-    trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8);
    if (do_strace) {
        print_freebsd_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6);
    }
@@ -541,7 +539,6 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1,
    if (do_strace) {
        print_freebsd_syscall_ret(num, ret);
    }
-    trace_guest_user_syscall_ret(cpu, num, ret);

    return ret;
 }
--- a/configs/devices/aarch64-softmmu/default.mak
+++ b/configs/devices/aarch64-softmmu/default.mak
@@ -2,3 +2,9 @@

 # We support all the 32 bit boards so need all their config
 include ../arm-softmmu/default.mak
+
+# These are selected by default when TCG is enabled, uncomment them to
+# keep out of the build.
+# CONFIG_XLNX_ZYNQMP_ARM=n
+# CONFIG_XLNX_VERSAL=n
+# CONFIG_SBSA_REF=n
--- a/configs/devices/arm-softmmu/default.mak
+++ b/configs/devices/arm-softmmu/default.mak
@@ -4,3 +4,43 @@
 # CONFIG_TEST_DEVICES=n

 CONFIG_ARM_VIRT=y
+
+# These are selected by default when TCG is enabled, uncomment them to
+# keep out of the build.
+# CONFIG_CUBIEBOARD=n
+# CONFIG_EXYNOS4=n
+# CONFIG_HIGHBANK=n
+# CONFIG_INTEGRATOR=n
+# CONFIG_FSL_IMX31=n
+# CONFIG_MUSICPAL=n
+# CONFIG_MUSCA=n
+# CONFIG_CHEETAH=n
+# CONFIG_SX1=n
+# CONFIG_NSERIES=n
+# CONFIG_STELLARIS=n
+# CONFIG_STM32VLDISCOVERY=n
+# CONFIG_REALVIEW=n
+# CONFIG_VERSATILE=n
+# CONFIG_VEXPRESS=n
+# CONFIG_ZYNQ=n
+# CONFIG_MAINSTONE=n
+# CONFIG_GUMSTIX=n
+# CONFIG_SPITZ=n
+# CONFIG_TOSA=n
+# CONFIG_Z2=n
+# CONFIG_NPCM7XX=n
+# CONFIG_COLLIE=n
+# CONFIG_ASPEED_SOC=n
+# CONFIG_NETDUINO2=n
+# CONFIG_NETDUINOPLUS2=n
+# CONFIG_OLIMEX_STM32_H405=n
+# CONFIG_MPS2=n
+# CONFIG_RASPI=n
+# CONFIG_DIGIC=n
+# CONFIG_SABRELITE=n
+# CONFIG_EMCRAFT_SF2=n
+# CONFIG_MICROBIT=n
+# CONFIG_FSL_IMX25=n
+# CONFIG_FSL_IMX7=n
+# CONFIG_FSL_IMX6UL=n
+# CONFIG_ALLWINNER_H3=n
--- a/configs/targets/hexagon-linux-user.mak
+++ b/configs/targets/hexagon-linux-user.mak
@@ -1 +1,2 @@
 TARGET_ARCH=hexagon
+TARGET_XML_FILES=gdb-xml/hexagon-core.xml gdb-xml/hexagon-hvx.xml
--- a/configs/targets/mips-linux-user.mak
+++ b/configs/targets/mips-linux-user.mak
@@ -2,5 +2,4 @@ TARGET_ARCH=mips
 TARGET_ABI_MIPSO32=y
 TARGET_SYSTBL_ABI=o32
 TARGET_SYSTBL=syscall_o32.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips-softmmu.mak
+++ b/configs/targets/mips-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
 TARGET_SUPPORTS_MTTCG=y
--- a/configs/targets/mips64-linux-user.mak
+++ b/configs/targets/mips64-linux-user.mak
@@ -3,5 +3,4 @@ TARGET_ABI_MIPSN64=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n64
 TARGET_SYSTBL=syscall_n64.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips64-softmmu.mak
+++ b/configs/targets/mips64-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips64
 TARGET_BASE_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips64el-linux-user.mak
+++ b/configs/targets/mips64el-linux-user.mak
@@ -3,4 +3,3 @@ TARGET_ABI_MIPSN64=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n64
 TARGET_SYSTBL=syscall_n64.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/mips64el-softmmu.mak
+++ b/configs/targets/mips64el-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips64
 TARGET_BASE_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_NEED_FDT=y
--- a/configs/targets/mipsel-linux-user.mak
+++ b/configs/targets/mipsel-linux-user.mak
@@ -2,4 +2,3 @@ TARGET_ARCH=mips
 TARGET_ABI_MIPSO32=y
 TARGET_SYSTBL_ABI=o32
 TARGET_SYSTBL=syscall_o32.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/mipsel-softmmu.mak
+++ b/configs/targets/mipsel-softmmu.mak
@@ -1,3 +1,2 @@
 TARGET_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_SUPPORTS_MTTCG=y
--- a/configs/targets/mipsn32-linux-user.mak
+++ b/configs/targets/mipsn32-linux-user.mak
@@ -4,5 +4,4 @@ TARGET_ABI32=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n32
 TARGET_SYSTBL=syscall_n32.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mipsn32el-linux-user.mak
+++ b/configs/targets/mipsn32el-linux-user.mak
@@ -4,4 +4,3 @@ TARGET_ABI32=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n32
 TARGET_SYSTBL=syscall_n32.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/nios2-softmmu.mak
+++ b/configs/targets/nios2-softmmu.mak
@@ -1,3 +1,2 @@
 TARGET_ARCH=nios2
-TARGET_ALIGNED_ONLY=y
 TARGET_NEED_FDT=y
--- a/configs/targets/sh4-linux-user.mak
+++ b/configs/targets/sh4-linux-user.mak
@@ -1,5 +1,4 @@
 TARGET_ARCH=sh4
 TARGET_SYSTBL_ABI=common
 TARGET_SYSTBL=syscall.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_HAS_BFLT=y
--- a/configs/targets/sh4-softmmu.mak
+++ b/configs/targets/sh4-softmmu.mak
@@ -1,2 +1 @@
 TARGET_ARCH=sh4
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/sh4eb-linux-user.mak
+++ b/configs/targets/sh4eb-linux-user.mak
@@ -1,6 +1,5 @@
 TARGET_ARCH=sh4
 TARGET_SYSTBL_ABI=common
 TARGET_SYSTBL=syscall.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
 TARGET_HAS_BFLT=y
--- a/Show More
+++ b/Show More