migration: Fix duplicated included in meson.build

This is the commint with the merge error (not in the submited patch). commit 52623f23b0 Author: Lukas Straub <lukasstraub2@web.de> Date: Thu Apr 20 11:48:35 2023 +0200 ram-compress.c: Make target independent Make ram-compress.c target independent. Fixes: 52623f23b0 Signed-off-by: Juan Quintela <quintela@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Message-Id: <20230509170217.83246-1-quintela@redhat.com>
migration/multifd: Compute transferred bytes correctly
2023-05-18 18:41:53 +02:00 · 2023-05-18 18:41:46 +02:00 · 2023-05-18 18:40:51 +02:00 · 2023-05-18 18:40:51 +02:00 · 2023-05-18 18:40:51 +02:00 · 2023-05-18 18:40:51 +02:00
395 changed files with 18998 additions and 12765 deletions
--- a/.gitlab-ci.d/cirrus/freebsd-13.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-13.vars
@@ -11,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
 NINJA='/usr/local/bin/ninja'
 PACKAGING_COMMAND='pkg'
 PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage cmocka ctags curl cyrus-sasl dbus diffutils dtc flex fusefs-libs3 gettext git glib gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs libslirp libspice-server libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy sndio socat spice-protocol tesseract usbredir virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache cmocka ctags curl cyrus-sasl dbus diffutils dtc flex fusefs-libs3 gettext git glib gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs libslirp libspice-server libssh libtasn1 llvm lzo2 meson mtools ncurses nettle ninja opencv pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy sndio socat spice-protocol tesseract usbredir virglrenderer vte3 xorriso zstd'
 PYPI_PKGS=''
 PYTHON='/usr/local/bin/python3'
--- a/.gitlab-ci.d/cirrus/macos-12.vars
+++ b/.gitlab-ci.d/cirrus/macos-12.vars
@@ -11,6 +11,6 @@ MAKE='/opt/homebrew/bin/gmake'
 NINJA='/opt/homebrew/bin/ninja'
 PACKAGING_COMMAND='brew'
 PIP3='/opt/homebrew/bin/pip3'
-PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 libusb llvm lzo make meson ncurses nettle ninja pixman pkg-config python3 rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract usbredir vde vte3 zlib zstd'
+PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 libusb llvm lzo make meson mtools ncurses nettle ninja pixman pkg-config python3 rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract usbredir vde vte3 xorriso zlib zstd'
 PYPI_PKGS='PyYAML numpy pillow sphinx sphinx-rtd-theme'
 PYTHON='/opt/homebrew/bin/python3'
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml
@@ -1,6 +1,6 @@
 # All ubuntu-22.04 jobs should run successfully in an environment
 # setup by the scripts/ci/setup/qemu/build-environment.yml task
-# "Install basic packages to build QEMU on Ubuntu 20.04"
+# "Install basic packages to build QEMU on Ubuntu 22.04"

 ubuntu-22.04-aarch32-all:
 extends: .custom_runner_template
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml
@@ -1,6 +1,6 @@
-# All ubuntu-20.04 jobs should run successfully in an environment
+# All ubuntu-22.04 jobs should run successfully in an environment
 # setup by the scripts/ci/setup/qemu/build-environment.yml task
-# "Install basic packages to build QEMU on Ubuntu 20.04"
+# "Install basic packages to build QEMU on Ubuntu 22.04"

 ubuntu-22.04-aarch64-all-linux-static:
 extends: .custom_runner_template
@@ -45,6 +45,28 @@ ubuntu-22.04-aarch64-all:
 - make --output-sync -j`nproc --ignore=40`
 - make --output-sync -j`nproc --ignore=40` check

+ubuntu-22.04-aarch64-without-defaults:
+ extends: .custom_runner_template
+ needs: []
+ stage: build
+ tags:
+ - ubuntu_22.04
+ - aarch64
+ rules:
+ - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
+   when: manual
+   allow_failure: true
+ - if: "$AARCH64_RUNNER_AVAILABLE"
+   when: manual
+   allow_failure: true
+ script:
+ - mkdir build
+ - cd build
+ - ../configure --disable-user --without-default-devices --without-default-features
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
+ - make --output-sync -j`nproc --ignore=40`
+ - make --output-sync -j`nproc --ignore=40` check
+
 ubuntu-22.04-aarch64-alldbg:
 extends: .custom_runner_template
 needs: []
@@ -123,7 +145,7 @@ ubuntu-22.04-aarch64-notcg:
 script:
 - mkdir build
 - cd build
- - ../configure --disable-tcg
+ - ../configure --disable-tcg --with-devices-aarch64=minimal
   || { cat config.log meson-logs/meson-log.txt; exit 1; }
 - make --output-sync -j`nproc --ignore=40`
 - make --output-sync -j`nproc --ignore=40` check
--- a/16
+++ b/16
@@ -245,6 +245,7 @@ M: Xiaojuan Yang <yangxiaojuan@loongson.cn>
 S: Maintained
 F: target/loongarch/
 F: tests/tcg/loongarch64/
+F: tests/avocado/machine_loongarch.py

 M68K TCG CPUs
 M: Laurent Vivier <laurent@vivier.eu>
@@ -939,6 +940,7 @@ SBSA-REF
 M: Radoslaw Biernacki <rad@semihalf.com>
 M: Peter Maydell <peter.maydell@linaro.org>
 R: Leif Lindholm <quic_llindhol@quicinc.com>
+R: Marcin Juszkiewicz <marcin.juszkiewicz@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/sbsa-ref.c
@@ -2566,7 +2568,7 @@ Core Audio framework backend
 M: Gerd Hoffmann <kraxel@redhat.com>
 M: Philippe Mathieu-Daudé <philmd@linaro.org>
 R: Christian Schoenebeck <qemu_oss@crudebyte.com>
-R: Akihiko Odaki <akihiko.odaki@gmail.com>
+R: Akihiko Odaki <akihiko.odaki@daynix.com>
 S: Odd Fixes
 F: audio/coreaudio.c

@@ -2850,7 +2852,7 @@ F: docs/devel/ui.rst
 Cocoa graphics
 M: Peter Maydell <peter.maydell@linaro.org>
 M: Philippe Mathieu-Daudé <philmd@linaro.org>
-R: Akihiko Odaki <akihiko.odaki@gmail.com>
+R: Akihiko Odaki <akihiko.odaki@daynix.com>
 S: Odd Fixes
 F: ui/cocoa.m

@@ -3910,6 +3912,16 @@ F: configure
 F: scripts/mtest2make.py
 F: tests/Makefile.include

+Kconfig
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Maintained
+F: scripts/minikconf.py
+F: docs/devel/kconfig.rst
+F: Kconfig*
+F: */Kconfig*
+F: hw/*/Kconfig*
+F: target/*/Kconfig*
+
 GIT submodules
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
--- a/accel/tcg/atomic_common.c.inc
+++ b/accel/tcg/atomic_common.c.inc
@@ -13,20 +13,20 @@
 * See the COPYING file in the top-level directory.
 */

-static void atomic_trace_rmw_post(CPUArchState *env, target_ulong addr,
+static void atomic_trace_rmw_post(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi)
 {
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_RW);
 }

 #if HAVE_ATOMIC128
-static void atomic_trace_ld_post(CPUArchState *env, target_ulong addr,
+static void atomic_trace_ld_post(CPUArchState *env, uint64_t addr,
                                 MemOpIdx oi)
 {
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 }

-static void atomic_trace_st_post(CPUArchState *env, target_ulong addr,
+static void atomic_trace_st_post(CPUArchState *env, uint64_t addr,
                                 MemOpIdx oi)
 {
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
@@ -40,7 +40,7 @@ static void atomic_trace_st_post(CPUArchState *env, target_ulong addr,
 */

 #define CMPXCHG_HELPER(OP, TYPE) \
-    TYPE HELPER(atomic_##OP)(CPUArchState *env, target_ulong addr,  \
+    TYPE HELPER(atomic_##OP)(CPUArchState *env, uint64_t addr,      \
                             TYPE oldv, TYPE newv, uint32_t oi)     \
    { return cpu_atomic_##OP##_mmu(env, addr, oldv, newv, oi, GETPC()); }

@@ -62,7 +62,7 @@ CMPXCHG_HELPER(cmpxchgo_le, Int128)

 #undef CMPXCHG_HELPER

-Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, target_ulong addr,
+Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, uint64_t addr,
                                     Int128 cmpv, Int128 newv, uint32_t oi)
 {
 #if TCG_TARGET_REG_BITS == 32
@@ -82,7 +82,7 @@ Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, target_ulong addr,
 #endif
 }

-Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr,
+Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, uint64_t addr,
                                     Int128 cmpv, Int128 newv, uint32_t oi)
 {
 #if TCG_TARGET_REG_BITS == 32
@@ -103,7 +103,7 @@ Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr,
 }

 #define ATOMIC_HELPER(OP, TYPE) \
-    TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, target_ulong addr,  \
+    TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, uint64_t addr,  \
                                  TYPE val, uint32_t oi)                 \
    { return glue(glue(cpu_atomic_,OP),_mmu)(env, addr, val, oi, GETPC()); }

--- a/accel/tcg/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
@@ -22,6 +22,7 @@
 #include "sysemu/tcg.h"
 #include "exec/exec-all.h"
 #include "qemu/plugin.h"
+#include "internal.h"

 bool tcg_allowed;

@@ -81,6 +82,8 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)

 void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
 {
+    /* Prevent looping if already executing in a serial context. */
+    g_assert(!cpu_in_serial_context(cpu));
    cpu->exception_index = EXCP_ATOMIC;
    cpu_loop_exit_restore(cpu, pc);
 }
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -64,6 +64,15 @@ static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
    }
 }

+/*
+ * Return true if CS is not running in parallel with other cpus, either
+ * because there are no other cpus or we are within an exclusive context.
+ */
+static inline bool cpu_in_serial_context(CPUState *cs)
+{
+    return !(cs->tcg_cflags & CF_PARALLEL) || cpu_in_exclusive_context(cs);
+}
+
 extern int64_t max_delay;
 extern int64_t max_advance;

--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -92,27 +92,6 @@ void HELPER(plugin_vcpu_mem_cb)(unsigned int vcpu_index,
                                void *userdata)
 { }

-static void do_gen_mem_cb(TCGv vaddr, uint32_t info)
-{
-    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
-    TCGv_i32 meminfo = tcg_temp_ebb_new_i32();
-    TCGv_i64 vaddr64 = tcg_temp_ebb_new_i64();
-    TCGv_ptr udata = tcg_temp_ebb_new_ptr();
-
-    tcg_gen_movi_i32(meminfo, info);
-    tcg_gen_movi_ptr(udata, 0);
-    tcg_gen_ld_i32(cpu_index, cpu_env,
-                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
-    tcg_gen_extu_tl_i64(vaddr64, vaddr);
-
-    gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, vaddr64, udata);
-
-    tcg_temp_free_ptr(udata);
-    tcg_temp_free_i64(vaddr64);
-    tcg_temp_free_i32(meminfo);
-    tcg_temp_free_i32(cpu_index);
-}
-
 static void gen_empty_udata_cb(void)
 {
    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
@@ -145,9 +124,22 @@ static void gen_empty_inline_cb(void)
    tcg_temp_free_i64(val);
 }

-static void gen_empty_mem_cb(TCGv addr, uint32_t info)
+static void gen_empty_mem_cb(TCGv_i64 addr, uint32_t info)
 {
-    do_gen_mem_cb(addr, info);
+    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
+    TCGv_i32 meminfo = tcg_temp_ebb_new_i32();
+    TCGv_ptr udata = tcg_temp_ebb_new_ptr();
+
+    tcg_gen_movi_i32(meminfo, info);
+    tcg_gen_movi_ptr(udata, 0);
+    tcg_gen_ld_i32(cpu_index, cpu_env,
+                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+
+    gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, addr, udata);
+
+    tcg_temp_free_ptr(udata);
+    tcg_temp_free_i32(meminfo);
+    tcg_temp_free_i32(cpu_index);
 }

 /*
@@ -202,35 +194,17 @@ static void plugin_gen_empty_callback(enum plugin_gen_from from)
    }
 }

-union mem_gen_fn {
-    void (*mem_fn)(TCGv, uint32_t);
-    void (*inline_fn)(void);
-};
-
-static void gen_mem_wrapped(enum plugin_gen_cb type,
-                            const union mem_gen_fn *f, TCGv addr,
-                            uint32_t info, bool is_mem)
+void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info)
 {
    enum qemu_plugin_mem_rw rw = get_plugin_meminfo_rw(info);

-    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, type, rw);
-    if (is_mem) {
-        f->mem_fn(addr, info);
-    } else {
-        f->inline_fn();
-    }
+    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_MEM, rw);
+    gen_empty_mem_cb(addr, info);
    tcg_gen_plugin_cb_end();
-}

-void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info)
-{
-    union mem_gen_fn fn;
-
-    fn.mem_fn = gen_empty_mem_cb;
-    gen_mem_wrapped(PLUGIN_GEN_CB_MEM, &fn, addr, info, true);
-
-    fn.inline_fn = gen_empty_inline_cb;
-    gen_mem_wrapped(PLUGIN_GEN_CB_INLINE, &fn, 0, info, false);
+    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_INLINE, rw);
+    gen_empty_inline_cb();
+    tcg_gen_plugin_cb_end();
 }

 static TCGOp *find_op(TCGOp *op, TCGOpcode opc)
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -746,7 +746,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)

    tcg_region_reset_all();
    /* XXX: flush processor icache at this point if cache flush is expensive */
-    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
+    qatomic_inc(&tb_ctx.tb_flush_count);

 done:
    mmap_unlock();
@@ -758,9 +758,9 @@ done:
 void tb_flush(CPUState *cpu)
 {
    if (tcg_enabled()) {
-        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
+        unsigned tb_flush_count = qatomic_read(&tb_ctx.tb_flush_count);

-        if (cpu_in_exclusive_context(cpu)) {
+        if (cpu_in_serial_context(cpu)) {
            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
        } else {
            async_safe_run_on_cpu(cpu, do_tb_flush,
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -89,7 +89,20 @@ void icount_handle_deadline(void)
    }
 }

-void icount_prepare_for_run(CPUState *cpu)
+/* Distribute the budget evenly across all CPUs */
+int64_t icount_percpu_budget(int cpu_count)
+{
+    int64_t limit = icount_get_limit();
+    int64_t timeslice = limit / cpu_count;
+
+    if (timeslice == 0) {
+        timeslice = limit;
+    }
+
+    return timeslice;
+}
+
+void icount_prepare_for_run(CPUState *cpu, int64_t cpu_budget)
 {
    int insns_left;

@@ -101,13 +114,13 @@ void icount_prepare_for_run(CPUState *cpu)
    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
    g_assert(cpu->icount_extra == 0);

-    cpu->icount_budget = icount_get_limit();
+    replay_mutex_lock();
+
+    cpu->icount_budget = MIN(icount_get_limit(), cpu_budget);
    insns_left = MIN(0xffff, cpu->icount_budget);
    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
    cpu->icount_extra = cpu->icount_budget - insns_left;

-    replay_mutex_lock();
-
    if (cpu->icount_budget == 0) {
        /*
         * We're called without the iothread lock, so must take it while
--- a/accel/tcg/tcg-accel-ops-icount.h
+++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -11,7 +11,8 @@
 #define TCG_ACCEL_OPS_ICOUNT_H

 void icount_handle_deadline(void);
-void icount_prepare_for_run(CPUState *cpu);
+void icount_prepare_for_run(CPUState *cpu, int64_t cpu_budget);
+int64_t icount_percpu_budget(int cpu_count);
 void icount_process_data(CPUState *cpu);

 void icount_handle_interrupt(CPUState *cpu, int mask);
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -24,6 +24,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/lockable.h"
 #include "sysemu/tcg.h"
 #include "sysemu/replay.h"
 #include "sysemu/cpu-timers.h"
@@ -139,6 +140,33 @@ static void rr_force_rcu(Notifier *notify, void *data)
    rr_kick_next_cpu();
 }

+/*
+ * Calculate the number of CPUs that we will process in a single iteration of
+ * the main CPU thread loop so that we can fairly distribute the instruction
+ * count across CPUs.
+ *
+ * The CPU count is cached based on the CPU list generation ID to avoid
+ * iterating the list every time.
+ */
+static int rr_cpu_count(void)
+{
+    static unsigned int last_gen_id = ~0;
+    static int cpu_count;
+    CPUState *cpu;
+
+    QEMU_LOCK_GUARD(&qemu_cpu_list_lock);
+
+    if (cpu_list_generation_id_get() != last_gen_id) {
+        cpu_count = 0;
+        CPU_FOREACH(cpu) {
+            ++cpu_count;
+        }
+        last_gen_id = cpu_list_generation_id_get();
+    }
+
+    return cpu_count;
+}
+
 /*
 * In the single-threaded case each vCPU is simulated in turn. If
 * there is more than a single vCPU we create a simple timer to kick
@@ -185,11 +213,16 @@ static void *rr_cpu_thread_fn(void *arg)
    cpu->exit_request = 1;

    while (1) {
+        /* Only used for icount_enabled() */
+        int64_t cpu_budget = 0;
+
        qemu_mutex_unlock_iothread();
        replay_mutex_lock();
        qemu_mutex_lock_iothread();

        if (icount_enabled()) {
+            int cpu_count = rr_cpu_count();
+
            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
            icount_account_warp_timer();
            /*
@@ -197,6 +230,8 @@ static void *rr_cpu_thread_fn(void *arg)
             * waking up the I/O thread and waiting for completion.
             */
            icount_handle_deadline();
+
+            cpu_budget = icount_percpu_budget(cpu_count);
        }

        replay_mutex_unlock();
@@ -218,7 +253,7 @@ static void *rr_cpu_thread_fn(void *arg)

                qemu_mutex_unlock_iothread();
                if (icount_enabled()) {
-                    icount_prepare_for_run(cpu);
+                    icount_prepare_for_run(cpu, cpu_budget);
                }
                r = tcg_cpus_exec(cpu);
                if (icount_enabled()) {
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -39,62 +39,65 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 #endif /* IN_HELPER_PROTO */

+DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, i64, i32)
+DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, i64, i128, i32)
+
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
+                   i32, env, i64, i32, i32, i32)
 #ifdef CONFIG_ATOMIC64
 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
-                   i64, env, tl, i64, i64, i32)
+                   i64, env, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
-                   i64, env, tl, i64, i64, i32)
+                   i64, env, i64, i64, i64, i32)
 #endif
 #ifdef CONFIG_CMPXCHG128
 DEF_HELPER_FLAGS_5(atomic_cmpxchgo_be, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgo_le, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)
 #endif

 DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_be, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)
 DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_le, TCG_CALL_NO_WG,
-                   i128, env, tl, i128, i128, i32)
+                   i128, env, i64, i128, i128, i32)

 #ifdef CONFIG_ATOMIC64
 #define GEN_ATOMIC_HELPERS(NAME)                                  \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),           \
-                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)    \
+                       TCG_CALL_NO_WG, i64, env, i64, i64, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),           \
-                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+                       TCG_CALL_NO_WG, i64, env, i64, i64, i32)
 #else
 #define GEN_ATOMIC_HELPERS(NAME)                                  \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)   \
    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
-                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+                       TCG_CALL_NO_WG, i32, env, i64, i32, i32)
 #endif /* CONFIG_ATOMIC64 */

 GEN_ATOMIC_HELPERS(fetch_add)
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -72,9 +72,11 @@ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >

 TBContext tb_ctx;

-/* Encode VAL as a signed leb128 sequence at P.
-   Return P incremented past the encoded value.  */
-static uint8_t *encode_sleb128(uint8_t *p, target_long val)
+/*
+ * Encode VAL as a signed leb128 sequence at P.
+ * Return P incremented past the encoded value.
+ */
+static uint8_t *encode_sleb128(uint8_t *p, int64_t val)
 {
    int more, byte;

@@ -92,21 +94,23 @@ static uint8_t *encode_sleb128(uint8_t *p, target_long val)
    return p;
 }

-/* Decode a signed leb128 sequence at *PP; increment *PP past the
-   decoded value.  Return the decoded value.  */
-static target_long decode_sleb128(const uint8_t **pp)
+/*
+ * Decode a signed leb128 sequence at *PP; increment *PP past the
+ * decoded value.  Return the decoded value.
+ */
+static int64_t decode_sleb128(const uint8_t **pp)
 {
    const uint8_t *p = *pp;
-    target_long val = 0;
+    int64_t val = 0;
    int byte, shift = 0;

    do {
        byte = *p++;
-        val |= (target_ulong)(byte & 0x7f) << shift;
+        val |= (int64_t)(byte & 0x7f) << shift;
        shift += 7;
    } while (byte & 0x80);
    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
-        val |= -(target_ulong)1 << shift;
+        val |= -(int64_t)1 << shift;
    }

    *pp = p;
@@ -132,7 +136,7 @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
    int i, j, n;

    for (i = 0, n = tb->icount; i < n; ++i) {
-        target_ulong prev;
+        uint64_t prev;

        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
            if (i == 0) {
@@ -352,6 +356,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tb_set_page_addr0(tb, phys_pc);
    tb_set_page_addr1(tb, -1);
    tcg_ctx->gen_tb = tb;
+    tcg_ctx->addr_type = TCG_TYPE_TL;
+#ifdef CONFIG_SOFTMMU
+    tcg_ctx->page_bits = TARGET_PAGE_BITS;
+    tcg_ctx->page_mask = TARGET_PAGE_MASK;
+    tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS;
+#endif
+
 tb_overflow:

 #ifdef CONFIG_PROFILER
@@ -444,7 +455,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
            /* Dump header and the first instruction */
            fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
            fprintf(logfile,
-                    "  -- guest addr 0x" TARGET_FMT_lx " + tb prologue\n",
+                    "  -- guest addr 0x%016" PRIx64 " + tb prologue\n",
                    tcg_ctx->gen_insn_data[insn][0]);
            chunk_start = tcg_ctx->gen_insn_end_off[insn];
            disas(logfile, tb->tc.ptr, chunk_start);
@@ -457,7 +468,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
            while (insn < tb->icount) {
                size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
                if (chunk_end > chunk_start) {
-                    fprintf(logfile, "  -- guest addr 0x" TARGET_FMT_lx "\n",
+                    fprintf(logfile, "  -- guest addr 0x%016" PRIx64 "\n",
                            tcg_ctx->gen_insn_data[insn][0]);
                    disas(logfile, tb->tc.ptr + chunk_start,
                          chunk_end - chunk_start);
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -889,35 +889,9 @@ void page_reset_target_data(target_ulong start, target_ulong last) { }

 /* The softmmu versions of these helpers are in cputlb.c.  */

-/*
- * Verify that we have passed the correct MemOp to the correct function.
- *
- * We could present one function to target code, and dispatch based on
- * the MemOp, but so far we have worked hard to avoid an indirect function
- * call along the memory path.
- */
-static void validate_memop(MemOpIdx oi, MemOp expected)
+static void *cpu_mmu_lookup(CPUArchState *env, abi_ptr addr,
+                            MemOp mop, uintptr_t ra, MMUAccessType type)
 {
-#ifdef CONFIG_DEBUG_TCG
-    MemOp have = get_memop(oi) & (MO_SIZE | MO_BSWAP);
-    assert(have == expected);
-#endif
-}
-
-void helper_unaligned_ld(CPUArchState *env, target_ulong addr)
-{
-    cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_LOAD, GETPC());
-}
-
-void helper_unaligned_st(CPUArchState *env, target_ulong addr)
-{
-    cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, GETPC());
-}
-
-static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
-                            MemOpIdx oi, uintptr_t ra, MMUAccessType type)
-{
-    MemOp mop = get_memop(oi);
    int a_bits = get_alignment_bits(mop);
    void *ret;

@@ -931,116 +905,251 @@ static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
    return ret;
 }

-uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
-                    MemOpIdx oi, uintptr_t ra)
+#include "ldst_atomicity.c.inc"
+
+static uint8_t do_ld1_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOp mop, uintptr_t ra)
 {
    void *haddr;
    uint8_t ret;

-    validate_memop(oi, MO_UB);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    tcg_debug_assert((mop & MO_SIZE) == MO_8);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
    ret = ldub_p(haddr);
    clear_helper_retaddr();
+    return ret;
+}
+
+tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    return (int8_t)do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    uint8_t ret = do_ld1_mmu(env, addr, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
    return ret;
 }

+static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
+                              MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+    uint16_t ret;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_16);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_2(env, ra, haddr, mop);
+    clear_helper_retaddr();
+    return ret;
+}
+
+tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
 uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
                        MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    uint16_t ret;

-    validate_memop(oi, MO_BEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_be_p(haddr);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    ret = do_ld2_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return cpu_to_be16(ret);
+}
+
+uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
+                        MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    uint16_t ret;
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    ret = do_ld2_he_mmu(env, addr, mop, ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return cpu_to_le16(ret);
+}
+
+static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
+                              MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+    uint32_t ret;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_32);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_4(env, ra, haddr, mop);
+    clear_helper_retaddr();
+    return ret;
+}
+
+tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
+                                 MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
    return ret;
 }

 uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
                        MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    uint32_t ret;

-    validate_memop(oi, MO_BEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_be_p(haddr);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    ret = do_ld4_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return cpu_to_be32(ret);
+}
+
+uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
+                        MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    uint32_t ret;
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    ret = do_ld4_he_mmu(env, addr, mop, ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return cpu_to_le32(ret);
+}
+
+static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
+                              MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+    uint64_t ret;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_64);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_8(env, ra, haddr, mop);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
+                        MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap64(ret);
+    }
    return ret;
 }

 uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
                        MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    uint64_t ret;

-    validate_memop(oi, MO_BEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_be_p(haddr);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    ret = do_ld8_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
-}
-
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-    uint16_t ret;
-
-    validate_memop(oi, MO_LEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_le_p(haddr);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
-}
-
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-    uint32_t ret;
-
-    validate_memop(oi, MO_LEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_le_p(haddr);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_be64(ret);
 }

 uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
                        MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    uint64_t ret;

-    validate_memop(oi, MO_LEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_le_p(haddr);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    ret = do_ld8_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return cpu_to_le64(ret);
+}
+
+static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
+                             MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+    Int128 ret;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_128);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+    ret = load_atom_16(env, ra, haddr, mop);
+    clear_helper_retaddr();
    return ret;
 }

+Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    Int128 ret = do_ld16_he_mmu(env, addr, mop, ra);
+
+    if (mop & MO_BSWAP) {
+        ret = bswap128(ret);
+    }
+    return ret;
+}
+
+Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
+{
+    return helper_ld16_mmu(env, addr, oi, GETPC());
+}
+
 Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
                       MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    Int128 ret;

-    validate_memop(oi, MO_128 | MO_BE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    memcpy(&ret, haddr, 16);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    ret = do_ld16_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-
    if (!HOST_BIG_ENDIAN) {
        ret = bswap128(ret);
    }
@@ -1050,132 +1159,218 @@ Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
 Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
                       MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);
    Int128 ret;

-    validate_memop(oi, MO_128 | MO_LE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    memcpy(&ret, haddr, 16);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    ret = do_ld16_he_mmu(env, addr, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-
    if (HOST_BIG_ENDIAN) {
        ret = bswap128(ret);
    }
    return ret;
 }

-void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
-                 MemOpIdx oi, uintptr_t ra)
+static void do_st1_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
+                       MemOp mop, uintptr_t ra)
 {
    void *haddr;

-    validate_memop(oi, MO_UB);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
+    tcg_debug_assert((mop & MO_SIZE) == MO_8);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
    stb_p(haddr, val);
    clear_helper_retaddr();
+}
+
+void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    do_st1_mmu(env, addr, val, get_memop(oi), ra);
+}
+
+void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
+                 MemOpIdx oi, uintptr_t ra)
+{
+    do_st1_mmu(env, addr, val, get_memop(oi), ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

+static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+                          MemOp mop, uintptr_t ra)
+{
+    void *haddr;
+
+    tcg_debug_assert((mop & MO_SIZE) == MO_16);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+    store_atom_2(env, ra, haddr, mop, val);
+    clear_helper_retaddr();
+}
+
+void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    if (mop & MO_BSWAP) {
+        val = bswap16(val);
+    }
+    do_st2_he_mmu(env, addr, val, mop, ra);
+}
+
 void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                    MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);

-    validate_memop(oi, MO_BEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stw_be_p(haddr, val);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-
-    validate_memop(oi, MO_BEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stl_be_p(haddr, val);
-    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    void *haddr;
-
-    validate_memop(oi, MO_BEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stq_be_p(haddr, val);
-    clear_helper_retaddr();
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

 void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                          MemOp mop, uintptr_t ra)
 {
    void *haddr;

-    validate_memop(oi, MO_LEUW);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stw_le_p(haddr, val);
+    tcg_debug_assert((mop & MO_SIZE) == MO_32);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+    store_atom_4(env, ra, haddr, mop, val);
    clear_helper_retaddr();
+}
+
+void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    if (mop & MO_BSWAP) {
+        val = bswap32(val);
+    }
+    do_st4_he_mmu(env, addr, val, mop, ra);
+}
+
+void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

 void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+                          MemOp mop, uintptr_t ra)
 {
    void *haddr;

-    validate_memop(oi, MO_LEUL);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stl_le_p(haddr, val);
+    tcg_debug_assert((mop & MO_SIZE) == MO_64);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+    store_atom_8(env, ra, haddr, mop, val);
    clear_helper_retaddr();
+}
+
+void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    if (mop & MO_BSWAP) {
+        val = bswap64(val);
+    }
+    do_st8_he_mmu(env, addr, val, mop, ra);
+}
+
+void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
+    do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

 void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                    MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
+    do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                           MemOp mop, uintptr_t ra)
 {
    void *haddr;

-    validate_memop(oi, MO_LEUQ);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
-    stq_le_p(haddr, val);
+    tcg_debug_assert((mop & MO_SIZE) == MO_128);
+    haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+    store_atom_16(env, ra, haddr, mop, val);
    clear_helper_retaddr();
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+
+    if (mop & MO_BSWAP) {
+        val = bswap128(val);
+    }
+    do_st16_he_mmu(env, addr, val, mop, ra);
+}
+
+void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
+{
+    helper_st16_mmu(env, addr, val, oi, GETPC());
 }

 void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
                     Int128 val, MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);

-    validate_memop(oi, MO_128 | MO_BE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
+    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
    if (!HOST_BIG_ENDIAN) {
        val = bswap128(val);
    }
-    memcpy(haddr, &val, 16);
-    clear_helper_retaddr();
+    do_st16_he_mmu(env, addr, val, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

 void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
                     Int128 val, MemOpIdx oi, uintptr_t ra)
 {
-    void *haddr;
+    MemOp mop = get_memop(oi);

-    validate_memop(oi, MO_128 | MO_LE);
-    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
+    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
    if (HOST_BIG_ENDIAN) {
        val = bswap128(val);
    }
-    memcpy(haddr, &val, 16);
-    clear_helper_retaddr();
+    do_st16_he_mmu(env, addr, val, mop, ra);
    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }

@@ -1267,7 +1462,6 @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
    void *haddr;
    uint64_t ret;

-    validate_memop(oi, MO_BEUQ);
    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
    ret = ldq_p(haddr);
    clear_helper_retaddr();
--- a/block.c
+++ b/block.c
@@ -680,7 +680,7 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,

    ret = 0;
 out:
-    blk_unref(blk);
+    blk_co_unref(blk);
    return ret;
 }

@@ -1610,9 +1610,9 @@ out:
 * bdrv_refresh_total_sectors() which polls when called from non-coroutine
 * context.
 */
-static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
-                            const char *node_name, QDict *options,
-                            int open_flags, Error **errp)
+static int no_coroutine_fn GRAPH_UNLOCKED
+bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
+                 QDict *options, int open_flags, Error **errp)
 {
    Error *local_err = NULL;
    int i, ret;
@@ -1667,7 +1667,10 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
        return ret;
    }

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, &local_err);
+    bdrv_graph_rdunlock_main_loop();
+
    if (local_err) {
        error_propagate(errp, local_err);
        return -EINVAL;
@@ -3419,7 +3422,9 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
    }

 out:
+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(parent_bs, tran, NULL);
+    bdrv_graph_rdunlock_main_loop();

    return 0;
 }
@@ -4917,7 +4922,9 @@ static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
    qdict_del(bs->explicit_options, "backing");
    qdict_del(bs->options, "backing");

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, NULL);
+    bdrv_graph_rdunlock_main_loop();
    bdrv_refresh_total_sectors(bs, bs->total_sectors);
 }

@@ -5316,7 +5323,9 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
 out:
    tran_finalize(tran, ret);

+    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs_top, NULL, NULL);
+    bdrv_graph_rdunlock_main_loop();

    if (new_context && old_context != new_context) {
        aio_context_release(new_context);
@@ -5750,7 +5759,8 @@ exit:
 * sums the size of all data-bearing children.  (This excludes backing
 * children.)
 */
-static int64_t coroutine_fn bdrv_sum_allocated_file_size(BlockDriverState *bs)
+static int64_t coroutine_fn GRAPH_RDLOCK
+bdrv_sum_allocated_file_size(BlockDriverState *bs)
 {
    BdrvChild *child;
    int64_t child_size, sum = 0;
@@ -5778,6 +5788,7 @@ int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
 {
    BlockDriver *drv = bs->drv;
    IO_CODE();
+    assert_bdrv_graph_readable();

    if (!drv) {
        return -ENOMEDIUM;
@@ -6347,6 +6358,8 @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    int ret;
    BlockDriver *drv = bs->drv;
    IO_CODE();
+    assert_bdrv_graph_readable();
+
    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
    if (!drv) {
        return -ENOMEDIUM;
@@ -6395,6 +6408,8 @@ BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
 void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
 {
    IO_CODE();
+    assert_bdrv_graph_readable();
+
    if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
        return;
    }
@@ -7967,6 +7982,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
        return;
    }

+    /*
+     * Non-zoned block drivers do not follow zoned storage constraints
+     * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
+     * drivers in a graph.
+     */
+    if (!parent_bs->drv->supports_zoned_children &&
+        child_bs->bl.zoned == BLK_Z_HM) {
+        /*
+         * The host-aware model allows zoned storage constraints and random
+         * write. Allow mixing host-aware and non-zoned drivers. Using
+         * host-aware device as a regular device.
+         */
+        error_setg(errp, "Cannot add a %s child to a %s parent",
+                   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
+                   parent_bs->drv->supports_zoned_children ?
+                   "support zoned children" : "not support zoned children");
+        return;
+    }
+
    if (!QLIST_EMPTY(&child_bs->parents)) {
        error_setg(errp, "The node %s already has a parent",
                   child_bs->node_name);
--- a/block/amend.c
+++ b/block/amend.c
@@ -46,6 +46,7 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error **errp)
 {
    BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
    int ret;
+    GRAPH_RDLOCK_GUARD();

    job_progress_set_remaining(&s->common, 1);
    ret = s->bs->drv->bdrv_co_amend(s->bs, s->opts, s->force, errp);
@@ -54,7 +55,8 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error **errp)
    return ret;
 }

-static int blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
+static int GRAPH_RDLOCK
+blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
 {
    if (s->bs->drv->bdrv_amend_pre_run) {
        return s->bs->drv->bdrv_amend_pre_run(s->bs, errp);
@@ -67,9 +69,11 @@ static void blockdev_amend_free(Job *job)
 {
    BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);

+    bdrv_graph_rdlock_main_loop();
    if (s->bs->drv->bdrv_amend_clean) {
        s->bs->drv->bdrv_amend_clean(s->bs);
    }
+    bdrv_graph_rdunlock_main_loop();

    bdrv_unref(s->bs);
 }
@@ -93,6 +97,8 @@ void qmp_x_blockdev_amend(const char *job_id,
    BlockDriver *drv = bdrv_find_format(fmt);
    BlockDriverState *bs;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    bs = bdrv_lookup_bs(NULL, node_name, errp);
    if (!bs) {
        return;
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -265,8 +265,9 @@ static int coroutine_fn GRAPH_RDLOCK blkverify_co_flush(BlockDriverState *bs)
    return bdrv_co_flush(s->test_file->bs);
 }

-static bool blkverify_recurse_can_replace(BlockDriverState *bs,
-                                          BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+blkverify_recurse_can_replace(BlockDriverState *bs,
+                              BlockDriverState *to_replace)
 {
    BDRVBlkverifyState *s = bs->opaque;

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1845,6 +1845,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
    return ret;
 }

+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
+                                   (unsigned int*)(uintptr_t)acb->bytes,
+                                   rwco->iobuf);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+                                unsigned int *nr_zones,
+                                BlockZoneDescriptor  *zones,
+                                BlockCompletionFunc *cb, void *opaque)
+{
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .offset = offset,
+        .iobuf  = zones,
+        .ret    = NOT_DONE,
+    };
+    acb->bytes = (int64_t)(uintptr_t)nr_zones,
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_mgmt(rwco->blk,
+                                 (BlockZoneOp)(uintptr_t)rwco->iobuf,
+                                 rwco->offset, acb->bytes);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                              int64_t offset, int64_t len,
+                              BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .offset = offset,
+        .iobuf  = (void *)(uintptr_t)op,
+        .ret    = NOT_DONE,
+    };
+    acb->bytes = len;
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
+                                   rwco->iobuf, rwco->flags);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+        .blk    = blk,
+        .ret    = NOT_DONE,
+        .flags  = flags,
+        .iobuf  = qiov,
+    };
+    acb->bytes = (int64_t)(uintptr_t)offset;
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+    aio_co_enter(blk_get_aio_context(blk), co);
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation;
+ * offset is the byte offset from the start of the zoned device;
+ * len is the maximum number of bytes the command should operate on. It
+ * should be aligned with the device zone size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        blk_dec_in_flight(blk);
+        return ret;
+    }
+
+    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+        QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
    BlockDriverState *bs = blk_bs(blk);
@@ -2024,7 +2222,15 @@ void blk_activate(BlockBackend *blk, Error **errp)
        return;
    }

-    bdrv_activate(bs, errp);
+    /*
+     * Migration code can call this function in coroutine context, so leave
+     * coroutine context if necessary.
+     */
+    if (qemu_in_coroutine()) {
+        bdrv_co_activate(bs, errp);
+    } else {
+        bdrv_activate(bs, errp);
+    }
 }

 bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -61,7 +61,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int coroutine_fn GRAPH_RDLOCK
 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);

-int coroutine_fn
+int coroutine_fn GRAPH_RDLOCK
 nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
                               Error **errp);

@@ -85,7 +85,8 @@ bdrv_common_block_status_above(BlockDriverState *bs,
                               int64_t *map,
                               BlockDriverState **file,
                               int *depth);
-int co_wrapper_mixed
+
+int co_wrapper_mixed_bdrv_rdlock
 nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);

 #endif /* BLOCK_COROUTINES_H */
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -355,7 +355,7 @@ block_crypto_co_create_generic(BlockDriverState *bs, int64_t size,
    ret = 0;
 cleanup:
    qcrypto_block_free(crypto);
-    blk_unref(blk);
+    blk_co_unref(blk);
    return ret;
 }

@@ -661,7 +661,7 @@ block_crypto_co_create_luks(BlockdevCreateOptions *create_options, Error **errp)

    ret = 0;
 fail:
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    return ret;
 }

@@ -730,13 +730,13 @@ fail:
        bdrv_co_delete_file_noerr(bs);
    }

-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_QCryptoBlockCreateOptions(create_opts);
    qobject_unref(cryptoopts);
    return ret;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 block_crypto_co_get_info_luks(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    BlockDriverInfo subbdi;
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -68,6 +68,9 @@
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -157,6 +160,7 @@ typedef struct BDRVRawState {
    bool has_write_zeroes:1;
    bool use_linux_aio:1;
    bool use_linux_io_uring:1;
+    int64_t *offset; /* offset of zone append operation */
    int page_cache_inconsistent; /* errno from fdatasync failure */
    bool has_fallocate;
    bool needs_alignment;
@@ -216,6 +220,13 @@ typedef struct RawPosixAIOData {
            PreallocMode prealloc;
            Error **errp;
        } truncate;
+        struct {
+            unsigned int *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        struct {
+            unsigned long op;
+        } zone_mgmt;
    };
 } RawPosixAIOData;

@@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
            goto fail;
        }
    }
+#ifdef CONFIG_BLKZONED
+    /*
+     * The kernel page cache does not reliably work for writes to SWR zones
+     * of zoned block device because it can not guarantee the order of writes.
+     */
+    if ((bs->bl.zoned != BLK_Z_NONE) &&
+        (!(s->open_flags & O_DIRECT))) {
+        error_setg(errp, "The driver supports zoned devices, and it requires "
+                         "cache.direct=on, which was not specified.");
+        return -EINVAL; /* No host kernel page cache */
+    }
+#endif

    if (S_ISBLK(st.st_mode)) {
 #ifdef __linux__
@@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
 #endif
 }

+/*
+ * Get a sysfs attribute value as character string.
+ */
+#ifdef CONFIG_LINUX
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+                             char **val) {
+    g_autofree char *sysfspath = NULL;
+    int ret;
+    size_t len;
+
+    if (!S_ISBLK(st->st_mode)) {
+        return -ENOTSUP;
+    }
+
+    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+                                major(st->st_rdev), minor(st->st_rdev),
+                                attribute);
+    ret = g_file_get_contents(sysfspath, val, &len, NULL);
+    if (ret == -1) {
+        return -ENOENT;
+    }
+
+    /* The file is ended with '\n' */
+    char *p;
+    p = *val;
+    if (*(p + len - 1) == '\n') {
+        *(p + len - 1) = '\0';
+    }
+    return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
+{
+    g_autofree char *val = NULL;
+    int ret;
+
+    ret = get_sysfs_str_val(st, "zoned", &val);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (strcmp(val, "host-managed") == 0) {
+        *zoned = BLK_Z_HM;
+    } else if (strcmp(val, "host-aware") == 0) {
+        *zoned = BLK_Z_HA;
+    } else if (strcmp(val, "none") == 0) {
+        *zoned = BLK_Z_NONE;
+    } else {
+        return -ENOTSUP;
+    }
+    return 0;
+}
+#endif /* defined(CONFIG_BLKZONED) */
+
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
+#ifdef CONFIG_LINUX
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
+{
+    g_autofree char *str = NULL;
+    const char *end;
+    long val;
+    int ret;
+
+    ret = get_sysfs_str_val(st, attribute, &str);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The file is ended with '\n', pass 'end' to accept that. */
+    ret = qemu_strtol(str, &end, 10, &val);
+    if (ret == 0 && end && *end == '\0') {
+        ret = val;
+    }
+    return ret;
+}
+#endif
+
 static int hdev_get_max_segments(int fd, struct stat *st)
 {
 #ifdef CONFIG_LINUX
-    char buf[32];
-    const char *end;
-    char *sysfspath = NULL;
    int ret;
-    int sysfd = -1;
-    long max_segments;

    if (S_ISCHR(st->st_mode)) {
        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
@@ -1218,44 +1317,176 @@ static int hdev_get_max_segments(int fd, struct stat *st)
        }
        return -ENOTSUP;
    }
-
-    if (!S_ISBLK(st->st_mode)) {
-        return -ENOTSUP;
-    }
-
-    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st->st_rdev), minor(st->st_rdev));
-    sysfd = open(sysfspath, O_RDONLY);
-    if (sysfd == -1) {
-        ret = -errno;
-        goto out;
-    }
-    ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
-    } else if (ret == 0) {
-        ret = -EIO;
-        goto out;
-    }
-    buf[ret] = 0;
-    /* The file is ended with '\n', pass 'end' to accept that. */
-    ret = qemu_strtol(buf, &end, 10, &max_segments);
-    if (ret == 0 && end && *end == '\n') {
-        ret = max_segments;
-    }
-
-out:
-    if (sysfd != -1) {
-        close(sysfd);
-    }
-    g_free(sysfspath);
-    return ret;
+    return get_sysfs_long_val(st, "max_segments");
 #else
    return -ENOTSUP;
 #endif
 }

+#if defined(CONFIG_BLKZONED)
+/*
+ * If the reset_all flag is true, then the wps of zone whose state is
+ * not readonly or offline should be all reset to the start sector.
+ * Else, take the real wp of the device.
+ */
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+                        unsigned int nrz, bool reset_all)
+{
+    struct blk_zone *blkz;
+    size_t rep_size;
+    uint64_t sector = offset >> BDRV_SECTOR_BITS;
+    BlockZoneWps *wps = bs->wps;
+    unsigned int j = offset / bs->bl.zone_size;
+    unsigned int n = 0, i = 0;
+    int ret;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+                wps->wp[j] |= 1ULL << 63;
+            } else {
+                switch(blkz[i].cond) {
+                case BLK_ZONE_COND_FULL:
+                case BLK_ZONE_COND_READONLY:
+                    /* Zone not writable */
+                    wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
+                    break;
+                case BLK_ZONE_COND_OFFLINE:
+                    /* Zone not writable nor readable */
+                    wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
+                    break;
+                default:
+                    if (reset_all) {
+                        wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
+                    } else {
+                        wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
+                    }
+                    break;
+                }
+            }
+        }
+        sector = blkz[i - 1].start + blkz[i - 1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+                            unsigned int nrz)
+{
+    if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
+        error_report("update zone wp failed");
+    }
+}
+
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+                                     Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+    BlockZoneModel zoned;
+    int ret;
+
+    bs->bl.zoned = BLK_Z_NONE;
+
+    ret = get_sysfs_zoned_model(st, &zoned);
+    if (ret < 0 || zoned == BLK_Z_NONE) {
+        return;
+    }
+    bs->bl.zoned = zoned;
+
+    ret = get_sysfs_long_val(st, "max_open_zones");
+    if (ret >= 0) {
+        bs->bl.max_open_zones = ret;
+    }
+
+    ret = get_sysfs_long_val(st, "max_active_zones");
+    if (ret >= 0) {
+        bs->bl.max_active_zones = ret;
+    }
+
+    /*
+     * The zoned device must at least have zone size and nr_zones fields.
+     */
+    ret = get_sysfs_long_val(st, "chunk_sectors");
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
+                                     "sysfs attribute");
+        return;
+    } else if (!ret) {
+        error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
+        return;
+    }
+    bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
+
+    ret = get_sysfs_long_val(st, "nr_zones");
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Unable to read nr_zones "
+                                     "sysfs attribute");
+        return;
+    } else if (!ret) {
+        error_setg(errp, "Read 0 from nr_zones sysfs attribute");
+        return;
+    }
+    bs->bl.nr_zones = ret;
+
+    ret = get_sysfs_long_val(st, "zone_append_max_bytes");
+    if (ret > 0) {
+        bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
+    }
+
+    ret = get_sysfs_long_val(st, "physical_block_size");
+    if (ret >= 0) {
+        bs->bl.write_granularity = ret;
+    }
+
+    /* The refresh_limits() function can be called multiple times. */
+    g_free(bs->wps);
+    bs->wps = g_malloc(sizeof(BlockZoneWps) +
+            sizeof(int64_t) * bs->bl.nr_zones);
+    ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "report wps failed");
+        bs->wps = NULL;
+        return;
+    }
+    qemu_co_mutex_init(&bs->wps->colock);
+}
+#else /* !defined(CONFIG_BLKZONED) */
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+                                     Error **errp)
+{
+    bs->bl.zoned = BLK_Z_NONE;
+}
+#endif /* !defined(CONFIG_BLKZONED) */
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
@@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
            bs->bl.max_hw_iov = ret;
        }
    }
+
+    raw_refresh_zoned_limits(bs, &st, errp);
 }

 static int check_for_dasd(int fd)
@@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
    BDRVRawState *s = bs->opaque;
    int ret;

-    /* If DASD, get blocksizes */
+    /* If DASD or zoned devices, get blocksizes */
    if (check_for_dasd(s->fd) < 0) {
-        return -ENOTSUP;
+        /* zoned devices are not DASD */
+        if (bs->bl.zoned == BLK_Z_NONE) {
+            return -ENOTSUP;
+        }
    }
    ret = probe_logical_blocksize(s->fd, &bsz->log);
    if (ret < 0) {
@@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
    ssize_t len;

    len = RETRY_ON_EINTR(
-        (aiocb->aio_type & QEMU_AIO_WRITE) ?
+        (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
            qemu_pwritev(aiocb->aio_fildes,
                           aiocb->io.iov,
                           aiocb->io.niov,
@@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
    ssize_t len;

    while (offset < aiocb->aio_nbytes) {
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
            len = pwrite(aiocb->aio_fildes,
                         (const char *)buf + offset,
                         aiocb->aio_nbytes - offset,
@@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
        char *p = buf;
        size_t count = aiocb->aio_nbytes, copy;
        int i;
@@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif

+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
+                              const struct blk_zone *blkz) {
+    zone->start = blkz->start << BDRV_SECTOR_BITS;
+    zone->length = blkz->len << BDRV_SECTOR_BITS;
+    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
+
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
+    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
+#else
+    zone->cap = blkz->len << BDRV_SECTOR_BITS;
+#endif
+
+    switch (blkz->type) {
+    case BLK_ZONE_TYPE_SEQWRITE_REQ:
+        zone->type = BLK_ZT_SWR;
+        break;
+    case BLK_ZONE_TYPE_SEQWRITE_PREF:
+        zone->type = BLK_ZT_SWP;
+        break;
+    case BLK_ZONE_TYPE_CONVENTIONAL:
+        zone->type = BLK_ZT_CONV;
+        break;
+    default:
+        error_report("Unsupported zone type: 0x%x", blkz->type);
+        return -ENOTSUP;
+    }
+
+    switch (blkz->cond) {
+    case BLK_ZONE_COND_NOT_WP:
+        zone->state = BLK_ZS_NOT_WP;
+        break;
+    case BLK_ZONE_COND_EMPTY:
+        zone->state = BLK_ZS_EMPTY;
+        break;
+    case BLK_ZONE_COND_IMP_OPEN:
+        zone->state = BLK_ZS_IOPEN;
+        break;
+    case BLK_ZONE_COND_EXP_OPEN:
+        zone->state = BLK_ZS_EOPEN;
+        break;
+    case BLK_ZONE_COND_CLOSED:
+        zone->state = BLK_ZS_CLOSED;
+        break;
+    case BLK_ZONE_COND_READONLY:
+        zone->state = BLK_ZS_RDONLY;
+        break;
+    case BLK_ZONE_COND_FULL:
+        zone->state = BLK_ZS_FULL;
+        break;
+    case BLK_ZONE_COND_OFFLINE:
+        zone->state = BLK_ZS_OFFLINE;
+        break;
+    default:
+        error_report("Unsupported zone state: 0x%x", blkz->cond);
+        return -ENOTSUP;
+    }
+    return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_report(void *opaque)
+{
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    /* zoned block devices use 512-byte sectors */
+    uint64_t sector = aiocb->aio_offset / 512;
+
+    struct blk_zone *blkz;
+    size_t rep_size;
+    unsigned int nrz;
+    int ret;
+    unsigned int n = 0, i = 0;
+
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                         fd, sector, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            ret = parse_zone(&zones[n], &blkz[i]);
+            if (ret != 0) {
+                return ret;
+            }
+
+            /* The next report should start after the last zone reported */
+            sector = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_mgmt(void *opaque)
+{
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    uint64_t sector = aiocb->aio_offset / 512;
+    int64_t nr_sectors = aiocb->aio_nbytes / 512;
+    struct blk_zone_range range;
+    int ret;
+
+    /* Execute the operation */
+    range.sector = sector;
+    range.nr_sectors = nr_sectors;
+    do {
+        ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
+    } while (ret != 0 && errno == EINTR);
+
+    return ret < 0 ? -errno : ret;
+}
+#endif
+
 static int handle_aiocb_copy_range(void *opaque)
 {
    RawPosixAIOData *aiocb = opaque;
@@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
 {
    BDRVRawState *s = bs->opaque;
    RawPosixAIOData acb;
+    int ret;

    if (fd_open(bs) < 0)
        return -EIO;
+#if defined(CONFIG_BLKZONED)
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
+        qemu_co_mutex_lock(&bs->wps->colock);
+        if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+            int index = offset / bs->bl.zone_size;
+            offset = bs->wps->wp[index];
+        }
+    }
+#endif

    /*
     * When using O_DIRECT, the request must be aligned to be able to use
@@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
 #ifdef CONFIG_LINUX_IO_URING
    } else if (s->use_linux_io_uring) {
        assert(qiov->size == bytes);
-        return luring_co_submit(bs, s->fd, offset, qiov, type);
+        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+        goto out;
 #endif
 #ifdef CONFIG_LINUX_AIO
    } else if (s->use_linux_aio) {
        assert(qiov->size == bytes);
-        return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
+        ret = laio_co_submit(s->fd, offset, qiov, type,
+                              s->aio_max_batch);
+        goto out;
 #endif
    }

@@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
    };

    assert(qiov->size == bytes);
-    return raw_thread_pool_submit(handle_aiocb_rw, &acb);
+    ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+    goto out; /* Avoid the compiler err of unused label */
+
+out:
+#if defined(CONFIG_BLKZONED)
+{
+    BlockZoneWps *wps = bs->wps;
+    if (ret == 0) {
+        if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
+            && wps && bs->bl.zone_size) {
+            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
+            if (!BDRV_ZT_IS_CONV(*wp)) {
+                if (type & QEMU_AIO_ZONE_APPEND) {
+                    *s->offset = *wp;
+                    trace_zbd_zone_append_complete(bs, *s->offset
+                        >> BDRV_SECTOR_BITS);
+                }
+                /* Advance the wp if needed */
+                if (offset + bytes > *wp) {
+                    *wp = offset + bytes;
+                }
+            }
+        }
+    } else {
+        if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
+            update_zones_wp(bs, s->fd, 0, 1);
+        }
+    }
+
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
+        qemu_co_mutex_unlock(&wps->colock);
+    }
+}
+#endif
+    return ret;
 }

 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
@@ -2212,6 +2636,9 @@ static void raw_close(BlockDriverState *bs)
    BDRVRawState *s = bs->opaque;

    if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        g_free(bs->wps);
+#endif
        qemu_close(s->fd);
        s->fd = -1;
    }
@@ -2969,6 +3396,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
    }
 }

+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ * zones is an array of zone descriptors to hold zone information on reply;
+ * offset can be any byte within the entire size of the device;
+ * nr_zones is the maxium number of sectors the command should operate on.
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                           unsigned int *nr_zones,
+                                           BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_ZONE_REPORT,
+        .aio_offset = offset,
+        .zone_report    = {
+            .nr_zones       = nr_zones,
+            .zones          = zones,
+        },
+    };
+
+    trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
+    return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
+}
+#endif
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+    int64_t zone_size, zone_size_mask;
+    const char *op_name;
+    unsigned long zo;
+    int ret;
+    BlockZoneWps *wps = bs->wps;
+    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+
+    zone_size = bs->bl.zone_size;
+    zone_size_mask = zone_size - 1;
+    if (offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId64 "", offset / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    if (((offset + len) < capacity && len & zone_size_mask) ||
+        offset + len > capacity) {
+        error_report("number of sectors %" PRId64 " is not aligned to zone size"
+                      " %" PRId64 "", len / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    uint32_t i = offset / bs->bl.zone_size;
+    uint32_t nrz = len / bs->bl.zone_size;
+    uint64_t *wp = &wps->wp[i];
+    if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
+        error_report("zone mgmt operations are not allowed for conventional zones");
+        return -EIO;
+    }
+
+    switch (op) {
+    case BLK_ZO_OPEN:
+        op_name = "BLKOPENZONE";
+        zo = BLKOPENZONE;
+        break;
+    case BLK_ZO_CLOSE:
+        op_name = "BLKCLOSEZONE";
+        zo = BLKCLOSEZONE;
+        break;
+    case BLK_ZO_FINISH:
+        op_name = "BLKFINISHZONE";
+        zo = BLKFINISHZONE;
+        break;
+    case BLK_ZO_RESET:
+        op_name = "BLKRESETZONE";
+        zo = BLKRESETZONE;
+        break;
+    default:
+        error_report("Unsupported zone op: 0x%x", op);
+        return -ENOTSUP;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_ZONE_MGMT,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .zone_mgmt  = {
+            .op = zo,
+        },
+    };
+
+    trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
+                        len >> BDRV_SECTOR_BITS);
+    ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
+    if (ret != 0) {
+        update_zones_wp(bs, s->fd, offset, i);
+        error_report("ioctl %s failed %d", op_name, ret);
+        return ret;
+    }
+
+    if (zo == BLKRESETZONE && len == capacity) {
+        ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
+        if (ret < 0) {
+            error_report("reporting single wp failed");
+            return ret;
+        }
+    } else if (zo == BLKRESETZONE) {
+        for (unsigned int j = 0; j < nrz; ++j) {
+            wp[j] = offset + j * zone_size;
+        }
+    } else if (zo == BLKFINISHZONE) {
+        for (unsigned int j = 0; j < nrz; ++j) {
+            /* The zoned device allows the last zone smaller that the
+             * zone size. */
+            wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    assert(flags == 0);
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+    BDRVRawState *s = bs->opaque;
+    s->offset = offset;
+
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+        iov_len = qiov->iov[i].iov_len;
+        if (iov_len & wg_mask) {
+            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+                         "block size %" PRId64 "", i, iov_len, wg);
+            return -EINVAL;
+        }
+        len += iov_len;
+    }
+
+    trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
+    return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                bool blkdev)
@@ -3724,6 +4316,14 @@ static BlockDriver bdrv_host_device = {
 #ifdef __linux__
    .bdrv_co_ioctl          = hdev_co_ioctl,
 #endif
+
+    /* zoned device */
+#if defined(CONFIG_BLKZONED)
+    /* zone management operations */
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+    .bdrv_co_zone_append = raw_co_zone_append,
+#endif
 };

 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -265,7 +265,10 @@ void bdrv_graph_rdunlock_main_loop(void)

 void assert_bdrv_graph_readable(void)
 {
+    /* reader_count() is slow due to aio_context_list_lock lock contention */
+#ifdef CONFIG_DEBUG_GRAPH_LOCK
    assert(qemu_in_main_thread() || reader_count());
+#endif
 }

 void assert_bdrv_graph_writable(void)
--- a/block/io.c
+++ b/block/io.c
@@ -160,7 +160,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
    bool have_limits;

    GLOBAL_STATE_CODE();
-    assume_graph_lock(); /* FIXME */

    if (tran) {
        BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
@@ -727,10 +726,9 @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 /**
 * Round a region to cluster boundaries
 */
-void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t offset, int64_t bytes,
-                            int64_t *cluster_offset,
-                            int64_t *cluster_bytes)
+void coroutine_fn GRAPH_RDLOCK
+bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                       int64_t *cluster_offset, int64_t *cluster_bytes)
 {
    BlockDriverInfo bdi;
    IO_CODE();
@@ -744,7 +742,7 @@ void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs,
    }
 }

-static coroutine_fn int bdrv_get_cluster_size(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK bdrv_get_cluster_size(BlockDriverState *bs)
 {
    BlockDriverInfo bdi;
    int ret;
@@ -1800,7 +1798,7 @@ fail:
    return ret;
 }

-static inline int coroutine_fn
+static inline int coroutine_fn GRAPH_RDLOCK
 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
                          BdrvTrackedRequest *req, int flags)
 {
@@ -3115,6 +3113,74 @@ out:
    return co.ret;
 }

+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                        unsigned int *nr_zones,
+                        BlockZoneDescriptor *zones)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                        QEMUIOVector *qiov,
+                        BdrvRequestFlags flags)
+{
+    int ret;
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
    IO_CODE();
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
                             luringcb->qiov->niov, offset);
        break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+                             luringcb->qiov->niov, offset);
+        break;
    case QEMU_AIO_READ:
        io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
                            luringcb->qiov->niov, offset);
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
--- a/block/meson.build
+++ b/block/meson.build
@@ -38,11 +38,6 @@ block_ss.add(files(
  'snapshot-access.c',
  'throttle-groups.c',
  'throttle.c',
-  'vhdx-endian.c',
-  'vhdx-log.c',
-  'vhdx.c',
-  'vmdk.c',
-  'vpc.c',
  'write-threshold.c',
 ), zstd, zlib, gnutls)

@@ -55,6 +50,19 @@ endif
 if get_option('vdi').allowed()
  block_ss.add(files('vdi.c'))
 endif
+if get_option('vhdx').allowed()
+  block_ss.add(files(
+    'vhdx-endian.c',
+    'vhdx-log.c',
+    'vhdx.c'
+  ))
+endif
+if get_option('vmdk').allowed()
+  block_ss.add(files('vmdk.c'))
+endif
+if get_option('vpc').allowed()
+  block_ss.add(files('vpc.c'))
+endif
 if get_option('cloop').allowed()
  block_ss.add(files('cloop.c'))
 endif
@@ -84,7 +92,7 @@ block_ss.add(when: 'CONFIG_WIN32', if_true: files('file-win32.c', 'win32-aio.c')
 block_ss.add(when: 'CONFIG_POSIX', if_true: [files('file-posix.c'), coref, iokit])
 block_ss.add(when: libiscsi, if_true: files('iscsi-opts.c'))
 block_ss.add(when: 'CONFIG_LINUX', if_true: files('nvme.c'))
-if not get_option('replication').disabled()
+if get_option('replication').allowed()
  block_ss.add(files('replication.c'))
 endif
 block_ss.add(when: libaio, if_true: files('linux-aio.c'))
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -270,8 +270,8 @@ static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,

 /* Round offset and/or bytes to target cluster if COW is needed, and
 * return the offset of the adjusted tail against original. */
-static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
-                            uint64_t *bytes)
+static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
+                                         uint64_t *bytes)
 {
    bool need_cow;
    int ret = 0;
@@ -576,8 +576,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_offset;
            int64_t target_bytes;
-            bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
-                                   &target_offset, &target_bytes);
+            WITH_GRAPH_RDLOCK_GUARD() {
+                bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
+                                       &target_offset, &target_bytes);
+            }
            if (target_offset == offset &&
                target_bytes == io_bytes) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
@@ -745,7 +747,10 @@ static int mirror_exit_common(Job *job)
         * Cannot use check_to_replace_node() here, because that would
         * check for an op blocker on @to_replace, and we have our own
         * there.
+         *
+         * TODO Pull out the writer lock from bdrv_replace_node() to here
         */
+        bdrv_graph_rdlock_main_loop();
        if (bdrv_recurse_can_replace(src, to_replace)) {
            bdrv_replace_node(to_replace, target_bs, &local_err);
        } else {
@@ -754,6 +759,7 @@ static int mirror_exit_common(Job *job)
                       "would not lead to an abrupt change of visible data",
                       to_replace->node_name, target_bs->node_name);
        }
+        bdrv_graph_rdunlock_main_loop();
        bdrv_drained_end(target_bs);
        if (local_err) {
            error_report_err(local_err);
@@ -966,11 +972,13 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
     */
    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
+    bdrv_graph_co_rdlock();
    if (!bdrv_co_get_info(target_bs, &bdi) && bdi.cluster_size) {
        s->target_cluster_size = bdi.cluster_size;
    } else {
        s->target_cluster_size = BDRV_SECTOR_SIZE;
    }
+    bdrv_graph_co_rdunlock();
    if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) &&
        s->granularity < s->target_cluster_size) {
        s->buf_size = MAX(s->buf_size, s->target_cluster_size);
@@ -1416,7 +1424,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
    return op;
 }

-static void coroutine_fn active_write_settle(MirrorOp *op)
+static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op)
 {
    uint64_t start_chunk = op->offset / op->s->granularity;
    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -322,6 +322,7 @@ int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs,
    int ret;
    IO_CODE();

+    assert_bdrv_graph_readable();
    assert(!s->ioc);

    s->ioc = nbd_co_establish_connection(s->conn, &s->info, blocking, errp);
@@ -369,7 +370,7 @@ static bool nbd_client_connecting(BDRVNBDState *s)
 }

 /* Called with s->requests_lock taken.  */
-static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
+static void coroutine_fn GRAPH_RDLOCK nbd_reconnect_attempt(BDRVNBDState *s)
 {
    int ret;
    bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
@@ -480,9 +481,9 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
    }
 }

-static int coroutine_fn nbd_co_send_request(BlockDriverState *bs,
-                                            NBDRequest *request,
-                                            QEMUIOVector *qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_send_request(BlockDriverState *bs, NBDRequest *request,
+                    QEMUIOVector *qiov)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    int rc, i = -1;
@@ -1171,8 +1172,9 @@ static int coroutine_fn nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
    return iter.ret;
 }

-static int coroutine_fn nbd_co_request(BlockDriverState *bs, NBDRequest *request,
-                                       QEMUIOVector *write_qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_request(BlockDriverState *bs, NBDRequest *request,
+               QEMUIOVector *write_qiov)
 {
    int ret, request_ret;
    Error *local_err = NULL;
@@ -1208,9 +1210,9 @@ static int coroutine_fn nbd_co_request(BlockDriverState *bs, NBDRequest *request
    return ret ? ret : request_ret;
 }

-static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offset,
-                                             int64_t bytes, QEMUIOVector *qiov,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                     QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
    int ret, request_ret;
    Error *local_err = NULL;
@@ -1266,9 +1268,9 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse
    return ret ? ret : request_ret;
 }

-static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset,
-                                              int64_t bytes, QEMUIOVector *qiov,
-                                              BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                      QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1291,8 +1293,9 @@ static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t offs
    return nbd_co_request(bs, &request, qiov);
 }

-static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                                    int64_t bytes, BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                            BdrvRequestFlags flags)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1326,7 +1329,7 @@ static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_flush(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK nbd_client_co_flush(BlockDriverState *bs)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = { .type = NBD_CMD_FLUSH };
@@ -1341,8 +1344,8 @@ static int coroutine_fn nbd_client_co_flush(BlockDriverState *bs)
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                               int64_t bytes)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
    NBDRequest request = {
@@ -1361,7 +1364,7 @@ static int coroutine_fn nbd_client_co_pdiscard(BlockDriverState *bs, int64_t off
    return nbd_co_request(bs, &request, NULL);
 }

-static int coroutine_fn nbd_client_co_block_status(
+static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status(
        BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
        int64_t *pnum, int64_t *map, BlockDriverState **file)
 {
@@ -1920,11 +1923,6 @@ fail:
    return ret;
 }

-static int coroutine_fn nbd_co_flush(BlockDriverState *bs)
-{
-    return nbd_client_co_flush(bs);
-}
-
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
@@ -2120,7 +2118,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
@@ -2148,7 +2146,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
@@ -2176,7 +2174,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
-    .bdrv_co_flush_to_os        = nbd_co_flush,
+    .bdrv_co_flush_to_os        = nbd_client_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_co_truncate           = nbd_co_truncate,
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -613,8 +613,8 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,

    ret = 0;
 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;

 exit:
@@ -691,7 +691,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename,

 done:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/qapi-sysemu.c
+++ b/block/qapi-sysemu.c
@@ -517,6 +517,7 @@ void qmp_block_latency_histogram_set(
    bool has_boundaries, uint64List *boundaries,
    bool has_boundaries_read, uint64List *boundaries_read,
    bool has_boundaries_write, uint64List *boundaries_write,
+    bool has_boundaries_append, uint64List *boundaries_append,
    bool has_boundaries_flush, uint64List *boundaries_flush,
    Error **errp)
 {
@@ -557,6 +558,16 @@ void qmp_block_latency_histogram_set(
        }
    }

+    if (has_boundaries || has_boundaries_append) {
+        ret = block_latency_histogram_set(
+                stats, BLOCK_ACCT_ZONE_APPEND,
+                has_boundaries_append ? boundaries_append : boundaries);
+        if (ret) {
+            error_setg(errp, "Device '%s' set append write boundaries fail", id);
+            return;
+        }
+    }
+
    if (has_boundaries || has_boundaries_flush) {
        ret = block_latency_histogram_set(
            stats, BLOCK_ACCT_FLUSH,
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)

    ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
    ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+    ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
    ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
    ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+    ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];

    ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
    ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+    ds->failed_zone_append_operations =
+        stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
    ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];

    ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
    ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+    ds->invalid_zone_append_operations =
+        stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
    ds->invalid_flush_operations =
        stats->invalid_ops[BLOCK_ACCT_FLUSH];
    ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];

    ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
    ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+    ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
    ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
    ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
    ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+    ds->zone_append_total_time_ns =
+        stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
    ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
    ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
    ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
@@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)

        TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
        TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+        TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
        TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];

        dev_stats->interval_length = ts->interval_length;
@@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
        dev_stats->max_wr_latency_ns = timed_average_max(wr);
        dev_stats->avg_wr_latency_ns = timed_average_avg(wr);

+        dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
+        dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
+        dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
+
        dev_stats->min_flush_latency_ns = timed_average_min(fl);
        dev_stats->max_flush_latency_ns = timed_average_max(fl);
        dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
@@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
            block_acct_queue_depth(ts, BLOCK_ACCT_READ);
        dev_stats->avg_wr_queue_depth =
            block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+        dev_stats->avg_zone_append_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);

        QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
    }
@@ -600,12 +616,14 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
    ds->wr_latency_histogram
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
+    ds->zone_append_latency_histogram
+        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
    ds->flush_latency_histogram
        = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
 }

-static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
-                                        bool blk_level)
+static BlockStats * GRAPH_RDLOCK
+bdrv_query_bds_stats(BlockDriverState *bs, bool blk_level)
 {
    BdrvChild *parent_child;
    BlockDriverState *filter_or_cow_bs;
@@ -713,6 +731,8 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
    BlockBackend *blk;
    BlockDriverState *bs;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /* Just to be safe if query_nodes is not always initialized */
    if (has_query_nodes && query_nodes) {
        for (bs = bdrv_next_node(NULL); bs; bs = bdrv_next_node(bs)) {
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -915,8 +915,8 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
    g_free(tmp);
    ret = 0;
 exit:
-    blk_unref(qcow_blk);
-    bdrv_unref(bs);
+    blk_co_unref(qcow_blk);
+    bdrv_co_unref(bs);
    qcrypto_block_free(crypto);
    return ret;
 }
@@ -1015,7 +1015,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename,
 fail:
    g_free(backing_fmt);
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -3715,7 +3715,7 @@ int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)

    qemu_co_mutex_assert_locked(&s->lock);

-    file_length = bdrv_getlength(bs->file->bs);
+    file_length = bdrv_co_getlength(bs->file->bs);
    if (file_length < 0) {
        return file_length;
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1891,7 +1891,7 @@ static void coroutine_fn qcow2_open_entry(void *opaque)
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

-    assume_graph_lock(); /* FIXME */
+    GRAPH_RDLOCK_GUARD();

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
@@ -1920,14 +1920,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

-    if (qemu_in_coroutine()) {
-        /* From bdrv_co_create.  */
-        qcow2_open_entry(&qoc);
-    } else {
-        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
-        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
-    }
+    assert(!qemu_in_coroutine());
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
+    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+
    return qoc.ret;
 }

@@ -2089,11 +2086,10 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
    }
 }

-static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
-                                              bool want_zero,
-                                              int64_t offset, int64_t count,
-                                              int64_t *pnum, int64_t *map,
-                                              BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                      int64_t count, int64_t *pnum, int64_t *map,
+                      BlockDriverState **file)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t host_offset;
@@ -3235,7 +3231,7 @@ preallocate_co(BlockDriverState *bs, uint64_t offset, uint64_t new_length,
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
-    file_length = bdrv_getlength(s->data_file->bs);
+    file_length = bdrv_co_getlength(s->data_file->bs);
    if (file_length < 0) {
        error_setg_errno(errp, -file_length, "Could not get file size");
        ret = file_length;
@@ -3706,7 +3702,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
        goto out;
    }

-    blk_unref(blk);
+    blk_co_unref(blk);
    blk = NULL;

    /*
@@ -3786,7 +3782,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
        }
    }

-    blk_unref(blk);
+    blk_co_unref(blk);
    blk = NULL;

    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
@@ -3811,9 +3807,9 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)

    ret = 0;
 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
-    bdrv_unref(data_bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
+    bdrv_co_unref(data_bs);
    return ret;
 }

@@ -3944,8 +3940,8 @@ finish:
    }

    qobject_unref(qdict);
-    bdrv_unref(bs);
-    bdrv_unref(data_bs);
+    bdrv_co_unref(bs);
+    bdrv_co_unref(data_bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
@@ -4098,7 +4094,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs,
        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
            if (bs->backing && bs->backing->bs) {
-                int64_t backing_length = bdrv_getlength(bs->backing->bs);
+                int64_t backing_length = bdrv_co_getlength(bs->backing->bs);
                if (src_offset >= backing_length) {
                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
                } else {
@@ -4293,7 +4289,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
            goto fail;
        }

-        old_file_size = bdrv_getlength(bs->file->bs);
+        old_file_size = bdrv_co_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
@@ -4386,7 +4382,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
            break;
        }

-        old_file_size = bdrv_getlength(bs->file->bs);
+        old_file_size = bdrv_co_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
@@ -4694,7 +4690,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         * align end of file to a sector boundary to ease reading with
         * sector based I/Os
         */
-        int64_t len = bdrv_getlength(bs->file->bs);
+        int64_t len = bdrv_co_getlength(bs->file->bs);
        if (len < 0) {
            return len;
        }
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -895,7 +895,9 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                void *cb_opaque, Error **errp);
 int coroutine_fn GRAPH_RDLOCK qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
-int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs);
+
+int coroutine_fn GRAPH_RDLOCK
+qcow2_detect_metadata_preallocation(BlockDriverState *bs);

 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
--- a/block/qed.c
+++ b/block/qed.c
@@ -557,11 +557,13 @@ typedef struct QEDOpenCo {
    int ret;
 } QEDOpenCo;

-static void coroutine_fn GRAPH_RDLOCK bdrv_qed_open_entry(void *opaque)
+static void coroutine_fn bdrv_qed_open_entry(void *opaque)
 {
    QEDOpenCo *qoc = opaque;
    BDRVQEDState *s = qoc->bs->opaque;

+    GRAPH_RDLOCK_GUARD();
+
    qemu_co_mutex_lock(&s->table_lock);
    qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->table_lock);
@@ -579,21 +581,17 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    };
    int ret;

-    assume_graph_lock(); /* FIXME */
-
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
    }

    bdrv_qed_init_state(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_qed_open_entry(&qoc);
-    } else {
-        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-        qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
-        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
-    }
+    assert(!qemu_in_coroutine());
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
+    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+
    return qoc.ret;
 }

@@ -748,8 +746,8 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
    ret = 0; /* success */
 out:
    g_free(l1_table);
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;
 }

@@ -819,7 +817,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -825,8 +825,8 @@ static coroutine_fn GRAPH_RDLOCK int quorum_co_flush(BlockDriverState *bs)
    return result;
 }

-static bool quorum_recurse_can_replace(BlockDriverState *bs,
-                                       BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+quorum_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
    return bdrv_co_pdiscard(bs->file, offset, bytes);
 }

+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                   unsigned int *nr_zones,
+                   BlockZoneDescriptor *zones)
+{
+    return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+                 int64_t offset, int64_t len)
+{
+    return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
+                   BdrvRequestFlags flags)
+{
+    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
 static int64_t coroutine_fn GRAPH_RDLOCK
 raw_co_getlength(BlockDriverState *bs)
 {
@@ -369,7 +391,7 @@ static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs,
    return info;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    return bdrv_co_get_info(bs->file->bs, bdi);
@@ -608,6 +630,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
 BlockDriver bdrv_raw = {
    .format_name          = "raw",
    .instance_size        = sizeof(BDRVRawState),
+    .supports_zoned_children = true,
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
    .bdrv_reopen_commit   = &raw_reopen_commit,
@@ -619,6 +642,9 @@ BlockDriver bdrv_raw = {
    .bdrv_co_pwritev      = &raw_co_pwritev,
    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_pdiscard     = &raw_co_pdiscard,
+    .bdrv_co_zone_report  = &raw_co_zone_report,
+    .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
+    .bdrv_co_zone_append = &raw_co_zone_append,
    .bdrv_co_block_status = &raw_co_block_status,
    .bdrv_co_copy_range_from = &raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
--- a/block/trace-events
+++ b/block/trace-events
@@ -209,6 +209,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
 file_setup_cdrom(const char *partition) "Using %s as optical disc"
 file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
 file_flush_fdatasync_failed(int err) "errno %d"
+zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
+zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
+zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
+zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""

 # ssh.c
 sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -886,8 +886,8 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,

    ret = 0;
 exit:
-    blk_unref(blk);
-    bdrv_unref(bs_file);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs_file);
    g_free(bmap);
    return ret;
 }
@@ -975,7 +975,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename,
 done:
    qobject_unref(qdict);
    qapi_free_BlockdevCreateOptions(create_options);
-    bdrv_unref(bs_file);
+    bdrv_co_unref(bs_file);
    return ret;
 }

--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1506,8 +1506,9 @@ exit:
 * There are 2 headers, and the highest sequence number will represent
 * the active header
 */
-static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
-                                   uint32_t log_size)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
+                        uint32_t log_size)
 {
    BlockDriverState *bs = blk_bs(blk);
    BdrvChild *child;
@@ -1897,8 +1898,8 @@ exit:
 *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
 *   1MB
 */
-static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,
-                                       Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
    BlockdevCreateOptionsVhdx *vhdx_opts;
    BlockBackend *blk = NULL;
@@ -2053,8 +2054,8 @@ static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,

    ret = 0;
 delete_and_exit:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    g_free(creator);
    return ret;
 }
@@ -2144,7 +2145,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2306,7 +2306,7 @@ exit:
        if (pbb) {
            *pbb = blk;
        } else {
-            blk_unref(blk);
+            blk_co_unref(blk);
            blk = NULL;
        }
    }
@@ -2516,12 +2516,12 @@ vmdk_co_do_create(int64_t size,
        if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
            error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
                       blk_bs(backing)->drv->format_name);
-            blk_unref(backing);
+            blk_co_unref(backing);
            ret = -EINVAL;
            goto exit;
        }
        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
-        blk_unref(backing);
+        blk_co_unref(backing);
        if (ret) {
            error_setg(errp, "Failed to read parent CID");
            goto exit;
@@ -2542,14 +2542,14 @@ vmdk_co_do_create(int64_t size,
                             blk_bs(extent_blk)->filename);
        created_size += cur_size;
        extent_idx++;
-        blk_unref(extent_blk);
+        blk_co_unref(extent_blk);
    }

    /* Check whether we got excess extents */
    extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
                           opaque, NULL);
    if (extent_blk) {
-        blk_unref(extent_blk);
+        blk_co_unref(extent_blk);
        error_setg(errp, "List of extents contains unused extents");
        ret = -EINVAL;
        goto exit;
@@ -2590,7 +2590,7 @@ vmdk_co_do_create(int64_t size,
    ret = 0;
 exit:
    if (blk) {
-        blk_unref(blk);
+        blk_co_unref(blk);
    }
    g_free(desc);
    g_free(parent_desc_line);
@@ -2641,7 +2641,7 @@ vmdk_co_create_opts_cb(int64_t size, int idx, bool flat, bool split,
                           errp)) {
        goto exit;
    }
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
 exit:
    g_free(ext_filename);
    return blk;
@@ -2797,12 +2797,12 @@ static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx,
        return NULL;
    }
    blk_set_allow_write_beyond_eof(blk, true);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);

    if (size != -1) {
        ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
        if (ret) {
-            blk_unref(blk);
+            blk_co_unref(blk);
            blk = NULL;
        }
    }
@@ -2845,7 +2845,7 @@ static void vmdk_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

-static int64_t coroutine_fn
+static int64_t coroutine_fn GRAPH_RDLOCK
 vmdk_co_get_allocated_file_size(BlockDriverState *bs)
 {
    int i;
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -1082,8 +1082,8 @@ static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
    }

 out:
-    blk_unref(blk);
-    bdrv_unref(bs);
+    blk_co_unref(blk);
+    bdrv_co_unref(bs);
    return ret;
 }

@@ -1162,7 +1162,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename,

 fail:
    qobject_unref(qdict);
-    bdrv_unref(bs);
+    bdrv_co_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }
--- a/blockdev.c
+++ b/blockdev.c
@@ -153,12 +153,22 @@ void blockdev_mark_auto_del(BlockBackend *blk)

    JOB_LOCK_GUARD();

-    for (job = block_job_next_locked(NULL); job;
-         job = block_job_next_locked(job)) {
-        if (block_job_has_bdrv(job, blk_bs(blk))) {
+    do {
+        job = block_job_next_locked(NULL);
+        while (job && (job->job.cancelled ||
+                       job->job.deferred_to_main_loop ||
+                       !block_job_has_bdrv(job, blk_bs(blk))))
+        {
+            job = block_job_next_locked(job);
+        }
+        if (job) {
+            /*
+             * This drops the job lock temporarily and polls, so we need to
+             * restart processing the list from the start after this.
+             */
            job_cancel_locked(&job->job, false);
        }
-    }
+    } while (job);

    dinfo->auto_del = 1;
 }
@@ -2430,7 +2440,7 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name,
        return;
    }

-    blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);
+    blk = blk_co_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);
    if (!blk) {
        return;
    }
@@ -2445,7 +2455,7 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name,

    bdrv_co_lock(bs);
    bdrv_drained_end(bs);
-    blk_unref(blk);
+    blk_co_unref(blk);
    bdrv_co_unlock(bs);
 }

@@ -2951,6 +2961,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    BlockDriverState *unfiltered_bs;
    int job_flags = JOB_DEFAULT;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!has_speed) {
        speed = 0;
    }
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -352,9 +352,10 @@ static abi_ulong load_elf_interp(struct elfhdr *interp_elf_ex,

 static int symfind(const void *s0, const void *s1)
 {
-    target_ulong addr = *(target_ulong *)s0;
    struct elf_sym *sym = (struct elf_sym *)s1;
+    __typeof(sym->st_value) addr = *(uint64_t *)s0;
    int result = 0;
+
    if (addr < sym->st_value) {
        result = -1;
    } else if (addr >= sym->st_value + sym->st_size) {
@@ -363,7 +364,7 @@ static int symfind(const void *s0, const void *s1)
    return result;
 }

-static const char *lookup_symbolxx(struct syminfo *s, target_ulong orig_addr)
+static const char *lookup_symbolxx(struct syminfo *s, uint64_t orig_addr)
 {
 #if ELF_CLASS == ELFCLASS32
    struct elf_sym *syms = s->disas_symtab.elf32;
--- a/configs/targets/mips-linux-user.mak
+++ b/configs/targets/mips-linux-user.mak
@@ -2,5 +2,4 @@ TARGET_ARCH=mips
 TARGET_ABI_MIPSO32=y
 TARGET_SYSTBL_ABI=o32
 TARGET_SYSTBL=syscall_o32.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips-softmmu.mak
+++ b/configs/targets/mips-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
 TARGET_SUPPORTS_MTTCG=y
--- a/configs/targets/mips64-linux-user.mak
+++ b/configs/targets/mips64-linux-user.mak
@@ -3,5 +3,4 @@ TARGET_ABI_MIPSN64=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n64
 TARGET_SYSTBL=syscall_n64.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips64-softmmu.mak
+++ b/configs/targets/mips64-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips64
 TARGET_BASE_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mips64el-linux-user.mak
+++ b/configs/targets/mips64el-linux-user.mak
@@ -3,4 +3,3 @@ TARGET_ABI_MIPSN64=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n64
 TARGET_SYSTBL=syscall_n64.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/mips64el-softmmu.mak
+++ b/configs/targets/mips64el-softmmu.mak
@@ -1,4 +1,3 @@
 TARGET_ARCH=mips64
 TARGET_BASE_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_NEED_FDT=y
--- a/configs/targets/mipsel-linux-user.mak
+++ b/configs/targets/mipsel-linux-user.mak
@@ -2,4 +2,3 @@ TARGET_ARCH=mips
 TARGET_ABI_MIPSO32=y
 TARGET_SYSTBL_ABI=o32
 TARGET_SYSTBL=syscall_o32.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/mipsel-softmmu.mak
+++ b/configs/targets/mipsel-softmmu.mak
@@ -1,3 +1,2 @@
 TARGET_ARCH=mips
-TARGET_ALIGNED_ONLY=y
 TARGET_SUPPORTS_MTTCG=y
--- a/configs/targets/mipsn32-linux-user.mak
+++ b/configs/targets/mipsn32-linux-user.mak
@@ -4,5 +4,4 @@ TARGET_ABI32=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n32
 TARGET_SYSTBL=syscall_n32.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/mipsn32el-linux-user.mak
+++ b/configs/targets/mipsn32el-linux-user.mak
@@ -4,4 +4,3 @@ TARGET_ABI32=y
 TARGET_BASE_ARCH=mips
 TARGET_SYSTBL_ABI=n32
 TARGET_SYSTBL=syscall_n32.tbl
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/nios2-softmmu.mak
+++ b/configs/targets/nios2-softmmu.mak
@@ -1,3 +1,2 @@
 TARGET_ARCH=nios2
-TARGET_ALIGNED_ONLY=y
 TARGET_NEED_FDT=y
--- a/configs/targets/sh4-linux-user.mak
+++ b/configs/targets/sh4-linux-user.mak
@@ -1,5 +1,4 @@
 TARGET_ARCH=sh4
 TARGET_SYSTBL_ABI=common
 TARGET_SYSTBL=syscall.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_HAS_BFLT=y
--- a/configs/targets/sh4-softmmu.mak
+++ b/configs/targets/sh4-softmmu.mak
@@ -1,2 +1 @@
 TARGET_ARCH=sh4
-TARGET_ALIGNED_ONLY=y
--- a/configs/targets/sh4eb-linux-user.mak
+++ b/configs/targets/sh4eb-linux-user.mak
@@ -1,6 +1,5 @@
 TARGET_ARCH=sh4
 TARGET_SYSTBL_ABI=common
 TARGET_SYSTBL=syscall.tbl
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
 TARGET_HAS_BFLT=y
--- a/configs/targets/sh4eb-softmmu.mak
+++ b/configs/targets/sh4eb-softmmu.mak
@@ -1,3 +1,2 @@
 TARGET_ARCH=sh4
-TARGET_ALIGNED_ONLY=y
 TARGET_BIG_ENDIAN=y
--- a/1
+++ b/1
@@ -816,6 +816,7 @@ for opt do
  --enable-debug)
      # Enable debugging options that aren't excessively noisy
      debug_tcg="yes"
+      meson_option_parse --enable-debug-graph-lock ""
      meson_option_parse --enable-debug-mutex ""
      meson_option_add -Doptimization=0
      fortify_source="no"
--- a/cpu.c
+++ b/cpu.c
@@ -427,6 +427,11 @@ bool target_words_bigendian(void)
 #endif
 }

+const char *target_name(void)
+{
+    return TARGET_NAME;
+}
+
 void page_size_init(void)
 {
    /* NOTE: we can always suppose that qemu_host_page_size >=
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -25,7 +25,7 @@
 #include "qemu/lockable.h"
 #include "trace/trace-root.h"

-static QemuMutex qemu_cpu_list_lock;
+QemuMutex qemu_cpu_list_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 static QemuCond qemu_work_cond;
--- a/disas/disas-internal.h
+++ b/disas/disas-internal.h
@@ -0,0 +1,21 @@
+/*
+ * Definitions used internally in the disassembly code
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef DISAS_INTERNAL_H
+#define DISAS_INTERNAL_H
+
+#include "disas/dis-asm.h"
+
+typedef struct CPUDebug {
+    struct disassemble_info info;
+    CPUState *cpu;
+} CPUDebug;
+
+void disas_initialize_debug_target(CPUDebug *s, CPUState *cpu);
+int disas_gstring_printf(FILE *stream, const char *fmt, ...)
+    G_GNUC_PRINTF(2, 3);
+
+#endif
--- a/disas/disas-mon.c
+++ b/disas/disas-mon.c
@@ -0,0 +1,65 @@
+/*
+ * Functions related to disassembly from the monitor
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "disas-internal.h"
+#include "disas/disas.h"
+#include "exec/memory.h"
+#include "hw/core/cpu.h"
+#include "monitor/monitor.h"
+
+static int
+physical_read_memory(bfd_vma memaddr, bfd_byte *myaddr, int length,
+                     struct disassemble_info *info)
+{
+    CPUDebug *s = container_of(info, CPUDebug, info);
+    MemTxResult res;
+
+    res = address_space_read(s->cpu->as, memaddr, MEMTXATTRS_UNSPECIFIED,
+                             myaddr, length);
+    return res == MEMTX_OK ? 0 : EIO;
+}
+
+/* Disassembler for the monitor.  */
+void monitor_disas(Monitor *mon, CPUState *cpu, uint64_t pc,
+                   int nb_insn, bool is_physical)
+{
+    int count, i;
+    CPUDebug s;
+    g_autoptr(GString) ds = g_string_new("");
+
+    disas_initialize_debug_target(&s, cpu);
+    s.info.fprintf_func = disas_gstring_printf;
+    s.info.stream = (FILE *)ds;  /* abuse this slot */
+
+    if (is_physical) {
+        s.info.read_memory_func = physical_read_memory;
+    }
+    s.info.buffer_vma = pc;
+
+    if (s.info.cap_arch >= 0 && cap_disas_monitor(&s.info, pc, nb_insn)) {
+        monitor_puts(mon, ds->str);
+        return;
+    }
+
+    if (!s.info.print_insn) {
+        monitor_printf(mon, "0x%08" PRIx64
+                       ": Asm output not supported on this arch\n", pc);
+        return;
+    }
+
+    for (i = 0; i < nb_insn; i++) {
+        g_string_append_printf(ds, "0x%08" PRIx64 ":  ", pc);
+        count = s.info.print_insn(pc, &s.info);
+        g_string_append_c(ds, '\n');
+        if (count < 0) {
+            break;
+        }
+        pc += count;
+    }
+
+    monitor_puts(mon, ds->str);
+}
--- a/disas/disas.c
+++ b/disas/disas.c
@@ -1,16 +1,12 @@
 /* General "disassemble this chunk" code.  Used for debugging. */
 #include "qemu/osdep.h"
-#include "disas/dis-asm.h"
+#include "disas/disas-internal.h"
 #include "elf.h"
 #include "qemu/qemu-print.h"
-
 #include "disas/disas.h"
 #include "disas/capstone.h"
-
-typedef struct CPUDebug {
-    struct disassemble_info info;
-    CPUState *cpu;
-} CPUDebug;
+#include "hw/core/cpu.h"
+#include "exec/memory.h"

 /* Filled in by elfload.c.  Simplistic, but will do for now. */
 struct syminfo *syminfos = NULL;
@@ -119,18 +115,18 @@ static void initialize_debug(CPUDebug *s)
    s->info.symbol_at_address_func = symbol_at_address;
 }

-static void initialize_debug_target(CPUDebug *s, CPUState *cpu)
+void disas_initialize_debug_target(CPUDebug *s, CPUState *cpu)
 {
    initialize_debug(s);

    s->cpu = cpu;
    s->info.read_memory_func = target_read_memory;
    s->info.print_address_func = print_address;
-#if TARGET_BIG_ENDIAN
-    s->info.endian = BFD_ENDIAN_BIG;
-#else
-    s->info.endian = BFD_ENDIAN_LITTLE;
-#endif
+    if (target_words_bigendian()) {
+        s->info.endian = BFD_ENDIAN_BIG;
+    } else {
+        s->info.endian =  BFD_ENDIAN_LITTLE;
+    }

    CPUClass *cc = CPU_GET_CLASS(cpu);
    if (cc->disas_set_info) {
@@ -168,7 +164,7 @@ static void initialize_debug_host(CPUDebug *s)
 # ifdef _ARCH_PPC64
    s->info.cap_mode = CS_MODE_64;
 # endif
-#elif defined(__riscv) && defined(CONFIG_RISCV_DIS)
+#elif defined(__riscv)
 #if defined(_ILP32) || (__riscv_xlen == 32)
    s->info.print_insn = print_insn_riscv32;
 #elif defined(_LP64)
@@ -204,14 +200,13 @@ static void initialize_debug_host(CPUDebug *s)
 }

 /* Disassemble this for me please... (debugging).  */
-void target_disas(FILE *out, CPUState *cpu, target_ulong code,
-                  target_ulong size)
+void target_disas(FILE *out, CPUState *cpu, uint64_t code, size_t size)
 {
-    target_ulong pc;
+    uint64_t pc;
    int count;
    CPUDebug s;

-    initialize_debug_target(&s, cpu);
+    disas_initialize_debug_target(&s, cpu);
    s.info.fprintf_func = fprintf;
    s.info.stream = out;
    s.info.buffer_vma = code;
@@ -226,11 +221,12 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
    }

    for (pc = code; size > 0; pc += count, size -= count) {
-	fprintf(out, "0x" TARGET_FMT_lx ":  ", pc);
-	count = s.info.print_insn(pc, &s.info);
-	fprintf(out, "\n");
-	if (count < 0)
-	    break;
+        fprintf(out, "0x%08" PRIx64 ":  ", pc);
+        count = s.info.print_insn(pc, &s.info);
+        fprintf(out, "\n");
+        if (count < 0) {
+            break;
+        }
        if (size < count) {
            fprintf(out,
                    "Disassembler disagrees with translator over instruction "
@@ -241,8 +237,7 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
    }
 }

-static int G_GNUC_PRINTF(2, 3)
-gstring_printf(FILE *stream, const char *fmt, ...)
+int disas_gstring_printf(FILE *stream, const char *fmt, ...)
 {
    /* We abuse the FILE parameter to pass a GString. */
    GString *s = (GString *)stream;
@@ -272,8 +267,8 @@ char *plugin_disas(CPUState *cpu, uint64_t addr, size_t size)
    CPUDebug s;
    GString *ds = g_string_new(NULL);

-    initialize_debug_target(&s, cpu);
-    s.info.fprintf_func = gstring_printf;
+    disas_initialize_debug_target(&s, cpu);
+    s.info.fprintf_func = disas_gstring_printf;
    s.info.stream = (FILE *)ds;  /* abuse this slot */
    s.info.buffer_vma = addr;
    s.info.buffer_length = size;
@@ -292,7 +287,7 @@ char *plugin_disas(CPUState *cpu, uint64_t addr, size_t size)
 }

 /* Disassemble this for me please... (debugging). */
-void disas(FILE *out, const void *code, unsigned long size)
+void disas(FILE *out, const void *code, size_t size)
 {
    uintptr_t pc;
    int count;
@@ -324,7 +319,7 @@ void disas(FILE *out, const void *code, unsigned long size)
 }

 /* Look up symbol for debugging purpose.  Returns "" if unknown. */
-const char *lookup_symbol(target_ulong orig_addr)
+const char *lookup_symbol(uint64_t orig_addr)
 {
    const char *symbol = "";
    struct syminfo *s;
@@ -338,61 +333,3 @@ const char *lookup_symbol(target_ulong orig_addr)

    return symbol;
 }
-
-#if !defined(CONFIG_USER_ONLY)
-
-#include "monitor/monitor.h"
-
-static int
-physical_read_memory(bfd_vma memaddr, bfd_byte *myaddr, int length,
-                     struct disassemble_info *info)
-{
-    CPUDebug *s = container_of(info, CPUDebug, info);
-    MemTxResult res;
-
-    res = address_space_read(s->cpu->as, memaddr, MEMTXATTRS_UNSPECIFIED,
-                             myaddr, length);
-    return res == MEMTX_OK ? 0 : EIO;
-}
-
-/* Disassembler for the monitor.  */
-void monitor_disas(Monitor *mon, CPUState *cpu,
-                   target_ulong pc, int nb_insn, int is_physical)
-{
-    int count, i;
-    CPUDebug s;
-    g_autoptr(GString) ds = g_string_new("");
-
-    initialize_debug_target(&s, cpu);
-    s.info.fprintf_func = gstring_printf;
-    s.info.stream = (FILE *)ds;  /* abuse this slot */
-
-    if (is_physical) {
-        s.info.read_memory_func = physical_read_memory;
-    }
-    s.info.buffer_vma = pc;
-
-    if (s.info.cap_arch >= 0 && cap_disas_monitor(&s.info, pc, nb_insn)) {
-        monitor_puts(mon, ds->str);
-        return;
-    }
-
-    if (!s.info.print_insn) {
-        monitor_printf(mon, "0x" TARGET_FMT_lx
-                       ": Asm output not supported on this arch\n", pc);
-        return;
-    }
-
-    for (i = 0; i < nb_insn; i++) {
-        g_string_append_printf(ds, "0x" TARGET_FMT_lx ":  ", pc);
-        count = s.info.print_insn(pc, &s.info);
-        g_string_append_c(ds, '\n');
-        if (count < 0) {
-            break;
-        }
-        pc += count;
-    }
-
-    monitor_puts(mon, ds->str);
-}
-#endif
--- a/disas/meson.build
+++ b/disas/meson.build
@@ -10,4 +10,8 @@ common_ss.add(when: 'CONFIG_RISCV_DIS', if_true: files('riscv.c'))
 common_ss.add(when: 'CONFIG_SH4_DIS', if_true: files('sh4.c'))
 common_ss.add(when: 'CONFIG_SPARC_DIS', if_true: files('sparc.c'))
 common_ss.add(when: 'CONFIG_XTENSA_DIS', if_true: files('xtensa.c'))
-common_ss.add(when: capstone, if_true: files('capstone.c'))
+common_ss.add(when: capstone, if_true: [files('capstone.c'), capstone])
+common_ss.add(files('disas.c'))
+
+softmmu_ss.add(files('disas-mon.c'))
+specific_ss.add(capstone)
--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -210,6 +210,7 @@ children.0=childs0 \

 3. On Secondary VM's QEMU monitor, issue command
 {"execute":"qmp_capabilities"}
+{"execute": "migrate-set-capabilities", "arguments": {"capabilities": [ {"capability": "x-colo", "state": true } ] } }
 {"execute": "nbd-server-start", "arguments": {"addr": {"type": "inet", "data": {"host": "0.0.0.0", "port": "9999"} } } }
 {"execute": "nbd-server-add", "arguments": {"device": "parent0", "writable": true } }

--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -111,6 +111,11 @@ Use ``-machine acpi=off`` instead.
 The HAXM project has been retired (see https://github.com/intel/haxm#status).
 Use "whpx" (on Windows) or "hvf" (on macOS) instead.

+``-async-teardown`` (since 8.1)
+'''''''''''''''''''''''''''''''
+
+Use ``-run-with async-teardown=on`` instead.
+
 ``-singlestep`` (since 8.1)
 '''''''''''''''''''''''''''

--- a/docs/about/emulation.rst
+++ b/docs/about/emulation.rst
@@ -99,7 +99,7 @@ depending on the guest architecture.
    - Yes
    - A configurable 32 bit soft core now owned by Cadence

-A number of features are are only available when running under
+A number of features are only available when running under
 emulation including :ref:`Record/Replay<replay>` and :ref:`TCG Plugins`.

 .. _Semihosting:
--- a/docs/devel/index-api.rst
+++ b/docs/devel/index-api.rst
@@ -12,3 +12,4 @@ generated from in-code annotations to function prototypes.
   memory
   modules
   ui
+   zoned-storage
--- a/docs/devel/kconfig.rst
+++ b/docs/devel/kconfig.rst
@@ -282,9 +282,19 @@ want to change some lines in the first group, for example like this::
   CONFIG_PCI_DEVICES=y
   #CONFIG_TEST_DEVICES=n

-and/or pick a subset of the devices in those device groups.  Right now
-there is no single place that lists all the optional devices for
-``CONFIG_PCI_DEVICES`` and ``CONFIG_TEST_DEVICES``.  In the future,
+and/or pick a subset of the devices in those device groups.  Without
+further modifications to ``configs/devices/``, a system emulator built
+without default devices might not do much more than start an empty
+machine, and even then only if ``--nodefaults`` is specified on the
+command line.  Starting a VM *without* ``--nodefaults`` is allowed to
+fail, but should never abort.  Failures in ``make check`` with
+``--without-default-devices`` are considered bugs in the test code:
+the tests should either use ``--nodefaults``, and should be skipped
+if a necessary device is not present in the build.  Such failures
+should not be worked around with ``select`` directives.
+
+Right now there is no single place that lists all the optional devices
+for ``CONFIG_PCI_DEVICES`` and ``CONFIG_TEST_DEVICES``.  In the future,
 we expect that ``.mak`` files will be automatically generated, so that
 they will include all these symbols and some help text on what they do.

--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -297,31 +297,20 @@ swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)``
 Regexes for git grep
 - ``\<translator_ld[us]\?[bwlq]\(_swap\)\?\>``

-``helper_*_{ld,st}*_mmu``
+``helper_{ld,st}*_mmu``
 ~~~~~~~~~~~~~~~~~~~~~~~~~

 These functions are intended primarily to be called by the code
-generated by the TCG backend. They may also be called by target
-CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions
-they perform accesses by guest virtual address, with a given ``mmuidx``.
+generated by the TCG backend.  Like the ``cpu_{ld,st}_mmu`` functions
+they perform accesses by guest virtual address, with a given ``MemOpIdx``.

-These functions specify an ``opindex`` parameter which encodes
-(among other things) the mmu index to use for the access.  This parameter
-should be created by calling ``make_memop_idx()``.
+They differ from ``cpu_{ld,st}_mmu`` in that they take the endianness
+of the operation only from the MemOpIdx, and loads extend the return
+value to the size of a host general register (``tcg_target_ulong``).

-The ``retaddr`` parameter should be the result of GETPC() called directly
-from the top level HELPER(foo) function (or 0 if no guest CPU state
-unwinding is required).
+load: ``helper_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``

-**TODO** The names of these functions are a bit odd for historical
-reasons because they were originally expected to be called only from
-within generated code. We should rename them to bring them more in
-line with the other memory access functions. The explicit endianness
-is the only feature they have beyond ``*_mmuidx_ra``.
-
-load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
-
-store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
+store: ``helper_{size}_mmu(env, addr, val, opindex, retaddr)``

 ``sign``
 - (empty) : for 32 or 64 bit sizes
@@ -334,14 +323,9 @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
 - ``l`` : 32 bits
 - ``q`` : 64 bits

-``endian``
- - ``le`` : little endian
- - ``be`` : big endian
- - ``ret`` : target endianness
-
 Regexes for git grep
- - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_mmu\>``
- - ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>``
+ - ``\<helper_ld[us]\?[bwlq]_mmu\>``
+ - ``\<helper_st[bwlq]_mmu\>``

 ``address_space_*``
 ~~~~~~~~~~~~~~~~~~~
--- a/docs/devel/qapi-code-gen.rst
+++ b/docs/devel/qapi-code-gen.rst
@@ -947,6 +947,11 @@ Example::
 #   <- get that
 ##

+For legibility, wrap text paragraphs so every line is at most 70
+characters long.
+
+Separate sentences with two spaces.
+

 Definition documentation
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -963,57 +968,46 @@ commands and events), member (for structs and unions), branch (for
 alternates), or value (for enums), a description of each feature (if
 any), and finally optional tagged sections.

-The description of an argument or feature 'name' starts with
-'\@name:'.  The description text can start on the line following the
-'\@name:', in which case it must not be indented at all.  It can also
-start on the same line as the '\@name:'.  In this case if it spans
-multiple lines then second and subsequent lines must be indented to
-line up with the first character of the first line of the
-description::
+Descriptions start with '\@name:'.  The description text should be
+indented like this::

- # @argone:
- # This is a two line description
- # in the first style.
- #
- # @argtwo: This is a two line description
- #          in the second style.
+ # @name: Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed
+ #     do eiusmod tempor incididunt ut labore et dolore magna aliqua.

-The number of spaces between the ':' and the text is not significant.
+.. FIXME The parser accepts these things in almost any order.

-.. admonition:: FIXME
-
-   The parser accepts these things in almost any order.
-
-.. admonition:: FIXME
-
-   union branches should be described, too.
+.. FIXME union branches should be described, too.

 Extensions added after the definition was first released carry a
-'(since x.y.z)' comment.
+"(since x.y.z)" comment.

 The feature descriptions must be preceded by a line "Features:", like
 this::

  # Features:
+  #
  # @feature: Description text

 A tagged section starts with one of the following words:
 "Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:".
 The section ends with the start of a new section.

-The text of a section can start on a new line, in
-which case it must not be indented at all.  It can also start
-on the same line as the 'Note:', 'Returns:', etc tag.  In this
-case if it spans multiple lines then second and subsequent
-lines must be indented to match the first, in the same way as
-multiline argument descriptions.
+The second and subsequent lines of sections other than
+"Example"/"Examples" should be indented like this::

-A 'Since: x.y.z' tagged section lists the release that introduced the
+ # Note: Ut enim ad minim veniam, quis nostrud exercitation ullamco
+ #     laboris nisi ut aliquip ex ea commodo consequat.
+ #
+ #     Duis aute irure dolor in reprehenderit in voluptate velit esse
+ #     cillum dolore eu fugiat nulla pariatur.
+
+A "Since: x.y.z" tagged section lists the release that introduced the
 definition.

-An 'Example' or 'Examples' section is automatically rendered
-entirely as literal fixed-width text.  In other sections,
-the text is formatted, and rST markup can be used.
+An "Example" or "Examples" section is rendered entirely
+as literal fixed-width text.  "TODO" sections are not rendered at all
+(they are for developers, not users of QMP).  In other sections, the
+text is formatted, and rST markup can be used.

 For example::

@@ -1023,7 +1017,7 @@ For example::
 # Statistics of a virtual block device or a block backing device.
 #
 # @device: If the stats are for a virtual block device, the name
- #          corresponding to the virtual block device.
+ #     corresponding to the virtual block device.
 #
 # @node-name: The node name of the device. (since 2.3)
 #
@@ -1040,8 +1034,8 @@ For example::
 #
 # Query the @BlockStats for all virtual block devices.
 #
- # @query-nodes: If true, the command will query all the
- #               block nodes ... explain, explain ...  (since 2.3)
+ # @query-nodes: If true, the command will query all the block nodes
+ #     ... explain, explain ...  (since 2.3)
 #
 # Returns: A list of @BlockStats for each virtual block devices.
 #
@@ -1078,10 +1072,14 @@ Indentation matters.  Bad example::

 # @none: None (no memory side cache in this proximity domain,
 #              or cache associativity unknown)
+ #     (since 5.0)

-The description is parsed as a definition list with term "None (no
-memory side cache in this proximity domain," and definition "or cache
-associativity unknown)".
+The last line's de-indent is wrong.  The second and subsequent lines
+need to line up with each other, like this::
+
+ # @none: None (no memory side cache in this proximity domain,
+ #     or cache associativity unknown)
+ #     (since 5.0)

 Section tags are case-sensitive and end with a colon.  Good example::

--- a/docs/devel/tcg-ops.rst
+++ b/docs/devel/tcg-ops.rst
@@ -672,19 +672,20 @@ QEMU specific operations
       | This operation is optional. If the TCG backend does not implement the
         goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).

-   * - qemu_ld_i32/i64 *t0*, *t1*, *flags*, *memidx*
+   * - qemu_ld_i32/i64/i128 *t0*, *t1*, *flags*, *memidx*

-       qemu_st_i32/i64 *t0*, *t1*, *flags*, *memidx*
+       qemu_st_i32/i64/i128 *t0*, *t1*, *flags*, *memidx*

       qemu_st8_i32 *t0*, *t1*, *flags*, *memidx*

     - | Load data at the guest address *t1* into *t0*, or store data in *t0* at guest
-         address *t1*.  The _i32/_i64 size applies to the size of the input/output
+         address *t1*.  The _i32/_i64/_i128 size applies to the size of the input/output
         register *t0* only.  The address *t1* is always sized according to the guest,
         and the width of the memory operation is controlled by *flags*.
       |
       | Both *t0* and *t1* may be split into little-endian ordered pairs of registers
-         if dealing with 64-bit quantities on a 32-bit host.
+         if dealing with 64-bit quantities on a 32-bit host, or 128-bit quantities on
+         a 64-bit host.
       |
       | The *memidx* selects the qemu tlb index to use (e.g. user or kernel access).
         The flags are the MemOp bits, selecting the sign, width, and endianness
@@ -693,6 +694,8 @@ QEMU specific operations
       | For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
         64-bit memory access specified in *flags*.
       |
+       | For qemu_ld/st_i128, these are only supported for a 64-bit host.
+       |
       | For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
         the memory operation is known to be 8-bit.  This allows the backend to
         provide a different set of register constraints.
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -479,6 +479,12 @@ first to contribute the mapping to the ``libvirt-ci`` project:
   contains the ``mappings.yml`` update.  Then add the prerequisite and
   run ``make lcitool-refresh``.

+ * Please also trigger gitlab container generation pipelines on your change
+   for as many OS distros as practical to make sure that there are no
+   obvious breakages when adding the new pre-requisite. Please see
+   `CI <https://www.qemu.org/docs/master/devel/ci.html>`__ documentation
+   page on how to trigger gitlab CI pipelines on your change.
+
 For enterprise distros that default to old, end-of-life versions of the
 Python runtime, QEMU uses a separate set of mappings that work with more
 recent versions.  These can be found in ``tests/lcitool/mappings.yml``.
--- a/docs/devel/zoned-storage.rst
+++ b/docs/devel/zoned-storage.rst
@@ -0,0 +1,62 @@
+=============
+zoned-storage
+=============
+
+Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
+that are larger than the LBA size. They can only allow sequential writes, which
+can reduce write amplification in SSDs, and potentially lead to higher
+throughput and increased capacity. More details about ZBDs can be found at:
+
+https://zonedstorage.io/docs/introduction/zoned-storage
+
+1. Block layer APIs for zoned storage
+-------------------------------------
+QEMU block layer supports three zoned storage models:
+- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
+to zones. It supports ZBD-specific I/O commands that can be used by a host to
+manage the zones of a device.
+- BLK_Z_HA: The host-aware zoned model allows random write operations in
+zones, making it backward compatible with regular block devices.
+- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
+regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
+supported.
+
+The block device information resides inside BlockDriverState. QEMU uses
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
+block layer while processing I/O requests. A BlockBackend has a root pointer to
+a BlockDriverState graph(for example, raw format on top of file-posix). The
+zoned storage information can be propagated from the leaf BlockDriverState all
+the way up to the BlockBackend. If the zoned storage model in file-posix is
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
+
+The block layer APIs support commands needed for zoned storage devices,
+including report zones, four zone operations, and zone append.
+
+2. Emulating zoned storage controllers
+--------------------------------------
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
+APIs for zoned storage emulation or testing.
+
+For example, to test zone_report on a null_blk device using qemu-io is::
+
+  $ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
+
+To expose the host's zoned block device through virtio-blk, the command line
+can be (includes the -device parameter)::
+
+  -blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \
+  -device virtio-blk-pci,drive=drive0
+
+Or only use the -drive parameter::
+
+  -driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on
+
+Additionally, QEMU has several ways of supporting zoned storage, including:
+(1) Using virtio-scsi: --device scsi-block allows for the passing through of
+SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU.
+(2) PCI device pass-through: While NVMe ZNS emulation is available for testing
+purposes, it cannot yet pass through a zoned device from the host. To pass on
+the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter
+through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs
+attached to the HBA.
--- a/docs/sphinx/qapidoc.py
+++ b/docs/sphinx/qapidoc.py
@@ -268,6 +268,9 @@ class QAPISchemaGenRSTVisitor(QAPISchemaVisitor):
        """Return list of doctree nodes for additional sections"""
        nodelist = []
        for section in doc.sections:
+            if section.name and section.name == 'TODO':
+                # Hide TODO: sections
+                continue
            snode = self._make_section(section.name)
            if section.name and section.name.startswith('Example'):
                snode += self._nodes_for_example(section.text)
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -93,4 +93,5 @@ Emulated Devices
   devices/virtio-pmem.rst
   devices/vhost-user-rng.rst
   devices/canokey.rst
+   devices/usb-u2f.rst
   devices/igb.rst
--- a/docs/system/devices/igb.rst
+++ b/docs/system/devices/igb.rst
@@ -29,7 +29,7 @@ Using igb
 =========

 Using igb should be nothing different from using another network device. See
-:ref:`pcsys_005fnetwork` in general.
+:ref:`Network_emulation` in general.

 However, you may also need to perform additional steps to activate SR-IOV
 feature on your guest. For Linux, refer to [4]_.
--- a/docs/system/devices/ivshmem.rst
+++ b/docs/system/devices/ivshmem.rst
@@ -1,5 +1,3 @@
-.. _pcsys_005fivshmem:
-
 Inter-VM Shared Memory device
 -----------------------------

--- a/docs/system/devices/net.rst
+++ b/docs/system/devices/net.rst
@@ -1,4 +1,4 @@
-.. _pcsys_005fnetwork:
+.. _Network_Emulation:

 Network emulation
 -----------------
--- a/docs/system/devices/usb-u2f.rst
+++ b/docs/system/devices/usb-u2f.rst
@@ -0,0 +1,93 @@
+Universal Second Factor (U2F) USB Key Device
+============================================
+
+U2F is an open authentication standard that enables relying parties
+exposed to the internet to offer a strong second factor option for end
+user authentication.
+
+The second factor is provided by a device implementing the U2F
+protocol. In case of a USB U2F security key, it is a USB HID device
+that implements the U2F protocol.
+
+QEMU supports both pass-through of a host U2F key device to a VM,
+and software emulation of a U2F key.
+
+``u2f-passthru``
+----------------
+
+The ``u2f-passthru`` device allows you to connect a real hardware
+U2F key on your host to a guest VM. All requests made from the guest
+are passed through to the physical security key connected to the
+host machine and vice versa.
+
+In addition, the dedicated pass-through allows you to share a single
+U2F security key with several guest VMs, which is not possible with a
+simple host device assignment pass-through.
+
+You can specify the host U2F key to use with the ``hidraw``
+option, which takes the host path to a Linux ``/dev/hidrawN`` device:
+
+.. parsed-literal::
+   |qemu_system| -usb -device u2f-passthru,hidraw=/dev/hidraw0
+
+If you don't specify the device, the ``u2f-passthru`` device will
+autoscan to take the first U2F device it finds on the host (this
+requires a working libudev):
+
+.. parsed-literal::
+   |qemu_system| -usb -device u2f-passthru
+
+``u2f-emulated``
+----------------
+
+``u2f-emulated`` is a completely software emulated U2F device.
+It uses `libu2f-emu <https://github.com/MattGorko/libu2f-emu>`__
+for the U2F key emulation. libu2f-emu
+provides a complete implementation of the U2F protocol device part for
+all specified transports given by the FIDO Alliance.
+
+To work, an emulated U2F device must have four elements:
+
+ * ec x509 certificate
+ * ec private key
+ * counter (four bytes value)
+ * 48 bytes of entropy (random bits)
+
+To use this type of device, these have to be configured, and these
+four elements must be passed one way or another.
+
+Assuming that you have a working libu2f-emu installed on the host,
+there are three possible ways to configure the ``u2f-emulated`` device:
+
+ * ephemeral
+ * setup directory
+ * manual
+
+Ephemeral is the simplest way to configure; it lets the device generate
+all the elements it needs for a single use of the lifetime of the device.
+It is the default if you do not pass any other options to the device.
+
+.. parsed-literal::
+   |qemu_system| -usb -device u2f-emulated
+
+You can pass the device the path of a setup directory on the host
+using the ``dir`` option; the directory must contain these four files:
+
+ * ``certificate.pem``: ec x509 certificate
+ * ``private-key.pem``: ec private key
+ * ``counter``: counter value
+ * ``entropy``: 48 bytes of entropy
+
+.. parsed-literal::
+   |qemu_system| -usb -device u2f-emulated,dir=$dir
+
+You can also manually pass the device the paths to each of these files,
+if you don't want them all to be in the same directory, using the options
+
+ * ``cert``
+ * ``priv``
+ * ``counter``
+ * ``entropy``
+
+.. parsed-literal::
+   |qemu_system| -usb -device u2f-emulated,cert=$DIR1/$FILE1,priv=$DIR2/$FILE2,counter=$DIR3/$FILE3,entropy=$DIR4/$FILE4
--- a/docs/system/devices/usb.rst
+++ b/docs/system/devices/usb.rst
@@ -1,5 +1,3 @@
-.. _pcsys_005fusb:
-
 USB emulation
 -------------

@@ -209,7 +207,7 @@ option or the ``device_add`` monitor command. Available devices are:
   USB audio device

 ``u2f-{emulated,passthru}``
-   Universal Second Factor device
+   :doc:`usb-u2f`

 ``canokey``
   An Open-source Secure Key implementing FIDO2, OpenPGP, PIV and more.
--- a/docs/system/gdb.rst
+++ b/docs/system/gdb.rst
@@ -46,6 +46,28 @@ Here are some useful tips in order to use gdb on system code:
 3. Use ``set architecture i8086`` to dump 16 bit code. Then use
   ``x/10i $cs*16+$eip`` to dump the code at the PC position.

+Breakpoint and Watchpoint support
+=================================
+
+While GDB can always fall back to inserting breakpoints into memory
+(if writable) other features are very much dependent on support of the
+accelerator. For TCG system emulation we advertise an infinite number
+of hardware assisted breakpoints and watchpoints. For other
+accelerators it will depend on if support has been added (see
+supports_guest_debug and related hooks in AccelOpsClass).
+
+As TCG cannot track all memory accesses in user-mode there is no
+support for watchpoints.
+
+Relocating code
+---------------
+
+On modern kernels confusion can be caused by code being relocated by
+features such as address space layout randomisation. To avoid
+confusion when debugging such things you either need to update gdb's
+view of where things are in memory or perhaps more trivially disable
+ASLR when booting the system.
+
 Debugging multicore machines
 ============================

--- a/docs/system/keys.rst
+++ b/docs/system/keys.rst
@@ -1,4 +1,4 @@
-.. _pcsys_005fkeys:
+.. _GUI_keys:

 Keys in the graphical frontends
 -------------------------------
--- a/docs/system/linuxboot.rst
+++ b/docs/system/linuxboot.rst
@@ -27,4 +27,4 @@ virtual serial port and the QEMU monitor to the console with the
                    -append "root=/dev/hda console=ttyS0" -nographic

 Use Ctrl-a c to switch between the serial console and the monitor (see
-:ref:`pcsys_005fkeys`).
+:ref:`GUI_keys`).
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -430,6 +430,12 @@ Hard disks
  you may corrupt your host data (use the ``-snapshot`` command
  line option or modify the device permissions accordingly).

+Zoned block devices
+  Zoned block devices can be passed through to the guest if the emulated storage
+  controller supports zoned storage. Use ``--blockdev host_device,
+  node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through
+  ``/dev/nullb0`` as ``drive0``.
+
 Windows
 ^^^^^^^

--- a/docs/system/target-i386.rst
+++ b/docs/system/target-i386.rst
@@ -3,8 +3,6 @@
 x86 System emulator
 -------------------

-.. _pcsys_005fdevices:
-
 Board-specific documentation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -32,8 +30,6 @@ Architectural features
   i386/sgx
   i386/amd-memory-encryption

-.. _pcsys_005freq:
-
 OS requirements
 ~~~~~~~~~~~~~~~

--- a/docs/u2f.txt
+++ b/docs/u2f.txt
@@ -1,110 +0,0 @@
-QEMU U2F Key Device Documentation.
-
-Contents
-1. USB U2F key device
-2. Building
-3. Using u2f-emulated
-4. Using u2f-passthru
-5. Libu2f-emu
-
-1. USB U2F key device
-
-U2F is an open authentication standard that enables relying parties
-exposed to the internet to offer a strong second factor option for end
-user authentication.
-
-The standard brings many advantages to both parties, client and server,
-allowing to reduce over-reliance on passwords, it increases authentication
-security and simplifies passwords.
-
-The second factor is materialized by a device implementing the U2F
-protocol. In case of a USB U2F security key, it is a USB HID device
-that implements the U2F protocol.
-
-In QEMU, the USB U2F key device offers a dedicated support of U2F, allowing
-guest USB FIDO/U2F security keys operating in two possible modes:
-pass-through and emulated.
-
-The pass-through mode consists of passing all requests made from the guest
-to the physical security key connected to the host machine and vice versa.
-In addition, the dedicated pass-through allows to have a U2F security key
-shared on several guests which is not possible with a simple host device
-assignment pass-through.
-
-The emulated mode consists of completely emulating the behavior of an
-U2F device through software part. Libu2f-emu is used for that.
-
-
-2. Building
-
-To ensure the build of the u2f-emulated device variant which depends
-on libu2f-emu: configuring and building:
-
-    ./configure --enable-u2f && make
-
-The pass-through mode is built by default on Linux. To take advantage
-of the autoscan option it provides, make sure you have a working libudev
-installed on the host.
-
-
-3. Using u2f-emulated
-
-To work, an emulated U2F device must have four elements:
- * ec x509 certificate
- * ec private key
- * counter (four bytes value)
- * 48 bytes of entropy (random bits)
-
-To use this type of device, this one has to be configured, and these
-four elements must be passed one way or another.
-
-Assuming that you have a working libu2f-emu installed on the host.
-There are three possible ways of configurations:
- * ephemeral
- * setup directory
- * manual
-
-Ephemeral is the simplest way to configure, it lets the device generate
-all the elements it needs for a single use of the lifetime of the device.
-
-    qemu -usb -device u2f-emulated
-
-Setup directory allows to configure the device from a directory containing
-four files:
- * certificate.pem: ec x509 certificate
- * private-key.pem: ec private key
- * counter: counter value
- * entropy: 48 bytes of entropy
-
-    qemu -usb -device u2f-emulated,dir=$dir
-
-Manual allows to configure the device more finely by specifying each
-of the elements necessary for the device:
- * cert
- * priv
- * counter
- * entropy
-
-    qemu -usb -device u2f-emulated,cert=$DIR1/$FILE1,priv=$DIR2/$FILE2,counter=$DIR3/$FILE3,entropy=$DIR4/$FILE4
-
-
-4. Using u2f-passthru
-
-On the host specify the u2f-passthru device with a suitable hidraw:
-
-    qemu -usb -device u2f-passthru,hidraw=/dev/hidraw0
-
-Alternately, the u2f-passthru device can autoscan to take the first
-U2F device it finds on the host (this requires a working libudev):
-
-    qemu -usb -device u2f-passthru
-
-
-5. Libu2f-emu
-
-The u2f-emulated device uses libu2f-emu for the U2F key emulation. Libu2f-emu
-implements completely the U2F protocol device part for all specified
-transport given by the FIDO Alliance.
-
-For more information about libu2f-emu see this page:
-https://github.com/MattGorko/libu2f-emu.
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1052,6 +1052,7 @@ SRST
  migration (or once already in postcopy).
 ERST

+#ifdef CONFIG_REPLICATION
    {
        .name       = "x_colo_lost_heartbeat",
        .args_type  = "",
@@ -1060,6 +1061,7 @@ ERST
                      "a failover or takeover is needed.",
        .cmd = hmp_x_colo_lost_heartbeat,
    },
+#endif

 SRST
 ``x_colo_lost_heartbeat``
--- a/hw/9pfs/trace-events
+++ b/hw/9pfs/trace-events
@@ -48,3 +48,9 @@ v9fs_readlink(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_readlink_return(uint16_t tag, uint8_t id, char* target) "tag %d id %d name %s"
 v9fs_setattr(uint16_t tag, uint8_t id, int32_t fid, int32_t valid, int32_t mode, int32_t uid, int32_t gid, int64_t size, int64_t atime_sec, int64_t mtime_sec) "tag %u id %u fid %d iattr={valid %d mode %d uid %d gid %d size %"PRId64" atime=%"PRId64" mtime=%"PRId64" }"
 v9fs_setattr_return(uint16_t tag, uint8_t id) "tag %u id %u"
+
+# xen-9p-backend.c
+xen_9pfs_alloc(char *name) "name %s"
+xen_9pfs_connect(char *name) "name %s"
+xen_9pfs_disconnect(char *name) "name %s"
+xen_9pfs_free(char *name) "name %s"
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -25,6 +25,8 @@
 #include "qemu/iov.h"
 #include "fsdev/qemu-fsdev.h"

+#include "trace.h"
+
 #define VERSIONS "1"
 #define MAX_RINGS 8
 #define MAX_RING_ORDER 9
@@ -337,6 +339,8 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev)
    Xen9pfsDev *xen_9pdev = container_of(xendev, Xen9pfsDev, xendev);
    int i;

+    trace_xen_9pfs_disconnect(xendev->name);
+
    for (i = 0; i < xen_9pdev->num_rings; i++) {
        if (xen_9pdev->rings[i].evtchndev != NULL) {
            qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev),
@@ -345,40 +349,41 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev)
                                   xen_9pdev->rings[i].local_port);
            xen_9pdev->rings[i].evtchndev = NULL;
        }
-    }
-}
-
-static int xen_9pfs_free(struct XenLegacyDevice *xendev)
-{
-    Xen9pfsDev *xen_9pdev = container_of(xendev, Xen9pfsDev, xendev);
-    int i;
-
-    if (xen_9pdev->rings[0].evtchndev != NULL) {
-        xen_9pfs_disconnect(xendev);
-    }
-
-    for (i = 0; i < xen_9pdev->num_rings; i++) {
        if (xen_9pdev->rings[i].data != NULL) {
            xen_be_unmap_grant_refs(&xen_9pdev->xendev,
                                    xen_9pdev->rings[i].data,
                                    xen_9pdev->rings[i].intf->ref,
                                    (1 << xen_9pdev->rings[i].ring_order));
+            xen_9pdev->rings[i].data = NULL;
        }
        if (xen_9pdev->rings[i].intf != NULL) {
            xen_be_unmap_grant_ref(&xen_9pdev->xendev,
                                   xen_9pdev->rings[i].intf,
                                   xen_9pdev->rings[i].ref);
+            xen_9pdev->rings[i].intf = NULL;
        }
        if (xen_9pdev->rings[i].bh != NULL) {
            qemu_bh_delete(xen_9pdev->rings[i].bh);
+            xen_9pdev->rings[i].bh = NULL;
        }
    }

    g_free(xen_9pdev->id);
+    xen_9pdev->id = NULL;
    g_free(xen_9pdev->tag);
+    xen_9pdev->tag = NULL;
    g_free(xen_9pdev->path);
+    xen_9pdev->path = NULL;
    g_free(xen_9pdev->security_model);
+    xen_9pdev->security_model = NULL;
    g_free(xen_9pdev->rings);
+    xen_9pdev->rings = NULL;
+}
+
+static int xen_9pfs_free(struct XenLegacyDevice *xendev)
+{
+    trace_xen_9pfs_free(xendev->name);
+
    return 0;
 }

@@ -390,6 +395,8 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
    V9fsState *s = &xen_9pdev->state;
    QemuOpts *fsdev;

+    trace_xen_9pfs_connect(xendev->name);
+
    if (xenstore_read_fe_int(&xen_9pdev->xendev, "num-rings",
                             &xen_9pdev->num_rings) == -1 ||
        xen_9pdev->num_rings > MAX_RINGS || xen_9pdev->num_rings < 1) {
@@ -499,6 +506,8 @@ out:

 static void xen_9pfs_alloc(struct XenLegacyDevice *xendev)
 {
+    trace_xen_9pfs_alloc(xendev->name);
+
    xenstore_write_be_str(xendev, "versions", VERSIONS);
    xenstore_write_be_int(xendev, "max-rings", MAX_RINGS);
    xenstore_write_be_int(xendev, "max-ring-page-order", MAX_RING_ORDER);
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -436,6 +436,7 @@ config XLNX_VERSAL
    select OR_IRQ
    select XLNX_BBRAM
    select XLNX_EFUSE_VERSAL
+    select XLNX_USB_SUBSYS

 config NPCM7XX
    bool
--- a/hw/arm/sbsa-ref.c
+++ b/hw/arm/sbsa-ref.c
@@ -648,7 +648,7 @@ static void create_pcie(SBSAMachineState *sms)
        }
    }

-    pci_create_simple(pci->bus, -1, "VGA");
+    pci_create_simple(pci->bus, -1, "bochs-display");

    create_smmu(sms, pci->bus);
 }
@@ -852,7 +852,7 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data)

    mc->init = sbsa_ref_init;
    mc->desc = "QEMU 'SBSA Reference' ARM Virtual Machine";
-    mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-a57");
+    mc->default_cpu_type = ARM_CPU_TYPE_NAME("neoverse-n1");
    mc->max_cpus = 512;
    mc->pci_allow_0_address = true;
    mc->minimum_page_bits = 12;
--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@@ -173,6 +173,11 @@ struct VexpressMachineClass {

 struct VexpressMachineState {
    MachineState parent;
+    MemoryRegion vram;
+    MemoryRegion sram;
+    MemoryRegion flashalias;
+    MemoryRegion lowram;
+    MemoryRegion a15sram;
    bool secure;
    bool virt;
 };
@@ -182,7 +187,7 @@ struct VexpressMachineState {
 #define TYPE_VEXPRESS_A15_MACHINE   MACHINE_TYPE_NAME("vexpress-a15")
 OBJECT_DECLARE_TYPE(VexpressMachineState, VexpressMachineClass, VEXPRESS_MACHINE)

-typedef void DBoardInitFn(const VexpressMachineState *machine,
+typedef void DBoardInitFn(VexpressMachineState *machine,
                          ram_addr_t ram_size,
                          const char *cpu_type,
                          qemu_irq *pic);
@@ -263,14 +268,13 @@ static void init_cpus(MachineState *ms, const char *cpu_type,
    }
 }

-static void a9_daughterboard_init(const VexpressMachineState *vms,
+static void a9_daughterboard_init(VexpressMachineState *vms,
                                  ram_addr_t ram_size,
                                  const char *cpu_type,
                                  qemu_irq *pic)
 {
    MachineState *machine = MACHINE(vms);
    MemoryRegion *sysmem = get_system_memory();
-    MemoryRegion *lowram = g_new(MemoryRegion, 1);
    ram_addr_t low_ram_size;

    if (ram_size > 0x40000000) {
@@ -287,9 +291,9 @@ static void a9_daughterboard_init(const VexpressMachineState *vms,
     * address space should in theory be remappable to various
     * things including ROM or RAM; we always map the RAM there.
     */
-    memory_region_init_alias(lowram, NULL, "vexpress.lowmem", machine->ram,
-                             0, low_ram_size);
-    memory_region_add_subregion(sysmem, 0x0, lowram);
+    memory_region_init_alias(&vms->lowram, NULL, "vexpress.lowmem",
+                             machine->ram, 0, low_ram_size);
+    memory_region_add_subregion(sysmem, 0x0, &vms->lowram);
    memory_region_add_subregion(sysmem, 0x60000000, machine->ram);

    /* 0x1e000000 A9MPCore (SCU) private memory region */
@@ -348,14 +352,13 @@ static VEDBoardInfo a9_daughterboard = {
    .init = a9_daughterboard_init,
 };

-static void a15_daughterboard_init(const VexpressMachineState *vms,
+static void a15_daughterboard_init(VexpressMachineState *vms,
                                   ram_addr_t ram_size,
                                   const char *cpu_type,
                                   qemu_irq *pic)
 {
    MachineState *machine = MACHINE(vms);
    MemoryRegion *sysmem = get_system_memory();
-    MemoryRegion *sram = g_new(MemoryRegion, 1);

    {
        /* We have to use a separate 64 bit variable here to avoid the gcc
@@ -386,9 +389,9 @@ static void a15_daughterboard_init(const VexpressMachineState *vms,
    /* 0x2b060000: SP805 watchdog: not modelled */
    /* 0x2b0a0000: PL341 dynamic memory controller: not modelled */
    /* 0x2e000000: system SRAM */
-    memory_region_init_ram(sram, NULL, "vexpress.a15sram", 0x10000,
+    memory_region_init_ram(&vms->a15sram, NULL, "vexpress.a15sram", 0x10000,
                           &error_fatal);
-    memory_region_add_subregion(sysmem, 0x2e000000, sram);
+    memory_region_add_subregion(sysmem, 0x2e000000, &vms->a15sram);

    /* 0x7ffb0000: DMA330 DMA controller: not modelled */
    /* 0x7ffd0000: PL354 static memory controller: not modelled */
@@ -547,10 +550,6 @@ static void vexpress_common_init(MachineState *machine)
    I2CBus *i2c;
    ram_addr_t vram_size, sram_size;
    MemoryRegion *sysmem = get_system_memory();
-    MemoryRegion *vram = g_new(MemoryRegion, 1);
-    MemoryRegion *sram = g_new(MemoryRegion, 1);
-    MemoryRegion *flashalias = g_new(MemoryRegion, 1);
-    MemoryRegion *flash0mem;
    const hwaddr *map = daughterboard->motherboard_map;
    int i;

@@ -662,24 +661,25 @@ static void vexpress_common_init(MachineState *machine)

    if (map[VE_NORFLASHALIAS] != -1) {
        /* Map flash 0 as an alias into low memory */
+        MemoryRegion *flash0mem;
        flash0mem = sysbus_mmio_get_region(SYS_BUS_DEVICE(pflash0), 0);
-        memory_region_init_alias(flashalias, NULL, "vexpress.flashalias",
+        memory_region_init_alias(&vms->flashalias, NULL, "vexpress.flashalias",
                                 flash0mem, 0, VEXPRESS_FLASH_SIZE);
-        memory_region_add_subregion(sysmem, map[VE_NORFLASHALIAS], flashalias);
+        memory_region_add_subregion(sysmem, map[VE_NORFLASHALIAS], &vms->flashalias);
    }

    dinfo = drive_get(IF_PFLASH, 0, 1);
    ve_pflash_cfi01_register(map[VE_NORFLASH1], "vexpress.flash1", dinfo);

    sram_size = 0x2000000;
-    memory_region_init_ram(sram, NULL, "vexpress.sram", sram_size,
+    memory_region_init_ram(&vms->sram, NULL, "vexpress.sram", sram_size,
                           &error_fatal);
-    memory_region_add_subregion(sysmem, map[VE_SRAM], sram);
+    memory_region_add_subregion(sysmem, map[VE_SRAM], &vms->sram);

    vram_size = 0x800000;
-    memory_region_init_ram(vram, NULL, "vexpress.vram", vram_size,
+    memory_region_init_ram(&vms->vram, NULL, "vexpress.vram", vram_size,
                           &error_fatal);
-    memory_region_add_subregion(sysmem, map[VE_VIDEORAM], vram);
+    memory_region_add_subregion(sysmem, map[VE_VIDEORAM], &vms->vram);

    /* 0x4e000000 LAN9118 Ethernet */
    if (nd_table[0].used) {
--- a/Show More
+++ b/Show More