qdev: Rework array properties based on list visitor

Until now, array properties are actually implemented with a hack that uses multiple properties on the QOM level: a static "foo-len" property and after it is set, dynamically created "foo[i]" properties. In external interfaces (-device on the command line and device_add in QMP), this interface was broken by commit f3558b1b ('qdev: Base object creation on QDict rather than QemuOpts') because QDicts are unordered and therefore it could happen that QEMU tried to set the indexed properties before setting the length, which fails and effectively makes array properties inaccessible. In particular, this affects the 'ports' property of the 'rocker' device, which used to be configured like this: -device rocker,len-ports=2,ports[0]=dev0,ports[1]=dev1 This patch reworks the external interface so that instead of using a separate top-level property for the length and for each element, we use a single true array property that accepts a list value. In the external interfaces, this is naturally expressed as a JSON list and makes array properties accessible again. The new syntax looks like this: -device '{"driver":"rocker","ports":["dev0","dev1"]}' Creating an array property on the command line without using JSON format is currently not possible. This could be fixed by switching from QemuOpts to a keyval parser, which however requires consideration of the compatibility implications. All internal users of devices with array properties go through qdev_prop_set_array() at this point, so updating it takes care of all of them. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1090 Fixes: f3558b1b76 Signed-off-by: Kevin Wolf <kwolf@redhat.com> Message-ID: <20231109174240.72376-12-kwolf@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
qdev: Make netdev properties work as list elements
2023-11-10 18:19:19 +01:00 · 2023-11-10 18:19:15 +01:00 · 2023-11-10 18:19:14 +01:00 · 2023-11-10 18:19:13 +01:00 · 2023-11-10 18:19:13 +01:00 · 2023-11-10 18:19:13 +01:00
884 changed files with 34679 additions and 13636 deletions
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -256,6 +256,7 @@ build-user:
  variables:
    IMAGE: debian-all-test-cross
    CONFIGURE_ARGS: --disable-tools --disable-system
+      --target-list-exclude=alpha-linux-user,sh4-linux-user
    MAKE_CHECK_ARGS: check-tcg

 build-user-static:
@@ -265,6 +266,18 @@ build-user-static:
  variables:
    IMAGE: debian-all-test-cross
    CONFIGURE_ARGS: --disable-tools --disable-system --static
+      --target-list-exclude=alpha-linux-user,sh4-linux-user
+    MAKE_CHECK_ARGS: check-tcg
+
+# targets stuck on older compilers
+build-legacy:
+  extends: .native_build_job_template
+  needs:
+    job: amd64-debian-legacy-cross-container
+  variables:
+    IMAGE: debian-legacy-test-cross
+    TARGETS: alpha-linux-user alpha-softmmu sh4-linux-user
+    CONFIGURE_ARGS: --disable-tools
    MAKE_CHECK_ARGS: check-tcg

 build-user-hexagon:
@@ -277,7 +290,9 @@ build-user-hexagon:
    CONFIGURE_ARGS: --disable-tools --disable-docs --enable-debug-tcg
    MAKE_CHECK_ARGS: check-tcg

-# Only build the softmmu targets we have check-tcg tests for
+# Build the softmmu targets we have check-tcg tests and compilers in
+# our omnibus all-test-cross container. Those targets that haven't got
+# Debian cross compiler support need to use special containers.
 build-some-softmmu:
  extends: .native_build_job_template
  needs:
@@ -285,7 +300,18 @@ build-some-softmmu:
  variables:
    IMAGE: debian-all-test-cross
    CONFIGURE_ARGS: --disable-tools --enable-debug
-    TARGETS: xtensa-softmmu arm-softmmu aarch64-softmmu alpha-softmmu
+    TARGETS: arm-softmmu aarch64-softmmu i386-softmmu riscv64-softmmu
+      s390x-softmmu x86_64-softmmu
+    MAKE_CHECK_ARGS: check-tcg
+
+build-loongarch64:
+  extends: .native_build_job_template
+  needs:
+    job: loongarch-debian-cross-container
+  variables:
+    IMAGE: debian-loongarch-cross
+    CONFIGURE_ARGS: --disable-tools --enable-debug
+    TARGETS: loongarch64-linux-user loongarch64-softmmu
    MAKE_CHECK_ARGS: check-tcg

 # We build tricore in a very minimal tricore only container
@@ -318,7 +344,7 @@ clang-user:
  variables:
    IMAGE: debian-all-test-cross
    CONFIGURE_ARGS: --cc=clang --cxx=clang++ --disable-system
-      --target-list-exclude=microblazeel-linux-user,aarch64_be-linux-user,i386-linux-user,m68k-linux-user,mipsn32el-linux-user,xtensaeb-linux-user
+      --target-list-exclude=alpha-linux-user,microblazeel-linux-user,aarch64_be-linux-user,i386-linux-user,m68k-linux-user,mipsn32el-linux-user,xtensaeb-linux-user
      --extra-cflags=-fsanitize=undefined --extra-cflags=-fno-sanitize-recover=undefined
    MAKE_CHECK_ARGS: check-unit check-tcg

@@ -505,7 +531,7 @@ build-tci:
  variables:
    IMAGE: debian-all-test-cross
  script:
-    - TARGETS="aarch64 alpha arm hppa m68k microblaze ppc64 s390x x86_64"
+    - TARGETS="aarch64 arm hppa m68k microblaze ppc64 s390x x86_64"
    - mkdir build
    - cd build
    - ../configure --enable-tcg-interpreter --disable-docs --disable-gtk --disable-vnc
--- a/.gitlab-ci.d/container-cross.yml
+++ b/.gitlab-ci.d/container-cross.yml
@@ -1,9 +1,3 @@
-alpha-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-alpha-cross
-
 amd64-debian-cross-container:
  extends: .container_job_template
  stage: containers
@@ -16,6 +10,12 @@ amd64-debian-user-cross-container:
  variables:
    NAME: debian-all-test-cross

+amd64-debian-legacy-cross-container:
+  extends: .container_job_template
+  stage: containers
+  variables:
+    NAME: debian-legacy-test-cross
+
 arm64-debian-cross-container:
  extends: .container_job_template
  stage: containers
@@ -40,23 +40,11 @@ hexagon-cross-container:
  variables:
    NAME: debian-hexagon-cross

-hppa-debian-cross-container:
+loongarch-debian-cross-container:
  extends: .container_job_template
  stage: containers
  variables:
-    NAME: debian-hppa-cross
-
-m68k-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-m68k-cross
-
-mips64-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-mips64-cross
+    NAME: debian-loongarch-cross

 mips64el-debian-cross-container:
  extends: .container_job_template
@@ -64,24 +52,12 @@ mips64el-debian-cross-container:
  variables:
    NAME: debian-mips64el-cross

-mips-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-mips-cross
-
 mipsel-debian-cross-container:
  extends: .container_job_template
  stage: containers
  variables:
    NAME: debian-mipsel-cross

-powerpc-test-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-powerpc-test-cross
-
 ppc64el-debian-cross-container:
  extends: .container_job_template
  stage: containers
@@ -97,31 +73,12 @@ riscv64-debian-cross-container:
    NAME: debian-riscv64-cross
    QEMU_JOB_OPTIONAL: 1

-# we can however build TCG tests using a non-sid base
-riscv64-debian-test-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-riscv64-test-cross
-
 s390x-debian-cross-container:
  extends: .container_job_template
  stage: containers
  variables:
    NAME: debian-s390x-cross

-sh4-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-sh4-cross
-
-sparc64-debian-cross-container:
-  extends: .container_job_template
-  stage: containers
-  variables:
-    NAME: debian-sparc64-cross
-
 tricore-debian-cross-container:
  extends: .container_job_template
  stage: containers
--- a/.gitlab-ci.d/crossbuilds.yml
+++ b/.gitlab-ci.d/crossbuilds.yml
@@ -165,7 +165,7 @@ cross-win32-system:
    job: win32-fedora-cross-container
  variables:
    IMAGE: fedora-win32-cross
-    EXTRA_CONFIGURE_OPTS: --enable-fdt=internal
+    EXTRA_CONFIGURE_OPTS: --enable-fdt=internal --disable-plugins
    CROSS_SKIP_TARGETS: alpha-softmmu avr-softmmu hppa-softmmu m68k-softmmu
                        microblazeel-softmmu mips64el-softmmu nios2-softmmu
  artifacts:
@@ -179,7 +179,7 @@ cross-win64-system:
    job: win64-fedora-cross-container
  variables:
    IMAGE: fedora-win64-cross
-    EXTRA_CONFIGURE_OPTS: --enable-fdt=internal
+    EXTRA_CONFIGURE_OPTS: --enable-fdt=internal --disable-plugins
    CROSS_SKIP_TARGETS: alpha-softmmu avr-softmmu hppa-softmmu
                        m68k-softmmu microblazeel-softmmu nios2-softmmu
                        or1k-softmmu rx-softmmu sh4eb-softmmu sparc64-softmmu
--- a/.gitlab-ci.d/windows.yml
+++ b/.gitlab-ci.d/windows.yml
@@ -72,6 +72,7 @@
  - .\msys64\usr\bin\bash -lc "pacman -Sy --noconfirm --needed
      bison diffutils flex
      git grep make sed
+      $MINGW_TARGET-binutils
      $MINGW_TARGET-capstone
      $MINGW_TARGET-ccache
      $MINGW_TARGET-curl
--- a/.mailmap
+++ b/.mailmap
@@ -30,10 +30,12 @@ malc <av1474@comtv.ru> malc <malc@c046a42c-6fe2-441c-8c8c-71466251a162>
 # Corrupted Author fields
 Aaron Larson <alarson@ddci.com> alarson@ddci.com
 Andreas Färber <andreas.faerber@web.de> Andreas Färber <andreas.faerber>
+fanwenjie <fanwj@mail.ustc.edu.cn> fanwj@mail.ustc.edu.cn <fanwj@mail.ustc.edu.cn>
 Jason Wang <jasowang@redhat.com> Jason Wang <jasowang>
 Marek Dolata <mkdolata@us.ibm.com> mkdolata@us.ibm.com <mkdolata@us.ibm.com>
 Michael Ellerman <mpe@ellerman.id.au> michael@ozlabs.org <michael@ozlabs.org>
 Nick Hudson <hnick@vmware.com> hnick@vmware.com <hnick@vmware.com>
+Timothée Cocault <timothee.cocault@gmail.com> timothee.cocault@gmail.com <timothee.cocault@gmail.com>

 # There is also a:
 #    (no author) <(no author)@c046a42c-6fe2-441c-8c8c-71466251a162>
--- a/Kconfig.host
+++ b/Kconfig.host
@@ -11,6 +11,9 @@ config OPENGL
 config X11
    bool

+config PIXMAN
+    bool
+
 config SPICE
    bool

@@ -46,3 +49,6 @@ config FUZZ
 config VFIO_USER_SERVER_ALLOWED
    bool
    imply VFIO_USER_SERVER
+
+config HV_BALLOON_POSSIBLE
+    bool
--- a/57
+++ b/57
@@ -323,7 +323,7 @@ RISC-V TCG CPUs
 M: Palmer Dabbelt <palmer@dabbelt.com>
 M: Alistair Francis <alistair.francis@wdc.com>
 M: Bin Meng <bin.meng@windriver.com>
-R: Weiwei Li <liweiwei@iscas.ac.cn>
+R: Weiwei Li <liwei1518@gmail.com>
 R: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
 R: Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
 L: qemu-riscv@nongnu.org
@@ -490,7 +490,7 @@ S: Supported
 F: include/sysemu/kvm_xen.h
 F: target/i386/kvm/xen*
 F: hw/i386/kvm/xen*
-F: tests/avocado/xen_guest.py
+F: tests/avocado/kvm_xen_guest.py

 Guest CPU Cores (other accelerators)
 ------------------------------------
@@ -687,7 +687,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/intc/arm*
-F: hw/intc/gic_internal.h
+F: hw/intc/gic*_internal.h
 F: hw/misc/a9scu.c
 F: hw/misc/arm11scu.c
 F: hw/misc/arm_l2x0.c
@@ -859,8 +859,10 @@ M: Hao Wu <wuhaotsh@google.com>
 L: qemu-arm@nongnu.org
 S: Supported
 F: hw/*/npcm*
+F: hw/sensor/adm1266.c
 F: include/hw/*/npcm*
 F: tests/qtest/npcm*
+F: tests/qtest/adm1266-test.c
 F: pc-bios/npcm7xx_bootrom.bin
 F: roms/vbootrom
 F: docs/system/arm/nuvoton.rst
@@ -1192,6 +1194,7 @@ M: Richard Henderson <richard.henderson@linaro.org>
 R: Helge Deller <deller@gmx.de>
 S: Odd Fixes
 F: configs/devices/hppa-softmmu/default.mak
+F: hw/display/artist.c
 F: hw/hppa/
 F: hw/input/lasips2.c
 F: hw/net/*i82596*
@@ -1283,6 +1286,7 @@ F: include/hw/char/goldfish_tty.h
 F: include/hw/intc/goldfish_pic.h
 F: include/hw/intc/m68k_irqc.h
 F: include/hw/misc/virt_ctrl.h
+F: docs/specs/virt-ctlr.rst

 MicroBlaze Machines
 -------------------
@@ -1535,6 +1539,14 @@ F: hw/pci-host/mv64361.c
 F: hw/pci-host/mv643xx.h
 F: include/hw/pci-host/mv64361.h

+amigaone
+M: BALATON Zoltan <balaton@eik.bme.hu>
+L: qemu-ppc@nongnu.org
+S: Maintained
+F: hw/ppc/amigaone.c
+F: hw/pci-host/articia.c
+F: include/hw/pci-host/articia.h
+
 Virtual Open Firmware (VOF)
 M: Alexey Kardashevskiy <aik@ozlabs.ru>
 R: David Gibson <david@gibson.dropbear.id.au>
@@ -1614,6 +1626,7 @@ F: hw/intc/sh_intc.c
 F: hw/pci-host/sh_pci.c
 F: hw/timer/sh_timer.c
 F: include/hw/sh4/sh_intc.h
+F: include/hw/timer/tmu012.h

 Shix
 R: Yoshinori Sato <ysato@users.sourceforge.jp>
@@ -1771,7 +1784,7 @@ F: include/hw/southbridge/ich9.h
 F: include/hw/southbridge/piix.h
 F: hw/isa/apm.c
 F: include/hw/isa/apm.h
-F: tests/unit/test-x86-cpuid.c
+F: tests/unit/test-x86-topo.c
 F: tests/qtest/test-x86-cpuid-compat.c

 PC Chipset
@@ -1857,6 +1870,7 @@ M: Max Filippov <jcmvbkbc@gmail.com>
 S: Maintained
 F: hw/xtensa/xtfpga.c
 F: hw/net/opencores_eth.c
+F: include/hw/xtensa/mx_pic.h

 Devices
 -------
@@ -1882,6 +1896,7 @@ EDU
 M: Jiri Slaby <jslaby@suse.cz>
 S: Maintained
 F: hw/misc/edu.c
+F: docs/specs/edu.rst

 IDE
 M: John Snow <jsnow@redhat.com>
@@ -2308,6 +2323,15 @@ F: hw/virtio/virtio-mem-pci.h
 F: hw/virtio/virtio-mem-pci.c
 F: include/hw/virtio/virtio-mem.h

+virtio-snd
+M: Gerd Hoffmann <kraxel@redhat.com>
+R: Manos Pitsidianakis <manos.pitsidianakis@linaro.org>
+S: Supported
+F: hw/audio/virtio-snd.c
+F: hw/audio/virtio-snd-pci.c
+F: include/hw/audio/virtio-snd.h
+F: docs/system/devices/virtio-snd.rst
+
 nvme
 M: Keith Busch <kbusch@kernel.org>
 M: Klaus Jensen <its@irrelevant.dk>
@@ -2350,6 +2374,7 @@ S: Maintained
 F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*
 F: tests/qtest/vmxnet3-test.c
+F: docs/specs/vwm_pvscsi-spec.rst

 Rocker
 M: Jiri Pirko <jiri@resnulli.us>
@@ -2434,7 +2459,7 @@ S: Orphan
 R: Ani Sinha <ani@anisinha.ca>
 F: hw/acpi/vmgenid.c
 F: include/hw/acpi/vmgenid.h
-F: docs/specs/vmgenid.txt
+F: docs/specs/vmgenid.rst
 F: tests/qtest/vmgenid-test.c

 LED
@@ -2466,6 +2491,7 @@ F: hw/display/vga*
 F: hw/display/bochs-display.c
 F: include/hw/display/vga.h
 F: include/hw/display/bochs-vbe.h
+F: docs/specs/standard-vga.rst

 ramfb
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -2479,6 +2505,7 @@ S: Odd Fixes
 F: hw/display/virtio-gpu*
 F: hw/display/virtio-vga.*
 F: include/hw/virtio/virtio-gpu.h
+F: docs/system/devices/virtio-gpu.rst

 vhost-user-blk
 M: Raphael Norwitz <raphael.norwitz@nutanix.com>
@@ -2581,6 +2608,7 @@ W: https://canbus.pages.fel.cvut.cz/
 F: net/can/*
 F: hw/net/can/*
 F: include/net/can_*.h
+F: docs/system/devices/can.rst

 OpenPIC interrupt controller
 M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
@@ -2652,6 +2680,14 @@ F: hw/usb/canokey.c
 F: hw/usb/canokey.h
 F: docs/system/devices/canokey.rst

+Hyper-V Dynamic Memory Protocol
+M: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
+S: Supported
+F: hw/hyperv/hv-balloon*.c
+F: hw/hyperv/hv-balloon*.h
+F: include/hw/hyperv/dynmem-proto.h
+F: include/hw/hyperv/hv-balloon.h
+
 Subsystems
 ----------
 Overall Audio backends
@@ -2755,12 +2791,13 @@ S: Supported
 F: util/async.c
 F: util/aio-*.c
 F: util/aio-*.h
+F: util/defer-call.c
 F: util/fdmon-*.c
 F: block/io.c
-F: block/plug.c
 F: migration/block*
 F: include/block/aio.h
 F: include/block/aio-wait.h
+F: include/qemu/defer-call.h
 F: scripts/qemugdb/aio.py
 F: tests/unit/test-fdmon-epoll.c
 T: git https://github.com/stefanha/qemu.git block
@@ -2879,6 +2916,7 @@ F: include/sysemu/dump.h
 F: qapi/dump.json
 F: scripts/dump-guest-memory.py
 F: stubs/dump.c
+F: docs/specs/vmcoreinfo.rst

 Error reporting
 M: Markus Armbruster <armbru@redhat.com>
@@ -2904,7 +2942,7 @@ F: gdbstub/*
 F: include/exec/gdbstub.h
 F: include/gdbstub/*
 F: gdb-xml/
-F: tests/tcg/multiarch/gdbstub/
+F: tests/tcg/multiarch/gdbstub/*
 F: scripts/feature_to_c.py
 F: scripts/probe-gdb-support.py

@@ -3126,10 +3164,11 @@ M: Michael Roth <michael.roth@amd.com>
 M: Konstantin Kostiuk <kkostiuk@redhat.com>
 S: Maintained
 F: qga/
+F: contrib/systemd/qemu-guest-agent.service
 F: docs/interop/qemu-ga.rst
 F: docs/interop/qemu-ga-ref.rst
 F: scripts/qemu-guest-agent/
-F: tests/unit/test-qga.c
+F: tests/*/test-qga*
 T: git https://github.com/mdroth/qemu.git qga

 QEMU Guest Agent Win32
@@ -4039,7 +4078,7 @@ F: gitdm.config
 F: contrib/gitdm/*

 Incompatible changes
-R: libvir-list@redhat.com
+R: devel@lists.libvirt.org
 F: docs/about/deprecated.rst

 Build System
--- a/10
+++ b/10
@@ -283,6 +283,13 @@ include $(SRC_PATH)/tests/vm/Makefile.include
 print-help-run = printf "  %-30s - %s\\n" "$1" "$2"
 print-help = @$(call print-help-run,$1,$2)

+.PHONY: update-linux-vdso
+update-linux-vdso:
+	@for m in $(SRC_PATH)/linux-user/*/Makefile.vdso; do \
+	  $(MAKE) $(SUBDIR_MAKEFLAGS) -C $$(dirname $$m) -f Makefile.vdso \
+		SRC_PATH=$(SRC_PATH) BUILD_DIR=$(BUILD_DIR); \
+	done
+
 .PHONY: help
 help:
 	@echo  'Generic targets:'
@@ -303,6 +310,9 @@ endif
 	$(call print-help,distclean,Remove all generated files)
 	$(call print-help,dist,Build a distributable tarball)
 	@echo  ''
+	@echo  'Linux-user targets:'
+	$(call print-help,update-linux-vdso,Build linux-user vdso images)
+	@echo  ''
 	@echo  'Test targets:'
 	$(call print-help,check,Run all tests (check-help for details))
 	$(call print-help,bench,Run all benchmarks)
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -22,10 +22,6 @@ void tlb_set_dirty(CPUState *cpu, vaddr vaddr)
 {
 }

-void tcg_flush_jmp_cache(CPUState *cpu)
-{
-}
-
 int probe_access_flags(CPUArchState *env, vaddr addr, int size,
                       MMUAccessType access_type, int mmu_idx,
                       bool nonfault, void **phost, uintptr_t retaddr)
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -24,6 +24,7 @@
 #include "exec/memory.h"
 #include "exec/cpu_ldst.h"
 #include "exec/cputlb.h"
+#include "exec/tb-flush.h"
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
 #include "tcg/tcg.h"
@@ -321,21 +322,6 @@ static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
    }
 }

-void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
-{
-    CPUState *cpu;
-    size_t full = 0, part = 0, elide = 0;
-
-    CPU_FOREACH(cpu) {
-        full += qatomic_read(&cpu->neg.tlb.c.full_flush_count);
-        part += qatomic_read(&cpu->neg.tlb.c.part_flush_count);
-        elide += qatomic_read(&cpu->neg.tlb.c.elide_flush_count);
-    }
-    *pfull = full;
-    *ppart = part;
-    *pelide = elide;
-}
-
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
    uint16_t asked = data.host_int;
@@ -2706,7 +2692,7 @@ static uint64_t do_st16_leN(CPUState *cpu, MMULookupPageData *p,

    case MO_ATOM_WITHIN16_PAIR:
        /* Since size > 8, this is the half that must be atomic. */
-        if (!HAVE_ATOMIC128_RW) {
+        if (!HAVE_CMPXCHG128) {
            cpu_loop_exit_atomic(cpu, ra);
        }
        return store_whole_le16(p->haddr, p->size, val_le);
--- a/accel/tcg/internal-common.h
+++ b/accel/tcg/internal-common.h
@@ -14,8 +14,6 @@
 extern int64_t max_delay;
 extern int64_t max_advance;

-void dump_exec_info(GString *buf);
-
 /*
 * Return true if CS is not running in parallel with other cpus, either
 * because there are no other cpus or we are within an exclusive context.
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -825,7 +825,7 @@ static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
    int sh = o * 8;
    Int128 m, v;

-    qemu_build_assert(HAVE_ATOMIC128_RW);
+    qemu_build_assert(HAVE_CMPXCHG128);

    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
    if (sz <= 64) {
@@ -887,7 +887,7 @@ static void store_atom_2(CPUState *cpu, uintptr_t ra,
            return;
        }
    } else if ((pi & 15) == 7) {
-        if (HAVE_ATOMIC128_RW) {
+        if (HAVE_CMPXCHG128) {
            Int128 v = int128_lshift(int128_make64(val), 56);
            Int128 m = int128_lshift(int128_make64(0xffff), 56);
            store_atom_insert_al16(pv - 7, v, m);
@@ -956,7 +956,7 @@ static void store_atom_4(CPUState *cpu, uintptr_t ra,
                return;
            }
        } else {
-            if (HAVE_ATOMIC128_RW) {
+            if (HAVE_CMPXCHG128) {
                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
                return;
            }
@@ -1021,7 +1021,7 @@ static void store_atom_8(CPUState *cpu, uintptr_t ra,
        }
        break;
    case MO_64:
-        if (HAVE_ATOMIC128_RW) {
+        if (HAVE_CMPXCHG128) {
            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
            return;
        }
@@ -1076,7 +1076,7 @@ static void store_atom_16(CPUState *cpu, uintptr_t ra,
        }
        break;
    case -MO_64:
-        if (HAVE_ATOMIC128_RW) {
+        if (HAVE_CMPXCHG128) {
            uint64_t val_le;
            int s2 = pi & 15;
            int s1 = 16 - s2;
@@ -1103,10 +1103,6 @@ static void store_atom_16(CPUState *cpu, uintptr_t ra,
        }
        break;
    case MO_128:
-        if (HAVE_ATOMIC128_RW) {
-            atomic16_set(pv, val);
-            return;
-        }
        break;
    default:
        g_assert_not_reached();
--- a/accel/tcg/monitor.c
+++ b/accel/tcg/monitor.c
@@ -8,6 +8,7 @@

 #include "qemu/osdep.h"
 #include "qemu/accel.h"
+#include "qemu/qht.h"
 #include "qapi/error.h"
 #include "qapi/type-helpers.h"
 #include "qapi/qapi-commands-machine.h"
@@ -17,6 +18,7 @@
 #include "sysemu/tcg.h"
 #include "tcg/tcg.h"
 #include "internal-common.h"
+#include "tb-context.h"


 static void dump_drift_info(GString *buf)
@@ -50,6 +52,153 @@ static void dump_accel_info(GString *buf)
                           one_insn_per_tb ? "on" : "off");
 }

+static void print_qht_statistics(struct qht_stats hst, GString *buf)
+{
+    uint32_t hgram_opts;
+    size_t hgram_bins;
+    char *hgram;
+
+    if (!hst.head_buckets) {
+        return;
+    }
+    g_string_append_printf(buf, "TB hash buckets     %zu/%zu "
+                           "(%0.2f%% head buckets used)\n",
+                           hst.used_head_buckets, hst.head_buckets,
+                           (double)hst.used_head_buckets /
+                           hst.head_buckets * 100);
+
+    hgram_opts =  QDIST_PR_BORDER | QDIST_PR_LABELS;
+    hgram_opts |= QDIST_PR_100X   | QDIST_PR_PERCENT;
+    if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
+        hgram_opts |= QDIST_PR_NODECIMAL;
+    }
+    hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
+    g_string_append_printf(buf, "TB hash occupancy   %0.2f%% avg chain occ. "
+                           "Histogram: %s\n",
+                           qdist_avg(&hst.occupancy) * 100, hgram);
+    g_free(hgram);
+
+    hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
+    hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
+    if (hgram_bins > 10) {
+        hgram_bins = 10;
+    } else {
+        hgram_bins = 0;
+        hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
+    }
+    hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
+    g_string_append_printf(buf, "TB hash avg chain   %0.3f buckets. "
+                           "Histogram: %s\n",
+                           qdist_avg(&hst.chain), hgram);
+    g_free(hgram);
+}
+
+struct tb_tree_stats {
+    size_t nb_tbs;
+    size_t host_size;
+    size_t target_size;
+    size_t max_target_size;
+    size_t direct_jmp_count;
+    size_t direct_jmp2_count;
+    size_t cross_page;
+};
+
+static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
+{
+    const TranslationBlock *tb = value;
+    struct tb_tree_stats *tst = data;
+
+    tst->nb_tbs++;
+    tst->host_size += tb->tc.size;
+    tst->target_size += tb->size;
+    if (tb->size > tst->max_target_size) {
+        tst->max_target_size = tb->size;
+    }
+    if (tb->page_addr[1] != -1) {
+        tst->cross_page++;
+    }
+    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
+        tst->direct_jmp_count++;
+        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
+            tst->direct_jmp2_count++;
+        }
+    }
+    return false;
+}
+
+static void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
+{
+    CPUState *cpu;
+    size_t full = 0, part = 0, elide = 0;
+
+    CPU_FOREACH(cpu) {
+        full += qatomic_read(&cpu->neg.tlb.c.full_flush_count);
+        part += qatomic_read(&cpu->neg.tlb.c.part_flush_count);
+        elide += qatomic_read(&cpu->neg.tlb.c.elide_flush_count);
+    }
+    *pfull = full;
+    *ppart = part;
+    *pelide = elide;
+}
+
+static void tcg_dump_info(GString *buf)
+{
+    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+}
+
+static void dump_exec_info(GString *buf)
+{
+    struct tb_tree_stats tst = {};
+    struct qht_stats hst;
+    size_t nb_tbs, flush_full, flush_part, flush_elide;
+
+    tcg_tb_foreach(tb_tree_stats_iter, &tst);
+    nb_tbs = tst.nb_tbs;
+    /* XXX: avoid using doubles ? */
+    g_string_append_printf(buf, "Translation buffer state:\n");
+    /*
+     * Report total code size including the padding and TB structs;
+     * otherwise users might think "-accel tcg,tb-size" is not honoured.
+     * For avg host size we use the precise numbers from tb_tree_stats though.
+     */
+    g_string_append_printf(buf, "gen code size       %zu/%zu\n",
+                           tcg_code_size(), tcg_code_capacity());
+    g_string_append_printf(buf, "TB count            %zu\n", nb_tbs);
+    g_string_append_printf(buf, "TB avg target size  %zu max=%zu bytes\n",
+                           nb_tbs ? tst.target_size / nb_tbs : 0,
+                           tst.max_target_size);
+    g_string_append_printf(buf, "TB avg host size    %zu bytes "
+                           "(expansion ratio: %0.1f)\n",
+                           nb_tbs ? tst.host_size / nb_tbs : 0,
+                           tst.target_size ?
+                           (double)tst.host_size / tst.target_size : 0);
+    g_string_append_printf(buf, "cross page TB count %zu (%zu%%)\n",
+                           tst.cross_page,
+                           nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
+    g_string_append_printf(buf, "direct jump count   %zu (%zu%%) "
+                           "(2 jumps=%zu %zu%%)\n",
+                           tst.direct_jmp_count,
+                           nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
+                           tst.direct_jmp2_count,
+                           nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
+
+    qht_statistics_init(&tb_ctx.htable, &hst);
+    print_qht_statistics(hst, buf);
+    qht_statistics_destroy(&hst);
+
+    g_string_append_printf(buf, "\nStatistics:\n");
+    g_string_append_printf(buf, "TB flush count      %u\n",
+                           qatomic_read(&tb_ctx.tb_flush_count));
+    g_string_append_printf(buf, "TB invalidate count %u\n",
+                           qatomic_read(&tb_ctx.tb_phys_invalidate_count));
+
+    tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
+    g_string_append_printf(buf, "TLB full flushes    %zu\n", flush_full);
+    g_string_append_printf(buf, "TLB partial flushes %zu\n", flush_part);
+    g_string_append_printf(buf, "TLB elided flushes  %zu\n", flush_elide);
+    tcg_dump_info(buf);
+}
+
 HumanReadableText *qmp_x_query_jit(Error **errp)
 {
    g_autoptr(GString) buf = g_string_new("");
@@ -66,6 +215,11 @@ HumanReadableText *qmp_x_query_jit(Error **errp)
    return human_readable_text_from_str(buf);
 }

+static void tcg_dump_op_count(GString *buf)
+{
+    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+}
+
 HumanReadableText *qmp_x_query_opcount(Error **errp)
 {
    g_autoptr(GString) buf = g_string_new("");
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -327,8 +327,7 @@ static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
    return op;
 }

-static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
-                        void *func, int *cb_idx)
+static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *func, int *cb_idx)
 {
    TCGOp *old_op;
    int func_idx;
@@ -372,8 +371,7 @@ static TCGOp *append_udata_cb(const struct qemu_plugin_dyn_cb *cb,
    }

    /* call */
-    op = copy_call(&begin_op, op, HELPER(plugin_vcpu_udata_cb),
-                   cb->f.vcpu_udata, cb_idx);
+    op = copy_call(&begin_op, op, cb->f.vcpu_udata, cb_idx);

    return op;
 }
@@ -420,8 +418,7 @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,

    if (type == PLUGIN_GEN_CB_MEM) {
        /* call */
-        op = copy_call(&begin_op, op, HELPER(plugin_vcpu_mem_cb),
-                       cb->f.vcpu_udata, cb_idx);
+        op = copy_call(&begin_op, op, cb->f.vcpu_udata, cb_idx);
    }

    return op;
--- a/accel/tcg/tcg-accel-ops.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -34,6 +34,7 @@
 #include "qemu/timer.h"
 #include "exec/exec-all.h"
 #include "exec/hwaddr.h"
+#include "exec/tb-flush.h"
 #include "exec/gdbstub.h"

 #include "tcg-accel-ops.h"
@@ -77,6 +78,13 @@ int tcg_cpus_exec(CPUState *cpu)
    return ret;
 }

+static void tcg_cpu_reset_hold(CPUState *cpu)
+{
+    tcg_flush_jmp_cache(cpu);
+
+    tlb_flush(cpu);
+}
+
 /* mask must never be zero, except for A20 change call */
 void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
@@ -205,6 +213,7 @@ static void tcg_accel_ops_init(AccelOpsClass *ops)
        }
    }

+    ops->cpu_reset_hold = tcg_cpu_reset_hold;
    ops->supports_guest_debug = tcg_supports_guest_debug;
    ops->insert_breakpoint = tcg_insert_breakpoint;
    ops->remove_breakpoint = tcg_remove_breakpoint;
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -645,133 +645,6 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
    cpu_loop_exit_noexc(cpu);
 }

-static void print_qht_statistics(struct qht_stats hst, GString *buf)
-{
-    uint32_t hgram_opts;
-    size_t hgram_bins;
-    char *hgram;
-
-    if (!hst.head_buckets) {
-        return;
-    }
-    g_string_append_printf(buf, "TB hash buckets     %zu/%zu "
-                           "(%0.2f%% head buckets used)\n",
-                           hst.used_head_buckets, hst.head_buckets,
-                           (double)hst.used_head_buckets /
-                           hst.head_buckets * 100);
-
-    hgram_opts =  QDIST_PR_BORDER | QDIST_PR_LABELS;
-    hgram_opts |= QDIST_PR_100X   | QDIST_PR_PERCENT;
-    if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
-        hgram_opts |= QDIST_PR_NODECIMAL;
-    }
-    hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
-    g_string_append_printf(buf, "TB hash occupancy   %0.2f%% avg chain occ. "
-                           "Histogram: %s\n",
-                           qdist_avg(&hst.occupancy) * 100, hgram);
-    g_free(hgram);
-
-    hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
-    hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
-    if (hgram_bins > 10) {
-        hgram_bins = 10;
-    } else {
-        hgram_bins = 0;
-        hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
-    }
-    hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
-    g_string_append_printf(buf, "TB hash avg chain   %0.3f buckets. "
-                           "Histogram: %s\n",
-                           qdist_avg(&hst.chain), hgram);
-    g_free(hgram);
-}
-
-struct tb_tree_stats {
-    size_t nb_tbs;
-    size_t host_size;
-    size_t target_size;
-    size_t max_target_size;
-    size_t direct_jmp_count;
-    size_t direct_jmp2_count;
-    size_t cross_page;
-};
-
-static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
-{
-    const TranslationBlock *tb = value;
-    struct tb_tree_stats *tst = data;
-
-    tst->nb_tbs++;
-    tst->host_size += tb->tc.size;
-    tst->target_size += tb->size;
-    if (tb->size > tst->max_target_size) {
-        tst->max_target_size = tb->size;
-    }
-    if (tb_page_addr1(tb) != -1) {
-        tst->cross_page++;
-    }
-    if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
-        tst->direct_jmp_count++;
-        if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
-            tst->direct_jmp2_count++;
-        }
-    }
-    return false;
-}
-
-void dump_exec_info(GString *buf)
-{
-    struct tb_tree_stats tst = {};
-    struct qht_stats hst;
-    size_t nb_tbs, flush_full, flush_part, flush_elide;
-
-    tcg_tb_foreach(tb_tree_stats_iter, &tst);
-    nb_tbs = tst.nb_tbs;
-    /* XXX: avoid using doubles ? */
-    g_string_append_printf(buf, "Translation buffer state:\n");
-    /*
-     * Report total code size including the padding and TB structs;
-     * otherwise users might think "-accel tcg,tb-size" is not honoured.
-     * For avg host size we use the precise numbers from tb_tree_stats though.
-     */
-    g_string_append_printf(buf, "gen code size       %zu/%zu\n",
-                           tcg_code_size(), tcg_code_capacity());
-    g_string_append_printf(buf, "TB count            %zu\n", nb_tbs);
-    g_string_append_printf(buf, "TB avg target size  %zu max=%zu bytes\n",
-                           nb_tbs ? tst.target_size / nb_tbs : 0,
-                           tst.max_target_size);
-    g_string_append_printf(buf, "TB avg host size    %zu bytes "
-                           "(expansion ratio: %0.1f)\n",
-                           nb_tbs ? tst.host_size / nb_tbs : 0,
-                           tst.target_size ?
-                           (double)tst.host_size / tst.target_size : 0);
-    g_string_append_printf(buf, "cross page TB count %zu (%zu%%)\n",
-                           tst.cross_page,
-                           nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
-    g_string_append_printf(buf, "direct jump count   %zu (%zu%%) "
-                           "(2 jumps=%zu %zu%%)\n",
-                           tst.direct_jmp_count,
-                           nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
-                           tst.direct_jmp2_count,
-                           nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
-
-    qht_statistics_init(&tb_ctx.htable, &hst);
-    print_qht_statistics(hst, buf);
-    qht_statistics_destroy(&hst);
-
-    g_string_append_printf(buf, "\nStatistics:\n");
-    g_string_append_printf(buf, "TB flush count      %u\n",
-                           qatomic_read(&tb_ctx.tb_flush_count));
-    g_string_append_printf(buf, "TB invalidate count %u\n",
-                           qatomic_read(&tb_ctx.tb_phys_invalidate_count));
-
-    tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
-    g_string_append_printf(buf, "TLB full flushes    %zu\n", flush_full);
-    g_string_append_printf(buf, "TLB partial flushes %zu\n", flush_part);
-    g_string_append_printf(buf, "TLB elided flushes  %zu\n", flush_elide);
-    tcg_dump_info(buf);
-}
-
 #else /* CONFIG_USER_ONLY */

 void cpu_interrupt(CPUState *cpu, int mask)
@@ -800,11 +673,3 @@ void tcg_flush_jmp_cache(CPUState *cpu)
        qatomic_set(&jc->array[i].tb, NULL);
    }
 }
-
-/* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
-void tcg_flush_softmmu_tlb(CPUState *cs)
-{
-#ifdef CONFIG_SOFTMMU
-    tlb_flush(cs);
-#endif
-}
--- a/accel/tcg/user-exec-stub.c
+++ b/accel/tcg/user-exec-stub.c
@@ -14,6 +14,10 @@ void qemu_init_vcpu(CPUState *cpu)
 {
 }

+void cpu_exec_reset_hold(CPUState *cpu)
+{
+}
+
 /* User mode emulation does not support record/replay yet.  */

 bool replay_exception(void)
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -1781,7 +1781,7 @@ static AudioState *audio_init(Audiodev *dev, Error **errp)

    QTAILQ_INSERT_TAIL(&audio_states, s, list);
    QLIST_INIT (&s->card_head);
-    vmstate_register (NULL, 0, &vmstate_audio, s);
+    vmstate_register_any(NULL, &vmstate_audio, s);
    return s;

 out:
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -97,6 +97,10 @@ static int wav_init_out(HWVoiceOut *hw, struct audsettings *as,
        dolog ("WAVE files can not handle 32bit formats\n");
        return -1;

+    case AUDIO_FORMAT_F32:
+        dolog("WAVE files can not handle float formats\n");
+        return -1;
+
    default:
        abort();
    }
--- a/backends/dbus-vmstate.c
+++ b/backends/dbus-vmstate.c
@@ -426,8 +426,7 @@ dbus_vmstate_complete(UserCreatable *uc, Error **errp)
        return;
    }

-    if (vmstate_register(VMSTATE_IF(self), VMSTATE_INSTANCE_ID_ANY,
-                         &dbus_vmstate, self) < 0) {
+    if (vmstate_register_any(VMSTATE_IF(self), &dbus_vmstate, self) < 0) {
        error_setg(errp, "Failed to register vmstate");
    }
 }
--- a/backends/tpm/tpm_emulator.c
+++ b/backends/tpm/tpm_emulator.c
@@ -975,8 +975,7 @@ static void tpm_emulator_inst_init(Object *obj)
        qemu_add_vm_change_state_handler(tpm_emulator_vm_state_change,
                                         tpm_emu);

-    vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY,
-                     &vmstate_tpm_emulator, obj);
+    vmstate_register_any(NULL, &vmstate_tpm_emulator, obj);
 }

 /*
--- a/block.c
+++ b/block.c
@@ -820,12 +820,17 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 {
    BlockDriver *drv = bs->drv;
-    BlockDriverState *filtered = bdrv_filter_bs(bs);
+    BlockDriverState *filtered;
+
    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();

    if (drv && drv->bdrv_probe_geometry) {
        return drv->bdrv_probe_geometry(bs, geo);
-    } else if (filtered) {
+    }
+
+    filtered = bdrv_filter_bs(bs);
+    if (filtered) {
        return bdrv_probe_geometry(filtered, geo);
    }

@@ -1702,12 +1707,14 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
    return 0;
 open_failed:
    bs->drv = NULL;
+
+    bdrv_graph_wrlock(NULL);
    if (bs->file != NULL) {
-        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, bs->file);
-        bdrv_graph_wrunlock();
        assert(!bs->file);
    }
+    bdrv_graph_wrunlock();
+
    g_free(bs->opaque);
    bs->opaque = NULL;
    return ret;
@@ -1849,9 +1856,12 @@ static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
    Error *local_err = NULL;
    bool ro;

+    GLOBAL_STATE_CODE();
+
+    bdrv_graph_rdlock_main_loop();
    assert(bs->file == NULL);
    assert(options != NULL && bs->options != options);
-    GLOBAL_STATE_CODE();
+    bdrv_graph_rdunlock_main_loop();

    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
@@ -3209,8 +3219,6 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,

    GLOBAL_STATE_CODE();

-    bdrv_graph_wrlock(child_bs);
-
    child = bdrv_attach_child_common(child_bs, child_name, child_class,
                                   child_role, perm, shared_perm, opaque,
                                   tran, errp);
@@ -3223,9 +3231,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,

 out:
    tran_finalize(tran, ret);
-    bdrv_graph_wrunlock();

-    bdrv_unref(child_bs);
+    bdrv_schedule_unref(child_bs);

    return ret < 0 ? NULL : child;
 }
@@ -3530,19 +3537,7 @@ out:
 *
 * If a backing child is already present (i.e. we're detaching a node), that
 * child node must be drained.
- *
- * After calling this function, the transaction @tran may only be completed
- * while holding a writer lock for the graph.
 */
-static int GRAPH_WRLOCK
-bdrv_set_backing_noperm(BlockDriverState *bs,
-                        BlockDriverState *backing_hd,
-                        Transaction *tran, Error **errp)
-{
-    GLOBAL_STATE_CODE();
-    return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
-}
-
 int bdrv_set_backing_hd_drained(BlockDriverState *bs,
                                BlockDriverState *backing_hd,
                                Error **errp)
@@ -3555,9 +3550,8 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,
    if (bs->backing) {
        assert(bs->backing->bs->quiesce_counter > 0);
    }
-    bdrv_graph_wrlock(backing_hd);

-    ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
+    ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
    if (ret < 0) {
        goto out;
    }
@@ -3565,20 +3559,25 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,
    ret = bdrv_refresh_perms(bs, tran, errp);
 out:
    tran_finalize(tran, ret);
-    bdrv_graph_wrunlock();
    return ret;
 }

 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                        Error **errp)
 {
-    BlockDriverState *drain_bs = bs->backing ? bs->backing->bs : bs;
+    BlockDriverState *drain_bs;
    int ret;
    GLOBAL_STATE_CODE();

+    bdrv_graph_rdlock_main_loop();
+    drain_bs = bs->backing ? bs->backing->bs : bs;
+    bdrv_graph_rdunlock_main_loop();
+
    bdrv_ref(drain_bs);
    bdrv_drained_begin(drain_bs);
+    bdrv_graph_wrlock(backing_hd);
    ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
+    bdrv_graph_wrunlock();
    bdrv_drained_end(drain_bs);
    bdrv_unref(drain_bs);

@@ -3612,6 +3611,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
    Error *local_err = NULL;

    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();

    if (bs->backing != NULL) {
        goto free_exit;
@@ -3653,10 +3653,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
            implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
        }

-        bdrv_graph_rdlock_main_loop();
        backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
-        bdrv_graph_rdunlock_main_loop();
-
        if (local_err) {
            ret = -EINVAL;
            error_propagate(errp, local_err);
@@ -3687,9 +3684,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
    }

    if (implicit_backing) {
-        bdrv_graph_rdlock_main_loop();
        bdrv_refresh_filename(backing_hd);
-        bdrv_graph_rdunlock_main_loop();
        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
                backing_hd->filename);
    }
@@ -4760,8 +4755,8 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
 {
    BlockDriverState *bs = reopen_state->bs;
    BlockDriverState *new_child_bs;
-    BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
-                                                  child_bs(bs->file);
+    BlockDriverState *old_child_bs;
+
    const char *child_name = is_backing ? "backing" : "file";
    QObject *value;
    const char *str;
@@ -4776,6 +4771,8 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        return 0;
    }

+    bdrv_graph_rdlock_main_loop();
+
    switch (qobject_type(value)) {
    case QTYPE_QNULL:
        assert(is_backing); /* The 'file' option does not allow a null value */
@@ -4785,17 +4782,16 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        str = qstring_get_str(qobject_to(QString, value));
        new_child_bs = bdrv_lookup_bs(NULL, str, errp);
        if (new_child_bs == NULL) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out_rdlock;
        }

-        bdrv_graph_rdlock_main_loop();
        has_child = bdrv_recurse_has_child(new_child_bs, bs);
-        bdrv_graph_rdunlock_main_loop();
-
        if (has_child) {
            error_setg(errp, "Making '%s' a %s child of '%s' would create a "
                       "cycle", str, child_name, bs->node_name);
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out_rdlock;
        }
        break;
    default:
@@ -4806,19 +4802,23 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        g_assert_not_reached();
    }

+    old_child_bs = is_backing ? child_bs(bs->backing) : child_bs(bs->file);
    if (old_child_bs == new_child_bs) {
-        return 0;
+        ret = 0;
+        goto out_rdlock;
    }

    if (old_child_bs) {
        if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
-            return 0;
+            ret = 0;
+            goto out_rdlock;
        }

        if (old_child_bs->implicit) {
            error_setg(errp, "Cannot replace implicit %s child of %s",
                       child_name, bs->node_name);
-            return -EPERM;
+            ret = -EPERM;
+            goto out_rdlock;
        }
    }

@@ -4829,7 +4829,8 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
         */
        error_setg(errp, "'%s' is a %s filter node that does not support a "
                   "%s child", bs->node_name, bs->drv->format_name, child_name);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out_rdlock;
    }

    if (is_backing) {
@@ -4850,6 +4851,7 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        aio_context_acquire(ctx);
    }

+    bdrv_graph_rdunlock_main_loop();
    bdrv_graph_wrlock(new_child_bs);

    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
@@ -4868,6 +4870,10 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
    }

    return ret;
+
+out_rdlock:
+    bdrv_graph_rdunlock_main_loop();
+    return ret;
 }

 /*
@@ -5008,13 +5014,16 @@ bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     * file or if the image file has a backing file name as part of
     * its metadata. Otherwise the 'backing' option can be omitted.
     */
+    bdrv_graph_rdlock_main_loop();
    if (drv->supports_backing && reopen_state->backing_missing &&
        (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
        error_setg(errp, "backing is missing for '%s'",
                   reopen_state->bs->node_name);
+        bdrv_graph_rdunlock_main_loop();
        ret = -EINVAL;
        goto error;
    }
+    bdrv_graph_rdunlock_main_loop();

    /*
     * Allow changing the 'backing' option. The new value can be
@@ -5200,14 +5209,15 @@ static void bdrv_close(BlockDriverState *bs)
        bs->drv = NULL;
    }

-    bdrv_graph_wrlock(NULL);
+    bdrv_graph_wrlock(bs);
    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
        bdrv_unref_child(bs, child);
    }
-    bdrv_graph_wrunlock();

    assert(!bs->backing);
    assert(!bs->file);
+    bdrv_graph_wrunlock();
+
    g_free(bs->opaque);
    bs->opaque = NULL;
    qatomic_set(&bs->copy_on_read, 0);
@@ -5412,6 +5422,9 @@ bdrv_replace_node_noperm(BlockDriverState *from,
 }

 /*
+ * Switch all parents of @from to point to @to instead. @from and @to must be in
+ * the same AioContext and both must be drained.
+ *
 * With auto_skip=true bdrv_replace_node_common skips updating from parents
 * if it creates a parent-child relation loop or if parent is block-job.
 *
@@ -5421,10 +5434,9 @@ bdrv_replace_node_noperm(BlockDriverState *from,
 * With @detach_subchain=true @to must be in a backing chain of @from. In this
 * case backing link of the cow-parent of @to is removed.
 */
-static int bdrv_replace_node_common(BlockDriverState *from,
-                                    BlockDriverState *to,
-                                    bool auto_skip, bool detach_subchain,
-                                    Error **errp)
+static int GRAPH_WRLOCK
+bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
+                         bool auto_skip, bool detach_subchain, Error **errp)
 {
    Transaction *tran = tran_new();
    g_autoptr(GSList) refresh_list = NULL;
@@ -5433,6 +5445,10 @@ static int bdrv_replace_node_common(BlockDriverState *from,

    GLOBAL_STATE_CODE();

+    assert(from->quiesce_counter);
+    assert(to->quiesce_counter);
+    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
+
    if (detach_subchain) {
        assert(bdrv_chain_contains(from, to));
        assert(from != to);
@@ -5444,17 +5460,6 @@ static int bdrv_replace_node_common(BlockDriverState *from,
        }
    }

-    /* Make sure that @from doesn't go away until we have successfully attached
-     * all of its parents to @to. */
-    bdrv_ref(from);
-
-    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
-    bdrv_drained_begin(from);
-    bdrv_drained_begin(to);
-
-    bdrv_graph_wrlock(to);
-
    /*
     * Do the replacement without permission update.
     * Replacement may influence the permissions, we should calculate new
@@ -5483,29 +5488,33 @@ static int bdrv_replace_node_common(BlockDriverState *from,

 out:
    tran_finalize(tran, ret);
-    bdrv_graph_wrunlock();
-
-    bdrv_drained_end(to);
-    bdrv_drained_end(from);
-    bdrv_unref(from);
-
    return ret;
 }

 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
                      Error **errp)
 {
-    GLOBAL_STATE_CODE();
-
    return bdrv_replace_node_common(from, to, true, false, errp);
 }

 int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
 {
+    BlockDriverState *child_bs;
+    int ret;
+
    GLOBAL_STATE_CODE();

-    return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
-                                    errp);
+    bdrv_graph_rdlock_main_loop();
+    child_bs = bdrv_filter_or_cow_bs(bs);
+    bdrv_graph_rdunlock_main_loop();
+
+    bdrv_drained_begin(child_bs);
+    bdrv_graph_wrlock(bs);
+    ret = bdrv_replace_node_common(bs, child_bs, true, true, errp);
+    bdrv_graph_wrunlock();
+    bdrv_drained_end(child_bs);
+
+    return ret;
 }

 /*
@@ -5532,7 +5541,9 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,

    GLOBAL_STATE_CODE();

+    bdrv_graph_rdlock_main_loop();
    assert(!bs_new->backing);
+    bdrv_graph_rdunlock_main_loop();

    old_context = bdrv_get_aio_context(bs_top);
    bdrv_drained_begin(bs_top);
@@ -5700,9 +5711,19 @@ BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
        goto fail;
    }

+    /*
+     * Make sure that @bs doesn't go away until we have successfully attached
+     * all of its parents to @new_node_bs and undrained it again.
+     */
+    bdrv_ref(bs);
    bdrv_drained_begin(bs);
+    bdrv_drained_begin(new_node_bs);
+    bdrv_graph_wrlock(new_node_bs);
    ret = bdrv_replace_node(bs, new_node_bs, errp);
+    bdrv_graph_wrunlock();
+    bdrv_drained_end(new_node_bs);
    bdrv_drained_end(bs);
+    bdrv_unref(bs);

    if (ret < 0) {
        error_prepend(errp, "Could not replace node: ");
@@ -5748,13 +5769,14 @@ int coroutine_fn bdrv_co_check(BlockDriverState *bs,
 *            image file header
 * -ENOTSUP - format driver doesn't support changing the backing file
 */
-int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
-                             const char *backing_fmt, bool require)
+int coroutine_fn
+bdrv_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+                            const char *backing_fmt, bool require)
 {
    BlockDriver *drv = bs->drv;
    int ret;

-    GLOBAL_STATE_CODE();
+    IO_CODE();

    if (!drv) {
        return -ENOMEDIUM;
@@ -5769,8 +5791,8 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
        return -EINVAL;
    }

-    if (drv->bdrv_change_backing_file != NULL) {
-        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
+    if (drv->bdrv_co_change_backing_file != NULL) {
+        ret = drv->bdrv_co_change_backing_file(bs, backing_file, backing_fmt);
    } else {
        ret = -ENOTSUP;
    }
@@ -5827,8 +5849,9 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs)
 * between @bs and @base is frozen. @errp is set if that's the case.
 * @base must be reachable from @bs, or NULL.
 */
-bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
-                                  Error **errp)
+static bool GRAPH_RDLOCK
+bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+                             Error **errp)
 {
    BlockDriverState *i;
    BdrvChild *child;
@@ -5952,15 +5975,15 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,

    bdrv_ref(top);
    bdrv_drained_begin(base);
-    bdrv_graph_rdlock_main_loop();
+    bdrv_graph_wrlock(base);

    if (!top->drv || !base->drv) {
-        goto exit;
+        goto exit_wrlock;
    }

    /* Make sure that base is in the backing chain of top */
    if (!bdrv_chain_contains(top, base)) {
-        goto exit;
+        goto exit_wrlock;
    }

    /* If 'base' recursively inherits from 'top' then we should set
@@ -5992,6 +6015,8 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
     * That's a FIXME.
     */
    bdrv_replace_node_common(top, base, false, false, &local_err);
+    bdrv_graph_wrunlock();
+
    if (local_err) {
        error_report_err(local_err);
        goto exit;
@@ -6024,8 +6049,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
    }

    ret = 0;
+    goto exit;
+
+exit_wrlock:
+    bdrv_graph_wrunlock();
 exit:
-    bdrv_graph_rdunlock_main_loop();
    bdrv_drained_end(base);
    bdrv_unref(top);
    return ret;
@@ -6587,7 +6615,7 @@ int bdrv_has_zero_init_1(BlockDriverState *bs)
    return 1;
 }

-int bdrv_has_zero_init(BlockDriverState *bs)
+int coroutine_mixed_fn bdrv_has_zero_init(BlockDriverState *bs)
 {
    BlockDriverState *filtered;
    GLOBAL_STATE_CODE();
@@ -8100,7 +8128,7 @@ static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
 /* Note: This function may return false positives; it may return true
 * even if opening the backing file specified by bs's image header
 * would result in exactly bs->backing. */
-static bool bdrv_backing_overridden(BlockDriverState *bs)
+static bool GRAPH_RDLOCK bdrv_backing_overridden(BlockDriverState *bs)
 {
    GLOBAL_STATE_CODE();
    if (bs->backing) {
@@ -8474,8 +8502,8 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs)
    return found;
 }

-static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
-                                              bool stop_on_explicit_filter)
+static BlockDriverState * GRAPH_RDLOCK
+bdrv_do_skip_filters(BlockDriverState *bs, bool stop_on_explicit_filter)
 {
    BdrvChild *c;

--- a/block/backup.c
+++ b/block/backup.c
@@ -374,7 +374,6 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
    assert(bs);
    assert(target);
    GLOBAL_STATE_CODE();
-    GRAPH_RDLOCK_GUARD_MAINLOOP();

    /* QMP interface protects us from these cases */
    assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
@@ -385,31 +384,33 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
        return NULL;
    }

+    bdrv_graph_rdlock_main_loop();
    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
-        return NULL;
+        goto error_rdlock;
    }

    if (!bdrv_is_inserted(target)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(target));
-        return NULL;
+        goto error_rdlock;
    }

    if (compress && !bdrv_supports_compressed_writes(target)) {
        error_setg(errp, "Compression is not supported for this drive %s",
                   bdrv_get_device_name(target));
-        return NULL;
+        goto error_rdlock;
    }

    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
-        return NULL;
+        goto error_rdlock;
    }

    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
-        return NULL;
+        goto error_rdlock;
    }
+    bdrv_graph_rdunlock_main_loop();

    if (perf->max_workers < 1 || perf->max_workers > INT_MAX) {
        error_setg(errp, "max-workers must be between 1 and %d", INT_MAX);
@@ -437,6 +438,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,

    len = bdrv_getlength(bs);
    if (len < 0) {
+        GRAPH_RDLOCK_GUARD_MAINLOOP();
        error_setg_errno(errp, -len, "Unable to get length for '%s'",
                         bdrv_get_device_or_node_name(bs));
        goto error;
@@ -444,6 +446,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,

    target_len = bdrv_getlength(target);
    if (target_len < 0) {
+        GRAPH_RDLOCK_GUARD_MAINLOOP();
        error_setg_errno(errp, -target_len, "Unable to get length for '%s'",
                         bdrv_get_device_or_node_name(bs));
        goto error;
@@ -493,8 +496,10 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
    block_copy_set_speed(bcs, speed);

    /* Required permissions are taken by copy-before-write filter target */
+    bdrv_graph_wrlock(target);
    block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                       &error_abort);
+    bdrv_graph_wrunlock();

    return &job->common;

@@ -507,4 +512,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
    }

    return NULL;
+
+error_rdlock:
+    bdrv_graph_rdunlock_main_loop();
+    return NULL;
 }
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -508,6 +508,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }

+    bdrv_graph_rdlock_main_loop();
+
    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
@@ -520,7 +522,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->align && (s->align >= INT_MAX || !is_power_of_2(s->align))) {
        error_setg(errp, "Cannot meet constraints with align %" PRIu64,
                   s->align);
-        goto out;
+        goto out_rdlock;
    }
    align = MAX(s->align, bs->file->bs->bl.request_alignment);

@@ -530,7 +532,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         !QEMU_IS_ALIGNED(s->max_transfer, align))) {
        error_setg(errp, "Cannot meet constraints with max-transfer %" PRIu64,
                   s->max_transfer);
-        goto out;
+        goto out_rdlock;
    }

    s->opt_write_zero = qemu_opt_get_size(opts, "opt-write-zero", 0);
@@ -539,7 +541,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         !QEMU_IS_ALIGNED(s->opt_write_zero, align))) {
        error_setg(errp, "Cannot meet constraints with opt-write-zero %" PRIu64,
                   s->opt_write_zero);
-        goto out;
+        goto out_rdlock;
    }

    s->max_write_zero = qemu_opt_get_size(opts, "max-write-zero", 0);
@@ -549,7 +551,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
                          MAX(s->opt_write_zero, align)))) {
        error_setg(errp, "Cannot meet constraints with max-write-zero %" PRIu64,
                   s->max_write_zero);
-        goto out;
+        goto out_rdlock;
    }

    s->opt_discard = qemu_opt_get_size(opts, "opt-discard", 0);
@@ -558,7 +560,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         !QEMU_IS_ALIGNED(s->opt_discard, align))) {
        error_setg(errp, "Cannot meet constraints with opt-discard %" PRIu64,
                   s->opt_discard);
-        goto out;
+        goto out_rdlock;
    }

    s->max_discard = qemu_opt_get_size(opts, "max-discard", 0);
@@ -568,12 +570,14 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
                          MAX(s->opt_discard, align)))) {
        error_setg(errp, "Cannot meet constraints with max-discard %" PRIu64,
                   s->max_discard);
-        goto out;
+        goto out_rdlock;
    }

    bdrv_debug_event(bs, BLKDBG_NONE);

    ret = 0;
+out_rdlock:
+    bdrv_graph_rdunlock_main_loop();
 out:
    if (ret < 0) {
        qemu_mutex_destroy(&s->lock);
@@ -746,13 +750,10 @@ blkdebug_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
    return bdrv_co_pdiscard(bs->file, offset, bytes);
 }

-static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
-                                                 bool want_zero,
-                                                 int64_t offset,
-                                                 int64_t bytes,
-                                                 int64_t *pnum,
-                                                 int64_t *map,
-                                                 BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+blkdebug_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                         int64_t bytes, int64_t *pnum, int64_t *map,
+                         BlockDriverState **file)
 {
    int err;

@@ -973,7 +974,7 @@ blkdebug_co_getlength(BlockDriverState *bs)
    return bdrv_co_getlength(bs->file->bs);
 }

-static void blkdebug_refresh_filename(BlockDriverState *bs)
+static void GRAPH_RDLOCK blkdebug_refresh_filename(BlockDriverState *bs)
 {
    BDRVBlkdebugState *s = bs->opaque;
    const QDictEntry *e;
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -13,6 +13,7 @@
 #include "block/block_int.h"
 #include "exec/memory.h"
 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
+#include "qemu/defer-call.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qapi/qmp/qdict.h"
@@ -312,10 +313,10 @@ static void blkio_detach_aio_context(BlockDriverState *bs)
 }

 /*
- * Called by blk_io_unplug() or immediately if not plugged. Called without
- * blkio_lock.
+ * Called by defer_call_end() or immediately if not in a deferred section.
+ * Called without blkio_lock.
 */
-static void blkio_unplug_fn(void *opaque)
+static void blkio_deferred_fn(void *opaque)
 {
    BDRVBlkioState *s = opaque;

@@ -332,7 +333,7 @@ static void blkio_submit_io(BlockDriverState *bs)
 {
    BDRVBlkioState *s = bs->opaque;

-    blk_io_plug_call(blkio_unplug_fn, s);
+    defer_call(blkio_deferred_fn, s);
 }

 static int coroutine_fn
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -130,7 +130,13 @@ static int coroutine_fn GRAPH_RDLOCK blkreplay_co_flush(BlockDriverState *bs)
 static int blkreplay_snapshot_goto(BlockDriverState *bs,
                                   const char *snapshot_id)
 {
-    return bdrv_snapshot_goto(bs->file->bs, snapshot_id, NULL);
+    BlockDriverState *file_bs;
+
+    bdrv_graph_rdlock_main_loop();
+    file_bs = bs->file->bs;
+    bdrv_graph_rdunlock_main_loop();
+
+    return bdrv_snapshot_goto(file_bs, snapshot_id, NULL);
 }

 static BlockDriver bdrv_blkreplay = {
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -33,8 +33,8 @@ typedef struct BlkverifyRequest {
    uint64_t bytes;
    int flags;

-    int (*request_fn)(BdrvChild *, int64_t, int64_t, QEMUIOVector *,
-                      BdrvRequestFlags);
+    int GRAPH_RDLOCK_PTR (*request_fn)(
+        BdrvChild *, int64_t, int64_t, QEMUIOVector *, BdrvRequestFlags);

    int ret;                    /* test image result */
    int raw_ret;                /* raw image result */
@@ -170,8 +170,11 @@ static void coroutine_fn blkverify_do_test_req(void *opaque)
    BlkverifyRequest *r = opaque;
    BDRVBlkverifyState *s = r->bs->opaque;

+    bdrv_graph_co_rdlock();
    r->ret = r->request_fn(s->test_file, r->offset, r->bytes, r->qiov,
                           r->flags);
+    bdrv_graph_co_rdunlock();
+
    r->done++;
    qemu_coroutine_enter_if_inactive(r->co);
 }
@@ -180,13 +183,16 @@ static void coroutine_fn blkverify_do_raw_req(void *opaque)
 {
    BlkverifyRequest *r = opaque;

+    bdrv_graph_co_rdlock();
    r->raw_ret = r->request_fn(r->bs->file, r->offset, r->bytes, r->raw_qiov,
                               r->flags);
+    bdrv_graph_co_rdunlock();
+
    r->done++;
    qemu_coroutine_enter_if_inactive(r->co);
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 blkverify_co_prwv(BlockDriverState *bs, BlkverifyRequest *r, uint64_t offset,
                  uint64_t bytes, QEMUIOVector *qiov, QEMUIOVector *raw_qiov,
                  int flags, bool is_write)
@@ -222,7 +228,7 @@ blkverify_co_prwv(BlockDriverState *bs, BlkverifyRequest *r, uint64_t offset,
    return r->ret;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
                    QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
@@ -251,7 +257,7 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    return ret;
 }

-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 blkverify_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
                     QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
@@ -282,7 +288,7 @@ blkverify_recurse_can_replace(BlockDriverState *bs,
           bdrv_recurse_can_replace(s->test_file->bs, to_replace);
 }

-static void blkverify_refresh_filename(BlockDriverState *bs)
+static void GRAPH_RDLOCK blkverify_refresh_filename(BlockDriverState *bs)
 {
    BDRVBlkverifyState *s = bs->opaque;

--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -931,10 +931,12 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
    GLOBAL_STATE_CODE();
    bdrv_ref(bs);
+    bdrv_graph_wrlock(bs);
    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
                                       blk->perm, blk->shared_perm,
                                       blk, errp);
+    bdrv_graph_wrunlock();
    if (blk->root == NULL) {
        return -EPERM;
    }
@@ -2666,6 +2668,8 @@ int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
 {
    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!blk_is_available(blk)) {
        return -ENOMEDIUM;
    }
@@ -2726,6 +2730,7 @@ int blk_commit_all(void)
 {
    BlockBackend *blk = NULL;
    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();

    while ((blk = blk_all_next(blk)) != NULL) {
        AioContext *aio_context = blk_get_aio_context(blk);
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -313,7 +313,12 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
 {
    int ret;
    BlockDriverInfo bdi;
-    bool target_does_cow = bdrv_backing_chain_next(target);
+    bool target_does_cow;
+
+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
+    target_does_cow = bdrv_backing_chain_next(target);

    /*
     * If there is no backing file on the target, we cannot rely on COW if our
@@ -355,6 +360,8 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
    BdrvDirtyBitmap *copy_bitmap;
    bool is_fleecing;

+    GLOBAL_STATE_CODE();
+
    cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
    if (cluster_size < 0) {
        return NULL;
@@ -392,7 +399,9 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
     * For more information see commit f8d59dfb40bb and test
     * tests/qemu-iotests/222
     */
+    bdrv_graph_rdlock_main_loop();
    is_fleecing = bdrv_chain_contains(target->bs, source->bs);
+    bdrv_graph_rdunlock_main_loop();

    s = g_new(BlockCopyState, 1);
    *s = (BlockCopyState) {
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -105,6 +105,8 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    struct bochs_header bochs;
    int ret;

+    GLOBAL_STATE_CODE();
+
    /* No write support yet */
    bdrv_graph_rdlock_main_loop();
    ret = bdrv_apply_auto_read_only(bs, NULL, errp);
@@ -118,6 +120,8 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    ret = bdrv_pread(bs->file, 0, sizeof(bochs), &bochs, 0);
    if (ret < 0) {
        return ret;
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -67,6 +67,8 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    uint32_t offsets_size, max_compressed_block_size = 1, i;
    int ret;

+    GLOBAL_STATE_CODE();
+
    bdrv_graph_rdlock_main_loop();
    ret = bdrv_apply_auto_read_only(bs, NULL, errp);
    bdrv_graph_rdunlock_main_loop();
@@ -79,6 +81,8 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /* read header */
    ret = bdrv_pread(bs->file, 128, 4, &s->block_size, 0);
    if (ret < 0) {
--- a/block/commit.c
+++ b/block/commit.c
@@ -48,8 +48,10 @@ static int commit_prepare(Job *job)
 {
    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);

+    bdrv_graph_rdlock_main_loop();
    bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
    s->chain_frozen = false;
+    bdrv_graph_rdunlock_main_loop();

    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
     * the normal backing chain can be restored. */
@@ -66,9 +68,12 @@ static void commit_abort(Job *job)
 {
    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
    BlockDriverState *top_bs = blk_bs(s->top);
+    BlockDriverState *commit_top_backing_bs;

    if (s->chain_frozen) {
+        bdrv_graph_rdlock_main_loop();
        bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
+        bdrv_graph_rdunlock_main_loop();
    }

    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
@@ -90,8 +95,15 @@ static void commit_abort(Job *job)
     * XXX Can (or should) we somehow keep 'consistent read' blocked even
     * after the failed/cancelled commit job is gone? If we already wrote
     * something to base, the intermediate images aren't valid any more. */
-    bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs,
-                      &error_abort);
+    bdrv_graph_rdlock_main_loop();
+    commit_top_backing_bs = s->commit_top_bs->backing->bs;
+    bdrv_graph_rdunlock_main_loop();
+
+    bdrv_drained_begin(commit_top_backing_bs);
+    bdrv_graph_wrlock(commit_top_backing_bs);
+    bdrv_replace_node(s->commit_top_bs, commit_top_backing_bs, &error_abort);
+    bdrv_graph_wrunlock();
+    bdrv_drained_end(commit_top_backing_bs);

    bdrv_unref(s->commit_top_bs);
    bdrv_unref(top_bs);
@@ -210,7 +222,7 @@ bdrv_commit_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }

-static void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
+static GRAPH_RDLOCK void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
 {
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->backing->bs->filename);
@@ -255,10 +267,13 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    GLOBAL_STATE_CODE();

    assert(top != bs);
+    bdrv_graph_rdlock_main_loop();
    if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
+        bdrv_graph_rdunlock_main_loop();
        return;
    }
+    bdrv_graph_rdunlock_main_loop();

    base_size = bdrv_getlength(base);
    if (base_size < 0) {
@@ -324,6 +339,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     * this is the responsibility of the interface (i.e. whoever calls
     * commit_start()).
     */
+    bdrv_graph_wrlock(top);
    s->base_overlay = bdrv_find_overlay(top, base);
    assert(s->base_overlay);

@@ -354,16 +370,20 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                 iter_shared_perms, errp);
        if (ret < 0) {
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }

    if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
+        bdrv_graph_wrunlock();
        goto fail;
    }
    s->chain_frozen = true;

    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
+    bdrv_graph_wrunlock();
+
    if (ret < 0) {
        goto fail;
    }
@@ -396,7 +416,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,

 fail:
    if (s->chain_frozen) {
+        bdrv_graph_rdlock_main_loop();
        bdrv_unfreeze_backing_chain(commit_top_bs, base);
+        bdrv_graph_rdunlock_main_loop();
    }
    if (s->base) {
        blk_unref(s->base);
@@ -411,7 +433,11 @@ fail:
    /* commit_top_bs has to be replaced after deleting the block job,
     * otherwise this would fail because of lack of permissions. */
    if (commit_top_bs) {
+        bdrv_drained_begin(top);
+        bdrv_graph_wrlock(top);
        bdrv_replace_node(commit_top_bs, top, &error_abort);
+        bdrv_graph_wrunlock();
+        bdrv_drained_end(top);
    }
 }

--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -203,7 +203,7 @@ static int coroutine_fn GRAPH_RDLOCK cbw_co_flush(BlockDriverState *bs)
 * It's guaranteed that guest writes will not interact in the region until
 * cbw_snapshot_read_unlock() called.
 */
-static coroutine_fn BlockReq *
+static BlockReq * coroutine_fn GRAPH_RDLOCK
 cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
                       int64_t *pnum, BdrvChild **file)
 {
@@ -335,7 +335,7 @@ cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
    return bdrv_co_pdiscard(s->target, offset, bytes);
 }

-static void cbw_refresh_filename(BlockDriverState *bs)
+static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs)
 {
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->file->bs->filename);
@@ -433,6 +433,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
        return -EINVAL;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    ctx = bdrv_get_aio_context(bs);
    aio_context_acquire(ctx);

--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -35,8 +35,8 @@ typedef struct BDRVStateCOR {
 } BDRVStateCOR;


-static int cor_open(BlockDriverState *bs, QDict *options, int flags,
-                    Error **errp)
+static int GRAPH_UNLOCKED
+cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
 {
    BlockDriverState *bottom_bs = NULL;
    BDRVStateCOR *state = bs->opaque;
@@ -44,11 +44,15 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
    const char *bottom_node = qdict_get_try_str(options, "bottom");
    int ret;

+    GLOBAL_STATE_CODE();
+
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    bs->supported_read_flags = BDRV_REQ_PREFETCH;

    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
@@ -227,13 +231,17 @@ cor_co_lock_medium(BlockDriverState *bs, bool locked)
 }


-static void cor_close(BlockDriverState *bs)
+static void GRAPH_UNLOCKED cor_close(BlockDriverState *bs)
 {
    BDRVStateCOR *s = bs->opaque;

+    GLOBAL_STATE_CODE();
+
    if (s->chain_frozen) {
+        bdrv_graph_rdlock_main_loop();
        s->chain_frozen = false;
        bdrv_unfreeze_backing_chain(bs, s->bottom_bs);
+        bdrv_graph_rdunlock_main_loop();
    }

    bdrv_unref(s->bottom_bs);
@@ -263,12 +271,15 @@ static BlockDriver bdrv_copy_on_read = {
 };


-void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs)
+void no_coroutine_fn bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs)
 {
    BDRVStateCOR *s = cor_filter_bs->opaque;

+    GLOBAL_STATE_CODE();
+
    /* unfreeze, as otherwise bdrv_replace_node() will fail */
    if (s->chain_frozen) {
+        GRAPH_RDLOCK_GUARD_MAINLOOP();
        s->chain_frozen = false;
        bdrv_unfreeze_backing_chain(cor_filter_bs, s->bottom_bs);
    }
--- a/block/copy-on-read.h
+++ b/block/copy-on-read.h
@@ -27,6 +27,7 @@

 #include "block/block_int.h"

-void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs);
+void no_coroutine_fn GRAPH_UNLOCKED
+bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs);

 #endif /* BLOCK_COPY_ON_READ_H */
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -65,6 +65,9 @@ static int block_crypto_read_func(QCryptoBlock *block,
    BlockDriverState *bs = opaque;
    ssize_t ret;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    ret = bdrv_pread(bs->file, offset, buflen, buf, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
@@ -83,6 +86,9 @@ static int block_crypto_write_func(QCryptoBlock *block,
    BlockDriverState *bs = opaque;
    ssize_t ret;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    ret = bdrv_pwrite(bs->file, offset, buflen, buf, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
@@ -263,11 +269,15 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
    unsigned int cflags = 0;
    QDict *cryptoopts = NULL;

+    GLOBAL_STATE_CODE();
+
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    bs->supported_write_flags = BDRV_REQ_FUA &
        bs->file->bs->supported_write_flags;

--- a/block/dmg.c
+++ b/block/dmg.c
@@ -70,7 +70,8 @@ static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
    return 0;
 }

-static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
+static int GRAPH_RDLOCK
+read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
 {
    uint64_t buffer;
    int ret;
@@ -84,7 +85,8 @@ static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
    return 0;
 }

-static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
+static int GRAPH_RDLOCK
+read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
 {
    uint32_t buffer;
    int ret;
@@ -321,8 +323,9 @@ fail:
    return ret;
 }

-static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
-                                  uint64_t info_begin, uint64_t info_length)
+static int GRAPH_RDLOCK
+dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
+                       uint64_t info_begin, uint64_t info_length)
 {
    BDRVDMGState *s = bs->opaque;
    int ret;
@@ -388,8 +391,9 @@ fail:
    return ret;
 }

-static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,
-                              uint64_t info_begin, uint64_t info_length)
+static int GRAPH_RDLOCK
+dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,
+                   uint64_t info_begin, uint64_t info_length)
 {
    BDRVDMGState *s = bs->opaque;
    int ret;
@@ -452,6 +456,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t offset;
    int ret;

+    GLOBAL_STATE_CODE();
+
    bdrv_graph_rdlock_main_loop();
    ret = bdrv_apply_auto_read_only(bs, NULL, errp);
    bdrv_graph_rdunlock_main_loop();
@@ -463,6 +469,9 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    if (ret < 0) {
        return ret;
    }
+
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /*
     * NB: if uncompress submodules are absent,
     * ie block_module_load return value == 0, the function pointers
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -160,7 +160,6 @@ typedef struct BDRVRawState {
    bool has_write_zeroes:1;
    bool use_linux_aio:1;
    bool use_linux_io_uring:1;
-    int64_t *offset; /* offset of zone append operation */
    int page_cache_inconsistent; /* errno from fdatasync failure */
    bool has_fallocate;
    bool needs_alignment;
@@ -2445,12 +2444,13 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
    return true;
 }

-static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
+static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 {
    BDRVRawState *s = bs->opaque;
    RawPosixAIOData acb;
    int ret;
+    uint64_t offset = *offset_ptr;

    if (fd_open(bs) < 0)
        return -EIO;
@@ -2513,8 +2513,8 @@ out:
            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
            if (!BDRV_ZT_IS_CONV(*wp)) {
                if (type & QEMU_AIO_ZONE_APPEND) {
-                    *s->offset = *wp;
-                    trace_zbd_zone_append_complete(bs, *s->offset
+                    *offset_ptr = *wp;
+                    trace_zbd_zone_append_complete(bs, *offset_ptr
                        >> BDRV_SECTOR_BITS);
                }
                /* Advance the wp if needed */
@@ -2523,7 +2523,10 @@ out:
                }
            }
        } else {
-            update_zones_wp(bs, s->fd, 0, 1);
+            /*
+             * write and append write are not allowed to cross zone boundaries
+             */
+            update_zones_wp(bs, s->fd, offset, 1);
        }

        qemu_co_mutex_unlock(&wps->colock);
@@ -2536,14 +2539,14 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
                                      int64_t bytes, QEMUIOVector *qiov,
                                      BdrvRequestFlags flags)
 {
-    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
+    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
 }

 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
                                       int64_t bytes, QEMUIOVector *qiov,
                                       BdrvRequestFlags flags)
 {
-    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
+    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
 }

 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@@ -3470,7 +3473,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
                        len >> BDRV_SECTOR_BITS);
    ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
    if (ret != 0) {
-        update_zones_wp(bs, s->fd, offset, i);
+        update_zones_wp(bs, s->fd, offset, nrz);
        error_report("ioctl %s failed %d", op_name, ret);
        return ret;
    }
@@ -3506,8 +3509,6 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
    int64_t zone_size_mask = bs->bl.zone_size - 1;
    int64_t iov_len = 0;
    int64_t len = 0;
-    BDRVRawState *s = bs->opaque;
-    s->offset = offset;

    if (*offset & zone_size_mask) {
        error_report("sector offset %" PRId64 " is not aligned to zone size "
@@ -3528,7 +3529,7 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
    }

    trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
-    return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
 }
 #endif

--- a/block/filter-compress.c
+++ b/block/filter-compress.c
@@ -36,6 +36,8 @@ static int compress_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!bs->file->bs->drv || !block_driver_can_compress(bs->file->bs->drv)) {
        error_setg(errp,
                   "Compression is not supported for underlying format: %s",
@@ -97,7 +99,8 @@ compress_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 }


-static void compress_refresh_limits(BlockDriverState *bs, Error **errp)
+static void GRAPH_RDLOCK
+compress_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BlockDriverInfo bdi;
    int ret;
--- a/block/io.c
+++ b/block/io.c
@@ -3685,6 +3685,8 @@ out:
 void bdrv_cancel_in_flight(BlockDriverState *bs)
 {
    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!bs || !bs->drv) {
        return;
    }
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -15,6 +15,7 @@
 #include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
+#include "qemu/defer-call.h"
 #include "qapi/error.h"
 #include "sysemu/block-backend.h"
 #include "trace.h"
@@ -124,6 +125,9 @@ static void luring_process_completions(LuringState *s)
 {
    struct io_uring_cqe *cqes;
    int total_bytes;
+
+    defer_call_begin();
+
    /*
     * Request completion callbacks can run the nested event loop.
     * Schedule ourselves so the nested event loop will "see" remaining
@@ -216,7 +220,10 @@ end:
            aio_co_wake(luringcb->co);
        }
    }
+
    qemu_bh_cancel(s->completion_bh);
+
+    defer_call_end();
 }

 static int ioq_submit(LuringState *s)
@@ -306,7 +313,7 @@ static void ioq_init(LuringQueue *io_q)
    io_q->blocked = false;
 }

-static void luring_unplug_fn(void *opaque)
+static void luring_deferred_fn(void *opaque)
 {
    LuringState *s = opaque;
    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
@@ -367,7 +374,7 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
            return ret;
        }

-        blk_io_plug_call(luring_unplug_fn, s);
+        defer_call(luring_deferred_fn, s);
    }
    return 0;
 }
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -14,6 +14,7 @@
 #include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
 #include "qemu/coroutine.h"
+#include "qemu/defer-call.h"
 #include "qapi/error.h"
 #include "sysemu/block-backend.h"

@@ -204,6 +205,8 @@ static void qemu_laio_process_completions(LinuxAioState *s)
 {
    struct io_event *events;

+    defer_call_begin();
+
    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

@@ -230,6 +233,8 @@ static void qemu_laio_process_completions(LinuxAioState *s)
     * own `for` loop.  If we are the last all counters dropped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
+
+    defer_call_end();
 }

 static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
@@ -353,7 +358,7 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
    return max_batch;
 }

-static void laio_unplug_fn(void *opaque)
+static void laio_deferred_fn(void *opaque)
 {
    LinuxAioState *s = opaque;

@@ -393,7 +398,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
            ioq_submit(s);
        } else {
-            blk_io_plug_call(laio_unplug_fn, s);
+            defer_call(laio_deferred_fn, s);
        }
    }

--- a/block/meson.build
+++ b/block/meson.build
@@ -21,7 +21,6 @@ block_ss.add(files(
  'mirror.c',
  'nbd.c',
  'null.c',
-  'plug.c',
  'preallocate.c',
  'progress_meter.c',
  'qapi.c',
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -55,10 +55,18 @@ typedef struct MirrorBlockJob {
    BlockMirrorBackingMode backing_mode;
    /* Whether the target image requires explicit zero-initialization */
    bool zero_target;
+    /*
+     * To be accesssed with atomics. Written only under the BQL (required by the
+     * current implementation of mirror_change()).
+     */
    MirrorCopyMode copy_mode;
    BlockdevOnError on_source_error, on_target_error;
-    /* Set when the target is synced (dirty bitmap is clean, nothing
-     * in flight) and the job is running in active mode */
+    /*
+     * To be accessed with atomics.
+     *
+     * Set when the target is synced (dirty bitmap is clean, nothing in flight)
+     * and the job is running in active mode.
+     */
    bool actively_synced;
    bool should_complete;
    int64_t granularity;
@@ -122,7 +130,7 @@ typedef enum MirrorMethod {
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                            int error)
 {
-    s->actively_synced = false;
+    qatomic_set(&s->actively_synced, false);
    if (read) {
        return block_job_error_action(&s->common, s->on_source_error,
                                      true, error);
@@ -471,7 +479,7 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
    return bytes_handled;
 }

-static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
+static void coroutine_fn GRAPH_RDLOCK mirror_iteration(MirrorBlockJob *s)
 {
    BlockDriverState *source = s->mirror_top_bs->backing->bs;
    MirrorOp *pseudo_op;
@@ -670,6 +678,7 @@ static int mirror_exit_common(Job *job)
    s->prepared = true;

    aio_context_acquire(qemu_get_aio_context());
+    bdrv_graph_rdlock_main_loop();

    mirror_top_bs = s->mirror_top_bs;
    bs_opaque = mirror_top_bs->opaque;
@@ -688,6 +697,8 @@ static int mirror_exit_common(Job *job)
    bdrv_ref(mirror_top_bs);
    bdrv_ref(target_bs);

+    bdrv_graph_rdunlock_main_loop();
+
    /*
     * Remove target parent that still uses BLK_PERM_WRITE/RESIZE before
     * inserting target_bs at s->to_replace, where we might not be able to get
@@ -701,12 +712,12 @@ static int mirror_exit_common(Job *job)
     * these permissions any more means that we can't allow any new requests on
     * mirror_top_bs from now on, so keep it drained. */
    bdrv_drained_begin(mirror_top_bs);
+    bdrv_drained_begin(target_bs);
    bs_opaque->stop = true;

    bdrv_graph_rdlock_main_loop();
    bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                             &error_abort);
-    bdrv_graph_rdunlock_main_loop();

    if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
@@ -729,6 +740,7 @@ static int mirror_exit_common(Job *job)
            local_err = NULL;
        }
    }
+    bdrv_graph_rdunlock_main_loop();

    if (s->to_replace) {
        replace_aio_context = bdrv_get_aio_context(s->to_replace);
@@ -746,15 +758,13 @@ static int mirror_exit_common(Job *job)
        /* The mirror job has no requests in flight any more, but we need to
         * drain potential other users of the BDS before changing the graph. */
        assert(s->in_drain);
-        bdrv_drained_begin(target_bs);
+        bdrv_drained_begin(to_replace);
        /*
         * Cannot use check_to_replace_node() here, because that would
         * check for an op blocker on @to_replace, and we have our own
         * there.
-         *
-         * TODO Pull out the writer lock from bdrv_replace_node() to here
         */
-        bdrv_graph_rdlock_main_loop();
+        bdrv_graph_wrlock(target_bs);
        if (bdrv_recurse_can_replace(src, to_replace)) {
            bdrv_replace_node(to_replace, target_bs, &local_err);
        } else {
@@ -763,8 +773,8 @@ static int mirror_exit_common(Job *job)
                       "would not lead to an abrupt change of visible data",
                       to_replace->node_name, target_bs->node_name);
        }
-        bdrv_graph_rdunlock_main_loop();
-        bdrv_drained_end(target_bs);
+        bdrv_graph_wrunlock();
+        bdrv_drained_end(to_replace);
        if (local_err) {
            error_report_err(local_err);
            ret = -EPERM;
@@ -779,7 +789,6 @@ static int mirror_exit_common(Job *job)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_unref(target_bs);

    /*
     * Remove the mirror filter driver from the graph. Before this, get rid of
@@ -787,7 +796,12 @@ static int mirror_exit_common(Job *job)
     * valid.
     */
    block_job_remove_all_bdrv(bjob);
+    bdrv_graph_wrlock(mirror_top_bs);
    bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
+    bdrv_graph_wrunlock();
+
+    bdrv_drained_end(target_bs);
+    bdrv_unref(target_bs);

    bs_opaque->job = NULL;

@@ -825,14 +839,18 @@ static void coroutine_fn mirror_throttle(MirrorBlockJob *s)
    }
 }

-static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
+static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
 {
    int64_t offset;
-    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
+    BlockDriverState *bs;
    BlockDriverState *target_bs = blk_bs(s->target);
    int ret;
    int64_t count;

+    bdrv_graph_co_rdlock();
+    bs = s->mirror_top_bs->backing->bs;
+    bdrv_graph_co_rdunlock();
+
    if (s->zero_target) {
        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
@@ -912,7 +930,7 @@ static int coroutine_fn mirror_flush(MirrorBlockJob *s)
 static int coroutine_fn mirror_run(Job *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
-    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
+    BlockDriverState *bs;
    MirrorBDSOpaque *mirror_top_opaque = s->mirror_top_bs->opaque;
    BlockDriverState *target_bs = blk_bs(s->target);
    bool need_drain = true;
@@ -924,6 +942,10 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
                                 checking for a NULL string */
    int ret = 0;

+    bdrv_graph_co_rdlock();
+    bs = bdrv_filter_bs(s->mirror_top_bs);
+    bdrv_graph_co_rdunlock();
+
    if (job_is_cancelled(&s->common.job)) {
        goto immediate_exit;
    }
@@ -962,7 +984,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
    if (s->bdev_length == 0) {
        /* Transition to the READY state and wait for complete. */
        job_transition_to_ready(&s->common.job);
-        s->actively_synced = true;
+        qatomic_set(&s->actively_synced, true);
        while (!job_cancel_requested(&s->common.job) && !s->should_complete) {
            job_yield(&s->common.job);
        }
@@ -984,13 +1006,13 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
    } else {
        s->target_cluster_size = BDRV_SECTOR_SIZE;
    }
-    bdrv_graph_co_rdunlock();
    if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) &&
        s->granularity < s->target_cluster_size) {
        s->buf_size = MAX(s->buf_size, s->target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
+    bdrv_graph_co_rdunlock();

    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
@@ -1056,7 +1078,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
                mirror_wait_for_free_in_flight_slot(s);
                continue;
            } else if (cnt != 0) {
+                bdrv_graph_co_rdlock();
                mirror_iteration(s);
+                bdrv_graph_co_rdunlock();
            }
        }

@@ -1074,9 +1098,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
                 * the target in a consistent state.
                 */
                job_transition_to_ready(&s->common.job);
-                if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) {
-                    s->actively_synced = true;
-                }
+            }
+            if (qatomic_read(&s->copy_mode) != MIRROR_COPY_MODE_BACKGROUND) {
+                qatomic_set(&s->actively_synced, true);
            }

            should_complete = s->should_complete ||
@@ -1246,6 +1270,48 @@ static bool commit_active_cancel(Job *job, bool force)
    return force || !job_is_ready(job);
 }

+static void mirror_change(BlockJob *job, BlockJobChangeOptions *opts,
+                          Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    BlockJobChangeOptionsMirror *change_opts = &opts->u.mirror;
+    MirrorCopyMode current;
+
+    /*
+     * The implementation relies on the fact that copy_mode is only written
+     * under the BQL. Otherwise, further synchronization would be required.
+     */
+
+    GLOBAL_STATE_CODE();
+
+    if (qatomic_read(&s->copy_mode) == change_opts->copy_mode) {
+        return;
+    }
+
+    if (change_opts->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING) {
+        error_setg(errp, "Change to copy mode '%s' is not implemented",
+                   MirrorCopyMode_str(change_opts->copy_mode));
+        return;
+    }
+
+    current = qatomic_cmpxchg(&s->copy_mode, MIRROR_COPY_MODE_BACKGROUND,
+                              change_opts->copy_mode);
+    if (current != MIRROR_COPY_MODE_BACKGROUND) {
+        error_setg(errp, "Expected current copy mode '%s', got '%s'",
+                   MirrorCopyMode_str(MIRROR_COPY_MODE_BACKGROUND),
+                   MirrorCopyMode_str(current));
+    }
+}
+
+static void mirror_query(BlockJob *job, BlockJobInfo *info)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    info->u.mirror = (BlockJobInfoMirror) {
+        .actively_synced = qatomic_read(&s->actively_synced),
+    };
+}
+
 static const BlockJobDriver mirror_job_driver = {
    .job_driver = {
        .instance_size          = sizeof(MirrorBlockJob),
@@ -1260,6 +1326,8 @@ static const BlockJobDriver mirror_job_driver = {
        .cancel                 = mirror_cancel,
    },
    .drained_poll           = mirror_drained_poll,
+    .change                 = mirror_change,
+    .query                  = mirror_query,
 };

 static const BlockJobDriver commit_active_job_driver = {
@@ -1378,7 +1446,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
        bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
        bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
                              bitmap_end - bitmap_offset);
-        job->actively_synced = false;
+        qatomic_set(&job->actively_synced, false);

        action = mirror_error_action(job, false, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT) {
@@ -1437,7 +1505,8 @@ static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op)
    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
                                      op->s->granularity);

-    if (!--op->s->in_active_write_counter && op->s->actively_synced) {
+    if (!--op->s->in_active_write_counter &&
+        qatomic_read(&op->s->actively_synced)) {
        BdrvChild *source = op->s->mirror_top_bs->backing;

        if (QLIST_FIRST(&source->bs->parents) == source &&
@@ -1463,21 +1532,21 @@ bdrv_mirror_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }

+static bool should_copy_to_target(MirrorBDSOpaque *s)
+{
+    return s->job && s->job->ret >= 0 &&
+        !job_is_cancelled(&s->job->common.job) &&
+        qatomic_read(&s->job->copy_mode) == MIRROR_COPY_MODE_WRITE_BLOCKING;
+}
+
 static int coroutine_fn GRAPH_RDLOCK
 bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorMethod method,
-                         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
-                         int flags)
+                         bool copy_to_target, uint64_t offset, uint64_t bytes,
+                         QEMUIOVector *qiov, int flags)
 {
    MirrorOp *op = NULL;
    MirrorBDSOpaque *s = bs->opaque;
    int ret = 0;
-    bool copy_to_target = false;
-
-    if (s->job) {
-        copy_to_target = s->job->ret >= 0 &&
-                         !job_is_cancelled(&s->job->common.job) &&
-                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
-    }

    if (copy_to_target) {
        op = active_write_prepare(s->job, offset, bytes);
@@ -1500,6 +1569,11 @@ bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorMethod method,
        abort();
    }

+    if (!copy_to_target && s->job && s->job->dirty_bitmap) {
+        qatomic_set(&s->job->actively_synced, false);
+        bdrv_set_dirty_bitmap(s->job->dirty_bitmap, offset, bytes);
+    }
+
    if (ret < 0) {
        goto out;
    }
@@ -1519,17 +1593,10 @@ static int coroutine_fn GRAPH_RDLOCK
 bdrv_mirror_top_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
                        QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
-    MirrorBDSOpaque *s = bs->opaque;
    QEMUIOVector bounce_qiov;
    void *bounce_buf;
    int ret = 0;
-    bool copy_to_target = false;
-
-    if (s->job) {
-        copy_to_target = s->job->ret >= 0 &&
-                         !job_is_cancelled(&s->job->common.job) &&
-                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
-    }
+    bool copy_to_target = should_copy_to_target(bs->opaque);

    if (copy_to_target) {
        /* The guest might concurrently modify the data to write; but
@@ -1546,8 +1613,8 @@ bdrv_mirror_top_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
        flags &= ~BDRV_REQ_REGISTERED_BUF;
    }

-    ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
-                                   flags);
+    ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, copy_to_target,
+                                   offset, bytes, qiov, flags);

    if (copy_to_target) {
        qemu_iovec_destroy(&bounce_qiov);
@@ -1570,18 +1637,20 @@ static int coroutine_fn GRAPH_RDLOCK
 bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
                              int64_t bytes, BdrvRequestFlags flags)
 {
-    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL,
-                                    flags);
+    bool copy_to_target = should_copy_to_target(bs->opaque);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, copy_to_target,
+                                    offset, bytes, NULL, flags);
 }

 static int coroutine_fn GRAPH_RDLOCK
 bdrv_mirror_top_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
-    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes,
-                                    NULL, 0);
+    bool copy_to_target = should_copy_to_target(bs->opaque);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, copy_to_target,
+                                    offset, bytes, NULL, 0);
 }

-static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs)
+static void GRAPH_RDLOCK bdrv_mirror_top_refresh_filename(BlockDriverState *bs)
 {
    if (bs->backing == NULL) {
        /* we can be here after failed bdrv_attach_child in
@@ -1691,12 +1760,15 @@ static BlockJob *mirror_start_job(
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

+    bdrv_graph_rdlock_main_loop();
    if (bdrv_skip_filters(bs) == bdrv_skip_filters(target)) {
        error_setg(errp, "Can't mirror node into itself");
+        bdrv_graph_rdunlock_main_loop();
        return NULL;
    }

    target_is_backing = bdrv_chain_contains(bs, target);
+    bdrv_graph_rdunlock_main_loop();

    /* In the case of active commit, add dummy driver to provide consistent
     * reads on the top, while disabling it in the intermediate nodes, and make
@@ -1779,14 +1851,19 @@ static BlockJob *mirror_start_job(
        }

        target_shared_perms |= BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
-    } else if (bdrv_chain_contains(bs, bdrv_skip_filters(target))) {
-        /*
-         * We may want to allow this in the future, but it would
-         * require taking some extra care.
-         */
-        error_setg(errp, "Cannot mirror to a filter on top of a node in the "
-                   "source's backing chain");
-        goto fail;
+    } else {
+        bdrv_graph_rdlock_main_loop();
+        if (bdrv_chain_contains(bs, bdrv_skip_filters(target))) {
+            /*
+             * We may want to allow this in the future, but it would
+             * require taking some extra care.
+             */
+            error_setg(errp, "Cannot mirror to a filter on top of a node in "
+                       "the source's backing chain");
+            bdrv_graph_rdunlock_main_loop();
+            goto fail;
+        }
+        bdrv_graph_rdunlock_main_loop();
    }

    s->target = blk_new(s->common.job.aio_context,
@@ -1807,13 +1884,14 @@ static BlockJob *mirror_start_job(
    blk_set_allow_aio_context_change(s->target, true);
    blk_set_disable_request_queuing(s->target, true);

+    bdrv_graph_rdlock_main_loop();
    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
    s->is_none_mode = is_none_mode;
    s->backing_mode = backing_mode;
    s->zero_target = zero_target;
-    s->copy_mode = copy_mode;
+    qatomic_set(&s->copy_mode, copy_mode);
    s->base = base;
    s->base_overlay = bdrv_find_overlay(bs, base);
    s->granularity = granularity;
@@ -1822,20 +1900,27 @@ static BlockJob *mirror_start_job(
    if (auto_complete) {
        s->should_complete = true;
    }
+    bdrv_graph_rdunlock_main_loop();

-    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(s->mirror_top_bs, granularity,
+                                               NULL, errp);
    if (!s->dirty_bitmap) {
        goto fail;
    }
-    if (s->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
-        bdrv_disable_dirty_bitmap(s->dirty_bitmap);
-    }

+    /*
+     * The dirty bitmap is set by bdrv_mirror_top_do_write() when not in active
+     * mode.
+     */
+    bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+
+    bdrv_graph_wrlock(bs);
    ret = block_job_add_bdrv(&s->common, "source", bs, 0,
                             BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
                             BLK_PERM_CONSISTENT_READ,
                             errp);
    if (ret < 0) {
+        bdrv_graph_wrunlock();
        goto fail;
    }

@@ -1880,14 +1965,17 @@ static BlockJob *mirror_start_job(
            ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                     iter_shared_perms, errp);
            if (ret < 0) {
+                bdrv_graph_wrunlock();
                goto fail;
            }
        }

        if (bdrv_freeze_backing_chain(mirror_top_bs, target, errp) < 0) {
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }
+    bdrv_graph_wrunlock();

    QTAILQ_INIT(&s->ops_in_flight);

@@ -1912,11 +2000,14 @@ fail:
    }

    bs_opaque->stop = true;
-    bdrv_graph_rdlock_main_loop();
+    bdrv_drained_begin(bs);
+    bdrv_graph_wrlock(bs);
+    assert(mirror_top_bs->backing->bs == bs);
    bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                             &error_abort);
-    bdrv_graph_rdunlock_main_loop();
-    bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
+    bdrv_replace_node(mirror_top_bs, bs, &error_abort);
+    bdrv_graph_wrunlock();
+    bdrv_drained_end(bs);

    bdrv_unref(mirror_top_bs);

@@ -1945,8 +2036,12 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode_str(mode));
        return;
    }
+
+    bdrv_graph_rdlock_main_loop();
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
+    bdrv_graph_rdunlock_main_loop();
+
    mirror_start_job(job_id, bs, creation_flags, target, replaces,
                     speed, granularity, buf_size, backing_mode, zero_target,
                     on_source_error, on_target_error, unmap, NULL, NULL,
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -206,6 +206,9 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
    BlockBackend *blk;
    int ret;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!strcmp(device, "all")) {
        ret = blk_commit_all();
    } else {
@@ -846,7 +849,7 @@ void hmp_info_block_jobs(Monitor *mon, const QDict *qdict)
    }

    while (list) {
-        if (strcmp(list->value->type, "stream") == 0) {
+        if (list->value->type == JOB_TYPE_STREAM) {
            monitor_printf(mon, "Streaming device %s: Completed %" PRId64
                           " of %" PRId64 " bytes, speed limit %" PRId64
                           " bytes/s\n",
@@ -858,7 +861,7 @@ void hmp_info_block_jobs(Monitor *mon, const QDict *qdict)
            monitor_printf(mon, "Type %s, device %s: Completed %" PRId64
                           " of %" PRId64 " bytes, speed limit %" PRId64
                           " bytes/s\n",
-                           list->value->type,
+                           JobType_str(list->value->type),
                           list->value->device,
                           list->value->offset,
                           list->value->len,
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -16,6 +16,7 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
+#include "qemu/defer-call.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qemu/module.h"
@@ -416,9 +417,10 @@ static bool nvme_process_completion(NVMeQueuePair *q)
            q->cq_phase = !q->cq_phase;
        }
        cid = le16_to_cpu(c->cid);
-        if (cid == 0 || cid > NVME_QUEUE_SIZE) {
-            warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
-                        "queue size: %u", cid, NVME_QUEUE_SIZE);
+        if (cid == 0 || cid > NVME_NUM_REQS) {
+            warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32
+                        ", should be within: 1..%u inclusively", cid,
+                        NVME_NUM_REQS);
            continue;
        }
        trace_nvme_complete_command(s, q->index, cid);
@@ -476,7 +478,7 @@ static void nvme_trace_command(const NvmeCmd *cmd)
    }
 }

-static void nvme_unplug_fn(void *opaque)
+static void nvme_deferred_fn(void *opaque)
 {
    NVMeQueuePair *q = opaque;

@@ -503,7 +505,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
    q->need_kick++;
    qemu_mutex_unlock(&q->lock);

-    blk_io_plug_call(nvme_unplug_fn, q);
+    defer_call(nvme_deferred_fn, q);
 }

 static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
--- a/block/parallels-ext.c
+++ b/block/parallels-ext.c
@@ -59,11 +59,10 @@ typedef struct ParallelsDirtyBitmapFeature {
 } QEMU_PACKED ParallelsDirtyBitmapFeature;

 /* Given L1 table read bitmap data from the image and populate @bitmap */
-static int parallels_load_bitmap_data(BlockDriverState *bs,
-                                      const uint64_t *l1_table,
-                                      uint32_t l1_size,
-                                      BdrvDirtyBitmap *bitmap,
-                                      Error **errp)
+static int GRAPH_RDLOCK
+parallels_load_bitmap_data(BlockDriverState *bs, const uint64_t *l1_table,
+                           uint32_t l1_size, BdrvDirtyBitmap *bitmap,
+                           Error **errp)
 {
    BDRVParallelsState *s = bs->opaque;
    int ret = 0;
@@ -120,17 +119,16 @@ finish:
 * @data buffer (of @data_size size) is the Dirty bitmaps feature which
 * consists of ParallelsDirtyBitmapFeature followed by L1 table.
 */
-static BdrvDirtyBitmap *parallels_load_bitmap(BlockDriverState *bs,
-                                              uint8_t *data,
-                                              size_t data_size,
-                                              Error **errp)
+static BdrvDirtyBitmap * GRAPH_RDLOCK
+parallels_load_bitmap(BlockDriverState *bs, uint8_t *data, size_t data_size,
+                      Error **errp)
 {
    int ret;
    ParallelsDirtyBitmapFeature bf;
    g_autofree uint64_t *l1_table = NULL;
    BdrvDirtyBitmap *bitmap;
    QemuUUID uuid;
-    char uuidstr[UUID_FMT_LEN + 1];
+    char uuidstr[UUID_STR_LEN];
    int i;

    if (data_size < sizeof(bf)) {
@@ -183,8 +181,9 @@ static BdrvDirtyBitmap *parallels_load_bitmap(BlockDriverState *bs,
    return bitmap;
 }

-static int parallels_parse_format_extension(BlockDriverState *bs,
-                                            uint8_t *ext_cluster, Error **errp)
+static int GRAPH_RDLOCK
+parallels_parse_format_extension(BlockDriverState *bs, uint8_t *ext_cluster,
+                                 Error **errp)
 {
    BDRVParallelsState *s = bs->opaque;
    int ret;
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -200,7 +200,7 @@ static int mark_used(BlockDriverState *bs, unsigned long *bitmap,
 * bitmap anyway, as much as we can. This information will be used for
 * error resolution.
 */
-static int parallels_fill_used_bitmap(BlockDriverState *bs)
+static int GRAPH_RDLOCK parallels_fill_used_bitmap(BlockDriverState *bs)
 {
    BDRVParallelsState *s = bs->opaque;
    int64_t payload_bytes;
@@ -415,14 +415,10 @@ parallels_co_flush_to_os(BlockDriverState *bs)
    return 0;
 }

-
-static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
-                                                  bool want_zero,
-                                                  int64_t offset,
-                                                  int64_t bytes,
-                                                  int64_t *pnum,
-                                                  int64_t *map,
-                                                  BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                          int64_t bytes, int64_t *pnum, int64_t *map,
+                          BlockDriverState **file)
 {
    BDRVParallelsState *s = bs->opaque;
    int count;
@@ -1189,7 +1185,7 @@ static int parallels_probe(const uint8_t *buf, int buf_size,
    return 0;
 }

-static int parallels_update_header(BlockDriverState *bs)
+static int GRAPH_RDLOCK parallels_update_header(BlockDriverState *bs)
 {
    BDRVParallelsState *s = bs->opaque;
    unsigned size = MAX(bdrv_opt_mem_align(bs->file->bs),
@@ -1259,6 +1255,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    file_nb_sectors = bdrv_nb_sectors(bs->file->bs);
    if (file_nb_sectors < 0) {
        return -EINVAL;
@@ -1363,13 +1361,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block));

    /* Disable migration until bdrv_activate method is added */
-    bdrv_graph_rdlock_main_loop();
    error_setg(&s->migration_blocker, "The Parallels format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    bdrv_graph_rdunlock_main_loop();

-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail;
    }
@@ -1432,6 +1428,8 @@ static void parallels_close(BlockDriverState *bs)
 {
    BDRVParallelsState *s = bs->opaque;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if ((bs->open_flags & BDRV_O_RDWR) && !(bs->open_flags & BDRV_O_INACTIVE)) {
        s->header->inuse = 0;
        parallels_update_header(bs);
--- a/block/parallels.h
+++ b/block/parallels.h
@@ -90,7 +90,8 @@ typedef struct BDRVParallelsState {
    Error *migration_blocker;
 } BDRVParallelsState;

-int parallels_read_format_extension(BlockDriverState *bs,
-                                    int64_t ext_off, Error **errp);
+int GRAPH_RDLOCK
+parallels_read_format_extension(BlockDriverState *bs, int64_t ext_off,
+                                Error **errp);

 #endif
--- a/block/plug.c
+++ b/block/plug.c
@@ -1,159 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Block I/O plugging
- *
- * Copyright Red Hat.
- *
- * This API defers a function call within a blk_io_plug()/blk_io_unplug()
- * section, allowing multiple calls to batch up. This is a performance
- * optimization that is used in the block layer to submit several I/O requests
- * at once instead of individually:
- *
- *   blk_io_plug(); <-- start of plugged region
- *   ...
- *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
- *   blk_io_plug_call(my_func, my_obj); <-- another
- *   blk_io_plug_call(my_func, my_obj); <-- another
- *   ...
- *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
- *
- * This code is actually generic and not tied to the block layer. If another
- * subsystem needs this functionality, it could be renamed.
- */
-
-#include "qemu/osdep.h"
-#include "qemu/coroutine-tls.h"
-#include "qemu/notify.h"
-#include "qemu/thread.h"
-#include "sysemu/block-backend.h"
-
-/* A function call that has been deferred until unplug() */
-typedef struct {
-    void (*fn)(void *);
-    void *opaque;
-} UnplugFn;
-
-/* Per-thread state */
-typedef struct {
-    unsigned count;       /* how many times has plug() been called? */
-    GArray *unplug_fns;   /* functions to call at unplug time */
-} Plug;
-
-/* Use get_ptr_plug() to fetch this thread-local value */
-QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
-
-/* Called at thread cleanup time */
-static void blk_io_plug_atexit(Notifier *n, void *value)
-{
-    Plug *plug = get_ptr_plug();
-    g_array_free(plug->unplug_fns, TRUE);
-}
-
-/* This won't involve coroutines, so use __thread */
-static __thread Notifier blk_io_plug_atexit_notifier;
-
-/**
- * blk_io_plug_call:
- * @fn: a function pointer to be invoked
- * @opaque: a user-defined argument to @fn()
- *
- * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
- * section.
- *
- * Otherwise defer the call until the end of the outermost
- * blk_io_plug()/blk_io_unplug() section in this thread. If the same
- * @fn/@opaque pair has already been deferred, it will only be called once upon
- * blk_io_unplug() so that accumulated calls are batched into a single call.
- *
- * The caller must ensure that @opaque is not freed before @fn() is invoked.
- */
-void blk_io_plug_call(void (*fn)(void *), void *opaque)
-{
-    Plug *plug = get_ptr_plug();
-
-    /* Call immediately if we're not plugged */
-    if (plug->count == 0) {
-        fn(opaque);
-        return;
-    }
-
-    GArray *array = plug->unplug_fns;
-    if (!array) {
-        array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
-        plug->unplug_fns = array;
-        blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
-        qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
-    }
-
-    UnplugFn *fns = (UnplugFn *)array->data;
-    UnplugFn new_fn = {
-        .fn = fn,
-        .opaque = opaque,
-    };
-
-    /*
-     * There won't be many, so do a linear search. If this becomes a bottleneck
-     * then a binary search (glib 2.62+) or different data structure could be
-     * used.
-     */
-    for (guint i = 0; i < array->len; i++) {
-        if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
-            return; /* already exists */
-        }
-    }
-
-    g_array_append_val(array, new_fn);
-}
-
-/**
- * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
- *
- * blk_io_plug/unplug are thread-local operations. This means that multiple
- * threads can simultaneously call plug/unplug, but the caller must ensure that
- * each unplug() is called in the same thread of the matching plug().
- *
- * Nesting is supported. blk_io_plug_call() functions are only called at the
- * outermost blk_io_unplug().
- */
-void blk_io_plug(void)
-{
-    Plug *plug = get_ptr_plug();
-
-    assert(plug->count < UINT32_MAX);
-
-    plug->count++;
-}
-
-/**
- * blk_io_unplug: Run any pending blk_io_plug_call() functions
- *
- * There must have been a matching blk_io_plug() call in the same thread prior
- * to this blk_io_unplug() call.
- */
-void blk_io_unplug(void)
-{
-    Plug *plug = get_ptr_plug();
-
-    assert(plug->count > 0);
-
-    if (--plug->count > 0) {
-        return;
-    }
-
-    GArray *array = plug->unplug_fns;
-    if (!array) {
-        return;
-    }
-
-    UnplugFn *fns = (UnplugFn *)array->data;
-
-    for (guint i = 0; i < array->len; i++) {
-        fns[i].fn(fns[i].opaque);
-    }
-
-    /*
-     * This resets the array without freeing memory so that appending is cheap
-     * in the future.
-     */
-    g_array_set_size(array, 0);
-}
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -143,6 +143,8 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVPreallocateState *s = bs->opaque;
    int ret;

+    GLOBAL_STATE_CODE();
+
    /*
     * s->data_end and friends should be initialized on permission update.
     * For this to work, mark them invalid.
@@ -155,6 +157,8 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
        return -EINVAL;
    }
@@ -169,7 +173,8 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
    return 0;
 }

-static int preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
+static int GRAPH_RDLOCK
+preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
 {
    BDRVPreallocateState *s = bs->opaque;
    int ret;
@@ -200,6 +205,9 @@ static void preallocate_close(BlockDriverState *bs)
 {
    BDRVPreallocateState *s = bs->opaque;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    qemu_bh_cancel(s->drop_resize_bh);
    qemu_bh_delete(s->drop_resize_bh);

@@ -223,6 +231,9 @@ static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
    int ret;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!preallocate_absorb_opts(opts, reopen_state->options,
                                 reopen_state->bs->file->bs, errp)) {
        g_free(opts);
@@ -283,7 +294,7 @@ static bool can_write_resize(uint64_t perm)
    return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
 }

-static bool has_prealloc_perms(BlockDriverState *bs)
+static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs)
 {
    BDRVPreallocateState *s = bs->opaque;

@@ -499,7 +510,8 @@ preallocate_co_getlength(BlockDriverState *bs)
    return ret;
 }

-static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
+static int GRAPH_RDLOCK
+preallocate_drop_resize(BlockDriverState *bs, Error **errp)
 {
    BDRVPreallocateState *s = bs->opaque;
    int ret;
@@ -525,15 +537,16 @@ static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
     */
    s->data_end = s->file_end = s->zero_start = -EINVAL;

-    bdrv_graph_rdlock_main_loop();
    bdrv_child_refresh_perms(bs, bs->file, NULL);
-    bdrv_graph_rdunlock_main_loop();

    return 0;
 }

 static void preallocate_drop_resize_bh(void *opaque)
 {
+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /*
     * In case of errors, we'll simply keep the exclusive lock on the image
     * indefinitely.
@@ -541,8 +554,8 @@ static void preallocate_drop_resize_bh(void *opaque)
    preallocate_drop_resize(opaque, NULL);
 }

-static void preallocate_set_perm(BlockDriverState *bs,
-                                 uint64_t perm, uint64_t shared)
+static void GRAPH_RDLOCK
+preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
 {
    BDRVPreallocateState *s = bs->opaque;

--- a/block/qapi-sysemu.c
+++ b/block/qapi-sysemu.c
@@ -237,6 +237,7 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
                                            BlockDriverState *bs, Error **errp)
 {
    Error *local_err = NULL;
+    AioContext *ctx;
    bool has_device;
    int ret;

@@ -258,7 +259,11 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk,
        return;
    }

+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
    ret = blk_insert_bs(blk, bs, errp);
+    aio_context_release(ctx);
+
    if (ret < 0) {
        return;
    }
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,9 +124,11 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,

    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
-        goto fail;
+        goto fail_unlocked;
    }

+    bdrv_graph_rdlock_main_loop();
+
    ret = bdrv_pread(bs->file, 0, sizeof(header), &header, 0);
    if (ret < 0) {
        goto fail;
@@ -301,13 +303,11 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Disable migration when qcow images are used */
-    bdrv_graph_rdlock_main_loop();
    error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    bdrv_graph_rdunlock_main_loop();

-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail;
    }
@@ -315,9 +315,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    qobject_unref(encryptopts);
    qapi_free_QCryptoBlockOpenOptions(crypto_opts);
    qemu_co_mutex_init(&s->lock);
+    bdrv_graph_rdunlock_main_loop();
    return 0;

- fail:
+fail:
+    bdrv_graph_rdunlock_main_loop();
+fail_unlocked:
    g_free(s->l1_table);
    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
@@ -1024,7 +1027,7 @@ fail:
    return ret;
 }

-static int qcow_make_empty(BlockDriverState *bs)
+static int GRAPH_RDLOCK qcow_make_empty(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -105,7 +105,7 @@ static inline bool can_write(BlockDriverState *bs)
    return !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE);
 }

-static int update_header_sync(BlockDriverState *bs)
+static int GRAPH_RDLOCK update_header_sync(BlockDriverState *bs)
 {
    int ret;

@@ -221,8 +221,9 @@ clear_bitmap_table(BlockDriverState *bs, uint64_t *bitmap_table,
    }
 }

-static int bitmap_table_load(BlockDriverState *bs, Qcow2BitmapTable *tb,
-                             uint64_t **bitmap_table)
+static int GRAPH_RDLOCK
+bitmap_table_load(BlockDriverState *bs, Qcow2BitmapTable *tb,
+                  uint64_t **bitmap_table)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;
@@ -551,8 +552,9 @@ static uint32_t bitmap_list_count(Qcow2BitmapList *bm_list)
 * Get bitmap list from qcow2 image. Actually reads bitmap directory,
 * checks it and convert to bitmap list.
 */
-static Qcow2BitmapList *bitmap_list_load(BlockDriverState *bs, uint64_t offset,
-                                         uint64_t size, Error **errp)
+static Qcow2BitmapList * GRAPH_RDLOCK
+bitmap_list_load(BlockDriverState *bs, uint64_t offset, uint64_t size,
+                 Error **errp)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;
@@ -961,7 +963,7 @@ static void set_readonly_helper(gpointer bitmap, gpointer value)
 * If header_updated is not NULL then it is set appropriately regardless of
 * the return value.
 */
-bool coroutine_fn GRAPH_RDLOCK
+bool coroutine_fn
 qcow2_load_dirty_bitmaps(BlockDriverState *bs,
                         bool *header_updated, Error **errp)
 {
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -391,11 +391,10 @@ fail:
 * If the L2 entry is invalid return -errno and set @type to
 * QCOW2_SUBCLUSTER_INVALID.
 */
-static int qcow2_get_subcluster_range_type(BlockDriverState *bs,
-                                           uint64_t l2_entry,
-                                           uint64_t l2_bitmap,
-                                           unsigned sc_from,
-                                           QCow2SubclusterType *type)
+static int GRAPH_RDLOCK
+qcow2_get_subcluster_range_type(BlockDriverState *bs, uint64_t l2_entry,
+                                uint64_t l2_bitmap, unsigned sc_from,
+                                QCow2SubclusterType *type)
 {
    BDRVQcow2State *s = bs->opaque;
    uint32_t val;
@@ -442,9 +441,10 @@ static int qcow2_get_subcluster_range_type(BlockDriverState *bs,
 * On failure return -errno and update @l2_index to point to the
 * invalid entry.
 */
-static int count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
-                                        unsigned sc_index, uint64_t *l2_slice,
-                                        unsigned *l2_index)
+static int GRAPH_RDLOCK
+count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
+                             unsigned sc_index, uint64_t *l2_slice,
+                             unsigned *l2_index)
 {
    BDRVQcow2State *s = bs->opaque;
    int i, count = 0;
@@ -1329,7 +1329,8 @@ calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
 * requires a new allocation (that is, if the cluster is unallocated
 * or has refcount > 1 and therefore cannot be written in-place).
 */
-static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
+static bool GRAPH_RDLOCK
+cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
 {
    switch (qcow2_get_cluster_type(bs, l2_entry)) {
    case QCOW2_CLUSTER_NORMAL:
@@ -1360,9 +1361,9 @@ static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
 * allocated and can be overwritten in-place (this includes clusters
 * of type QCOW2_CLUSTER_ZERO_ALLOC).
 */
-static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
-                                       uint64_t *l2_slice, int l2_index,
-                                       bool new_alloc)
+static int GRAPH_RDLOCK
+count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
+                            uint64_t *l2_slice, int l2_index, bool new_alloc)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
@@ -1983,7 +1984,7 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
            /* If we keep the reference, pass on the discard still */
            bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
                          s->cluster_size);
-       }
+        }
    }

    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
@@ -2061,9 +2062,15 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
        QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
        bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
            ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
-        uint64_t new_l2_entry = unmap ? 0 : old_l2_entry;
+        bool keep_reference =
+            (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
+        uint64_t new_l2_entry = old_l2_entry;
        uint64_t new_l2_bitmap = old_l2_bitmap;

+        if (unmap && !keep_reference) {
+            new_l2_entry = 0;
+        }
+
        if (has_subclusters(s)) {
            new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
        } else {
@@ -2081,9 +2088,17 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
        }

-        /* Then decrease the refcount */
        if (unmap) {
-            qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
+            if (!keep_reference) {
+                /* Then decrease the refcount */
+                qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
+            } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
+                       (type == QCOW2_CLUSTER_NORMAL ||
+                        type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+                /* If we keep the reference, pass on the discard still */
+                bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
+                            s->cluster_size);
+            }
        }
    }

--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -95,9 +95,10 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 }


-static int qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
-                                      uint8_t *buf, size_t buflen,
-                                      void *opaque, Error **errp)
+static int GRAPH_RDLOCK
+qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
+                           uint8_t *buf, size_t buflen,
+                           void *opaque, Error **errp)
 {
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
@@ -156,7 +157,7 @@ qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, void *opaque,


 /* The graph lock must be held when called in coroutine context */
-static int coroutine_mixed_fn
+static int coroutine_mixed_fn GRAPH_RDLOCK
 qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
                            const uint8_t *buf, size_t buflen,
                            void *opaque, Error **errp)
@@ -2029,6 +2030,8 @@ static void qcow2_reopen_commit(BDRVReopenState *state)
 {
    BDRVQcow2State *s = state->bs->opaque;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    qcow2_update_options_commit(state->bs, state->opaque);
    if (!s->data_file) {
        /*
@@ -2064,6 +2067,8 @@ static void qcow2_reopen_abort(BDRVReopenState *state)
 {
    BDRVQcow2State *s = state->bs->opaque;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!s->data_file) {
        /*
         * If we don't have an external data file, s->data_file was cleared by
@@ -3155,8 +3160,9 @@ fail:
    return ret;
 }

-static int qcow2_change_backing_file(BlockDriverState *bs,
-    const char *backing_file, const char *backing_fmt)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+                             const char *backing_fmt)
 {
    BDRVQcow2State *s = bs->opaque;

@@ -3816,8 +3822,11 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
        }

-        ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
-                                       backing_format, false);
+        bdrv_graph_co_rdlock();
+        ret = bdrv_co_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
+                                          backing_format, false);
+        bdrv_graph_co_rdunlock();
+
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
                             "with format '%s'", qcow2_opts->backing_file,
@@ -5222,8 +5231,8 @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs,
-                                                  Error **errp)
+static ImageInfoSpecific * GRAPH_RDLOCK
+qcow2_get_specific_info(BlockDriverState *bs, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
    ImageInfoSpecific *spec_info;
@@ -5302,7 +5311,8 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs,
    return spec_info;
 }

-static int coroutine_mixed_fn qcow2_has_zero_init(BlockDriverState *bs)
+static int coroutine_mixed_fn GRAPH_RDLOCK
+qcow2_has_zero_init(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;
    bool preallocated;
@@ -6114,64 +6124,64 @@ static const char *const qcow2_strong_runtime_opts[] = {
 };

 BlockDriver bdrv_qcow2 = {
-    .format_name        = "qcow2",
-    .instance_size      = sizeof(BDRVQcow2State),
-    .bdrv_probe         = qcow2_probe,
-    .bdrv_open          = qcow2_open,
-    .bdrv_close         = qcow2_close,
-    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
-    .bdrv_reopen_commit   = qcow2_reopen_commit,
-    .bdrv_reopen_commit_post = qcow2_reopen_commit_post,
-    .bdrv_reopen_abort    = qcow2_reopen_abort,
-    .bdrv_join_options    = qcow2_join_options,
-    .bdrv_child_perm      = bdrv_default_perms,
-    .bdrv_co_create_opts  = qcow2_co_create_opts,
-    .bdrv_co_create       = qcow2_co_create,
-    .bdrv_has_zero_init   = qcow2_has_zero_init,
-    .bdrv_co_block_status = qcow2_co_block_status,
+    .format_name                        = "qcow2",
+    .instance_size                      = sizeof(BDRVQcow2State),
+    .bdrv_probe                         = qcow2_probe,
+    .bdrv_open                          = qcow2_open,
+    .bdrv_close                         = qcow2_close,
+    .bdrv_reopen_prepare                = qcow2_reopen_prepare,
+    .bdrv_reopen_commit                 = qcow2_reopen_commit,
+    .bdrv_reopen_commit_post            = qcow2_reopen_commit_post,
+    .bdrv_reopen_abort                  = qcow2_reopen_abort,
+    .bdrv_join_options                  = qcow2_join_options,
+    .bdrv_child_perm                    = bdrv_default_perms,
+    .bdrv_co_create_opts                = qcow2_co_create_opts,
+    .bdrv_co_create                     = qcow2_co_create,
+    .bdrv_has_zero_init                 = qcow2_has_zero_init,
+    .bdrv_co_block_status               = qcow2_co_block_status,

-    .bdrv_co_preadv_part    = qcow2_co_preadv_part,
-    .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
-    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+    .bdrv_co_preadv_part                = qcow2_co_preadv_part,
+    .bdrv_co_pwritev_part               = qcow2_co_pwritev_part,
+    .bdrv_co_flush_to_os                = qcow2_co_flush_to_os,

-    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
-    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
-    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
-    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
-    .bdrv_co_truncate       = qcow2_co_truncate,
-    .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
-    .bdrv_make_empty        = qcow2_make_empty,
+    .bdrv_co_pwrite_zeroes              = qcow2_co_pwrite_zeroes,
+    .bdrv_co_pdiscard                   = qcow2_co_pdiscard,
+    .bdrv_co_copy_range_from            = qcow2_co_copy_range_from,
+    .bdrv_co_copy_range_to              = qcow2_co_copy_range_to,
+    .bdrv_co_truncate                   = qcow2_co_truncate,
+    .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
+    .bdrv_make_empty                    = qcow2_make_empty,

-    .bdrv_snapshot_create   = qcow2_snapshot_create,
-    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
-    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
-    .bdrv_snapshot_list     = qcow2_snapshot_list,
-    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
-    .bdrv_measure           = qcow2_measure,
-    .bdrv_co_get_info       = qcow2_co_get_info,
-    .bdrv_get_specific_info = qcow2_get_specific_info,
+    .bdrv_snapshot_create               = qcow2_snapshot_create,
+    .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
+    .bdrv_snapshot_delete               = qcow2_snapshot_delete,
+    .bdrv_snapshot_list                 = qcow2_snapshot_list,
+    .bdrv_snapshot_load_tmp             = qcow2_snapshot_load_tmp,
+    .bdrv_measure                       = qcow2_measure,
+    .bdrv_co_get_info                   = qcow2_co_get_info,
+    .bdrv_get_specific_info             = qcow2_get_specific_info,

-    .bdrv_co_save_vmstate   = qcow2_co_save_vmstate,
-    .bdrv_co_load_vmstate   = qcow2_co_load_vmstate,
+    .bdrv_co_save_vmstate               = qcow2_co_save_vmstate,
+    .bdrv_co_load_vmstate               = qcow2_co_load_vmstate,

-    .is_format                  = true,
-    .supports_backing           = true,
-    .bdrv_change_backing_file   = qcow2_change_backing_file,
+    .is_format                          = true,
+    .supports_backing                   = true,
+    .bdrv_co_change_backing_file        = qcow2_co_change_backing_file,

-    .bdrv_refresh_limits        = qcow2_refresh_limits,
-    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
-    .bdrv_inactivate            = qcow2_inactivate,
+    .bdrv_refresh_limits                = qcow2_refresh_limits,
+    .bdrv_co_invalidate_cache           = qcow2_co_invalidate_cache,
+    .bdrv_inactivate                    = qcow2_inactivate,

-    .create_opts         = &qcow2_create_opts,
-    .amend_opts          = &qcow2_amend_opts,
-    .strong_runtime_opts = qcow2_strong_runtime_opts,
-    .mutable_opts        = mutable_opts,
-    .bdrv_co_check       = qcow2_co_check,
-    .bdrv_amend_options  = qcow2_amend_options,
-    .bdrv_co_amend       = qcow2_co_amend,
+    .create_opts                        = &qcow2_create_opts,
+    .amend_opts                         = &qcow2_amend_opts,
+    .strong_runtime_opts                = qcow2_strong_runtime_opts,
+    .mutable_opts                       = mutable_opts,
+    .bdrv_co_check                      = qcow2_co_check,
+    .bdrv_amend_options                 = qcow2_amend_options,
+    .bdrv_co_amend                      = qcow2_co_amend,

-    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
-    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
+    .bdrv_detach_aio_context            = qcow2_detach_aio_context,
+    .bdrv_attach_aio_context            = qcow2_attach_aio_context,

    .bdrv_supports_persistent_dirty_bitmap =
            qcow2_supports_persistent_dirty_bitmap,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -641,7 +641,7 @@ static inline void set_l2_bitmap(BDRVQcow2State *s, uint64_t *l2_slice,
    l2_slice[idx + 1] = cpu_to_be64(bitmap);
 }

-static inline bool has_data_file(BlockDriverState *bs)
+static inline bool GRAPH_RDLOCK has_data_file(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;
    return (s->data_file != bs->file);
@@ -709,8 +709,8 @@ static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s)
    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
 }

-static inline QCow2ClusterType qcow2_get_cluster_type(BlockDriverState *bs,
-                                                      uint64_t l2_entry)
+static inline QCow2ClusterType GRAPH_RDLOCK
+qcow2_get_cluster_type(BlockDriverState *bs, uint64_t l2_entry)
 {
    BDRVQcow2State *s = bs->opaque;

@@ -743,7 +743,7 @@ static inline QCow2ClusterType qcow2_get_cluster_type(BlockDriverState *bs,
 * (this checks the whole entry and bitmap, not only the bits related
 * to subcluster @sc_index).
 */
-static inline
+static inline GRAPH_RDLOCK
 QCow2SubclusterType qcow2_get_subcluster_type(BlockDriverState *bs,
                                              uint64_t l2_entry,
                                              uint64_t l2_bitmap,
@@ -834,9 +834,9 @@ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                     int refcount_order, bool generous_increase,
                                     uint64_t *refblock_count);

-int qcow2_mark_dirty(BlockDriverState *bs);
-int qcow2_mark_corrupt(BlockDriverState *bs);
-int qcow2_update_header(BlockDriverState *bs);
+int GRAPH_RDLOCK qcow2_mark_dirty(BlockDriverState *bs);
+int GRAPH_RDLOCK qcow2_mark_corrupt(BlockDriverState *bs);
+int GRAPH_RDLOCK qcow2_update_header(BlockDriverState *bs);

 void GRAPH_RDLOCK
 qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
@@ -890,10 +890,11 @@ int GRAPH_RDLOCK qcow2_write_caches(BlockDriverState *bs);
 int coroutine_fn qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                       BdrvCheckMode fix);

-void qcow2_process_discards(BlockDriverState *bs, int ret);
+void GRAPH_RDLOCK qcow2_process_discards(BlockDriverState *bs, int ret);

-int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
-                                 int64_t size);
+int GRAPH_RDLOCK
+qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
+                             int64_t size);
 int GRAPH_RDLOCK
 qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
                              int64_t size, bool data_file);
@@ -939,8 +940,9 @@ qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
 int coroutine_fn GRAPH_RDLOCK
 qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
                                      int compressed_size, uint64_t *host_offset);
-void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
-                                     uint64_t *coffset, int *csize);
+void GRAPH_RDLOCK
+qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
+                                uint64_t *coffset, int *csize);

 int coroutine_fn GRAPH_RDLOCK
 qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
@@ -972,11 +974,12 @@ int GRAPH_RDLOCK
 qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id,
                          const char *name, Error **errp);

-int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
-int qcow2_snapshot_load_tmp(BlockDriverState *bs,
-                            const char *snapshot_id,
-                            const char *name,
-                            Error **errp);
+int GRAPH_RDLOCK
+qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+
+int GRAPH_RDLOCK
+qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_id,
+                        const char *name, Error **errp);

 void qcow2_free_snapshots(BlockDriverState *bs);
 int coroutine_fn GRAPH_RDLOCK
@@ -992,8 +995,9 @@ qcow2_check_fix_snapshot_table(BlockDriverState *bs, BdrvCheckResult *result,
                               BdrvCheckMode fix);

 /* qcow2-cache.c functions */
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
-                               unsigned table_size);
+Qcow2Cache * GRAPH_RDLOCK
+qcow2_cache_create(BlockDriverState *bs, int num_tables, unsigned table_size);
+
 int qcow2_cache_destroy(Qcow2Cache *c);

 void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
@@ -1019,17 +1023,24 @@ void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset);
 void qcow2_cache_discard(Qcow2Cache *c, void *table);

 /* qcow2-bitmap.c functions */
-int coroutine_fn
+int coroutine_fn GRAPH_RDLOCK
 qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                              void **refcount_table,
                              int64_t *refcount_table_size);
+
 bool coroutine_fn GRAPH_RDLOCK
-qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated, Error **errp);
-bool qcow2_get_bitmap_info_list(BlockDriverState *bs,
-                                Qcow2BitmapInfoList **info_list, Error **errp);
+qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated,
+                         Error **errp);
+
+bool GRAPH_RDLOCK
+qcow2_get_bitmap_info_list(BlockDriverState *bs,
+                           Qcow2BitmapInfoList **info_list, Error **errp);
+
 int GRAPH_RDLOCK qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
 int GRAPH_RDLOCK qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp);
-int coroutine_fn qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp);
+
+int coroutine_fn GRAPH_RDLOCK
+qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp);

 bool GRAPH_RDLOCK
 qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, bool release_stored,
--- a/block/qed.c
+++ b/block/qed.c
@@ -612,7 +612,7 @@ static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-static void bdrv_qed_close(BlockDriverState *bs)
+static void GRAPH_RDLOCK bdrv_qed_do_close(BlockDriverState *bs)
 {
    BDRVQEDState *s = bs->opaque;

@@ -631,6 +631,14 @@ static void bdrv_qed_close(BlockDriverState *bs)
    qemu_vfree(s->l1_table);
 }

+static void GRAPH_UNLOCKED bdrv_qed_close(BlockDriverState *bs)
+{
+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
+    bdrv_qed_do_close(bs);
+}
+
 static int coroutine_fn GRAPH_UNLOCKED
 bdrv_qed_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
@@ -1138,7 +1146,7 @@ out:
 /**
 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
 */
-static bool qed_should_set_need_check(BDRVQEDState *s)
+static bool GRAPH_RDLOCK qed_should_set_need_check(BDRVQEDState *s)
 {
    /* The flush before L2 update path ensures consistency */
    if (s->bs->backing) {
@@ -1443,12 +1451,10 @@ bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 }

-static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
-                                             int64_t offset,
-                                             bool exact,
-                                             PreallocMode prealloc,
-                                             BdrvRequestFlags flags,
-                                             Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+bdrv_qed_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
+                     PreallocMode prealloc, BdrvRequestFlags flags,
+                     Error **errp)
 {
    BDRVQEDState *s = bs->opaque;
    uint64_t old_image_size;
@@ -1498,9 +1504,9 @@ bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static int bdrv_qed_change_backing_file(BlockDriverState *bs,
-                                        const char *backing_file,
-                                        const char *backing_fmt)
+static int coroutine_fn GRAPH_RDLOCK
+bdrv_qed_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+                                const char *backing_fmt)
 {
    BDRVQEDState *s = bs->opaque;
    QEDHeader new_header, le_header;
@@ -1562,7 +1568,7 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
    }

    /* Write new header */
-    ret = bdrv_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
+    ret = bdrv_co_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
    g_free(buffer);
    if (ret == 0) {
        memcpy(&s->header, &new_header, sizeof(new_header));
@@ -1576,7 +1582,7 @@ bdrv_qed_co_invalidate_cache(BlockDriverState *bs, Error **errp)
    BDRVQEDState *s = bs->opaque;
    int ret;

-    bdrv_qed_close(bs);
+    bdrv_qed_do_close(bs);

    bdrv_qed_init_state(bs);
    qemu_co_mutex_lock(&s->table_lock);
@@ -1636,34 +1642,34 @@ static QemuOptsList qed_create_opts = {
 };

 static BlockDriver bdrv_qed = {
-    .format_name              = "qed",
-    .instance_size            = sizeof(BDRVQEDState),
-    .create_opts              = &qed_create_opts,
-    .is_format                = true,
-    .supports_backing         = true,
+    .format_name                    = "qed",
+    .instance_size                  = sizeof(BDRVQEDState),
+    .create_opts                    = &qed_create_opts,
+    .is_format                      = true,
+    .supports_backing               = true,

-    .bdrv_probe               = bdrv_qed_probe,
-    .bdrv_open                = bdrv_qed_open,
-    .bdrv_close               = bdrv_qed_close,
-    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
-    .bdrv_child_perm          = bdrv_default_perms,
-    .bdrv_co_create           = bdrv_qed_co_create,
-    .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
-    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
-    .bdrv_co_block_status     = bdrv_qed_co_block_status,
-    .bdrv_co_readv            = bdrv_qed_co_readv,
-    .bdrv_co_writev           = bdrv_qed_co_writev,
-    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
-    .bdrv_co_truncate         = bdrv_qed_co_truncate,
-    .bdrv_co_getlength        = bdrv_qed_co_getlength,
-    .bdrv_co_get_info         = bdrv_qed_co_get_info,
-    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
-    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
-    .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
-    .bdrv_co_check            = bdrv_qed_co_check,
-    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
-    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
-    .bdrv_drain_begin         = bdrv_qed_drain_begin,
+    .bdrv_probe                     = bdrv_qed_probe,
+    .bdrv_open                      = bdrv_qed_open,
+    .bdrv_close                     = bdrv_qed_close,
+    .bdrv_reopen_prepare            = bdrv_qed_reopen_prepare,
+    .bdrv_child_perm                = bdrv_default_perms,
+    .bdrv_co_create                 = bdrv_qed_co_create,
+    .bdrv_co_create_opts            = bdrv_qed_co_create_opts,
+    .bdrv_has_zero_init             = bdrv_has_zero_init_1,
+    .bdrv_co_block_status           = bdrv_qed_co_block_status,
+    .bdrv_co_readv                  = bdrv_qed_co_readv,
+    .bdrv_co_writev                 = bdrv_qed_co_writev,
+    .bdrv_co_pwrite_zeroes          = bdrv_qed_co_pwrite_zeroes,
+    .bdrv_co_truncate               = bdrv_qed_co_truncate,
+    .bdrv_co_getlength              = bdrv_qed_co_getlength,
+    .bdrv_co_get_info               = bdrv_qed_co_get_info,
+    .bdrv_refresh_limits            = bdrv_qed_refresh_limits,
+    .bdrv_co_change_backing_file    = bdrv_qed_co_change_backing_file,
+    .bdrv_co_invalidate_cache       = bdrv_qed_co_invalidate_cache,
+    .bdrv_co_check                  = bdrv_qed_co_check,
+    .bdrv_detach_aio_context        = bdrv_qed_detach_aio_context,
+    .bdrv_attach_aio_context        = bdrv_qed_attach_aio_context,
+    .bdrv_drain_begin               = bdrv_qed_drain_begin,
 };

 static void bdrv_qed_init(void)
--- a/block/qed.h
+++ b/block/qed.h
@@ -185,7 +185,7 @@ enum {
 /**
 * Header functions
 */
-int qed_write_header_sync(BDRVQEDState *s);
+int GRAPH_RDLOCK qed_write_header_sync(BDRVQEDState *s);

 /**
 * L2 cache functions
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -95,9 +95,9 @@ end:
    return ret;
 }

-static int raw_apply_options(BlockDriverState *bs, BDRVRawState *s,
-                             uint64_t offset, bool has_size, uint64_t size,
-                             Error **errp)
+static int GRAPH_RDLOCK
+raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset,
+                  bool has_size, uint64_t size, Error **errp)
 {
    int64_t real_size = 0;

@@ -145,6 +145,9 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state,
    uint64_t offset, size;
    int ret;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    assert(reopen_state != NULL);
    assert(reopen_state->bs != NULL);

@@ -279,11 +282,10 @@ fail:
    return ret;
 }

-static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
-                                            bool want_zero, int64_t offset,
-                                            int64_t bytes, int64_t *pnum,
-                                            int64_t *map,
-                                            BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                    int64_t bytes, int64_t *pnum, int64_t *map,
+                    BlockDriverState **file)
 {
    BDRVRawState *s = bs->opaque;
    *pnum = bytes;
@@ -397,7 +399,7 @@ raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return bdrv_co_get_info(bs->file->bs, bdi);
 }

-static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
+static void GRAPH_RDLOCK raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    bs->bl.has_variable_length = bs->file->bs->bl.has_variable_length;

@@ -452,7 +454,7 @@ raw_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    return bdrv_co_ioctl(bs->file->bs, req, buf);
 }

-static int raw_has_zero_init(BlockDriverState *bs)
+static int GRAPH_RDLOCK raw_has_zero_init(BlockDriverState *bs)
 {
    return bdrv_has_zero_init(bs->file->bs);
 }
@@ -474,6 +476,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    BdrvChildRole file_role;
    int ret;

+    GLOBAL_STATE_CODE();
+
    ret = raw_read_options(options, &offset, &has_size, &size, errp);
    if (ret < 0) {
        return ret;
@@ -491,6 +495,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,

    bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
                    file_role, false, errp);
+
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
    if (!bs->file) {
        return -EINVAL;
    }
@@ -505,9 +511,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                                   BDRV_REQ_ZERO_WRITE;

    if (bs->probed && !bdrv_is_read_only(bs)) {
-        bdrv_graph_rdlock_main_loop();
        bdrv_refresh_filename(bs->file->bs);
-        bdrv_graph_rdunlock_main_loop();
        fprintf(stderr,
                "WARNING: Image format was not specified for '%s' and probing "
                "guessed raw.\n"
@@ -543,7 +547,8 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
    return 1;
 }

-static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+static int GRAPH_RDLOCK
+raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
    BDRVRawState *s = bs->opaque;
    int ret;
@@ -560,7 +565,8 @@ static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
    return 0;
 }

-static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
+static int GRAPH_RDLOCK
+raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 {
    BDRVRawState *s = bs->opaque;
    if (s->offset || s->has_size) {
@@ -610,7 +616,7 @@ static const char *const raw_strong_runtime_opts[] = {
    NULL
 };

-static void raw_cancel_in_flight(BlockDriverState *bs)
+static void GRAPH_RDLOCK raw_cancel_in_flight(BlockDriverState *bs)
 {
    bdrv_cancel_in_flight(bs->file->bs);
 }
--- a/block/replication.c
+++ b/block/replication.c
@@ -311,7 +311,7 @@ static void GRAPH_UNLOCKED
 secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
 {
    BDRVReplicationState *s = bs->opaque;
-    BdrvChild *active_disk = bs->file;
+    BdrvChild *active_disk;
    Error *local_err = NULL;
    int ret;

@@ -328,6 +328,7 @@ secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
        return;
    }

+    active_disk = bs->file;
    if (!active_disk->bs->drv) {
        error_setg(errp, "Active disk %s is ejected",
                   active_disk->bs->node_name);
@@ -363,6 +364,9 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
    BdrvChild *hidden_disk, *secondary_disk;
    BlockReopenQueue *reopen_queue = NULL;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    /*
     * s->hidden_disk and s->secondary_disk may not be set yet, as they will
     * only be set after the children are writable.
@@ -496,9 +500,11 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
    case REPLICATION_MODE_PRIMARY:
        break;
    case REPLICATION_MODE_SECONDARY:
+        bdrv_graph_rdlock_main_loop();
        active_disk = bs->file;
        if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
            error_setg(errp, "Active disk doesn't have backing file");
+            bdrv_graph_rdunlock_main_loop();
            aio_context_release(aio_context);
            return;
        }
@@ -506,11 +512,11 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        hidden_disk = active_disk->bs->backing;
        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
            error_setg(errp, "Hidden disk doesn't have backing file");
+            bdrv_graph_rdunlock_main_loop();
            aio_context_release(aio_context);
            return;
        }

-        bdrv_graph_rdlock_main_loop();
        secondary_disk = hidden_disk->bs->backing;
        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
            error_setg(errp, "The secondary disk doesn't have block backend");
@@ -750,11 +756,13 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
            return;
        }

+        bdrv_graph_rdlock_main_loop();
        s->stage = BLOCK_REPLICATION_FAILOVER;
        s->commit_job = commit_active_start(
                            NULL, bs->file->bs, s->secondary_disk->bs,
                            JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
                            NULL, replication_done, bs, true, errp);
+        bdrv_graph_rdunlock_main_loop();
        break;
    default:
        aio_context_release(aio_context);
--- a/block/snapshot-access.c
+++ b/block/snapshot-access.c
@@ -73,7 +73,7 @@ snapshot_access_co_pwritev_part(BlockDriverState *bs,
 }


-static void snapshot_access_refresh_filename(BlockDriverState *bs)
+static void GRAPH_RDLOCK snapshot_access_refresh_filename(BlockDriverState *bs)
 {
    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
            bs->file->bs->filename);
@@ -85,6 +85,9 @@ static int snapshot_access_open(BlockDriverState *bs, QDict *options, int flags,
    bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
                    BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY,
                    false, errp);
+
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!bs->file) {
        return -EINVAL;
    }
--- a/block/stream.c
+++ b/block/stream.c
@@ -53,13 +53,20 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
 static int stream_prepare(Job *job)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-    BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
-    BlockDriverState *unfiltered_bs_cow = bdrv_cow_bs(unfiltered_bs);
+    BlockDriverState *unfiltered_bs;
+    BlockDriverState *unfiltered_bs_cow;
    BlockDriverState *base;
    BlockDriverState *unfiltered_base;
    Error *local_err = NULL;
    int ret = 0;

+    GLOBAL_STATE_CODE();
+
+    bdrv_graph_rdlock_main_loop();
+    unfiltered_bs = bdrv_skip_filters(s->target_bs);
+    unfiltered_bs_cow = bdrv_cow_bs(unfiltered_bs);
+    bdrv_graph_rdunlock_main_loop();
+
    /* We should drop filter at this point, as filter hold the backing chain */
    bdrv_cor_filter_drop(s->cor_filter_bs);
    s->cor_filter_bs = NULL;
@@ -78,10 +85,12 @@ static int stream_prepare(Job *job)
        bdrv_drained_begin(unfiltered_bs_cow);
    }

+    bdrv_graph_rdlock_main_loop();
    base = bdrv_filter_or_cow_bs(s->above_base);
    unfiltered_base = bdrv_skip_filters(base);
+    bdrv_graph_rdunlock_main_loop();

-    if (bdrv_cow_child(unfiltered_bs)) {
+    if (unfiltered_bs_cow) {
        const char *base_id = NULL, *base_fmt = NULL;
        if (unfiltered_base) {
            base_id = s->backing_file_str ?: unfiltered_base->filename;
@@ -90,7 +99,9 @@ static int stream_prepare(Job *job)
            }
        }

+        bdrv_graph_wrlock(base);
        bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err);
+        bdrv_graph_wrunlock();

        /*
         * This call will do I/O, so the graph can change again from here on.
@@ -138,18 +149,19 @@ static void stream_clean(Job *job)
 static int coroutine_fn stream_run(Job *job, Error **errp)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-    BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
+    BlockDriverState *unfiltered_bs;
    int64_t len;
    int64_t offset = 0;
    int error = 0;
    int64_t n = 0; /* bytes */

-    if (unfiltered_bs == s->base_overlay) {
-        /* Nothing to stream */
-        return 0;
-    }
-
    WITH_GRAPH_RDLOCK_GUARD() {
+        unfiltered_bs = bdrv_skip_filters(s->target_bs);
+        if (unfiltered_bs == s->base_overlay) {
+            /* Nothing to stream */
+            return 0;
+        }
+
        len = bdrv_co_getlength(s->target_bs);
        if (len < 0) {
            return len;
@@ -256,6 +268,8 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    assert(!(base && bottom));
    assert(!(backing_file_str && bottom));

+    bdrv_graph_rdlock_main_loop();
+
    if (bottom) {
        /*
         * New simple interface. The code is written in terms of old interface
@@ -272,7 +286,7 @@ void stream_start(const char *job_id, BlockDriverState *bs,
        if (!base_overlay) {
            error_setg(errp, "'%s' is not in the backing chain of '%s'",
                       base->node_name, bs->node_name);
-            return;
+            goto out_rdlock;
        }

        /*
@@ -294,7 +308,7 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    if (bs_read_only) {
        /* Hold the chain during reopen */
        if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
-            return;
+            goto out_rdlock;
        }

        ret = bdrv_reopen_set_read_only(bs, false, errp);
@@ -303,10 +317,12 @@ void stream_start(const char *job_id, BlockDriverState *bs,
        bdrv_unfreeze_backing_chain(bs, above_base);

        if (ret < 0) {
-            return;
+            goto out_rdlock;
        }
    }

+    bdrv_graph_rdunlock_main_loop();
+
    opts = qdict_new();

    qdict_put_str(opts, "driver", "copy-on-read");
@@ -350,8 +366,10 @@ void stream_start(const char *job_id, BlockDriverState *bs,
     * already have our own plans. Also don't allow resize as the image size is
     * queried only at the job start and then cached.
     */
+    bdrv_graph_wrlock(bs);
    if (block_job_add_bdrv(&s->common, "active node", bs, 0,
                           basic_flags | BLK_PERM_WRITE, errp)) {
+        bdrv_graph_wrunlock();
        goto fail;
    }

@@ -371,9 +389,11 @@ void stream_start(const char *job_id, BlockDriverState *bs,
        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
                                 basic_flags, errp);
        if (ret < 0) {
+            bdrv_graph_wrunlock();
            goto fail;
        }
    }
+    bdrv_graph_wrunlock();

    s->base_overlay = base_overlay;
    s->above_base = above_base;
@@ -397,4 +417,8 @@ fail:
    if (bs_read_only) {
        bdrv_reopen_set_read_only(bs, true, NULL);
    }
+    return;
+
+out_rdlock:
+    bdrv_graph_rdunlock_main_loop();
 }
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -84,6 +84,9 @@ static int throttle_open(BlockDriverState *bs, QDict *options,
    if (ret < 0) {
        return ret;
    }
+
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    bs->supported_write_flags = bs->file->bs->supported_write_flags |
                                BDRV_REQ_WRITE_UNCHANGED;
    bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -239,7 +239,7 @@ static void vdi_header_to_le(VdiHeader *header)

 static void vdi_header_print(VdiHeader *header)
 {
-    char uuidstr[37];
+    char uuidstr[UUID_STR_LEN];
    QemuUUID uuid;
    logout("text        %s", header->text);
    logout("signature   0x%08x\n", header->signature);
@@ -383,6 +383,8 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    logout("\n");

    ret = bdrv_pread(bs->file, 0, sizeof(header), &header, 0);
@@ -492,13 +494,11 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Disable migration when vdi images are used */
-    bdrv_graph_rdlock_main_loop();
    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    bdrv_graph_rdunlock_main_loop();

-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail_free_bmap;
    }
@@ -520,11 +520,10 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-static int coroutine_fn vdi_co_block_status(BlockDriverState *bs,
-                                            bool want_zero,
-                                            int64_t offset, int64_t bytes,
-                                            int64_t *pnum, int64_t *map,
-                                            BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+vdi_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                    int64_t bytes, int64_t *pnum, int64_t *map,
+                    BlockDriverState **file)
 {
    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    size_t bmap_index = offset / s->block_size;
@@ -990,7 +989,7 @@ static void vdi_close(BlockDriverState *bs)
    migrate_del_blocker(&s->migration_blocker);
 }

-static int vdi_has_zero_init(BlockDriverState *bs)
+static int GRAPH_RDLOCK vdi_has_zero_init(BlockDriverState *bs)
 {
    BDRVVdiState *s = bs->opaque;

--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -55,8 +55,9 @@ static const MSGUID zero_guid = { 0 };

 /* Allow peeking at the hdr entry at the beginning of the current
 * read index, without advancing the read index */
-static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
-                             VHDXLogEntryHeader *hdr)
+static int GRAPH_RDLOCK
+vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
+                  VHDXLogEntryHeader *hdr)
 {
    int ret = 0;
    uint64_t offset;
@@ -107,7 +108,7 @@ static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)


 /* Reset the log to empty */
-static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
+static void GRAPH_RDLOCK vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
 {
    MSGUID guid = { 0 };
    s->log.read = s->log.write = 0;
@@ -127,9 +128,10 @@ static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
 * not modified.
 *
 * 0 is returned on success, -errno otherwise.  */
-static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
-                                 uint32_t *sectors_read, void *buffer,
-                                 uint32_t num_sectors, bool peek)
+static int GRAPH_RDLOCK
+vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
+                      uint32_t *sectors_read, void *buffer,
+                      uint32_t num_sectors, bool peek)
 {
    int ret = 0;
    uint64_t offset;
@@ -333,9 +335,9 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
 * will allocate all the space for buffer, which must be NULL when
 * passed into this function. Each descriptor will also be validated,
 * and error returned if any are invalid. */
-static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
-                              VHDXLogEntries *log, VHDXLogDescEntries **buffer,
-                              bool convert_endian)
+static int GRAPH_RDLOCK
+vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogEntries *log,
+                   VHDXLogDescEntries **buffer, bool convert_endian)
 {
    int ret = 0;
    uint32_t desc_sectors;
@@ -412,8 +414,9 @@ exit:
 * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
 * In this case, it should be noted that zeroes are written to disk, and the
 * image file is not extended as a sparse file.  */
-static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
-                               VHDXLogDataSector *data)
+static int GRAPH_RDLOCK
+vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
+                    VHDXLogDataSector *data)
 {
    int ret = 0;
    uint64_t seq, file_offset;
@@ -484,8 +487,8 @@ exit:
 * file, and then set the log to 'empty' status once complete.
 *
 * The log entries should be validate prior to flushing */
-static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
-                          VHDXLogSequence *logs)
+static int GRAPH_RDLOCK
+vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogSequence *logs)
 {
    int ret = 0;
    int i;
@@ -584,9 +587,10 @@ exit:
    return ret;
 }

-static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
-                                   VHDXLogEntries *log, uint64_t seq,
-                                   bool *valid, VHDXLogEntryHeader *entry)
+static int GRAPH_RDLOCK
+vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
+                        VHDXLogEntries *log, uint64_t seq,
+                        bool *valid, VHDXLogEntryHeader *entry)
 {
    int ret = 0;
    VHDXLogEntryHeader hdr;
@@ -663,8 +667,8 @@ free_and_exit:
 /* Search through the log circular buffer, and find the valid, active
 * log sequence, if any exists
 * */
-static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
-                           VHDXLogSequence *logs)
+static int GRAPH_RDLOCK
+vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogSequence *logs)
 {
    int ret = 0;
    uint32_t tail;
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -353,8 +353,9 @@ exit:
 *
 *  - non-current header is updated with largest sequence number
 */
-static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
-                              bool generate_data_write_guid, MSGUID *log_guid)
+static int GRAPH_RDLOCK
+vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
+                   bool generate_data_write_guid, MSGUID *log_guid)
 {
    int ret = 0;
    int hdr_idx = 0;
@@ -416,8 +417,8 @@ int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s,
 }

 /* opens the specified header block from the VHDX file header section */
-static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
-                              Error **errp)
+static void GRAPH_RDLOCK
+vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, Error **errp)
 {
    int ret;
    VHDXHeader *header1;
@@ -517,7 +518,8 @@ exit:
 }


-static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+static int GRAPH_RDLOCK
+vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
 {
    int ret = 0;
    uint8_t *buffer;
@@ -634,7 +636,8 @@ fail:
 * Also, if the File Parameters indicate this is a differencing file,
 * we must also look for the Parent Locator metadata item.
 */
-static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+static int GRAPH_RDLOCK
+vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
 {
    int ret = 0;
    uint8_t *buffer;
@@ -885,7 +888,8 @@ static void vhdx_calc_bat_entries(BDRVVHDXState *s)

 }

-static int vhdx_check_bat_entries(BlockDriverState *bs, int *errcnt)
+static int coroutine_mixed_fn GRAPH_RDLOCK
+vhdx_check_bat_entries(BlockDriverState *bs, int *errcnt)
 {
    BDRVVHDXState *s = bs->opaque;
    int64_t image_file_size = bdrv_getlength(bs->file->bs);
@@ -1096,7 +1100,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail;
    }
@@ -1695,7 +1699,7 @@ exit:
 *  Fixed images: default state of the BAT is fully populated, with
 *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
 */
-static int coroutine_fn
+static int coroutine_fn GRAPH_UNLOCKED
 vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
                uint64_t image_size, VHDXImageType type,
                bool use_zero_blocks, uint64_t file_offset,
@@ -1708,6 +1712,7 @@ vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
    uint64_t unused;
    int block_state;
    VHDXSectorInfo sinfo;
+    bool has_zero_init;

    assert(s->bat == NULL);

@@ -1737,9 +1742,13 @@ vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
        goto exit;
    }

+    bdrv_graph_co_rdlock();
+    has_zero_init = bdrv_has_zero_init(blk_bs(blk));
+    bdrv_graph_co_rdunlock();
+
    if (type == VHDX_TYPE_FIXED ||
                use_zero_blocks ||
-                bdrv_has_zero_init(blk_bs(blk)) == 0) {
+                has_zero_init == 0) {
        /* for a fixed file, the default BAT entry is not zero */
        s->bat = g_try_malloc0(length);
        if (length && s->bat == NULL) {
@@ -1782,7 +1791,7 @@ exit:
 * to create the BAT itself, we will also cause the BAT to be
 * created.
 */
-static int coroutine_fn
+static int coroutine_fn GRAPH_UNLOCKED
 vhdx_create_new_region_table(BlockBackend *blk, uint64_t image_size,
                             uint32_t block_size, uint32_t sector_size,
                             uint32_t log_size, bool use_zero_blocks,
@@ -2158,9 +2167,9 @@ fail:
 * r/w and any log has already been replayed, so there is nothing (currently)
 * for us to do here
 */
-static int coroutine_fn vhdx_co_check(BlockDriverState *bs,
-                                      BdrvCheckResult *result,
-                                      BdrvCheckMode fix)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_co_check(BlockDriverState *bs, BdrvCheckResult *result,
+              BdrvCheckMode fix)
 {
    BDRVVHDXState *s = bs->opaque;

@@ -2173,7 +2182,7 @@ static int coroutine_fn vhdx_co_check(BlockDriverState *bs,
    return 0;
 }

-static int vhdx_has_zero_init(BlockDriverState *bs)
+static int GRAPH_RDLOCK vhdx_has_zero_init(BlockDriverState *bs)
 {
    BDRVVHDXState *s = bs->opaque;
    int state;
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -401,8 +401,9 @@ typedef struct BDRVVHDXState {

 void vhdx_guid_generate(MSGUID *guid);

-int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw,
-                        MSGUID *log_guid);
+int GRAPH_RDLOCK
+vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw,
+                    MSGUID *log_guid);

 uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset);
 uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
@@ -448,6 +449,8 @@ void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr);
 void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr);
 void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e);
 void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e);
-int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s);
+
+int GRAPH_RDLOCK
+vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s);

 #endif
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -300,7 +300,8 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 }

 /* Return -ve errno, or 0 on success and write CID into *pcid. */
-static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
+static int GRAPH_RDLOCK
+vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
 {
    char *desc;
    uint32_t cid;
@@ -380,7 +381,7 @@ out:
    return ret;
 }

-static int coroutine_fn vmdk_is_cid_valid(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK vmdk_is_cid_valid(BlockDriverState *bs)
 {
    BDRVVmdkState *s = bs->opaque;
    uint32_t cur_pcid;
@@ -415,6 +416,9 @@ static int vmdk_reopen_prepare(BDRVReopenState *state,
    BDRVVmdkReopenState *rs;
    int i;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    assert(state != NULL);
    assert(state->bs != NULL);
    assert(state->opaque == NULL);
@@ -451,6 +455,9 @@ static void vmdk_reopen_commit(BDRVReopenState *state)
    BDRVVmdkReopenState *rs = state->opaque;
    int i;

+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    for (i = 0; i < s->num_extents; i++) {
        if (rs->extents_using_bs_file[i]) {
            s->extents[i].file = state->bs->file;
@@ -465,7 +472,7 @@ static void vmdk_reopen_abort(BDRVReopenState *state)
    vmdk_reopen_clean(state);
 }

-static int vmdk_parent_open(BlockDriverState *bs)
+static int GRAPH_RDLOCK vmdk_parent_open(BlockDriverState *bs)
 {
    char *p_name;
    char *desc;
@@ -1386,7 +1393,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail;
    }
@@ -2547,7 +2554,10 @@ vmdk_co_do_create(int64_t size,
            ret = -EINVAL;
            goto exit;
        }
+
+        bdrv_graph_co_rdlock();
        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
+        bdrv_graph_co_rdunlock();
        blk_co_unref(backing);
        if (ret) {
            error_setg(errp, "Failed to read parent CID");
@@ -2894,7 +2904,7 @@ vmdk_co_get_allocated_file_size(BlockDriverState *bs)
    return ret;
 }

-static int vmdk_has_zero_init(BlockDriverState *bs)
+static int GRAPH_RDLOCK vmdk_has_zero_init(BlockDriverState *bs)
 {
    int i;
    BDRVVmdkState *s = bs->opaque;
@@ -3044,8 +3054,9 @@ vmdk_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static void vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
-                                      bool backing_overridden)
+static void GRAPH_RDLOCK
+vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
+                          bool backing_overridden)
 {
    /* No children but file and backing can be explicitly specified (TODO) */
    qdict_put(target, "file",
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -238,6 +238,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
        ret = -EINVAL;
@@ -446,13 +448,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Disable migration when VHD images are used */
-    bdrv_graph_rdlock_main_loop();
    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
-    bdrv_graph_rdunlock_main_loop();

-    ret = migrate_add_blocker(&s->migration_blocker, errp);
+    ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
    if (ret < 0) {
        goto fail;
    }
@@ -1170,7 +1170,7 @@ fail:
 }


-static int vpc_has_zero_init(BlockDriverState *bs)
+static int GRAPH_RDLOCK vpc_has_zero_init(BlockDriverState *bs)
 {
    BDRVVPCState *s = bs->opaque;

--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1268,7 +1268,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
                   "The vvfat (rw) format used by node '%s' "
                   "does not support live migration",
                   bdrv_get_device_or_node_name(bs));
-        ret = migrate_add_blocker(&s->migration_blocker, errp);
+        ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
        if (ret < 0) {
            goto fail;
        }
--- a/blockdev.c
+++ b/blockdev.c
@@ -255,13 +255,13 @@ void drive_check_orphaned(void)
         * Ignore default drives, because we create certain default
         * drives unconditionally, then leave them unclaimed.  Not the
         * users fault.
-         * Ignore IF_VIRTIO, because it gets desugared into -device,
-         * so we can leave failing to -device.
+         * Ignore IF_VIRTIO or IF_XEN, because it gets desugared into
+         * -device, so we can leave failing to -device.
         * Ignore IF_NONE, because leaving unclaimed IF_NONE remains
         * available for device_add is a feature.
         */
        if (dinfo->is_default || dinfo->type == IF_VIRTIO
-            || dinfo->type == IF_NONE) {
+            || dinfo->type == IF_XEN || dinfo->type == IF_NONE) {
            continue;
        }
        if (!blk_get_attached_dev(blk)) {
@@ -977,6 +977,15 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type,
        qemu_opt_set(devopts, "driver", "virtio-blk", &error_abort);
        qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id"),
                     &error_abort);
+    } else if (type == IF_XEN) {
+        QemuOpts *devopts;
+        devopts = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
+                                   &error_abort);
+        qemu_opt_set(devopts, "driver",
+                     (media == MEDIA_CDROM) ? "xen-cdrom" : "xen-disk",
+                     &error_abort);
+        qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id"),
+                     &error_abort);
    }

    filename = qemu_opt_get(legacy_opts, "file");
@@ -1601,7 +1610,12 @@ static void external_snapshot_abort(void *opaque)
                aio_context_acquire(aio_context);
            }

+            bdrv_drained_begin(state->new_bs);
+            bdrv_graph_wrlock(state->old_bs);
            bdrv_replace_node(state->new_bs, state->old_bs, &error_abort);
+            bdrv_graph_wrunlock();
+            bdrv_drained_end(state->new_bs);
+
            bdrv_unref(state->old_bs); /* bdrv_replace_node() ref'ed old_bs */

            aio_context_release(aio_context);
@@ -1701,7 +1715,6 @@ static void drive_backup_action(DriveBackup *backup,
        bdrv_graph_rdunlock_main_loop();
        goto out;
    }
-    bdrv_graph_rdunlock_main_loop();

    flags = bs->open_flags | BDRV_O_RDWR;

@@ -1726,6 +1739,7 @@ static void drive_backup_action(DriveBackup *backup,
        flags |= BDRV_O_NO_BACKING;
        set_backing_hd = true;
    }
+    bdrv_graph_rdunlock_main_loop();

    size = bdrv_getlength(bs);
    if (size < 0) {
@@ -1737,10 +1751,10 @@ static void drive_backup_action(DriveBackup *backup,
        assert(format);
        if (source) {
            /* Implicit filters should not appear in the filename */
-            BlockDriverState *explicit_backing =
-                bdrv_skip_implicit_filters(source);
+            BlockDriverState *explicit_backing;

            bdrv_graph_rdlock_main_loop();
+            explicit_backing = bdrv_skip_implicit_filters(source);
            bdrv_refresh_filename(explicit_backing);
            bdrv_graph_rdunlock_main_loop();

@@ -2441,11 +2455,12 @@ void qmp_block_stream(const char *job_id, const char *device,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

+    bdrv_graph_rdlock_main_loop();
    if (base) {
        base_bs = bdrv_find_backing_image(bs, base);
        if (base_bs == NULL) {
            error_setg(errp, "Can't find '%s' in the backing chain", base);
-            goto out;
+            goto out_rdlock;
        }
        assert(bdrv_get_aio_context(base_bs) == aio_context);
    }
@@ -2453,38 +2468,36 @@ void qmp_block_stream(const char *job_id, const char *device,
    if (base_node) {
        base_bs = bdrv_lookup_bs(NULL, base_node, errp);
        if (!base_bs) {
-            goto out;
+            goto out_rdlock;
        }
        if (bs == base_bs || !bdrv_chain_contains(bs, base_bs)) {
            error_setg(errp, "Node '%s' is not a backing image of '%s'",
                       base_node, device);
-            goto out;
+            goto out_rdlock;
        }
        assert(bdrv_get_aio_context(base_bs) == aio_context);

-        bdrv_graph_rdlock_main_loop();
        bdrv_refresh_filename(base_bs);
-        bdrv_graph_rdunlock_main_loop();
    }

    if (bottom) {
        bottom_bs = bdrv_lookup_bs(NULL, bottom, errp);
        if (!bottom_bs) {
-            goto out;
+            goto out_rdlock;
        }
        if (!bottom_bs->drv) {
            error_setg(errp, "Node '%s' is not open", bottom);
-            goto out;
+            goto out_rdlock;
        }
        if (bottom_bs->drv->is_filter) {
            error_setg(errp, "Node '%s' is a filter, use a non-filter node "
                       "as 'bottom'", bottom);
-            goto out;
+            goto out_rdlock;
        }
        if (!bdrv_chain_contains(bs, bottom_bs)) {
            error_setg(errp, "Node '%s' is not in a chain starting from '%s'",
                       bottom, device);
-            goto out;
+            goto out_rdlock;
        }
        assert(bdrv_get_aio_context(bottom_bs) == aio_context);
    }
@@ -2493,13 +2506,11 @@ void qmp_block_stream(const char *job_id, const char *device,
     * Check for op blockers in the whole chain between bs and base (or bottom)
     */
    iter_end = bottom ? bdrv_filter_or_cow_bs(bottom_bs) : base_bs;
-    bdrv_graph_rdlock_main_loop();
    for (iter = bs; iter && iter != iter_end;
         iter = bdrv_filter_or_cow_bs(iter))
    {
        if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) {
-            bdrv_graph_rdunlock_main_loop();
-            goto out;
+            goto out_rdlock;
        }
    }
    bdrv_graph_rdunlock_main_loop();
@@ -2531,6 +2542,11 @@ void qmp_block_stream(const char *job_id, const char *device,

 out:
    aio_context_release(aio_context);
+    return;
+
+out_rdlock:
+    bdrv_graph_rdunlock_main_loop();
+    aio_context_release(aio_context);
 }

 void qmp_block_commit(const char *job_id, const char *device,
@@ -2968,6 +2984,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,

    if (replaces) {
        BlockDriverState *to_replace_bs;
+        AioContext *aio_context;
        AioContext *replace_aio_context;
        int64_t bs_size, replace_size;

@@ -2982,10 +2999,19 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
            return;
        }

+        aio_context = bdrv_get_aio_context(bs);
        replace_aio_context = bdrv_get_aio_context(to_replace_bs);
-        aio_context_acquire(replace_aio_context);
+        /*
+         * bdrv_getlength() is a co-wrapper and uses AIO_WAIT_WHILE. Be sure not
+         * to acquire the same AioContext twice.
+         */
+        if (replace_aio_context != aio_context) {
+            aio_context_acquire(replace_aio_context);
+        }
        replace_size = bdrv_getlength(to_replace_bs);
-        aio_context_release(replace_aio_context);
+        if (replace_aio_context != aio_context) {
+            aio_context_release(replace_aio_context);
+        }

        if (replace_size < 0) {
            error_setg_errno(errp, -replace_size,
@@ -3035,7 +3061,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
        bdrv_graph_rdunlock_main_loop();
        return;
    }
-    bdrv_graph_rdunlock_main_loop();

    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);
@@ -3057,6 +3082,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
    if (arg->sync == MIRROR_SYNC_MODE_NONE) {
        target_backing_bs = bs;
    }
+    bdrv_graph_rdunlock_main_loop();

    size = bdrv_getlength(bs);
    if (size < 0) {
@@ -3089,16 +3115,18 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
        bdrv_img_create(arg->target, format,
                        NULL, NULL, NULL, size, flags, false, &local_err);
    } else {
-        /* Implicit filters should not appear in the filename */
-        BlockDriverState *explicit_backing =
-            bdrv_skip_implicit_filters(target_backing_bs);
+        BlockDriverState *explicit_backing;

        switch (arg->mode) {
        case NEW_IMAGE_MODE_EXISTING:
            break;
        case NEW_IMAGE_MODE_ABSOLUTE_PATHS:
-            /* create new image with backing file */
+            /*
+             * Create new image with backing file.
+             * Implicit filters should not appear in the filename.
+             */
            bdrv_graph_rdlock_main_loop();
+            explicit_backing = bdrv_skip_implicit_filters(target_backing_bs);
            bdrv_refresh_filename(explicit_backing);
            bdrv_graph_rdunlock_main_loop();

@@ -3137,9 +3165,11 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
        return;
    }

+    bdrv_graph_rdlock_main_loop();
    zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL &&
                   (arg->mode == NEW_IMAGE_MODE_EXISTING ||
                    !bdrv_has_zero_init(target_bs)));
+    bdrv_graph_rdunlock_main_loop();


    /* Honor bdrv_try_change_aio_context() context acquisition requirements. */
@@ -3382,6 +3412,20 @@ void qmp_block_job_dismiss(const char *id, Error **errp)
    job_dismiss_locked(&job, errp);
 }

+void qmp_block_job_change(BlockJobChangeOptions *opts, Error **errp)
+{
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(opts->id, errp);
+
+    if (!job) {
+        return;
+    }
+
+    block_job_change_locked(job, opts, errp);
+}
+
 void qmp_change_backing_file(const char *device,
                             const char *image_node_name,
                             const char *backing_file,
@@ -3402,38 +3446,38 @@ void qmp_change_backing_file(const char *device,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

+    bdrv_graph_rdlock_main_loop();
+
    image_bs = bdrv_lookup_bs(NULL, image_node_name, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        goto out;
+        goto out_rdlock;
    }

    if (!image_bs) {
        error_setg(errp, "image file not found");
-        goto out;
+        goto out_rdlock;
    }

    if (bdrv_find_base(image_bs) == image_bs) {
        error_setg(errp, "not allowing backing file change on an image "
                         "without a backing file");
-        goto out;
+        goto out_rdlock;
    }

    /* even though we are not necessarily operating on bs, we need it to
     * determine if block ops are currently prohibited on the chain */
-    bdrv_graph_rdlock_main_loop();
    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_CHANGE, errp)) {
-        bdrv_graph_rdunlock_main_loop();
-        goto out;
+        goto out_rdlock;
    }
-    bdrv_graph_rdunlock_main_loop();

    /* final sanity check */
    if (!bdrv_chain_contains(bs, image_bs)) {
        error_setg(errp, "'%s' and image file are not in the same chain",
                   device);
-        goto out;
+        goto out_rdlock;
    }
+    bdrv_graph_rdunlock_main_loop();

    /* if not r/w, reopen to make r/w */
    ro = bdrv_is_read_only(image_bs);
@@ -3461,6 +3505,11 @@ void qmp_change_backing_file(const char *device,

 out:
    aio_context_release(aio_context);
+    return;
+
+out_rdlock:
+    bdrv_graph_rdunlock_main_loop();
+    aio_context_release(aio_context);
 }

 void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
--- a/blockjob.c
+++ b/blockjob.c
@@ -198,7 +198,9 @@ void block_job_remove_all_bdrv(BlockJob *job)
     * one to make sure that such a concurrent access does not attempt
     * to process an already freed BdrvChild.
     */
+    aio_context_release(job->job.aio_context);
    bdrv_graph_wrlock(NULL);
+    aio_context_acquire(job->job.aio_context);
    while (job->nodes) {
        GSList *l = job->nodes;
        BdrvChild *c = l->data;
@@ -328,6 +330,26 @@ static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
    return block_job_set_speed_locked(job, speed, errp);
 }

+void block_job_change_locked(BlockJob *job, BlockJobChangeOptions *opts,
+                             Error **errp)
+{
+    const BlockJobDriver *drv = block_job_driver(job);
+
+    GLOBAL_STATE_CODE();
+
+    if (job_apply_verb_locked(&job->job, JOB_VERB_CHANGE, errp)) {
+        return;
+    }
+
+    if (drv->change) {
+        job_unlock();
+        drv->change(job, opts, errp);
+        job_lock();
+    } else {
+        error_setg(errp, "Job type does not support change");
+    }
+}
+
 void block_job_ratelimit_processed_bytes(BlockJob *job, uint64_t n)
 {
    IO_CODE();
@@ -356,6 +378,7 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
 {
    BlockJobInfo *info;
    uint64_t progress_current, progress_total;
+    const BlockJobDriver *drv = block_job_driver(job);

    GLOBAL_STATE_CODE();

@@ -368,7 +391,7 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
                          &progress_total);

    info = g_new0(BlockJobInfo, 1);
-    info->type      = g_strdup(job_type_str(&job->job));
+    info->type      = job_type(&job->job);
    info->device    = g_strdup(job->job.id);
    info->busy      = job->job.busy;
    info->paused    = job->job.pause_count > 0;
@@ -385,6 +408,11 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
                        g_strdup(error_get_pretty(job->job.err)) :
                        g_strdup(strerror(-job->job.ret));
    }
+    if (drv->query) {
+        job_unlock();
+        drv->query(job, info);
+        job_lock();
+    }
    return info;
 }

@@ -485,7 +513,8 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    BlockJob *job;
    int ret;
    GLOBAL_STATE_CODE();
-    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
+    bdrv_graph_wrlock(bs);

    if (job_id == NULL && !(flags & JOB_INTERNAL)) {
        job_id = bdrv_get_device_name(bs);
@@ -494,6 +523,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job = job_create(job_id, &driver->job_driver, txn, bdrv_get_aio_context(bs),
                     flags, cb, opaque, errp);
    if (job == NULL) {
+        bdrv_graph_wrunlock();
        return NULL;
    }

@@ -533,9 +563,11 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
        goto fail;
    }

+    bdrv_graph_wrunlock();
    return job;

 fail:
+    bdrv_graph_wrunlock();
    job_early_fail(&job->job);
    return NULL;
 }
--- a/bsd-user/arm/target_arch.h
+++ b/bsd-user/arm/target_arch.h
@@ -21,6 +21,7 @@
 #define TARGET_ARCH_H

 #include "qemu.h"
+#include "target/arm/cpu-features.h"

 void target_cpu_set_tls(CPUARMState *env, target_ulong newtls);
 target_ulong target_cpu_get_tls(CPUARMState *env);
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -118,7 +118,7 @@ void fork_end(int child)
         */
        CPU_FOREACH_SAFE(cpu, next_cpu) {
            if (cpu != thread_cpu) {
-                QTAILQ_REMOVE_RCU(&cpus, cpu, node);
+                QTAILQ_REMOVE_RCU(&cpus_queue, cpu, node);
            }
        }
        mmap_fork_end(child);
--- a/configs/devices/ppc-softmmu/default.mak
+++ b/configs/devices/ppc-softmmu/default.mak
@@ -14,6 +14,7 @@ CONFIG_SAM460EX=y
 CONFIG_MAC_OLDWORLD=y
 CONFIG_MAC_NEWWORLD=y

+CONFIG_AMIGAONE=y
 CONFIG_PEGASOS2=y

 # For PReP
--- a/configs/targets/hppa-linux-user.mak
+++ b/configs/targets/hppa-linux-user.mak
@@ -1,4 +1,5 @@
 TARGET_ARCH=hppa
+TARGET_ABI32=y
 TARGET_SYSTBL_ABI=common,32
 TARGET_SYSTBL=syscall.tbl
 TARGET_BIG_ENDIAN=y
--- a/configs/targets/loongarch64-linux-user.mak
+++ b/configs/targets/loongarch64-linux-user.mak
@@ -1,3 +1,4 @@
 # Default configuration for loongarch64-linux-user
 TARGET_ARCH=loongarch64
 TARGET_BASE_ARCH=loongarch
+TARGET_XML_FILES=gdb-xml/loongarch-base64.xml gdb-xml/loongarch-fpu.xml
--- a/51
+++ b/51
@@ -309,6 +309,7 @@ fi
 ar="${AR-${cross_prefix}ar}"
 as="${AS-${cross_prefix}as}"
 ccas="${CCAS-$cc}"
+dlltool="${DLLTOOL-${cross_prefix}dlltool}"
 objcopy="${OBJCOPY-${cross_prefix}objcopy}"
 ld="${LD-${cross_prefix}ld}"
 ranlib="${RANLIB-${cross_prefix}ranlib}"
@@ -1010,9 +1011,9 @@ if test "$targetos" = "bogus"; then
 fi

 # test for any invalid configuration combinations
-if test "$targetos" = "windows"; then
+if test "$targetos" = "windows" && ! has "$dlltool"; then
  if test "$plugins" = "yes"; then
-    error_exit "TCG plugins not currently supported on Windows platforms"
+    error_exit "TCG plugins requires dlltool to build on Windows platforms"
  fi
  plugins="no"
 fi
@@ -1281,6 +1282,11 @@ probe_target_compiler() {
        container_cross_prefix=aarch64-linux-gnu-
        container_cross_cc=${container_cross_prefix}gcc
        ;;
+      alpha)
+        container_image=debian-legacy-test-cross
+        container_cross_prefix=alpha-linux-gnu-
+        container_cross_cc=${container_cross_prefix}gcc
+        ;;
      arm)
        # We don't have any bigendian build tools so we only use this for ARM
        container_image=debian-armhf-cross
@@ -1295,6 +1301,11 @@ probe_target_compiler() {
        container_cross_prefix=hexagon-unknown-linux-musl-
        container_cross_cc=${container_cross_prefix}clang
        ;;
+      hppa)
+        container_image=debian-all-test-cross
+        container_cross_prefix=hppa-linux-gnu-
+        container_cross_cc=${container_cross_prefix}gcc
+        ;;
      i386)
        container_image=fedora-i386-cross
        container_cross_prefix=
@@ -1303,6 +1314,11 @@ probe_target_compiler() {
        container_image=debian-loongarch-cross
        container_cross_prefix=loongarch64-unknown-linux-gnu-
        ;;
+      m68k)
+        container_image=debian-all-test-cross
+        container_cross_prefix=m68k-linux-gnu-
+        container_cross_cc=${container_cross_prefix}gcc
+        ;;
      microblaze)
        container_image=debian-microblaze-cross
        container_cross_prefix=microblaze-linux-musl-
@@ -1312,22 +1328,37 @@ probe_target_compiler() {
        container_cross_prefix=mips64el-linux-gnuabi64-
        ;;
      mips64)
-        container_image=debian-mips64-cross
+        container_image=debian-all-test-cross
        container_cross_prefix=mips64-linux-gnuabi64-
        ;;
+      mips)
+        container_image=debian-all-test-cross
+        container_cross_prefix=mips-linux-gnu-
+        ;;
      nios2)
        container_image=debian-nios2-cross
        container_cross_prefix=nios2-linux-gnu-
        ;;
      ppc)
-        container_image=debian-powerpc-test-cross
+        container_image=debian-all-test-cross
        container_cross_prefix=powerpc-linux-gnu-
        container_cross_cc=${container_cross_prefix}gcc
        ;;
      ppc64|ppc64le)
-        container_image=debian-powerpc-test-cross
+        container_image=debian-all-test-cross
        container_cross_prefix=powerpc${target_arch#ppc}-linux-gnu-
-        container_cross_cc=${container_cross_prefix}gcc-10
+        ;;
+      riscv64)
+        container_image=debian-all-test-cross
+        container_cross_prefix=riscv64-linux-gnu-
+        ;;
+      sh4)
+        container_image=debian-legacy-test-cross
+        container_cross_prefix=sh4-linux-gnu-
+        ;;
+      sparc64)
+        container_image=debian-all-test-cross
+        container_cross_prefix=sparc64-linux-gnu-
        ;;
      tricore)
        container_image=debian-tricore-cross
@@ -1629,9 +1660,15 @@ echo "SRC_PATH=$source_path/contrib/plugins" >> contrib/plugins/$config_host_mak
 echo "PKG_CONFIG=${pkg_config}" >> contrib/plugins/$config_host_mak
 echo "CC=$cc $CPU_CFLAGS" >> contrib/plugins/$config_host_mak
 echo "CFLAGS=${CFLAGS-$default_cflags} $EXTRA_CFLAGS" >> contrib/plugins/$config_host_mak
+if test "$targetos" = windows; then
+  echo "DLLTOOL=$dlltool" >> contrib/plugins/$config_host_mak
+fi
 if test "$targetos" = darwin; then
  echo "CONFIG_DARWIN=y" >> contrib/plugins/$config_host_mak
 fi
+if test "$targetos" = windows; then
+  echo "CONFIG_WIN32=y" >> contrib/plugins/$config_host_mak
+fi

 # tests/tcg configuration
 (config_host_mak=tests/tcg/config-host.mak
@@ -1734,6 +1771,7 @@ if test "$skip_meson" = no; then
  test -n "$cxx" && echo "cpp = [$(meson_quote $cxx $CPU_CFLAGS)]" >> $cross
  test -n "$objcc" && echo "objc = [$(meson_quote $objcc $CPU_CFLAGS)]" >> $cross
  echo "ar = [$(meson_quote $ar)]" >> $cross
+  echo "dlltool = [$(meson_quote $dlltool)]" >> $cross
  echo "nm = [$(meson_quote $nm)]" >> $cross
  echo "pkgconfig = [$(meson_quote $pkg_config)]" >> $cross
  echo "pkg-config = [$(meson_quote $pkg_config)]" >> $cross
@@ -1839,6 +1877,7 @@ preserve_env CC
 preserve_env CFLAGS
 preserve_env CXX
 preserve_env CXXFLAGS
+preserve_env DLLTOOL
 preserve_env LD
 preserve_env LDFLAGS
 preserve_env LD_LIBRARY_PATH
--- a/contrib/gitdm/domain-map
+++ b/contrib/gitdm/domain-map
@@ -12,15 +12,18 @@ amd.com         AMD
 aspeedtech.com  ASPEED Technology Inc.
 baidu.com       Baidu
 bytedance.com   ByteDance
+cestc.cn        Cestc
 cmss.chinamobile.com China Mobile
 citrix.com      Citrix
 crudebyte.com   Crudebyte
 chinatelecom.cn China Telecom
+daynix.com      Daynix
 eldorado.org.br Instituto de Pesquisas Eldorado
 fb.com          Facebook
 fujitsu.com     Fujitsu
 google.com      Google
 greensocs.com   GreenSocs
+hisilicon.com   Huawei
 huawei.com      Huawei
 ibm.com         IBM
 igalia.com      Igalia
@@ -38,6 +41,7 @@ proxmox.com     Proxmox
 quicinc.com     Qualcomm Innovation Center
 redhat.com      Red Hat
 rev.ng          rev.ng Labs
+rivosinc.com    Rivos Inc
 rt-rk.com       RT-RK
 samsung.com     Samsung
 siemens.com     Siemens
--- a/contrib/plugins/Makefile
+++ b/contrib/plugins/Makefile
@@ -17,12 +17,25 @@ NAMES += execlog
 NAMES += hotblocks
 NAMES += hotpages
 NAMES += howvec
+
+# The lockstep example communicates using unix sockets,
+# and can't be easily made to work on windows.
+ifneq ($(CONFIG_WIN32),y)
 NAMES += lockstep
+endif
+
 NAMES += hwprofile
 NAMES += cache
 NAMES += drcov

-SONAMES := $(addsuffix .so,$(addprefix lib,$(NAMES)))
+ifeq ($(CONFIG_WIN32),y)
+SO_SUFFIX := .dll
+LDLIBS += $(shell $(PKG_CONFIG) --libs glib-2.0)
+else
+SO_SUFFIX := .so
+endif
+
+SONAMES := $(addsuffix $(SO_SUFFIX),$(addprefix lib,$(NAMES)))

 # The main QEMU uses Glib extensively so it's perfectly fine to use it
 # in plugins (which many example do).
@@ -35,15 +48,20 @@ all: $(SONAMES)
 %.o: %.c
 	$(CC) $(CFLAGS) $(PLUGIN_CFLAGS) -c -o $@ $<

-lib%.so: %.o
-ifeq ($(CONFIG_DARWIN),y)
+ifeq ($(CONFIG_WIN32),y)
+lib%$(SO_SUFFIX): %.o win32_linker.o ../../plugins/qemu_plugin_api.lib
+	$(CC) -shared -o $@ $^ $(LDLIBS)
+else ifeq ($(CONFIG_DARWIN),y)
+lib%$(SO_SUFFIX): %.o
 	$(CC) -bundle -Wl,-undefined,dynamic_lookup -o $@ $^ $(LDLIBS)
 else
+lib%$(SO_SUFFIX): %.o
 	$(CC) -shared -o $@ $^ $(LDLIBS)
 endif

+
 clean:
-	rm -f *.o *.so *.d
+	rm -f *.o *$(SO_SUFFIX) *.d
 	rm -Rf .libs

 .PHONY: all clean
--- a/contrib/plugins/lockstep.c
+++ b/contrib/plugins/lockstep.c
@@ -257,6 +257,7 @@ static bool setup_socket(const char *path)
    sockaddr.sun_family = AF_UNIX;
    if (g_strlcpy(sockaddr.sun_path, path, pathlen) >= pathlen) {
        perror("bad path");
+        close(fd);
        return false;
    }

@@ -303,6 +304,7 @@ static bool connect_socket(const char *path)
    sockaddr.sun_family = AF_UNIX;
    if (g_strlcpy(sockaddr.sun_path, path, pathlen) >= pathlen) {
        perror("bad path");
+        close(fd);
        return false;
    }

--- a/contrib/plugins/win32_linker.c
+++ b/contrib/plugins/win32_linker.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2023, Greg Manning <gmanning@rapitasystems.com>
+ *
+ * This hook, __pfnDliFailureHook2, is documented in the microsoft documentation here:
+ * https://learn.microsoft.com/en-us/cpp/build/reference/error-handling-and-notification
+ * It gets called when a delay-loaded DLL encounters various errors.
+ * We handle the specific case of a DLL looking for a "qemu.exe",
+ * and give it the running executable (regardless of what it is named).
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include <windows.h>
+#include <delayimp.h>
+
+FARPROC WINAPI dll_failure_hook(unsigned dliNotify, PDelayLoadInfo pdli);
+
+
+PfnDliHook __pfnDliFailureHook2 = dll_failure_hook;
+
+FARPROC WINAPI dll_failure_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
+    if (dliNotify == dliFailLoadLib) {
+        /* If the failing request was for qemu.exe, ... */
+        if (strcmp(pdli->szDll, "qemu.exe") == 0) {
+            /* Then pass back a pointer to the top level module. */
+            HMODULE top = GetModuleHandle(NULL);
+            return (FARPROC) top;
+        }
+    }
+    /* Otherwise we can't do anything special. */
+    return 0;
+}
+
--- a/cpu-common.c
+++ b/cpu-common.c
@@ -73,7 +73,7 @@ static int cpu_get_free_index(void)
    return max_cpu_index;
 }

-CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
+CPUTailQ cpus_queue = QTAILQ_HEAD_INITIALIZER(cpus_queue);
 static unsigned int cpu_list_generation_id;

 unsigned int cpu_list_generation_id_get(void)
@@ -90,7 +90,7 @@ void cpu_list_add(CPUState *cpu)
    } else {
        assert(!cpu_index_auto_assigned);
    }
-    QTAILQ_INSERT_TAIL_RCU(&cpus, cpu, node);
+    QTAILQ_INSERT_TAIL_RCU(&cpus_queue, cpu, node);
    cpu_list_generation_id++;
 }

@@ -102,7 +102,7 @@ void cpu_list_remove(CPUState *cpu)
        return;
    }

-    QTAILQ_REMOVE_RCU(&cpus, cpu, node);
+    QTAILQ_REMOVE_RCU(&cpus_queue, cpu, node);
    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
    cpu_list_generation_id++;
 }
--- a/cpu-target.c
+++ b/cpu-target.c
@@ -42,7 +42,6 @@
 #include "hw/core/accel-cpu.h"
 #include "trace/trace-root.h"
 #include "qemu/accel.h"
-#include "qemu/plugin.h"

 uintptr_t qemu_host_page_size;
 intptr_t qemu_host_page_mask;
@@ -131,23 +130,18 @@ const VMStateDescription vmstate_cpu_common = {
 };
 #endif

-void cpu_exec_realizefn(CPUState *cpu, Error **errp)
+bool cpu_exec_realizefn(CPUState *cpu, Error **errp)
 {
    /* cache the cpu class for the hotpath */
    cpu->cc = CPU_GET_CLASS(cpu);

    if (!accel_cpu_common_realize(cpu, errp)) {
-        return;
+        return false;
    }

    /* Wait until cpu initialization complete before exposing cpu. */
    cpu_list_add(cpu);

-    /* Plugin initialization must wait until cpu_index assigned. */
-    if (tcg_enabled()) {
-        qemu_plugin_vcpu_init_hook(cpu);
-    }
-
 #ifdef CONFIG_USER_ONLY
    assert(qdev_get_vmsd(DEVICE(cpu)) == NULL ||
           qdev_get_vmsd(DEVICE(cpu))->unmigratable);
@@ -159,6 +153,8 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
        vmstate_register(NULL, cpu->cpu_index, cpu->cc->sysemu_ops->legacy_vmsd, cpu);
    }
 #endif /* CONFIG_USER_ONLY */
+
+    return true;
 }

 void cpu_exec_unrealizefn(CPUState *cpu)
@@ -174,11 +170,6 @@ void cpu_exec_unrealizefn(CPUState *cpu)
    }
 #endif

-    /* Call the plugin hook before clearing cpu->cpu_index in cpu_list_remove */
-    if (tcg_enabled()) {
-        qemu_plugin_vcpu_exit_hook(cpu);
-    }
-
    cpu_list_remove(cpu);
    /*
     * Now that the vCPU has been removed from the RCU list, we can call
--- a/crypto/rsakey-builtin.c.inc
+++ b/crypto/rsakey-builtin.c.inc
@@ -88,15 +88,13 @@ static QCryptoAkCipherRSAKey *qcrypto_builtin_rsa_public_key_parse(
        goto error;
    }
    if (seq_length != 0) {
+        error_setg(errp, "Invalid RSA public key");
        goto error;
    }

    return rsa;

 error:
-    if (errp && !*errp) {
-        error_setg(errp, "Invalid RSA public key");
-    }
    qcrypto_akcipher_rsakey_free(rsa);
    return NULL;
 }
@@ -169,15 +167,13 @@ static QCryptoAkCipherRSAKey *qcrypto_builtin_rsa_private_key_parse(
        return rsa;
    }
    if (seq_length != 0) {
+        error_setg(errp, "Invalid RSA private key");
        goto error;
    }

    return rsa;

 error:
-    if (errp && !*errp) {
-        error_setg(errp, "Invalid RSA private key");
-    }
    qcrypto_akcipher_rsakey_free(rsa);
    return NULL;
 }
--- a/disas/riscv.c
+++ b/disas/riscv.c
@@ -862,6 +862,47 @@ typedef enum {
    rv_op_fltq_q = 831,
    rv_op_fleq_h = 832,
    rv_op_fltq_h = 833,
+    rv_op_vaesdf_vv = 834,
+    rv_op_vaesdf_vs = 835,
+    rv_op_vaesdm_vv = 836,
+    rv_op_vaesdm_vs = 837,
+    rv_op_vaesef_vv = 838,
+    rv_op_vaesef_vs = 839,
+    rv_op_vaesem_vv = 840,
+    rv_op_vaesem_vs = 841,
+    rv_op_vaeskf1_vi = 842,
+    rv_op_vaeskf2_vi = 843,
+    rv_op_vaesz_vs = 844,
+    rv_op_vandn_vv = 845,
+    rv_op_vandn_vx = 846,
+    rv_op_vbrev_v = 847,
+    rv_op_vbrev8_v = 848,
+    rv_op_vclmul_vv = 849,
+    rv_op_vclmul_vx = 850,
+    rv_op_vclmulh_vv = 851,
+    rv_op_vclmulh_vx = 852,
+    rv_op_vclz_v = 853,
+    rv_op_vcpop_v = 854,
+    rv_op_vctz_v = 855,
+    rv_op_vghsh_vv = 856,
+    rv_op_vgmul_vv = 857,
+    rv_op_vrev8_v = 858,
+    rv_op_vrol_vv = 859,
+    rv_op_vrol_vx = 860,
+    rv_op_vror_vv = 861,
+    rv_op_vror_vx = 862,
+    rv_op_vror_vi = 863,
+    rv_op_vsha2ch_vv = 864,
+    rv_op_vsha2cl_vv = 865,
+    rv_op_vsha2ms_vv = 866,
+    rv_op_vsm3c_vi = 867,
+    rv_op_vsm3me_vv = 868,
+    rv_op_vsm4k_vi = 869,
+    rv_op_vsm4r_vv = 870,
+    rv_op_vsm4r_vs = 871,
+    rv_op_vwsll_vv = 872,
+    rv_op_vwsll_vx = 873,
+    rv_op_vwsll_vi = 874,
 } rv_op;

 /* register names */
@@ -2008,6 +2049,47 @@ const rv_opcode_data rvi_opcode_data[] = {
    { "fltq.q", rv_codec_r, rv_fmt_rd_frs1_frs2, NULL, 0, 0, 0 },
    { "fleq.h", rv_codec_r, rv_fmt_rd_frs1_frs2, NULL, 0, 0, 0 },
    { "fltq.h", rv_codec_r, rv_fmt_rd_frs1_frs2, NULL, 0, 0, 0 },
+    { "vaesdf.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesdf.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesdm.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesdm.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesef.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesef.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesem.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaesem.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vaeskf1.vi", rv_codec_v_i, rv_fmt_vd_vs2_uimm, NULL, 0, 0, 0 },
+    { "vaeskf2.vi", rv_codec_v_i, rv_fmt_vd_vs2_uimm, NULL, 0, 0, 0 },
+    { "vaesz.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vandn.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vandn.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vbrev.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vbrev8.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vclmul.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vclmul.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vclmulh.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vclmulh.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vclz.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vcpop.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vctz.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vghsh.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1, NULL, 0, 0, 0 },
+    { "vgmul.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vrev8.v", rv_codec_v_r, rv_fmt_vd_vs2_vm, NULL, 0, 0, 0 },
+    { "vrol.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vrol.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vror.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vror.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vror.vi", rv_codec_vror_vi, rv_fmt_vd_vs2_uimm_vm, NULL, 0, 0, 0 },
+    { "vsha2ch.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1, NULL, 0, 0, 0 },
+    { "vsha2cl.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1, NULL, 0, 0, 0 },
+    { "vsha2ms.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1, NULL, 0, 0, 0 },
+    { "vsm3c.vi", rv_codec_v_i, rv_fmt_vd_vs2_uimm, NULL, 0, 0, 0 },
+    { "vsm3me.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1, NULL, 0, 0, 0 },
+    { "vsm4k.vi", rv_codec_v_i, rv_fmt_vd_vs2_uimm, NULL, 0, 0, 0 },
+    { "vsm4r.vv", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vsm4r.vs", rv_codec_v_r, rv_fmt_vd_vs2, NULL, 0, 0, 0 },
+    { "vwsll.vv", rv_codec_v_r, rv_fmt_vd_vs2_vs1_vm, NULL, 0, 0, 0 },
+    { "vwsll.vx", rv_codec_v_r, rv_fmt_vd_vs2_rs1_vm, NULL, 0, 0, 0 },
+    { "vwsll.vi", rv_codec_v_i, rv_fmt_vd_vs2_uimm_vm, NULL, 0, 0, 0 },
 };

 /* CSR names */
@@ -3054,12 +3136,12 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                }
                break;
            case 89:
-		switch (((inst >> 12) & 0b111)) {
+                switch (((inst >> 12) & 0b111)) {
                case 0: op = rv_op_fmvp_d_x; break;
                }
                break;
            case 91:
-		switch (((inst >> 12) & 0b111)) {
+                switch (((inst >> 12) & 0b111)) {
                case 0: op = rv_op_fmvp_q_x; break;
                }
                break;
@@ -3176,6 +3258,7 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
            case 0:
                switch ((inst >> 26) & 0b111111) {
                case 0: op = rv_op_vadd_vv; break;
+                case 1: op = rv_op_vandn_vv; break;
                case 2: op = rv_op_vsub_vv; break;
                case 4: op = rv_op_vminu_vv; break;
                case 5: op = rv_op_vmin_vv; break;
@@ -3198,6 +3281,8 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                    }
                    break;
                case 19: op = rv_op_vmsbc_vvm; break;
+                case 20: op = rv_op_vror_vv; break;
+                case 21: op = rv_op_vrol_vv; break;
                case 23:
                    if (((inst >> 20) & 0b111111) == 32)
                        op = rv_op_vmv_v_v;
@@ -3226,6 +3311,7 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                case 47: op = rv_op_vnclip_wv; break;
                case 48: op = rv_op_vwredsumu_vs; break;
                case 49: op = rv_op_vwredsum_vs; break;
+                case 53: op = rv_op_vwsll_vv; break;
                }
                break;
            case 1:
@@ -3323,6 +3409,8 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                case 9: op = rv_op_vaadd_vv; break;
                case 10: op = rv_op_vasubu_vv; break;
                case 11: op = rv_op_vasub_vv; break;
+                case 12: op = rv_op_vclmul_vv; break;
+                case 13: op = rv_op_vclmulh_vv; break;
                case 16:
                    switch ((inst >> 15) & 0b11111) {
                    case 0: if ((inst >> 25) & 1) op = rv_op_vmv_x_s; break;
@@ -3338,6 +3426,12 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                    case 5: op = rv_op_vsext_vf4; break;
                    case 6: op = rv_op_vzext_vf2; break;
                    case 7: op = rv_op_vsext_vf2; break;
+                    case 8: op = rv_op_vbrev8_v; break;
+                    case 9: op = rv_op_vrev8_v; break;
+                    case 10: op = rv_op_vbrev_v; break;
+                    case 12: op = rv_op_vclz_v; break;
+                    case 13: op = rv_op_vctz_v; break;
+                    case 14: op = rv_op_vcpop_v; break;
                    }
                    break;
                case 20:
@@ -3406,6 +3500,7 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                    }
                    break;
                case 17: op = rv_op_vmadc_vim; break;
+                case 20: case 21: op = rv_op_vror_vi; break;
                case 23:
                    if (((inst >> 20) & 0b111111) == 32)
                        op = rv_op_vmv_v_i;
@@ -3437,11 +3532,13 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                case 45: op = rv_op_vnsra_wi; break;
                case 46: op = rv_op_vnclipu_wi; break;
                case 47: op = rv_op_vnclip_wi; break;
+                case 53: op = rv_op_vwsll_vi; break;
                }
                break;
            case 4:
                switch ((inst >> 26) & 0b111111) {
                case 0: op = rv_op_vadd_vx; break;
+                case 1: op = rv_op_vandn_vx; break;
                case 2: op = rv_op_vsub_vx; break;
                case 3: op = rv_op_vrsub_vx; break;
                case 4: op = rv_op_vminu_vx; break;
@@ -3466,6 +3563,8 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                    }
                    break;
                case 19: op = rv_op_vmsbc_vxm; break;
+                case 20: op = rv_op_vror_vx; break;
+                case 21: op = rv_op_vrol_vx; break;
                case 23:
                    if (((inst >> 20) & 0b111111) == 32)
                        op = rv_op_vmv_v_x;
@@ -3494,6 +3593,7 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                case 45: op = rv_op_vnsra_wx; break;
                case 46: op = rv_op_vnclipu_wx; break;
                case 47: op = rv_op_vnclip_wx; break;
+                case 53: op = rv_op_vwsll_vx; break;
                }
                break;
            case 5:
@@ -3554,6 +3654,8 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
                case 9: op = rv_op_vaadd_vx; break;
                case 10: op = rv_op_vasubu_vx; break;
                case 11: op = rv_op_vasub_vx; break;
+                case 12: op = rv_op_vclmul_vx; break;
+                case 13: op = rv_op_vclmulh_vx; break;
                case 14: op = rv_op_vslide1up_vx; break;
                case 15: op = rv_op_vslide1down_vx; break;
                case 16:
@@ -3686,6 +3788,41 @@ static void decode_inst_opcode(rv_decode *dec, rv_isa isa)
            case 7: op = rv_op_csrrci; break;
            }
            break;
+        case 29:
+            if (((inst >> 25) & 1) == 1 && ((inst >> 12) & 0b111) == 2) {
+                switch ((inst >> 26) & 0b111111) {
+                case 32: op = rv_op_vsm3me_vv; break;
+                case 33: op = rv_op_vsm4k_vi; break;
+                case 34: op = rv_op_vaeskf1_vi; break;
+                case 40:
+                    switch ((inst >> 15) & 0b11111) {
+                    case 0: op = rv_op_vaesdm_vv; break;
+                    case 1: op = rv_op_vaesdf_vv; break;
+                    case 2: op = rv_op_vaesem_vv; break;
+                    case 3: op = rv_op_vaesef_vv; break;
+                    case 16: op = rv_op_vsm4r_vv; break;
+                    case 17: op = rv_op_vgmul_vv; break;
+                    }
+                    break;
+                case 41:
+                    switch ((inst >> 15) & 0b11111) {
+                    case 0: op = rv_op_vaesdm_vs; break;
+                    case 1: op = rv_op_vaesdf_vs; break;
+                    case 2: op = rv_op_vaesem_vs; break;
+                    case 3: op = rv_op_vaesef_vs; break;
+                    case 7: op = rv_op_vaesz_vs; break;
+                    case 16: op = rv_op_vsm4r_vs; break;
+                    }
+                    break;
+                case 42: op = rv_op_vaeskf2_vi; break;
+                case 43: op = rv_op_vsm3c_vi; break;
+                case 44: op = rv_op_vghsh_vv; break;
+                case 45: op = rv_op_vsha2ms_vv; break;
+                case 46: op = rv_op_vsha2ch_vv; break;
+                case 47: op = rv_op_vsha2cl_vv; break;
+                }
+            }
+            break;
        case 30:
            switch (((inst >> 22) & 0b1111111000) |
                    ((inst >> 12) & 0b0000000111)) {
@@ -4011,6 +4148,12 @@ static uint32_t operand_vzimm10(rv_inst inst)
    return (inst << 34) >> 54;
 }

+static uint32_t operand_vzimm6(rv_inst inst)
+{
+    return ((inst << 37) >> 63) << 5 |
+        ((inst << 44) >> 59);
+}
+
 static uint32_t operand_bs(rv_inst inst)
 {
    return (inst << 32) >> 62;
@@ -4393,6 +4536,12 @@ static void decode_inst_operands(rv_decode *dec, rv_isa isa)
        dec->imm = operand_vimm(inst);
        dec->vm = operand_vm(inst);
        break;
+    case rv_codec_vror_vi:
+        dec->rd = operand_rd(inst);
+        dec->rs2 = operand_rs2(inst);
+        dec->imm = operand_vzimm6(inst);
+        dec->vm = operand_vm(inst);
+        break;
    case rv_codec_vsetvli:
        dec->rd = operand_rd(inst);
        dec->rs1 = operand_rs1(inst);
@@ -4430,7 +4579,7 @@ static void decode_inst_operands(rv_decode *dec, rv_isa isa)
        break;
    case rv_codec_zcmt_jt:
        dec->imm = operand_tbl_index(inst);
-	break;
+        break;
    case rv_codec_fli:
        dec->rd = operand_rd(inst);
        dec->imm = operand_rs1(inst);
@@ -4677,7 +4826,7 @@ static void format_inst(char *buf, size_t buflen, size_t tab, rv_decode *dec)
            append(buf, tmp, buflen);
            break;
        case 'u':
-            snprintf(tmp, sizeof(tmp), "%u", ((uint32_t)dec->imm & 0b11111));
+            snprintf(tmp, sizeof(tmp), "%u", ((uint32_t)dec->imm & 0b111111));
            append(buf, tmp, buflen);
            break;
        case 'j':
--- a/disas/riscv.h
+++ b/disas/riscv.h
@@ -152,6 +152,7 @@ typedef enum {
    rv_codec_v_i,
    rv_codec_vsetvli,
    rv_codec_vsetivli,
+    rv_codec_vror_vi,
    rv_codec_zcb_ext,
    rv_codec_zcb_mul,
    rv_codec_zcb_lb,
@@ -274,6 +275,7 @@ enum {
 #define rv_fmt_vd_vs2_fs1_vm          "O\tD,F,4m"
 #define rv_fmt_vd_vs2_imm_vl          "O\tD,F,il"
 #define rv_fmt_vd_vs2_imm_vm          "O\tD,F,im"
+#define rv_fmt_vd_vs2_uimm            "O\tD,F,u"
 #define rv_fmt_vd_vs2_uimm_vm         "O\tD,F,um"
 #define rv_fmt_vd_vs1_vs2_vm          "O\tD,E,Fm"
 #define rv_fmt_vd_rs1_vs2_vm          "O\tD,1,Fm"
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -413,6 +413,18 @@ Specifying the iSCSI password in plain text on the command line using the
 used instead, to refer to a ``--object secret...`` instance that provides
 a password via a file, or encrypted.

+CPU device properties
+'''''''''''''''''''''
+
+``pmu-num=n`` on RISC-V CPUs (since 8.2)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to support more flexible counter configurations this has been replaced
+by a ``pmu-mask`` property. If set of counters is continuous then the mask can
+be calculated with ``((2 ^ n) - 1) << 3``. The least significant three bits
+must be left clear.
+
+
 Backwards compatibility
 -----------------------

--- a/docs/devel/index-api.rst
+++ b/docs/devel/index-api.rst
@@ -11,6 +11,7 @@ generated from in-code annotations to function prototypes.
   loads-stores
   memory
   modules
+   pci
   qom-api
   qdev-api
   ui
--- a/docs/devel/migration.rst
+++ b/docs/devel/migration.rst
@@ -167,13 +167,17 @@ An example (from hw/input/pckbd.c)
      }
  };

-We are declaring the state with name "pckbd".
-The ``version_id`` is 3, and the fields are 4 uint8_t in a KBDState structure.
-We registered this with:
+We are declaring the state with name "pckbd".  The ``version_id`` is
+3, and there are 4 uint8_t fields in the KBDState structure.  We
+registered this ``VMSTATEDescription`` with one of the following
+functions.  The first one will generate a device ``instance_id``
+different for each registration.  Use the second one if you already
+have an id that is different for each instance of the device:

 .. code:: c

-    vmstate_register(NULL, 0, &vmstate_kbd, s);
+    vmstate_register_any(NULL, &vmstate_kbd, s);
+    vmstate_register(NULL, instance_id, &vmstate_kbd, s);

 For devices that are ``qdev`` based, we can register the device in the class
 init function:
@@ -590,6 +594,77 @@ path.
     Return path  - opened by main thread, written by main thread AND postcopy
     thread (protected by rp_mutex)

+Dirty limit
+=====================
+The dirty limit, short for dirty page rate upper limit, is a new capability
+introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM
+dirty ring to throttle down the guest during live migration.
+
+The algorithm framework is as follows:
+
+::
+
+  ------------------------------------------------------------------------------
+  main   --------------> throttle thread ------------> PREPARE(1) <--------
+  thread  \                                                |              |
+           \                                               |              |
+            \                                              V              |
+             -\                                        CALCULATE(2)       |
+               \                                           |              |
+                \                                          |              |
+                 \                                         V              |
+                  \                                    SET PENALTY(3) -----
+                   -\                                      |
+                     \                                     |
+                      \                                    V
+                       -> virtual CPU thread -------> ACCEPT PENALTY(4)
+  ------------------------------------------------------------------------------
+
+When the qmp command qmp_set_vcpu_dirty_limit is called for the first time,
+the QEMU main thread starts the throttle thread. The throttle thread, once
+launched, executes the loop, which consists of three steps:
+
+  - PREPARE (1)
+
+     The entire work of PREPARE (1) is preparation for the second stage,
+     CALCULATE(2), as the name implies. It involves preparing the dirty
+     page rate value and the corresponding upper limit of the VM:
+     The dirty page rate is calculated via the KVM dirty ring mechanism,
+     which tells QEMU how many dirty pages a virtual CPU has had since the
+     last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper
+     limit is specified by caller, therefore fetch it directly.
+
+  - CALCULATE (2)
+
+     Calculate a suitable sleep period for each virtual CPU, which will be
+     used to determine the penalty for the target virtual CPU. The
+     computation must be done carefully in order to reduce the dirty page
+     rate progressively down to the upper limit without oscillation. To
+     achieve this, two strategies are provided: the first is to add or
+     subtract sleep time based on the ratio of the current dirty page rate
+     to the limit, which is used when the current dirty page rate is far
+     from the limit; the second is to add or subtract a fixed time when
+     the current dirty page rate is close to the limit.
+
+  - SET PENALTY (3)
+
+     Set the sleep time for each virtual CPU that should be penalized based
+     on the results of the calculation supplied by step CALCULATE (2).
+
+After completing the three above stages, the throttle thread loops back
+to step PREPARE (1) until the dirty limit is reached.
+
+On the other hand, each virtual CPU thread reads the sleep duration and
+sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that
+is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will
+obviously exit to the path and get penalized, whereas virtual CPUs involved
+with read processes will not.
+
+In summary, thanks to the KVM dirty ring technology, the dirty limit
+algorithm will restrict virtual CPUs as needed to keep their dirty page
+rate inside the limit. This leads to more steady reading performance during
+live migration and can aid in improving large guest responsiveness.
+
 Postcopy
 ========

--- a/docs/devel/pci.rst
+++ b/docs/devel/pci.rst
@@ -0,0 +1,8 @@
+=============
+PCI subsystem
+=============
+
+API Reference
+-------------
+
+.. kernel-doc:: include/hw/pci/pci.h
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -108,6 +108,43 @@ A vring state description

 :num: a 32-bit number

+A vring descriptor index for split virtqueues
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+-------------+---------------------+
+| vring index | index in avail ring |
+-------------+---------------------+
+
+:vring index: 32-bit index of the respective virtqueue
+
+:index in avail ring: 32-bit value, of which currently only the lower 16
+  bits are used:
+
+  - Bits 0–15: Index of the next *Available Ring* descriptor that the
+    back-end will process.  This is a free-running index that is not
+    wrapped by the ring size.
+  - Bits 16–31: Reserved (set to zero)
+
+Vring descriptor indices for packed virtqueues
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+-------------+--------------------+
+| vring index | descriptor indices |
+-------------+--------------------+
+
+:vring index: 32-bit index of the respective virtqueue
+
+:descriptor indices: 32-bit value:
+
+  - Bits 0–14: Index of the next *Available Ring* descriptor that the
+    back-end will process.  This is a free-running index that is not
+    wrapped by the ring size.
+  - Bit 15: Driver (Available) Ring Wrap Counter
+  - Bits 16–30: Index of the entry in the *Used Ring* where the back-end
+    will place the next descriptor.  This is a free-running index that
+    is not wrapped by the ring size.
+  - Bit 31: Device (Used) Ring Wrap Counter
+
 A vring address description
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

@@ -285,6 +322,32 @@ VhostUserShared
 :UUID: 16 bytes UUID, whose first three components (a 32-bit value, then
  two 16-bit values) are stored in big endian.

+Device state transfer parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+--------------------+-----------------+
+| transfer direction | migration phase |
+--------------------+-----------------+
+
+:transfer direction: a 32-bit enum, describing the direction in which
+  the state is transferred:
+
+  - 0: Save: Transfer the state from the back-end to the front-end,
+    which happens on the source side of migration
+  - 1: Load: Transfer the state from the front-end to the back-end,
+    which happens on the destination side of migration
+
+:migration phase: a 32-bit enum, describing the state in which the VM
+  guest and devices are:
+
+  - 0: Stopped (in the period after the transfer of memory-mapped
+    regions before switch-over to the destination): The VM guest is
+    stopped, and the vhost-user device is suspended (see
+    :ref:`Suspended device state <suspended_device_state>`).
+
+  In the future, additional phases might be added e.g. to allow
+  iterative migration while the device is running.
+
 C structure
 -----------

@@ -344,6 +407,7 @@ in the ancillary data:
 * ``VHOST_USER_SET_VRING_ERR``
 * ``VHOST_USER_SET_BACKEND_REQ_FD`` (previous name ``VHOST_USER_SET_SLAVE_REQ_FD``)
 * ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
+* ``VHOST_USER_SET_DEVICE_STATE_FD``

 If *front-end* is unable to send the full message or receives a wrong
 reply it will close the connection. An optional reconnection mechanism
@@ -374,35 +438,50 @@ negotiation.
 Ring states
 -----------

-Rings can be in one of three states:
+Rings have two independent states: started/stopped, and enabled/disabled.

-* stopped: the back-end must not process the ring at all.
+* While a ring is stopped, the back-end must not process the ring at
+  all, regardless of whether it is enabled or disabled.  The
+  enabled/disabled state should still be tracked, though, so it can come
+  into effect once the ring is started.

-* started but disabled: the back-end must process the ring without
+* started and disabled: The back-end must process the ring without
  causing any side effects.  For example, for a networking device,
  in the disabled state the back-end must not supply any new RX packets,
  but must process and discard any TX packets.

-* started and enabled.
+* started and enabled: The back-end must process the ring normally, i.e.
+  process all requests and execute them.

-Each ring is initialized in a stopped state.  The back-end must start
-ring upon receiving a kick (that is, detecting that file descriptor is
-readable) on the descriptor specified by ``VHOST_USER_SET_VRING_KICK``
-or receiving the in-band message ``VHOST_USER_VRING_KICK`` if negotiated,
-and stop ring upon receiving ``VHOST_USER_GET_VRING_BASE``.
+Each ring is initialized in a stopped and disabled state.  The back-end
+must start a ring upon receiving a kick (that is, detecting that file
+descriptor is readable) on the descriptor specified by
+``VHOST_USER_SET_VRING_KICK`` or receiving the in-band message
+``VHOST_USER_VRING_KICK`` if negotiated, and stop a ring upon receiving
+``VHOST_USER_GET_VRING_BASE``.

 Rings can be enabled or disabled by ``VHOST_USER_SET_VRING_ENABLE``.

-If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the
-ring starts directly in the enabled state.
-
-If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is
-initialized in a disabled state and is enabled by
-``VHOST_USER_SET_VRING_ENABLE`` with parameter 1.
+In addition, upon receiving a ``VHOST_USER_SET_FEATURES`` message from
+the front-end without ``VHOST_USER_F_PROTOCOL_FEATURES`` set, the
+back-end must enable all rings immediately.

 While processing the rings (whether they are enabled or not), the back-end
 must support changing some configuration aspects on the fly.

+.. _suspended_device_state:
+
+Suspended device state
+^^^^^^^^^^^^^^^^^^^^^^
+
+While all vrings are stopped, the device is *suspended*.  In addition to
+not processing any vring (because they are stopped), the device must:
+
+* not write to any guest memory regions,
+* not send any notifications to the guest,
+* not send any messages to the front-end,
+* still process and reply to messages from the front-end.
+
 Multiple queue support
 ----------------------

@@ -490,7 +569,8 @@ ancillary data, it may be used to inform the front-end that the log has
 been modified.

 Once the source has finished migration, rings will be stopped by the
-source. No further update must be done before rings are restarted.
+source (:ref:`Suspended device state <suspended_device_state>`). No
+further update must be done before rings are restarted.

 In postcopy migration the back-end is started before all the memory has
 been received from the source host, and care must be taken to avoid
@@ -502,6 +582,80 @@ it performs WAKE ioctl's on the userfaultfd to wake the stalled
 back-end.  The front-end indicates support for this via the
 ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature.

+.. _migrating_backend_state:
+
+Migrating back-end state
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Migrating device state involves transferring the state from one
+back-end, called the source, to another back-end, called the
+destination.  After migration, the destination transparently resumes
+operation without requiring the driver to re-initialize the device at
+the VIRTIO level.  If the migration fails, then the source can
+transparently resume operation until another migration attempt is made.
+
+Generally, the front-end is connected to a virtual machine guest (which
+contains the driver), which has its own state to transfer between source
+and destination, and therefore will have an implementation-specific
+mechanism to do so.  The ``VHOST_USER_PROTOCOL_F_DEVICE_STATE`` feature
+provides functionality to have the front-end include the back-end's
+state in this transfer operation so the back-end does not need to
+implement its own mechanism, and so the virtual machine may have its
+complete state, including vhost-user devices' states, contained within a
+single stream of data.
+
+To do this, the back-end state is transferred from back-end to front-end
+on the source side, and vice versa on the destination side.  This
+transfer happens over a channel that is negotiated using the
+``VHOST_USER_SET_DEVICE_STATE_FD`` message.  This message has two
+parameters:
+
+* Direction of transfer: On the source, the data is saved, transferring
+  it from the back-end to the front-end.  On the destination, the data
+  is loaded, transferring it from the front-end to the back-end.
+
+* Migration phase: Currently, the only supported phase is the period
+  after the transfer of memory-mapped regions before switch-over to the
+  destination, when both the source and destination devices are
+  suspended (:ref:`Suspended device state <suspended_device_state>`).
+  In the future, additional phases might be supported to allow iterative
+  migration while the device is running.
+
+The nature of the channel is implementation-defined, but it must
+generally behave like a pipe: The writing end will write all the data it
+has into it, signalling the end of data by closing its end.  The reading
+end must read all of this data (until encountering the end of file) and
+process it.
+
+* When saving, the writing end is the source back-end, and the reading
+  end is the source front-end.  After reading the state data from the
+  channel, the source front-end must transfer it to the destination
+  front-end through an implementation-defined mechanism.
+
+* When loading, the writing end is the destination front-end, and the
+  reading end is the destination back-end.  After reading the state data
+  from the channel, the destination back-end must deserialize its
+  internal state from that data and set itself up to allow the driver to
+  seamlessly resume operation on the VIRTIO level.
+
+Seamlessly resuming operation means that the migration must be
+transparent to the guest driver, which operates on the VIRTIO level.
+This driver will not perform any re-initialization steps, but continue
+to use the device as if no migration had occurred.  The vhost-user
+front-end, however, will re-initialize the vhost state on the
+destination, following the usual protocol for establishing a connection
+to a vhost-user back-end: This includes, for example, setting up memory
+mappings and kick and call FDs as necessary, negotiating protocol
+features, or setting the initial vring base indices (to the same value
+as on the source side, so that operation can resume).
+
+Both on the source and on the destination side, after the respective
+front-end has seen all data transferred (when the transfer FD has been
+closed), it sends the ``VHOST_USER_CHECK_DEVICE_STATE`` message to
+verify that data transfer was successful in the back-end, too.  The
+back-end responds once it knows whether the transfer and processing was
+successful or not.
+
 Memory access
 -------------

@@ -896,6 +1050,7 @@ Protocol features
  #define VHOST_USER_PROTOCOL_F_STATUS               16
  #define VHOST_USER_PROTOCOL_F_XEN_MMAP             17
  #define VHOST_USER_PROTOCOL_F_SHARED_OBJECT        18
+  #define VHOST_USER_PROTOCOL_F_DEVICE_STATE         19

 Front-end message types
 -----------------------
@@ -1042,18 +1197,54 @@ Front-end message types
 ``VHOST_USER_SET_VRING_BASE``
  :id: 10
  :equivalent ioctl: ``VHOST_SET_VRING_BASE``
-  :request payload: vring state description
+  :request payload: vring descriptor index/indices
  :reply payload: N/A

-  Sets the base offset in the available vring.
+  Sets the next index to use for descriptors in this vring:
+
+  * For a split virtqueue, sets only the next descriptor index to
+    process in the *Available Ring*.  The device is supposed to read the
+    next index in the *Used Ring* from the respective vring structure in
+    guest memory.
+
+  * For a packed virtqueue, both indices are supplied, as they are not
+    explicitly available in memory.
+
+  Consequently, the payload type is specific to the type of virt queue
+  (*a vring descriptor index for split virtqueues* vs. *vring descriptor
+  indices for packed virtqueues*).

 ``VHOST_USER_GET_VRING_BASE``
  :id: 11
  :equivalent ioctl: ``VHOST_USER_GET_VRING_BASE``
  :request payload: vring state description
-  :reply payload: vring state description
+  :reply payload: vring descriptor index/indices

-  Get the available vring base offset.
+  Stops the vring and returns the current descriptor index or indices:
+
+    * For a split virtqueue, returns only the 16-bit next descriptor
+      index to process in the *Available Ring*.  Note that this may
+      differ from the available ring index in the vring structure in
+      memory, which points to where the driver will put new available
+      descriptors.  For the *Used Ring*, the device only needs the next
+      descriptor index at which to put new descriptors, which is the
+      value in the vring structure in memory, so this value is not
+      covered by this message.
+
+    * For a packed virtqueue, neither index is explicitly available to
+      read from memory, so both indices (as maintained by the device) are
+      returned.
+
+  Consequently, the payload type is specific to the type of virt queue
+  (*a vring descriptor index for split virtqueues* vs. *vring descriptor
+  indices for packed virtqueues*).
+
+  When and as long as all of a device’s vrings are stopped, it is
+  *suspended*, see :ref:`Suspended device state
+  <suspended_device_state>`.
+
+  The request payload’s *num* field is currently reserved and must be
+  set to 0.

 ``VHOST_USER_SET_VRING_KICK``
  :id: 12
@@ -1464,6 +1655,76 @@ Front-end message types
  the requested UUID. Back-end will reply passing the fd when the operation
  is successful, or no fd otherwise.

+``VHOST_USER_SET_DEVICE_STATE_FD``
+  :id: 42
+  :equivalent ioctl: N/A
+  :request payload: device state transfer parameters
+  :reply payload: ``u64``
+
+  Front-end and back-end negotiate a channel over which to transfer the
+  back-end’s internal state during migration.  Either side (front-end or
+  back-end) may create the channel.  The nature of this channel is not
+  restricted or defined in this document, but whichever side creates it
+  must create a file descriptor that is provided to the respectively
+  other side, allowing access to the channel.  This FD must behave as
+  follows:
+
+  * For the writing end, it must allow writing the whole back-end state
+    sequentially.  Closing the file descriptor signals the end of
+    transfer.
+
+  * For the reading end, it must allow reading the whole back-end state
+    sequentially.  The end of file signals the end of the transfer.
+
+  For example, the channel may be a pipe, in which case the two ends of
+  the pipe fulfill these requirements respectively.
+
+  Initially, the front-end creates a channel along with such an FD.  It
+  passes the FD to the back-end as ancillary data of a
+  ``VHOST_USER_SET_DEVICE_STATE_FD`` message.  The back-end may create a
+  different transfer channel, passing the respective FD back to the
+  front-end as ancillary data of the reply.  If so, the front-end must
+  then discard its channel and use the one provided by the back-end.
+
+  Whether the back-end should decide to use its own channel is decided
+  based on efficiency: If the channel is a pipe, both ends will most
+  likely need to copy data into and out of it.  Any channel that allows
+  for more efficient processing on at least one end, e.g. through
+  zero-copy, is considered more efficient and thus preferred.  If the
+  back-end can provide such a channel, it should decide to use it.
+
+  The request payload contains parameters for the subsequent data
+  transfer, as described in the :ref:`Migrating back-end state
+  <migrating_backend_state>` section.
+
+  The value returned is both an indication for success, and whether a
+  file descriptor for a back-end-provided channel is returned: Bits 0–7
+  are 0 on success, and non-zero on error.  Bit 8 is the invalid FD
+  flag; this flag is set when there is no file descriptor returned.
+  When this flag is not set, the front-end must use the returned file
+  descriptor as its end of the transfer channel.  The back-end must not
+  both indicate an error and return a file descriptor.
+
+  Using this function requires prior negotiation of the
+  ``VHOST_USER_PROTOCOL_F_DEVICE_STATE`` feature.
+
+``VHOST_USER_CHECK_DEVICE_STATE``
+  :id: 43
+  :equivalent ioctl: N/A
+  :request payload: N/A
+  :reply payload: ``u64``
+
+  After transferring the back-end’s internal state during migration (see
+  the :ref:`Migrating back-end state <migrating_backend_state>`
+  section), check whether the back-end was able to successfully fully
+  process the state.
+
+  The value returned indicates success or error; 0 is success, any
+  non-zero value is an error.
+
+  Using this function requires prior negotiation of the
+  ``VHOST_USER_PROTOCOL_F_DEVICE_STATE`` feature.
+
 Back-end message types
 ----------------------

--- a/docs/specs/edu.rst
+++ b/docs/specs/edu.rst
@@ -2,9 +2,10 @@
 EDU device
 ==========

-Copyright (c) 2014-2015 Jiri Slaby
+..
+   Copyright (c) 2014-2015 Jiri Slaby

-This document is licensed under the GPLv2 (or later).
+   This document is licensed under the GPLv2 (or later).

 This is an educational device for writing (kernel) drivers. Its original
 intention was to support the Linux kernel lectures taught at the Masaryk
@@ -15,10 +16,11 @@ The devices behaves very similar to the PCI bridge present in the COMBO6 cards
 developed under the Liberouter wings. Both PCI device ID and PCI space is
 inherited from that device.

-Command line switches:
-    -device edu[,dma_mask=mask]
+Command line switches
+---------------------

-    dma_mask makes the virtual device work with DMA addresses with the given
+``-device edu[,dma_mask=mask]``
+    ``dma_mask`` makes the virtual device work with DMA addresses with the given
    mask. For educational purposes, the device supports only 28 bits (256 MiB)
    by default. Students shall set dma_mask for the device in the OS driver
    properly.
@@ -26,7 +28,8 @@ Command line switches:
 PCI specs
 ---------

-PCI ID: 1234:11e8
+PCI ID:
+   ``1234:11e8``

 PCI Region 0:
   I/O memory, 1 MB in size. Users are supposed to communicate with the card
@@ -35,24 +38,29 @@ PCI Region 0:
 MMIO area spec
 --------------

-Only size == 4 accesses are allowed for addresses < 0x80. size == 4 or
-size == 8 for the rest.
+Only ``size == 4`` accesses are allowed for addresses ``< 0x80``.
+``size == 4`` or ``size == 8`` for the rest.

-0x00 (RO) : identification (0xRRrr00edu)
-	    RR -- major version
-	    rr -- minor version
+0x00 (RO) : identification
+            Value is in the form ``0xRRrr00edu`` where:
+	    - ``RR`` -- major version
+	    - ``rr`` -- minor version

 0x04 (RW) : card liveness check
-	    It is a simple value inversion (~ C operator).
+	    It is a simple value inversion (``~`` C operator).

 0x08 (RW) : factorial computation
 	    The stored value is taken and factorial of it is put back here.
 	    This happens only after factorial bit in the status register (0x20
 	    below) is cleared.

-0x20 (RW) : status register, bitwise OR
-	    0x01 -- computing factorial (RO)
-	    0x80 -- raise interrupt after finishing factorial computation
+0x20 (RW) : status register
+            Bitwise OR of:
+
+            0x01
+              computing factorial (RO)
+	    0x80
+              raise interrupt after finishing factorial computation

 0x24 (RO) : interrupt status register
 	    It contains values which raised the interrupt (see interrupt raise
@@ -76,13 +84,19 @@ size == 8 for the rest.
 0x90 (RW) : DMA transfer count
 	    The size of the area to perform the DMA on.

-0x98 (RW) : DMA command register, bitwise OR
-	    0x01 -- start transfer
-	    0x02 -- direction (0: from RAM to EDU, 1: from EDU to RAM)
-	    0x04 -- raise interrupt 0x100 after finishing the DMA
+0x98 (RW) : DMA command register
+            Bitwise OR of:
+
+            0x01
+              start transfer
+	    0x02
+              direction (0: from RAM to EDU, 1: from EDU to RAM)
+	    0x04
+              raise interrupt 0x100 after finishing the DMA

 IRQ controller
 --------------
+
 An IRQ is generated when written to the interrupt raise register. The value
 appears in interrupt status register when the interrupt is raised and has to
 be written to the interrupt acknowledge register to lower it.
@@ -94,22 +108,28 @@ routine.

 DMA controller
 --------------
+
 One has to specify, source, destination, size, and start the transfer. One
 4096 bytes long buffer at offset 0x40000 is available in the EDU device. I.e.
 one can perform DMA to/from this space when programmed properly.

 Example of transferring a 100 byte block to and from the buffer using a given
-PCI address 'addr':
-addr     -> DMA source address
-0x40000  -> DMA destination address
-100      -> DMA transfer count
-1        -> DMA command register
-while (DMA command register & 1)
-	;
+PCI address ``addr``:

-0x40000  -> DMA source address
-addr+100 -> DMA destination address
-100      -> DMA transfer count
-3        -> DMA command register
-while (DMA command register & 1)
-	;
+::
+
+  addr     -> DMA source address
+  0x40000  -> DMA destination address
+  100      -> DMA transfer count
+  1        -> DMA command register
+  while (DMA command register & 1)
+      ;
+
+::
+
+  0x40000  -> DMA source address
+  addr+100 -> DMA destination address
+  100      -> DMA transfer count
+  3        -> DMA command register
+  while (DMA command register & 1)
+      ;
--- a/docs/specs/index.rst
+++ b/docs/specs/index.rst
@@ -24,3 +24,11 @@ guest hardware that is specific to QEMU.
   acpi_erst
   sev-guest-firmware
   fw_cfg
+   vmw_pvscsi-spec
+   edu
+   ivshmem-spec
+   pvpanic
+   standard-vga
+   virt-ctlr
+   vmcoreinfo
+   vmgenid
--- a/docs/specs/ivshmem-spec.rst
+++ b/docs/specs/ivshmem-spec.rst
@@ -1,4 +1,6 @@
-= Device Specification for Inter-VM shared memory device =
+======================================================
+Device Specification for Inter-VM shared memory device
+======================================================

 The Inter-VM shared memory device (ivshmem) is designed to share a
 memory region between multiple QEMU processes running different guests
@@ -12,42 +14,17 @@ can obtain one from an ivshmem server.
 In the latter case, the device can additionally interrupt its peers, and
 get interrupted by its peers.

+For information on configuring the ivshmem device on the QEMU
+command line, see :doc:`../system/devices/ivshmem`.

-== Configuring the ivshmem PCI device ==
-
-There are two basic configurations:
-
- Just shared memory:
-
-      -device ivshmem-plain,memdev=HMB,...
-
-  This uses host memory backend HMB.  It should have option "share"
-  set.
-
- Shared memory plus interrupts:
-
-      -device ivshmem-doorbell,chardev=CHR,vectors=N,...
-
-  An ivshmem server must already be running on the host.  The device
-  connects to the server's UNIX domain socket via character device
-  CHR.
-
-  Each peer gets assigned a unique ID by the server.  IDs must be
-  between 0 and 65535.
-
-  Interrupts are message-signaled (MSI-X).  vectors=N configures the
-  number of vectors to use.
-
-For more details on ivshmem device properties, see the QEMU Emulator
-user documentation.
-
-
-== The ivshmem PCI device's guest interface ==
+The ivshmem PCI device's guest interface
+========================================

 The device has vendor ID 1af4, device ID 1110, revision 1.  Before
 QEMU 2.6.0, it had revision 0.

-=== PCI BARs ===
+PCI BARs
+--------

 The ivshmem PCI device has two or three BARs:

@@ -59,8 +36,7 @@ There are two ways to use this device:

 - If you only need the shared memory part, BAR2 suffices.  This way,
  you have access to the shared memory in the guest and can use it as
-  you see fit.  Memnic, for example, uses ivshmem this way from guest
-  user space (see http://dpdk.org/browse/memnic).
+  you see fit.

 - If you additionally need the capability for peers to interrupt each
  other, you need BAR0 and BAR1.  You will most likely want to write a
@@ -77,10 +53,13 @@ accessing BAR2.
 Revision 0 of the device is not capable to tell guest software whether
 it is configured for interrupts.

-=== PCI device registers ===
+PCI device registers
+--------------------

 BAR 0 contains the following registers:

+::
+
    Offset  Size  Access      On reset  Function
        0     4   read/write        0   Interrupt Mask
                                        bit 0: peer interrupt (rev 0)
@@ -145,18 +124,20 @@ With multiple MSI-X vectors, different vectors can be used to indicate
 different events have occurred.  The semantics of interrupt vectors
 are left to the application.

-
-== Interrupt infrastructure ==
+Interrupt infrastructure
+========================

 When configured for interrupts, the peers share eventfd objects in
 addition to shared memory.  The shared resources are managed by an
 ivshmem server.

-=== The ivshmem server ===
+The ivshmem server
+------------------

 The server listens on a UNIX domain socket.

 For each new client that connects to the server, the server
+
 - picks an ID,
 - creates eventfd file descriptors for the interrupt vectors,
 - sends the ID and the file descriptor for the shared memory to the
@@ -189,7 +170,8 @@ vectors.
 A standalone client is in contrib/ivshmem-client/.  It can be useful
 for debugging.

-=== The ivshmem Client-Server Protocol ===
+The ivshmem Client-Server Protocol
+----------------------------------

 An ivshmem device configured for interrupts connects to an ivshmem
 server.  This section details the protocol between the two.
@@ -245,7 +227,8 @@ Known bugs:

 * The protocol is poorly designed.

-=== The ivshmem Client-Client Protocol ===
+The ivshmem Client-Client Protocol
+----------------------------------

 An ivshmem device configured for interrupts receives eventfd file
 descriptors for interrupting peers and getting interrupted by peers
--- a/docs/specs/pci-ids.rst
+++ b/docs/specs/pci-ids.rst
@@ -50,7 +50,7 @@ maintained as part of the virtio specification.
  by QEMU.

 1af4:1110
-  ivshmem device (shared memory, ``docs/specs/ivshmem-spec.txt``)
+  ivshmem device (:doc:`ivshmem-spec`)

 All other device IDs are reserved.

--- a/docs/specs/pvpanic.rst
+++ b/docs/specs/pvpanic.rst
@@ -21,18 +21,21 @@ recognize. On write, the bits not recognized by the device are ignored.
 Software should set only bits both itself and the device recognize.

 Bit Definition
--------------
-bit 0: a guest panic has happened and should be processed by the host
-bit 1: a guest panic has happened and will be handled by the guest;
-       the host should record it or report it, but should not affect
-       the execution of the guest.
+~~~~~~~~~~~~~~
+
+bit 0
+  a guest panic has happened and should be processed by the host
+bit 1
+  a guest panic has happened and will be handled by the guest;
+  the host should record it or report it, but should not affect
+  the execution of the guest.

 PCI Interface
 -------------

 The PCI interface is similar to the ISA interface except that it uses an MMIO
 address space provided by its BAR0, 1 byte long. Any machine with a PCI bus
-can enable a pvpanic device by adding '-device pvpanic-pci' to the command
+can enable a pvpanic device by adding ``-device pvpanic-pci`` to the command
 line.

 ACPI Interface
@@ -40,15 +43,25 @@ ACPI Interface

 pvpanic device is defined with ACPI ID "QEMU0001". Custom methods:

-RDPT:       To determine whether guest panic notification is supported.
-Arguments:  None
-Return:     Returns a byte, with the same semantics as the I/O port
-            interface.
+RDPT
+~~~~

-WRPT:       To send a guest panic event
-Arguments:  Arg0 is a byte to be written, with the same semantics as
-            the I/O interface.
-Return:     None
+To determine whether guest panic notification is supported.
+
+Arguments
+  None
+Return
+  Returns a byte, with the same semantics as the I/O port interface.
+
+WRPT
+~~~~
+
+To send a guest panic event.
+
+Arguments
+  Arg0 is a byte to be written, with the same semantics as the I/O interface.
+Return
+  None

 The ACPI device will automatically refer to the right port in case it
 is modified.
--- a/docs/specs/standard-vga.rst
+++ b/docs/specs/standard-vga.rst
@@ -0,0 +1,94 @@
+
+QEMU Standard VGA
+=================
+
+Exists in two variants, for isa and pci.
+
+command line switches:
+
+``-vga std``
+   picks isa for -M isapc, otherwise pci
+``-device VGA``
+   pci variant
+``-device isa-vga``
+   isa variant
+``-device secondary-vga``
+   legacy-free pci variant
+
+
+PCI spec
+--------
+
+Applies to the pci variant only for obvious reasons.
+
+PCI ID
+   ``1234:1111``
+
+PCI Region 0
+   Framebuffer memory, 16 MB in size (by default).
+   Size is tunable via vga_mem_mb property.
+
+PCI Region 1
+   Reserved (so we have the option to make the framebuffer bar 64bit).
+
+PCI Region 2
+   MMIO bar, 4096 bytes in size (QEMU 1.3+)
+
+PCI ROM Region
+   Holds the vgabios (QEMU 0.14+).
+
+
+The legacy-free variant has no ROM and has ``PCI_CLASS_DISPLAY_OTHER``
+instead of ``PCI_CLASS_DISPLAY_VGA``.
+
+
+IO ports used
+-------------
+
+Doesn't apply to the legacy-free pci variant, use the MMIO bar instead.
+
+``03c0 - 03df``
+   standard vga ports
+``01ce``
+   bochs vbe interface index port
+``01cf``
+   bochs vbe interface data port (x86 only)
+``01d0``
+   bochs vbe interface data port
+
+
+Memory regions used
+-------------------
+
+``0xe0000000``
+  Framebuffer memory, isa variant only.
+
+The pci variant used to mirror the framebuffer bar here, QEMU 0.14+
+stops doing that (except when in ``-M pc-$old`` compat mode).
+
+
+MMIO area spec
+--------------
+
+Likewise applies to the pci variant only for obvious reasons.
+
+``0000 - 03ff``
+  edid data blob.
+``0400 - 041f``
+  vga ioports (``0x3c0`` to ``0x3df``), remapped 1:1. Word access
+  is supported, bytes are written in little endian order (aka index
+  port first),  so indexed registers can be updated with a single
+  mmio write (and thus only one vmexit).
+``0500 - 0515``
+  bochs dispi interface registers, mapped flat without index/data ports.
+  Use ``(index << 1)`` as offset for (16bit) register access.
+``0600 - 0607``
+  QEMU extended registers.  QEMU 2.2+ only.
+  The pci revision is 2 (or greater) when these registers are present.
+  The registers are 32bit.
+``0600``
+  QEMU extended register region size, in bytes.
+``0604``
+  framebuffer endianness register.
+  - ``0xbebebebe`` indicates big endian.
+  - ``0x1e1e1e1e`` indicates little endian.
--- a/docs/specs/standard-vga.txt
+++ b/docs/specs/standard-vga.txt
@@ -1,81 +0,0 @@
-
-QEMU Standard VGA
-=================
-
-Exists in two variants, for isa and pci.
-
-command line switches:
-    -vga std               [ picks isa for -M isapc, otherwise pci ]
-    -device VGA            [ pci variant ]
-    -device isa-vga        [ isa variant ]
-    -device secondary-vga  [ legacy-free pci variant ]
-
-
-PCI spec
--------
-
-Applies to the pci variant only for obvious reasons.
-
-PCI ID: 1234:1111
-
-PCI Region 0:
-   Framebuffer memory, 16 MB in size (by default).
-   Size is tunable via vga_mem_mb property.
-
-PCI Region 1:
-   Reserved (so we have the option to make the framebuffer bar 64bit).
-
-PCI Region 2:
-   MMIO bar, 4096 bytes in size (qemu 1.3+)
-
-PCI ROM Region:
-   Holds the vgabios (qemu 0.14+).
-
-
-The legacy-free variant has no ROM and has PCI_CLASS_DISPLAY_OTHER
-instead of PCI_CLASS_DISPLAY_VGA.
-
-
-IO ports used
-------------
-
-Doesn't apply to the legacy-free pci variant, use the MMIO bar instead.
-
-03c0 - 03df : standard vga ports
-01ce        : bochs vbe interface index port
-01cf        : bochs vbe interface data port (x86 only)
-01d0        : bochs vbe interface data port
-
-
-Memory regions used
-------------------
-
-0xe0000000 : Framebuffer memory, isa variant only.
-
-The pci variant used to mirror the framebuffer bar here, qemu 0.14+
-stops doing that (except when in -M pc-$old compat mode).
-
-
-MMIO area spec
--------------
-
-Likewise applies to the pci variant only for obvious reasons.
-
-0000 - 03ff : edid data blob.
-0400 - 041f : vga ioports (0x3c0 -> 0x3df), remapped 1:1.
-              word access is supported, bytes are written
-              in little endia order (aka index port first),
-              so indexed registers can be updated with a
-              single mmio write (and thus only one vmexit).
-0500 - 0515 : bochs dispi interface registers, mapped flat
-              without index/data ports.  Use (index << 1)
-              as offset for (16bit) register access.
-
-0600 - 0607 : qemu extended registers.  qemu 2.2+ only.
-              The pci revision is 2 (or greater) when
-              these registers are present.  The registers
-              are 32bit.
-  0600      : qemu extended register region size, in bytes.
-  0604      : framebuffer endianness register.
-              - 0xbebebebe indicates big endian.
-              - 0x1e1e1e1e indicates little endian.
--- a/Show More
+++ b/Show More