include/block/block_int: Document protocol related functions

Clarify that: - for protocols the brdv_file_open function is used instead of bdrv_open; - when protocol_name is set, a driver should expect to be given only a filename and no other options. Signed-off-by: Fabiano Rosas <farosas@linux.vnet.ibm.com>
block/blkreplay: Remove protocol-related fields
2018-03-12 18:11:01 -03:00 · 2018-03-12 18:11:01 -03:00 · 2018-03-12 18:11:01 -03:00 · 2018-03-12 18:11:01 -03:00 · 2018-03-12 18:11:01 -03:00 · 2018-03-12 18:35:37 +00:00
1914 changed files with 192114 additions and 34490 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -27,13 +27,79 @@
 /libuser
 /linux-headers/asm
 /qga/qapi-generated
-/qapi-generated
-/qapi-types.[ch]
-/qapi-visit.[ch]
-/qapi-event.[ch]
-/qmp-commands.h
-/qmp-introspect.[ch]
-/qmp-marshal.c
+/qapi-gen-timestamp
+/qapi/qapi-builtin-types.[ch]
+/qapi/qapi-builtin-visit.[ch]
+/qapi/qapi-commands-block-core.[ch]
+/qapi/qapi-commands-block.[ch]
+/qapi/qapi-commands-char.[ch]
+/qapi/qapi-commands-common.[ch]
+/qapi/qapi-commands-crypto.[ch]
+/qapi/qapi-commands-introspect.[ch]
+/qapi/qapi-commands-migration.[ch]
+/qapi/qapi-commands-misc.[ch]
+/qapi/qapi-commands-net.[ch]
+/qapi/qapi-commands-rocker.[ch]
+/qapi/qapi-commands-run-state.[ch]
+/qapi/qapi-commands-sockets.[ch]
+/qapi/qapi-commands-tpm.[ch]
+/qapi/qapi-commands-trace.[ch]
+/qapi/qapi-commands-transaction.[ch]
+/qapi/qapi-commands-ui.[ch]
+/qapi/qapi-commands.[ch]
+/qapi/qapi-events-block-core.[ch]
+/qapi/qapi-events-block.[ch]
+/qapi/qapi-events-char.[ch]
+/qapi/qapi-events-common.[ch]
+/qapi/qapi-events-crypto.[ch]
+/qapi/qapi-events-introspect.[ch]
+/qapi/qapi-events-migration.[ch]
+/qapi/qapi-events-misc.[ch]
+/qapi/qapi-events-net.[ch]
+/qapi/qapi-events-rocker.[ch]
+/qapi/qapi-events-run-state.[ch]
+/qapi/qapi-events-sockets.[ch]
+/qapi/qapi-events-tpm.[ch]
+/qapi/qapi-events-trace.[ch]
+/qapi/qapi-events-transaction.[ch]
+/qapi/qapi-events-ui.[ch]
+/qapi/qapi-events.[ch]
+/qapi/qapi-introspect.[ch]
+/qapi/qapi-types-block-core.[ch]
+/qapi/qapi-types-block.[ch]
+/qapi/qapi-types-char.[ch]
+/qapi/qapi-types-common.[ch]
+/qapi/qapi-types-crypto.[ch]
+/qapi/qapi-types-introspect.[ch]
+/qapi/qapi-types-migration.[ch]
+/qapi/qapi-types-misc.[ch]
+/qapi/qapi-types-net.[ch]
+/qapi/qapi-types-rocker.[ch]
+/qapi/qapi-types-run-state.[ch]
+/qapi/qapi-types-sockets.[ch]
+/qapi/qapi-types-tpm.[ch]
+/qapi/qapi-types-trace.[ch]
+/qapi/qapi-types-transaction.[ch]
+/qapi/qapi-types-ui.[ch]
+/qapi/qapi-types.[ch]
+/qapi/qapi-visit-block-core.[ch]
+/qapi/qapi-visit-block.[ch]
+/qapi/qapi-visit-char.[ch]
+/qapi/qapi-visit-common.[ch]
+/qapi/qapi-visit-crypto.[ch]
+/qapi/qapi-visit-introspect.[ch]
+/qapi/qapi-visit-migration.[ch]
+/qapi/qapi-visit-misc.[ch]
+/qapi/qapi-visit-net.[ch]
+/qapi/qapi-visit-rocker.[ch]
+/qapi/qapi-visit-run-state.[ch]
+/qapi/qapi-visit-sockets.[ch]
+/qapi/qapi-visit-tpm.[ch]
+/qapi/qapi-visit-trace.[ch]
+/qapi/qapi-visit-transaction.[ch]
+/qapi/qapi-visit-ui.[ch]
+/qapi/qapi-visit.[ch]
+/qapi/qapi-doc.texi
 /qemu-doc.html
 /qemu-doc.info
 /qemu-doc.txt
@@ -53,8 +119,8 @@
 /qemu-version.h.tmp
 /module_block.h
 /scsi/qemu-pr-helper
-/vscclient
 /vhost-user-scsi
+/vhost-user-blk
 /fsdev/virtfs-proxy-helper
 *.tmp
 *.[1-9]
--- a/.gitmodules
+++ b/.gitmodules
@@ -40,3 +40,9 @@
 [submodule "capstone"]
 	path = capstone
 	url = git://git.qemu.org/capstone.git
+[submodule "roms/seabios-hppa"]
+	path = roms/seabios-hppa
+	url = git://github.com/hdeller/seabios-hppa.git
+[submodule "roms/u-boot-sam460ex"]
+	path = roms/u-boot-sam460ex
+	url = git://github.com/zbalaton/u-boot-sam460ex
--- a/.gitpublish
+++ b/.gitpublish
@@ -0,0 +1,51 @@
+#
+# Common git-publish profiles that can be used to send patches to QEMU upstream.
+#
+# See https://github.com/stefanha/git-publish for more information
+#
+[gitpublishprofile "default"]
+base = master
+to = qemu-devel@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "rfc"]
+base = master
+prefix = RFC PATCH
+to = qemu-devel@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "stable"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-stable@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "trivial"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-trivial@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "block"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-block@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "arm"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-arm@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "s390"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-s390@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
+
+[gitpublishprofile "ppc"]
+base = master
+to = qemu-devel@nongnu.org
+cc = qemu-ppc@nongnu.org
+cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null
--- a/.mailmap
+++ b/.mailmap
@@ -18,3 +18,7 @@ malc <av1474@comtv.ru> malc <malc@c046a42c-6fe2-441c-8c8c-71466251a162>
 # There is also a:
 #    (no author) <(no author)@c046a42c-6fe2-441c-8c8c-71466251a162>
 # for the cvs2svn initialization commit e63c3dc74bf.
+#
+# Also list preferred name forms where people have changed their
+# git author config
+Daniel P. Berrangé <berrange@redhat.com>
--- a/.shippable.yml
+++ b/.shippable.yml
@@ -23,8 +23,6 @@ env:
      TARGET_LIST=mips-softmmu,mipsel-linux-user
    - IMAGE=debian-mips64el-cross
      TARGET_LIST=mips64el-softmmu,mips64el-linux-user
-    - IMAGE=debian-powerpc-cross
-      TARGET_LIST=ppc-softmmu,ppcemb-softmmu,ppc-linux-user
    - IMAGE=debian-ppc64el-cross
      TARGET_LIST=ppc64-softmmu,ppc64-linux-user,ppc64abi32-linux-user
 build:
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 sudo: false
 language: c
 python:
-  - "2.4"
+  - "2.6"
 compiler:
  - gcc
 cache: ccache
@@ -13,12 +13,13 @@ addons:
      - libattr1-dev
      - libbrlapi-dev
      - libcap-ng-dev
+      - libgcc-4.8-dev
      - libgnutls-dev
      - libgtk-3-dev
      - libiscsi-dev
      - liblttng-ust-dev
-      - libnfs-dev
      - libncurses5-dev
+      - libnfs-dev
      - libnss3-dev
      - libpixman-1-dev
      - libpng12-dev
@@ -51,9 +52,9 @@ env:
    - CONFIG=""
    - CONFIG="--enable-debug --enable-debug-tcg --enable-trace-backends=log"
    - CONFIG="--disable-linux-aio --disable-cap-ng --disable-attr --disable-brlapi --disable-uuid --disable-libusb"
-    - CONFIG="--enable-modules"
-    - CONFIG="--with-coroutine=ucontext"
-    - CONFIG="--with-coroutine=sigaltstack"
+    - CONFIG="--enable-modules --disable-linux-user"
+    - CONFIG="--with-coroutine=ucontext --disable-linux-user"
+    - CONFIG="--with-coroutine=sigaltstack --disable-linux-user"
 git:
  # we want to do this ourselves
  submodules: false
@@ -115,15 +116,17 @@ matrix:
        - sudo apt-get build-dep -qq qemu
        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
        - git submodule update --init --recursive
-    # Trusty System build with latest stable clang
+    # Trusty System build with latest stable clang & python 3.0
    - sudo: required
      addons:
      dist: trusty
      language: generic
      compiler: none
+      python:
+        - "3.0"
      env:
        - COMPILER_NAME=clang CXX=clang++-3.9 CC=clang-3.9
-        - CONFIG="--disable-linux-user --cc=clang-3.9 --cxx=clang++-3.9"
+        - CONFIG="--disable-linux-user --cc=clang-3.9 --cxx=clang++-3.9 --python=/usr/bin/python3"
      before_install:
        - wget -nv -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
        - sudo apt-add-repository -y 'deb http://llvm.org/apt/trusty llvm-toolchain-trusty-3.9 main'
@@ -134,15 +137,17 @@ matrix:
        - git submodule update --init --recursive
      before_script:
        - ./configure ${CONFIG} || cat config.log
-    # Trusty Linux User build with latest stable clang
+    # Trusty Linux User build with latest stable clang & python 3.6
    - sudo: required
      addons:
      dist: trusty
      language: generic
      compiler: none
+      python:
+        - "3.6"
      env:
        - COMPILER_NAME=clang CXX=clang++-3.9 CC=clang-3.9
-        - CONFIG="--disable-system --cc=clang-3.9 --cxx=clang++-3.9"
+        - CONFIG="--disable-system --cc=clang-3.9 --cxx=clang++-3.9 --python=/usr/bin/python3"
      before_install:
        - wget -nv -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
        - sudo apt-add-repository -y 'deb http://llvm.org/apt/trusty llvm-toolchain-trusty-3.9 main'
--- a/135
+++ b/135
@@ -76,6 +76,29 @@ K: ^Subject:.*(?i)trivial
 T: git git://git.corpit.ru/qemu.git trivial-patches
 T: git git://github.com/vivier/qemu.git trivial-patches

+Architecture support
+--------------------
+S390
+M: Cornelia Huck <cohuck@redhat.com>
+S: Supported
+F: default-configs/s390x-softmmu.mak
+F: gdb-xml/s390*.xml
+F: hw/char/sclp*.[hc]
+F: hw/char/terminal3270.c
+F: hw/intc/s390_flic.c
+F: hw/intc/s390_flic_kvm.c
+F: hw/s390x/
+F: hw/vfio/ccw.c
+F: hw/watchdog/wdt_diag288.c
+F: include/hw/s390x/
+F: include/hw/watchdog/wdt_diag288.h
+F: pc-bios/s390-ccw/
+F: pc-bios/s390-ccw.img
+F: target/s390x/
+K: ^Subject:.*(?i)s390x?
+T: git git://github.com/cohuck/qemu.git s390-next
+L: qemu-s390x@nongnu.org
+
 Guest CPU cores (TCG):
 ----------------------
 Overall
@@ -133,6 +156,7 @@ HPPA (PA-RISC)
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: target/hppa/
+F: hw/hppa/
 F: disas/hppa.c

 LM32
@@ -209,9 +233,21 @@ F: hw/ppc/
 F: include/hw/ppc/
 F: disas/ppc.c

+RISC-V
+M: Michael Clark <mjc@sifive.com>
+M: Palmer Dabbelt <palmer@sifive.com>
+M: Sagar Karandikar <sagark@eecs.berkeley.edu>
+M: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
+S: Maintained
+F: target/riscv/
+F: hw/riscv/
+F: include/hw/riscv/
+F: disas/riscv.c
+
 S390
 M: Richard Henderson <rth@twiddle.net>
 M: Alexander Graf <agraf@suse.de>
+M: David Hildenbrand <david@redhat.com>
 S: Maintained
 F: target/s390x/
 F: hw/s390x/
@@ -259,6 +295,7 @@ S: Maintained
 F: target/xtensa/
 F: hw/xtensa/
 F: tests/tcg/xtensa/
+F: disas/xtensa.c

 TriCore
 M: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
@@ -360,6 +397,12 @@ M: Kamil Rytarowski <kamil@netbsd.org>
 S: Maintained
 K: ^Subject:.*(?i)NetBSD

+OPENBSD
+L: qemu-devel@nongnu.org
+M: Brad Smith <brad@comstyle.com>
+S: Maintained
+K: ^Subject:.*(?i)OpenBSD
+
 W32, W64
 L: qemu-devel@nongnu.org
 M: Stefan Weil <sw@weilnetz.de>
@@ -524,7 +567,7 @@ F: hw/misc/arm_sysctl.c

 Xilinx Zynq
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/xilinx_*
@@ -534,7 +577,7 @@ F: include/hw/misc/zynq*
 X: hw/ssi/xilinx_*

 Xilinx ZynqMP
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 L: qemu-arm@nongnu.org
 S: Maintained
@@ -543,7 +586,7 @@ F: include/hw/*/xlnx*.h

 ARM ACPI Subsystem
 M: Shannon Zhao <zhaoshenglong@huawei.com>
-M: Shannon Zhao <shannon.zhao@linaro.org>
+M: Shannon Zhao <shannon.zhaosl@gmail.com>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/virt-acpi-build.c
@@ -732,7 +775,11 @@ F: hw/ppc/prep.c
 F: hw/ppc/prep_systemio.c
 F: hw/ppc/rs6000_mc.c
 F: hw/pci-host/prep.[hc]
+F: hw/isa/i82378.c
 F: hw/isa/pc87312.[hc]
+F: hw/dma/i82374.c
+F: hw/timer/m48t59-isa.c
+F: include/hw/timer/m48t59.h
 F: pc-bios/ppc_rom.bin

 sPAPR
@@ -761,6 +808,12 @@ L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/virtex_ml507.c

+sam460ex
+M: BALATON Zoltan <balaton@eik.bme.hu>
+L: qemu-ppc@nongnu.org
+S: Maintained
+F: hw/ide/sii3112.c
+
 SH4 Machines
 ------------
 R2D
@@ -820,15 +873,22 @@ F: hw/char/sclp*.[hc]
 F: hw/char/terminal3270.c
 F: hw/s390x/
 F: include/hw/s390x/
-F: pc-bios/s390-ccw/
 F: hw/watchdog/wdt_diag288.c
 F: include/hw/watchdog/wdt_diag288.h
-F: pc-bios/s390-ccw.img
 F: default-configs/s390x-softmmu.mak
 T: git git://github.com/cohuck/qemu.git s390-next
 T: git git://github.com/borntraeger/qemu.git s390-next
 L: qemu-s390x@nongnu.org

+S390-ccw Bios
+M: Christian Borntraeger <borntraeger@de.ibm.com>
+M: Thomas Huth <thuth@redhat.com>
+S: Supported
+F: pc-bios/s390-ccw/
+F: pc-bios/s390-ccw.img
+T: git git://github.com/borntraeger/qemu.git s390-next
+L: qemu-s390x@nongnu.org
+
 UniCore32 Machines
 -------------
 PKUnity-3 SoC initramfs-with-busybox
@@ -841,6 +901,7 @@ X86 Machines
 ------------
 PC
 M: Michael S. Tsirkin <mst@redhat.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
 S: Supported
 F: include/hw/i386/
 F: hw/i386/
@@ -861,12 +922,13 @@ F: hw/misc/sga.c
 PC Chipset
 M: Michael S. Tsirkin <mst@redhat.com>
 M: Paolo Bonzini <pbonzini@redhat.com>
-S: Support
+S: Supported
 F: hw/char/debugcon.c
 F: hw/char/parallel.c
 F: hw/char/serial*
 F: hw/dma/i8257*
 F: hw/i2c/pm_smbus.c
+F: hw/input/pckbd.c
 F: hw/intc/apic*
 F: hw/intc/ioapic*
 F: hw/intc/i8259*
@@ -875,7 +937,10 @@ F: hw/misc/pc-testdev.c
 F: hw/timer/hpet*
 F: hw/timer/i8254*
 F: hw/timer/mc146818rtc*
+F: hw/watchdog/wdt_ib700.c
+F: include/hw/display/vga.h
 F: include/hw/i2c/pm_smbus.h
+F: include/hw/isa/i8257.h
 F: include/hw/timer/hpet.h
 F: include/hw/timer/i8254*
 F: include/hw/timer/mc146818rtc*
@@ -924,6 +989,15 @@ F: tests/ahci-test.c
 F: tests/libqos/ahci*
 T: git git://github.com/jnsnow/qemu.git ide

+IPMI
+M: Corey Minyard <minyard@acm.org>
+S: Maintained
+F: include/hw/ipmi/*
+F: hw/ipmi/*
+F: hw/smbios/smbios_type_38.c
+F: tests/ipmi*
+T: git git://github.com/cminyard/qemu.git master-ipmi-rebase
+
 Floppy
 M: John Snow <jsnow@redhat.com>
 L: qemu-block@nongnu.org
@@ -976,7 +1050,9 @@ M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/ppc4*.c
+F: hw/i2c/ppc4xx_i2c.c
 F: include/hw/ppc/ppc4xx.h
+F: include/hw/i2c/ppc4xx_i2c.h

 ppce500
 M: Alexander Graf <agraf@suse.de>
@@ -995,11 +1071,13 @@ Network devices
 M: Jason Wang <jasowang@redhat.com>
 S: Odd Fixes
 F: hw/net/
+F: include/hw/net/
 F: tests/virtio-net-test.c
 T: git git://github.com/jasowang/qemu.git net

 SCSI
 M: Paolo Bonzini <pbonzini@redhat.com>
+R: Fam Zheng <famz@redhat.com>
 S: Supported
 F: include/hw/scsi/*
 F: hw/scsi/*
@@ -1008,7 +1086,7 @@ T: git git://github.com/bonzini/qemu.git scsi-next

 SSI
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/ssi/*
 F: hw/block/m25p80.c
@@ -1017,11 +1095,19 @@ X: hw/ssi/xilinx_*
 F: tests/m25p80-test.c

 Xilinx SPI
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
 F: hw/ssi/xilinx_*

+SD (Secure Card)
+M: Philippe Mathieu-Daudé <f4bug@amsat.org>
+S: Odd Fixes
+F: include/hw/sd/sd*
+F: hw/sd/core.c
+F: hw/sd/sd*
+F: tests/sd*
+
 USB
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
@@ -1071,13 +1157,11 @@ F: include/hw/virtio/
 F: tests/virtio-balloon-test.c

 virtio-9p
-M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 M: Greg Kurz <groug@kaod.org>
 S: Supported
 F: hw/9pfs/
 F: fsdev/
 F: tests/virtio-9p-test.c
-T: git git://github.com/kvaneesh/QEMU.git
 T: git git://github.com/gkurz/qemu.git 9p-next

 virtio-blk
@@ -1145,7 +1229,7 @@ F: hw/scsi/mfi.h
 F: tests/megasas-test.c

 Network packet abstractions
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: include/net/eth.h
 F: net/eth.c
@@ -1153,7 +1237,7 @@ F: hw/net/net_rx_pkt*
 F: hw/net/net_tx_pkt*

 Vmware
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*
@@ -1174,12 +1258,12 @@ F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h

 e1000x
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/e1000x*

 e1000e
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/e1000e*

@@ -1189,7 +1273,7 @@ S: Maintained
 F: hw/net/eepro100.c

 Generic Loader
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/core/generic-loader.c
 F: include/hw/core/generic-loader.h
@@ -1260,6 +1344,7 @@ T: git git://github.com/stefanha/qemu.git block

 Block SCSI subsystem
 M: Paolo Bonzini <pbonzini@redhat.com>
+R: Fam Zheng <famz@redhat.com>
 L: qemu-block@nongnu.org
 S: Supported
 F: include/scsi/*
@@ -1534,10 +1619,11 @@ F: tests/qmp-test.c
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 Register API
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/core/register.c
 F: include/hw/register.h
+F: include/hw/registerfields.h

 SLIRP
 M: Samuel Thibault <samuel.thibault@ens-lyon.org>
@@ -1572,6 +1658,8 @@ F: include/hw/acpi/tpm.h
 F: include/sysemu/tpm*
 F: qapi/tpm.json
 F: backends/tpm.c
+F: tests/*tpm*
+T: git git://github.com/stefanberger/qemu-tpm.git tpm-next

 Checkpatch
 S: Odd Fixes
@@ -1699,6 +1787,7 @@ R: Laurent Vivier <laurent@vivier.eu>
 S: Maintained
 F: linux-user/
 F: default-configs/*-linux-user.mak
+F: scripts/qemu-binfmt-conf.sh

 Tiny Code Generator (TCG)
 -------------------------
@@ -1857,6 +1946,12 @@ L: qemu-block@nongnu.org
 S: Supported
 F: block/null.c

+NVMe Block Driver
+M: Fam Zheng <famz@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: block/nvme*
+
 Bootdevice
 M: Gonglei <arei.gonglei@huawei.com>
 S: Maintained
@@ -1965,6 +2060,14 @@ F: block/replication.c
 F: tests/test-replication.c
 F: docs/block-replication.txt

+PVRDMA
+M: Yuval Shaia <yuval.shaia@oracle.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
+S: Maintained
+F: hw/rdma/*
+F: hw/rdma/vmw/*
+F: docs/pvrdma.txt
+
 Build and test automation
 -------------------------
 Build and test automation
--- a/284
+++ b/284
@@ -6,7 +6,13 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.

-UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-% help
+UNCHECKED_GOALS := %clean TAGS cscope ctags dist \
+    html info pdf txt \
+    help check-help print-% \
+    docker docker-% vm-test vm-build-%
+
+print-%:
+	@echo '$*=$($*)'

 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -14,6 +20,8 @@ ifneq ($(wildcard config-host.mak),)
 all:
 include config-host.mak

+PYTHON_UTF8 = LC_ALL= LANG=C LC_CTYPE=en_US.UTF-8 $(PYTHON)
+
 git-submodule-update:

 .PHONY: git-submodule-update
@@ -50,7 +58,7 @@ ifneq ($(realpath $(SRC_PATH)),$(realpath .))
 ifneq ($(wildcard $(SRC_PATH)/config-host.mak),)
 $(error This is an out of tree build but your source tree ($(SRC_PATH)) \
 seems to have been used for an in-tree build. You can fix this by running \
-"make distclean && rm -rf *-linux-user *-softmmu" in your source tree)
+"$(MAKE) distclean && rm -rf *-linux-user *-softmmu" in your source tree)
 endif
 endif

@@ -82,10 +90,78 @@ endif
 include $(SRC_PATH)/rules.mak

 GENERATED_FILES = qemu-version.h config-host.h qemu-options.def
-GENERATED_FILES += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
-GENERATED_FILES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
-GENERATED_FILES += qmp-introspect.h
-GENERATED_FILES += qmp-introspect.c
+GENERATED_FILES += qapi/qapi-builtin-types.h qapi/qapi-builtin-types.c
+GENERATED_FILES += qapi/qapi-types.h qapi/qapi-types.c
+GENERATED_FILES += qapi/qapi-types-block-core.h qapi/qapi-types-block-core.c
+GENERATED_FILES += qapi/qapi-types-block.h qapi/qapi-types-block.c
+GENERATED_FILES += qapi/qapi-types-char.h qapi/qapi-types-char.c
+GENERATED_FILES += qapi/qapi-types-common.h qapi/qapi-types-common.c
+GENERATED_FILES += qapi/qapi-types-crypto.h qapi/qapi-types-crypto.c
+GENERATED_FILES += qapi/qapi-types-introspect.h qapi/qapi-types-introspect.c
+GENERATED_FILES += qapi/qapi-types-migration.h qapi/qapi-types-migration.c
+GENERATED_FILES += qapi/qapi-types-misc.h qapi/qapi-types-misc.c
+GENERATED_FILES += qapi/qapi-types-net.h qapi/qapi-types-net.c
+GENERATED_FILES += qapi/qapi-types-rocker.h qapi/qapi-types-rocker.c
+GENERATED_FILES += qapi/qapi-types-run-state.h qapi/qapi-types-run-state.c
+GENERATED_FILES += qapi/qapi-types-sockets.h qapi/qapi-types-sockets.c
+GENERATED_FILES += qapi/qapi-types-tpm.h qapi/qapi-types-tpm.c
+GENERATED_FILES += qapi/qapi-types-trace.h qapi/qapi-types-trace.c
+GENERATED_FILES += qapi/qapi-types-transaction.h qapi/qapi-types-transaction.c
+GENERATED_FILES += qapi/qapi-types-ui.h qapi/qapi-types-ui.c
+GENERATED_FILES += qapi/qapi-builtin-visit.h qapi/qapi-builtin-visit.c
+GENERATED_FILES += qapi/qapi-visit.h qapi/qapi-visit.c
+GENERATED_FILES += qapi/qapi-visit-block-core.h qapi/qapi-visit-block-core.c
+GENERATED_FILES += qapi/qapi-visit-block.h qapi/qapi-visit-block.c
+GENERATED_FILES += qapi/qapi-visit-char.h qapi/qapi-visit-char.c
+GENERATED_FILES += qapi/qapi-visit-common.h qapi/qapi-visit-common.c
+GENERATED_FILES += qapi/qapi-visit-crypto.h qapi/qapi-visit-crypto.c
+GENERATED_FILES += qapi/qapi-visit-introspect.h qapi/qapi-visit-introspect.c
+GENERATED_FILES += qapi/qapi-visit-migration.h qapi/qapi-visit-migration.c
+GENERATED_FILES += qapi/qapi-visit-misc.h qapi/qapi-visit-misc.c
+GENERATED_FILES += qapi/qapi-visit-net.h qapi/qapi-visit-net.c
+GENERATED_FILES += qapi/qapi-visit-rocker.h qapi/qapi-visit-rocker.c
+GENERATED_FILES += qapi/qapi-visit-run-state.h qapi/qapi-visit-run-state.c
+GENERATED_FILES += qapi/qapi-visit-sockets.h qapi/qapi-visit-sockets.c
+GENERATED_FILES += qapi/qapi-visit-tpm.h qapi/qapi-visit-tpm.c
+GENERATED_FILES += qapi/qapi-visit-trace.h qapi/qapi-visit-trace.c
+GENERATED_FILES += qapi/qapi-visit-transaction.h qapi/qapi-visit-transaction.c
+GENERATED_FILES += qapi/qapi-visit-ui.h qapi/qapi-visit-ui.c
+GENERATED_FILES += qapi/qapi-commands.h qapi/qapi-commands.c
+GENERATED_FILES += qapi/qapi-commands-block-core.h qapi/qapi-commands-block-core.c
+GENERATED_FILES += qapi/qapi-commands-block.h qapi/qapi-commands-block.c
+GENERATED_FILES += qapi/qapi-commands-char.h qapi/qapi-commands-char.c
+GENERATED_FILES += qapi/qapi-commands-common.h qapi/qapi-commands-common.c
+GENERATED_FILES += qapi/qapi-commands-crypto.h qapi/qapi-commands-crypto.c
+GENERATED_FILES += qapi/qapi-commands-introspect.h qapi/qapi-commands-introspect.c
+GENERATED_FILES += qapi/qapi-commands-migration.h qapi/qapi-commands-migration.c
+GENERATED_FILES += qapi/qapi-commands-misc.h qapi/qapi-commands-misc.c
+GENERATED_FILES += qapi/qapi-commands-net.h qapi/qapi-commands-net.c
+GENERATED_FILES += qapi/qapi-commands-rocker.h qapi/qapi-commands-rocker.c
+GENERATED_FILES += qapi/qapi-commands-run-state.h qapi/qapi-commands-run-state.c
+GENERATED_FILES += qapi/qapi-commands-sockets.h qapi/qapi-commands-sockets.c
+GENERATED_FILES += qapi/qapi-commands-tpm.h qapi/qapi-commands-tpm.c
+GENERATED_FILES += qapi/qapi-commands-trace.h qapi/qapi-commands-trace.c
+GENERATED_FILES += qapi/qapi-commands-transaction.h qapi/qapi-commands-transaction.c
+GENERATED_FILES += qapi/qapi-commands-ui.h qapi/qapi-commands-ui.c
+GENERATED_FILES += qapi/qapi-events.h qapi/qapi-events.c
+GENERATED_FILES += qapi/qapi-events-block-core.h qapi/qapi-events-block-core.c
+GENERATED_FILES += qapi/qapi-events-block.h qapi/qapi-events-block.c
+GENERATED_FILES += qapi/qapi-events-char.h qapi/qapi-events-char.c
+GENERATED_FILES += qapi/qapi-events-common.h qapi/qapi-events-common.c
+GENERATED_FILES += qapi/qapi-events-crypto.h qapi/qapi-events-crypto.c
+GENERATED_FILES += qapi/qapi-events-introspect.h qapi/qapi-events-introspect.c
+GENERATED_FILES += qapi/qapi-events-migration.h qapi/qapi-events-migration.c
+GENERATED_FILES += qapi/qapi-events-misc.h qapi/qapi-events-misc.c
+GENERATED_FILES += qapi/qapi-events-net.h qapi/qapi-events-net.c
+GENERATED_FILES += qapi/qapi-events-rocker.h qapi/qapi-events-rocker.c
+GENERATED_FILES += qapi/qapi-events-run-state.h qapi/qapi-events-run-state.c
+GENERATED_FILES += qapi/qapi-events-sockets.h qapi/qapi-events-sockets.c
+GENERATED_FILES += qapi/qapi-events-tpm.h qapi/qapi-events-tpm.c
+GENERATED_FILES += qapi/qapi-events-trace.h qapi/qapi-events-trace.c
+GENERATED_FILES += qapi/qapi-events-transaction.h qapi/qapi-events-transaction.c
+GENERATED_FILES += qapi/qapi-events-ui.h qapi/qapi-events-ui.c
+GENERATED_FILES += qapi/qapi-introspect.c qapi/qapi-introspect.h
+GENERATED_FILES += qapi/qapi-doc.texi

 GENERATED_FILES += trace/generated-tcg-tracers.h

@@ -226,17 +302,29 @@ KEYCODEMAP_GEN = $(SRC_PATH)/ui/keycodemapdb/tools/keymap-gen
 KEYCODEMAP_CSV = $(SRC_PATH)/ui/keycodemapdb/data/keymaps.csv

 KEYCODEMAP_FILES = \
+		 ui/input-keymap-atset1-to-qcode.c \
 		 ui/input-keymap-linux-to-qcode.c \
+		 ui/input-keymap-qcode-to-atset1.c \
+		 ui/input-keymap-qcode-to-atset2.c \
+		 ui/input-keymap-qcode-to-atset3.c \
+		 ui/input-keymap-qcode-to-linux.c \
 		 ui/input-keymap-qcode-to-qnum.c \
+		 ui/input-keymap-qcode-to-sun.c \
 		 ui/input-keymap-qnum-to-qcode.c \
+		 ui/input-keymap-usb-to-qcode.c \
+		 ui/input-keymap-win32-to-qcode.c \
+		 ui/input-keymap-x11-to-qcode.c \
+		 ui/input-keymap-xorgevdev-to-qcode.c \
+		 ui/input-keymap-xorgkbd-to-qcode.c \
+		 ui/input-keymap-xorgxquartz-to-qcode.c \
+		 ui/input-keymap-xorgxwin-to-qcode.c \
 		 $(NULL)

 GENERATED_FILES += $(KEYCODEMAP_FILES)

 ui/input-keymap-%.c: $(KEYCODEMAP_GEN) $(KEYCODEMAP_CSV) $(SRC_PATH)/ui/Makefile.objs
 	$(call quiet-command,\
-	    src=$$(echo $@ | sed -E -e "s,^ui/input-keymap-(.+)-to-(.+)\.c$$,\1,") && \
-	    dst=$$(echo $@ | sed -E -e "s,^ui/input-keymap-(.+)-to-(.+)\.c$$,\2,") && \
+	    stem=$* && src=$${stem%-to-*} dst=$${stem#*-to-} && \
 	    test -e $(KEYCODEMAP_GEN) && \
 	    $(PYTHON) $(KEYCODEMAP_GEN) \
 	          --lang glib2 \
@@ -273,7 +361,7 @@ else
 DOCS=
 endif

-SUBDIR_MAKEFLAGS=$(if $(V),,--no-print-directory) BUILD_DIR=$(BUILD_DIR)
+SUBDIR_MAKEFLAGS=$(if $(V),,--no-print-directory --quiet) BUILD_DIR=$(BUILD_DIR)
 SUBDIR_DEVICES_MAK=$(patsubst %, %/config-devices.mak, $(TARGET_DIRS))
 SUBDIR_DEVICES_MAK_DEP=$(patsubst %, %-config-devices.mak.d, $(TARGET_DIRS))

@@ -303,7 +391,7 @@ endif
 	    else \
 	      echo "WARNING: $@ out of date.";\
 	    fi; \
-	    echo "Run \"make defconfig\" to regenerate."; \
+	    echo "Run \"$(MAKE) defconfig\" to regenerate."; \
 	    rm $@.tmp; \
 	  fi; \
 	 else \
@@ -327,6 +415,7 @@ dummy := $(call unnest-vars,, \
                ivshmem-server-obj-y \
                libvhost-user-obj-y \
                vhost-user-scsi-obj-y \
+                vhost-user-blk-obj-y \
                qga-vss-dll-obj-y \
                block-obj-y \
                block-obj-m \
@@ -336,6 +425,10 @@ dummy := $(call unnest-vars,, \
                io-obj-y \
                common-obj-y \
                common-obj-m \
+                ui-obj-y \
+                ui-obj-m \
+                audio-obj-y \
+                audio-obj-m \
                trace-obj-y)

 include $(SRC_PATH)/tests/Makefile.include
@@ -461,32 +554,34 @@ qemu-ga$(EXESUF): QEMU_CFLAGS += -I qga/qapi-generated
 qemu-keymap$(EXESUF): LIBS += $(XKBCOMMON_LIBS)
 qemu-keymap$(EXESUF): QEMU_CFLAGS += $(XKBCOMMON_CFLAGS)

-gen-out-type = $(subst .,-,$(suffix $@))
+qapi-py = $(SRC_PATH)/scripts/qapi/commands.py \
+$(SRC_PATH)/scripts/qapi/events.py \
+$(SRC_PATH)/scripts/qapi/introspect.py \
+$(SRC_PATH)/scripts/qapi/types.py \
+$(SRC_PATH)/scripts/qapi/visit.py \
+$(SRC_PATH)/scripts/qapi/common.py \
+$(SRC_PATH)/scripts/qapi/doc.py \
+$(SRC_PATH)/scripts/ordereddict.py \
+$(SRC_PATH)/scripts/qapi-gen.py

-qapi-py = $(SRC_PATH)/scripts/qapi.py $(SRC_PATH)/scripts/ordereddict.py
+qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h \
+qga/qapi-generated/qga-qapi-visit.c qga/qapi-generated/qga-qapi-visit.h \
+qga/qapi-generated/qga-qapi-commands.h qga/qapi-generated/qga-qapi-commands.c \
+qga/qapi-generated/qga-qapi-doc.texi: \
+qga/qapi-generated/qapi-gen-timestamp ;
+qga/qapi-generated/qapi-gen-timestamp: $(SRC_PATH)/qga/qapi-schema.json $(qapi-py)
+	$(call quiet-command,$(PYTHON_UTF8) $(SRC_PATH)/scripts/qapi-gen.py \
+		-o qga/qapi-generated -p "qga-" $<, \
+		"GEN","$(@:%-timestamp=%)")
+	@>$@

-qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h :\
-$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \
-		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"GEN","$@")
-qga/qapi-generated/qga-qapi-visit.c qga/qapi-generated/qga-qapi-visit.h :\
-$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \
-		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"GEN","$@")
-qga/qapi-generated/qga-qmp-commands.h qga/qapi-generated/qga-qmp-marshal.c :\
-$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \
-		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"GEN","$@")
-
-qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
+qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \
               $(SRC_PATH)/qapi/char.json \
               $(SRC_PATH)/qapi/crypto.json \
               $(SRC_PATH)/qapi/introspect.json \
               $(SRC_PATH)/qapi/migration.json \
+               $(SRC_PATH)/qapi/misc.json \
               $(SRC_PATH)/qapi/net.json \
               $(SRC_PATH)/qapi/rocker.json \
               $(SRC_PATH)/qapi/run-state.json \
@@ -496,33 +591,86 @@ qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/transaction.json \
               $(SRC_PATH)/qapi/ui.json

-qapi-types.c qapi-types.h :\
-$(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \
-		$(gen-out-type) -o "." -b $<, \
-		"GEN","$@")
-qapi-visit.c qapi-visit.h :\
-$(qapi-modules) $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \
-		$(gen-out-type) -o "." -b $<, \
-		"GEN","$@")
-qapi-event.c qapi-event.h :\
-$(qapi-modules) $(SRC_PATH)/scripts/qapi-event.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-event.py \
-		$(gen-out-type) -o "." $<, \
-		"GEN","$@")
-qmp-commands.h qmp-marshal.c :\
-$(qapi-modules) $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \
-		$(gen-out-type) -o "." $<, \
-		"GEN","$@")
-qmp-introspect.h qmp-introspect.c :\
-$(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-introspect.py \
-		$(gen-out-type) -o "." $<, \
-		"GEN","$@")
+qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \
+qapi/qapi-types.c qapi/qapi-types.h \
+qapi/qapi-types-block-core.c qapi/qapi-types-block-core.h \
+qapi/qapi-types-block.c qapi/qapi-types-block.h \
+qapi/qapi-types-char.c qapi/qapi-types-char.h \
+qapi/qapi-types-common.c qapi/qapi-types-common.h \
+qapi/qapi-types-crypto.c qapi/qapi-types-crypto.h \
+qapi/qapi-types-introspect.c qapi/qapi-types-introspect.h \
+qapi/qapi-types-migration.c qapi/qapi-types-migration.h \
+qapi/qapi-types-misc.c qapi/qapi-types-misc.h \
+qapi/qapi-types-net.c qapi/qapi-types-net.h \
+qapi/qapi-types-rocker.c qapi/qapi-types-rocker.h \
+qapi/qapi-types-run-state.c qapi/qapi-types-run-state.h \
+qapi/qapi-types-sockets.c qapi/qapi-types-sockets.h \
+qapi/qapi-types-tpm.c qapi/qapi-types-tpm.h \
+qapi/qapi-types-trace.c qapi/qapi-types-trace.h \
+qapi/qapi-types-transaction.c qapi/qapi-types-transaction.h \
+qapi/qapi-types-ui.c qapi/qapi-types-ui.h \
+qapi/qapi-builtin-visit.c qapi/qapi-builtin-visit.h \
+qapi/qapi-visit.c qapi/qapi-visit.h \
+qapi/qapi-visit-block-core.c qapi/qapi-visit-block-core.h \
+qapi/qapi-visit-block.c qapi/qapi-visit-block.h \
+qapi/qapi-visit-char.c qapi/qapi-visit-char.h \
+qapi/qapi-visit-common.c qapi/qapi-visit-common.h \
+qapi/qapi-visit-crypto.c qapi/qapi-visit-crypto.h \
+qapi/qapi-visit-introspect.c qapi/qapi-visit-introspect.h \
+qapi/qapi-visit-migration.c qapi/qapi-visit-migration.h \
+qapi/qapi-visit-misc.c qapi/qapi-visit-misc.h \
+qapi/qapi-visit-net.c qapi/qapi-visit-net.h \
+qapi/qapi-visit-rocker.c qapi/qapi-visit-rocker.h \
+qapi/qapi-visit-run-state.c qapi/qapi-visit-run-state.h \
+qapi/qapi-visit-sockets.c qapi/qapi-visit-sockets.h \
+qapi/qapi-visit-tpm.c qapi/qapi-visit-tpm.h \
+qapi/qapi-visit-trace.c qapi/qapi-visit-trace.h \
+qapi/qapi-visit-transaction.c qapi/qapi-visit-transaction.h \
+qapi/qapi-visit-ui.c qapi/qapi-visit-ui.h \
+qapi/qapi-commands.h qapi/qapi-commands.c \
+qapi/qapi-commands-block-core.c qapi/qapi-commands-block-core.h \
+qapi/qapi-commands-block.c qapi/qapi-commands-block.h \
+qapi/qapi-commands-char.c qapi/qapi-commands-char.h \
+qapi/qapi-commands-common.c qapi/qapi-commands-common.h \
+qapi/qapi-commands-crypto.c qapi/qapi-commands-crypto.h \
+qapi/qapi-commands-introspect.c qapi/qapi-commands-introspect.h \
+qapi/qapi-commands-migration.c qapi/qapi-commands-migration.h \
+qapi/qapi-commands-misc.c qapi/qapi-commands-misc.h \
+qapi/qapi-commands-net.c qapi/qapi-commands-net.h \
+qapi/qapi-commands-rocker.c qapi/qapi-commands-rocker.h \
+qapi/qapi-commands-run-state.c qapi/qapi-commands-run-state.h \
+qapi/qapi-commands-sockets.c qapi/qapi-commands-sockets.h \
+qapi/qapi-commands-tpm.c qapi/qapi-commands-tpm.h \
+qapi/qapi-commands-trace.c qapi/qapi-commands-trace.h \
+qapi/qapi-commands-transaction.c qapi/qapi-commands-transaction.h \
+qapi/qapi-commands-ui.c qapi/qapi-commands-ui.h \
+qapi/qapi-events.c qapi/qapi-events.h \
+qapi/qapi-events-block-core.c qapi/qapi-events-block-core.h \
+qapi/qapi-events-block.c qapi/qapi-events-block.h \
+qapi/qapi-events-char.c qapi/qapi-events-char.h \
+qapi/qapi-events-common.c qapi/qapi-events-common.h \
+qapi/qapi-events-crypto.c qapi/qapi-events-crypto.h \
+qapi/qapi-events-introspect.c qapi/qapi-events-introspect.h \
+qapi/qapi-events-migration.c qapi/qapi-events-migration.h \
+qapi/qapi-events-misc.c qapi/qapi-events-misc.h \
+qapi/qapi-events-net.c qapi/qapi-events-net.h \
+qapi/qapi-events-rocker.c qapi/qapi-events-rocker.h \
+qapi/qapi-events-run-state.c qapi/qapi-events-run-state.h \
+qapi/qapi-events-sockets.c qapi/qapi-events-sockets.h \
+qapi/qapi-events-tpm.c qapi/qapi-events-tpm.h \
+qapi/qapi-events-trace.c qapi/qapi-events-trace.h \
+qapi/qapi-events-transaction.c qapi/qapi-events-transaction.h \
+qapi/qapi-events-ui.c qapi/qapi-events-ui.h \
+qapi/qapi-introspect.h qapi/qapi-introspect.c \
+qapi/qapi-doc.texi: \
+qapi-gen-timestamp ;
+qapi-gen-timestamp: $(qapi-modules) $(qapi-py)
+	$(call quiet-command,$(PYTHON_UTF8) $(SRC_PATH)/scripts/qapi-gen.py \
+		-o "qapi" -b $<, \
+		"GEN","$(@:%-timestamp=%)")
+	@>$@

-QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
+QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qapi-commands.h)
 $(qga-obj-y): $(QGALIB_GEN)

 qemu-ga$(EXESUF): $(qga-obj-y) $(COMMON_LDADDS)
@@ -558,6 +706,8 @@ ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) $(COMMON_LDADDS)
 endif
 vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) libvhost-user.a
 	$(call LINK, $^)
+vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
+	$(call LINK, $^)

 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
 	$(call quiet-command,$(PYTHON) $< $@ \
@@ -578,7 +728,7 @@ clean:
 	rm -f trace/generated-tracers-dtrace.dtrace*
 	rm -f trace/generated-tracers-dtrace.h*
 	rm -f $(foreach f,$(GENERATED_FILES),$(f) $(f)-timestamp)
-	rm -rf qapi-generated
+	rm -f qapi-gen-timestamp
 	rm -rf qga/qapi-generated
 	for d in $(ALL_SUBDIRS); do \
 	if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
@@ -633,13 +783,14 @@ efi-e1000.rom efi-eepro100.rom efi-ne2k_pci.rom \
 efi-pcnet.rom efi-rtl8139.rom efi-virtio.rom \
 efi-e1000e.rom efi-vmxnet3.rom \
 qemu-icon.bmp qemu_logo_no_text.svg \
-bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
+bamboo.dtb canyonlands.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
 multiboot.bin linuxboot.bin linuxboot_dma.bin kvmvapic.bin \
 s390-ccw.img s390-netboot.img \
 spapr-rtas.bin slof.bin skiboot.lid \
 palcode-clipper \
-u-boot.e500 \
-qemu_vga.ndrv
+u-boot.e500 u-boot-sam460-20100605.bin \
+qemu_vga.ndrv \
+hppa-firmware.img
 else
 BLOBS=
 endif
@@ -704,7 +855,7 @@ ifneq ($(BLOBS),)
 		$(INSTALL_DATA) $(SRC_PATH)/pc-bios/$$x "$(DESTDIR)$(qemu_datadir)"; \
 	done
 endif
-ifeq ($(CONFIG_GTK),y)
+ifeq ($(CONFIG_GTK),m)
 	$(MAKE) -C po $@
 endif
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/keymaps"
@@ -785,13 +936,11 @@ qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxt
 qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

-docs/interop/qemu-qmp-qapi.texi docs/interop/qemu-ga-qapi.texi: $(SRC_PATH)/scripts/qapi2texi.py $(qapi-py)
+docs/interop/qemu-qmp-qapi.texi: qapi/qapi-doc.texi
+	@cp -p $< $@

-docs/interop/qemu-qmp-qapi.texi: $(qapi-modules)
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")
-
-docs/interop/qemu-ga-qapi.texi: $(SRC_PATH)/qga/qapi-schema.json
-	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")
+docs/interop/qemu-ga-qapi.texi: qga/qapi-generated/qga-qapi-doc.texi
+	@cp -p $< $@

 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
 qemu.1: qemu-option-trace.texi
@@ -933,4 +1082,5 @@ ifdef QEMU_GA_MSI_ENABLED
 endif
 	@echo  ''
 endif
-	@echo  '  make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
+	@echo  '  $(MAKE) [targets]      (quiet build, default)'
+	@echo  '  $(MAKE) V=1 [targets]  (verbose build)'
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -2,7 +2,60 @@
 # Common libraries for tools and emulators
 stub-obj-y = stubs/ crypto/
 util-obj-y = util/ qobject/ qapi/
-util-obj-y += qmp-introspect.o qapi-types.o qapi-visit.o qapi-event.o
+util-obj-y += qapi/qapi-builtin-types.o
+util-obj-y += qapi/qapi-types.o
+util-obj-y += qapi/qapi-types-block-core.o
+util-obj-y += qapi/qapi-types-block.o
+util-obj-y += qapi/qapi-types-char.o
+util-obj-y += qapi/qapi-types-common.o
+util-obj-y += qapi/qapi-types-crypto.o
+util-obj-y += qapi/qapi-types-introspect.o
+util-obj-y += qapi/qapi-types-migration.o
+util-obj-y += qapi/qapi-types-misc.o
+util-obj-y += qapi/qapi-types-net.o
+util-obj-y += qapi/qapi-types-rocker.o
+util-obj-y += qapi/qapi-types-run-state.o
+util-obj-y += qapi/qapi-types-sockets.o
+util-obj-y += qapi/qapi-types-tpm.o
+util-obj-y += qapi/qapi-types-trace.o
+util-obj-y += qapi/qapi-types-transaction.o
+util-obj-y += qapi/qapi-types-ui.o
+util-obj-y += qapi/qapi-builtin-visit.o
+util-obj-y += qapi/qapi-visit.o
+util-obj-y += qapi/qapi-visit-block-core.o
+util-obj-y += qapi/qapi-visit-block.o
+util-obj-y += qapi/qapi-visit-char.o
+util-obj-y += qapi/qapi-visit-common.o
+util-obj-y += qapi/qapi-visit-crypto.o
+util-obj-y += qapi/qapi-visit-introspect.o
+util-obj-y += qapi/qapi-visit-migration.o
+util-obj-y += qapi/qapi-visit-misc.o
+util-obj-y += qapi/qapi-visit-net.o
+util-obj-y += qapi/qapi-visit-rocker.o
+util-obj-y += qapi/qapi-visit-run-state.o
+util-obj-y += qapi/qapi-visit-sockets.o
+util-obj-y += qapi/qapi-visit-tpm.o
+util-obj-y += qapi/qapi-visit-trace.o
+util-obj-y += qapi/qapi-visit-transaction.o
+util-obj-y += qapi/qapi-visit-ui.o
+util-obj-y += qapi/qapi-events.o
+util-obj-y += qapi/qapi-events-block-core.o
+util-obj-y += qapi/qapi-events-block.o
+util-obj-y += qapi/qapi-events-char.o
+util-obj-y += qapi/qapi-events-common.o
+util-obj-y += qapi/qapi-events-crypto.o
+util-obj-y += qapi/qapi-events-introspect.o
+util-obj-y += qapi/qapi-events-migration.o
+util-obj-y += qapi/qapi-events-misc.o
+util-obj-y += qapi/qapi-events-net.o
+util-obj-y += qapi/qapi-events-rocker.o
+util-obj-y += qapi/qapi-events-run-state.o
+util-obj-y += qapi/qapi-events-sockets.o
+util-obj-y += qapi/qapi-events-tpm.o
+util-obj-y += qapi/qapi-events-trace.o
+util-obj-y += qapi/qapi-events-transaction.o
+util-obj-y += qapi/qapi-events-ui.o
+util-obj-y += qapi/qapi-introspect.o

 chardev-obj-y = chardev/

@@ -51,11 +104,13 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 common-obj-y += migration/

 common-obj-y += audio/
+common-obj-m += audio/
 common-obj-y += hw/

 common-obj-y += replay/

 common-obj-y += ui/
+common-obj-m += ui/
 common-obj-y += bt-host.o bt-vhci.o
 bt-host.o-cflags := $(BLUEZ_CFLAGS)

@@ -78,8 +133,24 @@ common-obj-$(CONFIG_FDT) += device_tree.o
 ######################################################################
 # qapi

-common-obj-y += qmp-marshal.o
-common-obj-y += qmp-introspect.o
+common-obj-y += qapi/qapi-commands.o
+common-obj-y += qapi/qapi-commands-block-core.o
+common-obj-y += qapi/qapi-commands-block.o
+common-obj-y += qapi/qapi-commands-char.o
+common-obj-y += qapi/qapi-commands-common.o
+common-obj-y += qapi/qapi-commands-crypto.o
+common-obj-y += qapi/qapi-commands-introspect.o
+common-obj-y += qapi/qapi-commands-migration.o
+common-obj-y += qapi/qapi-commands-misc.o
+common-obj-y += qapi/qapi-commands-net.o
+common-obj-y += qapi/qapi-commands-rocker.o
+common-obj-y += qapi/qapi-commands-run-state.o
+common-obj-y += qapi/qapi-commands-sockets.o
+common-obj-y += qapi/qapi-commands-tpm.o
+common-obj-y += qapi/qapi-commands-trace.o
+common-obj-y += qapi/qapi-commands-transaction.o
+common-obj-y += qapi/qapi-commands-ui.o
+common-obj-y += qapi/qapi-introspect.o
 common-obj-y += qmp.o hmp.o
 endif

@@ -102,8 +173,9 @@ target-obj-y += trace/
 ######################################################################
 # guest agent

-# FIXME: a few definitions from qapi-types.o/qapi-visit.o are needed
-# by libqemuutil.a.  These should be moved to a separate .json schema.
+# FIXME: a few definitions from qapi/qapi-types.o and
+# qapi/qapi-visit.o are needed by libqemuutil.a.  These should be
+# extracted into a QAPI schema module, or perhaps a separate schema.
 qga-obj-y = qga/
 qga-vss-dll-obj-y = qga/

@@ -115,6 +187,7 @@ libvhost-user-obj-y = contrib/libvhost-user/
 vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
 vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
+vhost-user-blk-obj-y = contrib/vhost-user-blk/

 ######################################################################
 trace-events-subdirs =
@@ -129,9 +202,12 @@ trace-events-subdirs += hw/block/dataplane
 trace-events-subdirs += hw/char
 trace-events-subdirs += hw/intc
 trace-events-subdirs += hw/net
+trace-events-subdirs += hw/rdma
+trace-events-subdirs += hw/rdma/vmw
 trace-events-subdirs += hw/virtio
 trace-events-subdirs += hw/audio
 trace-events-subdirs += hw/misc
+trace-events-subdirs += hw/misc/macio
 trace-events-subdirs += hw/usb
 trace-events-subdirs += hw/scsi
 trace-events-subdirs += hw/nvram
@@ -140,6 +216,7 @@ trace-events-subdirs += hw/input
 trace-events-subdirs += hw/timer
 trace-events-subdirs += hw/dma
 trace-events-subdirs += hw/sparc
+trace-events-subdirs += hw/sparc64
 trace-events-subdirs += hw/sd
 trace-events-subdirs += hw/isa
 trace-events-subdirs += hw/mem
@@ -148,13 +225,16 @@ trace-events-subdirs += hw/i386/xen
 trace-events-subdirs += hw/9pfs
 trace-events-subdirs += hw/ppc
 trace-events-subdirs += hw/pci
+trace-events-subdirs += hw/pci-host
 trace-events-subdirs += hw/s390x
 trace-events-subdirs += hw/vfio
 trace-events-subdirs += hw/acpi
 trace-events-subdirs += hw/arm
 trace-events-subdirs += hw/alpha
+trace-events-subdirs += hw/hppa
 trace-events-subdirs += hw/xen
 trace-events-subdirs += hw/ide
+trace-events-subdirs += hw/tpm
 trace-events-subdirs += ui
 trace-events-subdirs += audio
 trace-events-subdirs += net
--- a/Makefile.target
+++ b/Makefile.target
@@ -93,8 +93,8 @@ all: $(PROGS) stap
 # cpu emulator library
 obj-y += exec.o
 obj-y += accel/
-obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG) += tcg/tcg-common.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
--- a/35
+++ b/35
@@ -56,7 +56,7 @@ The QEMU source code is maintained under the GIT version control system.

   git clone git://git.qemu.org/qemu.git

-When submitting patches, the preferred approach is to use 'git
+When submitting patches, one common approach is to use 'git
 format-patch' and/or 'git send-email' to format & send the mail to the
 qemu-devel@nongnu.org mailing list. All patches submitted must contain
 a 'Signed-off-by' line from the author. Patches should follow the
@@ -68,6 +68,39 @@ the QEMU website
  https://qemu.org/Contribute/SubmitAPatch
  https://qemu.org/Contribute/TrivialPatches

+The QEMU website is also maintained under source control.
+
+  git clone git://git.qemu.org/qemu-web.git
+  https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/
+
+A 'git-publish' utility was created to make above process less
+cumbersome, and is highly recommended for making regular contributions,
+or even just for sending consecutive patch series revisions. It also
+requires a working 'git send-email' setup, and by default doesn't
+automate everything, so you may want to go through the above steps
+manually for once.
+
+For installation instructions, please go to
+
+  https://github.com/stefanha/git-publish
+
+The workflow with 'git-publish' is:
+
+  $ git checkout master -b my-feature
+  $ # work on new commits, add your 'Signed-off-by' lines to each
+  $ git publish
+
+Your patch series will be sent and tagged as my-feature-v1 if you need to refer
+back to it in the future.
+
+Sending v2:
+
+  $ git checkout my-feature # same topic branch
+  $ # making changes to the commits (using 'git rebase', for example)
+  $ git publish
+
+Your patch series will be sent with 'v2' tag in the subject and the git tip
+will be tagged as my-feature-v2.

 Bug reporting
 =============
--- a/2
+++ b/2
@@ -1 +1 @@
-2.11.0
+2.11.50
--- a/accel/accel.c
+++ b/accel/accel.c
@@ -26,7 +26,6 @@
 #include "qemu/osdep.h"
 #include "sysemu/accel.h"
 #include "hw/boards.h"
-#include "qemu-common.h"
 #include "sysemu/arch_init.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
@@ -34,6 +33,7 @@
 #include "hw/xen/xen.h"
 #include "qom/object.h"
 #include "qemu/error-report.h"
+#include "qemu/option.h"

 static const TypeInfo accel_type = {
    .name = TYPE_ACCEL,
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -235,6 +235,7 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
 {
    KVMState *s = kvm_state;
    struct kvm_userspace_memory_region mem;
+    int ret;

    mem.slot = slot->slot | (kml->as_id << 16);
    mem.guest_phys_addr = slot->start_addr;
@@ -248,7 +249,10 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
    }
    mem.memory_size = slot->memory_size;
-    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
+    ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
+    trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
+                              mem.memory_size, mem.userspace_addr, ret);
+    return ret;
 }

 int kvm_destroy_vcpu(CPUState *cpu)
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -12,4 +12,5 @@ kvm_irqchip_commit_routes(void) ""
 kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
 kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
 kvm_irqchip_release_virq(int virq) "virq %d"
+kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"

--- a/accel/stubs/Makefile.objs
+++ b/accel/stubs/Makefile.objs
@@ -1,3 +1,5 @@
 obj-$(call lnot,$(CONFIG_HAX))  += hax-stub.o
+obj-$(call lnot,$(CONFIG_HVF))  += hvf-stub.o
+obj-$(call lnot,$(CONFIG_WHPX)) += whpx-stub.o
 obj-$(call lnot,$(CONFIG_KVM))  += kvm-stub.o
 obj-$(call lnot,$(CONFIG_TCG))  += tcg-stub.o
--- a/accel/stubs/hvf-stub.c
+++ b/accel/stubs/hvf-stub.c
@@ -0,0 +1,31 @@
+/*
+ * QEMU HVF support
+ *
+ * Copyright 2017 Red Hat, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2 or later, as published by the Free Software Foundation,
+ * and may be copied, distributed, and modified under those terms.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "sysemu/hvf.h"
+
+int hvf_init_vcpu(CPUState *cpu)
+{
+    return -ENOSYS;
+}
+
+int hvf_vcpu_exec(CPUState *cpu)
+{
+    return -ENOSYS;
+}
+
+void hvf_vcpu_destroy(CPUState *cpu)
+{
+}
--- a/accel/stubs/whpx-stub.c
+++ b/accel/stubs/whpx-stub.c
@@ -0,0 +1,48 @@
+/*
+ * QEMU Windows Hypervisor Platform accelerator (WHPX) stub
+ *
+ * Copyright Microsoft Corp. 2017
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "sysemu/whpx.h"
+
+int whpx_init_vcpu(CPUState *cpu)
+{
+    return -1;
+}
+
+int whpx_vcpu_exec(CPUState *cpu)
+{
+    return -1;
+}
+
+void whpx_destroy_vcpu(CPUState *cpu)
+{
+}
+
+void whpx_vcpu_kick(CPUState *cpu)
+{
+}
+
+void whpx_cpu_synchronize_state(CPUState *cpu)
+{
+}
+
+void whpx_cpu_synchronize_post_reset(CPUState *cpu)
+{
+}
+
+void whpx_cpu_synchronize_post_init(CPUState *cpu)
+{
+}
+
+void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+}
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -1,6 +1,6 @@
 obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
 obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
 obj-y += translator.o

--- a/accel/tcg/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
@@ -21,7 +21,6 @@
 #include "cpu.h"
 #include "sysemu/cpus.h"
 #include "exec/exec-all.h"
-#include "exec/memory-internal.h"

 bool tcg_allowed;

--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -146,8 +146,10 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
    uint8_t *tb_ptr = itb->tc.ptr;

    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
-                           "Trace %p [%d: " TARGET_FMT_lx "] %s\n",
-                           itb->tc.ptr, cpu->cpu_index, itb->pc,
+                           "Trace %d: %p ["
+                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
+                           cpu->cpu_index, itb->tc.ptr,
+                           itb->cs_base, itb->pc, itb->flags,
                           lookup_symbol(itb->pc));

 #if defined(DEBUG_DISAS)
@@ -525,19 +527,13 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
                                        TranslationBlock **last_tb)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    int32_t insns_left;

    /* Clear the interrupt flag now since we're processing
     * cpu->interrupt_request and cpu->exit_request.
+     * Ensure zeroing happens before reading cpu->exit_request or
+     * cpu->interrupt_request (see also smp_wmb in cpu_exit())
     */
-    insns_left = atomic_read(&cpu->icount_decr.u32);
-    atomic_set(&cpu->icount_decr.u16.high, 0);
-    if (unlikely(insns_left < 0)) {
-        /* Ensure the zeroing of icount_decr comes before the next read
-         * of cpu->exit_request or cpu->interrupt_request.
-         */
-        smp_mb();
-    }
+    atomic_mb_set(&cpu->icount_decr.u16.high, 0);

    if (unlikely(atomic_read(&cpu->interrupt_request))) {
        int interrupt_request;
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -880,7 +880,7 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
    if (unlikely(env->tlb_table[mmu_idx][index].addr_code !=
                 (addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK)))) {
        if (!VICTIM_TLB_HIT(addr_read, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_INST_FETCH, mmu_idx, 0);
+            tlb_fill(ENV_GET_CPU(env), addr, 0, MMU_INST_FETCH, mmu_idx, 0);
        }
    }
    iotlbentry = &env->iotlb[mmu_idx][index];
@@ -928,7 +928,7 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
 * Otherwise the function will return, and there will be a valid
 * entry in the TLB for this access.
 */
-void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
                 uintptr_t retaddr)
 {
    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -938,7 +938,8 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        /* TLB entry is for a different page */
        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+            tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
        }
    }
 }
@@ -981,7 +982,8 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
    if ((addr & TARGET_PAGE_MASK)
        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+            tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
        }
        tlb_addr = tlbe->addr_write & ~TLB_INVALID_MASK;
    }
@@ -995,7 +997,8 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,

    /* Let the guest notice RMW on a write-only page.  */
    if (unlikely(tlbe->addr_read != (tlb_addr & ~TLB_NOTDIRTY))) {
-        tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD, mmu_idx, retaddr);
+        tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_LOAD,
+                 mmu_idx, retaddr);
        /* Since we don't support reads and writes to different addresses,
           and we do have the proper page loaded for write, this shouldn't
           ever return.  But just in case, handle via stop-the-world.  */
--- a/accel/tcg/softmmu_template.h
+++ b/accel/tcg/softmmu_template.h
@@ -124,7 +124,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
    if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+            tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
                     mmu_idx, retaddr);
        }
        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
@@ -191,7 +191,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
    if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+            tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
                     mmu_idx, retaddr);
        }
        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
@@ -283,7 +283,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
    if ((addr & TARGET_PAGE_MASK)
        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+            tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
        }
        tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
    }
@@ -316,7 +317,7 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
        tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
        if (page2 != (tlb_addr2 & (TARGET_PAGE_MASK | TLB_INVALID_MASK))
            && !VICTIM_TLB_HIT(addr_write, page2)) {
-            tlb_fill(ENV_GET_CPU(env), page2, MMU_DATA_STORE,
+            tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
                     mmu_idx, retaddr);
        }

@@ -359,7 +360,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
    if ((addr & TARGET_PAGE_MASK)
        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+            tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
        }
        tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
    }
@@ -392,7 +394,7 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
        tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
        if (page2 != (tlb_addr2 & (TARGET_PAGE_MASK | TLB_INVALID_MASK))
            && !VICTIM_TLB_HIT(addr_write, page2)) {
-            tlb_fill(ENV_GET_CPU(env), page2, MMU_DATA_STORE,
+            tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
                     mmu_idx, retaddr);
        }

--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -0,0 +1,997 @@
+/*
+ * Generic vectorized operation runtime
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg-gvec-desc.h"
+
+
+/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
+ * them via GCC's generic vector extension.  This turns out to be simpler and
+ * more reliable than getting the compiler to autovectorize.
+ *
+ * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
+ * are multiples of 16.
+ *
+ * When the compiler does not support all of the operations we require, the
+ * loops are written so that we can always fall back on the base types.
+ */
+#ifdef CONFIG_VECTOR16
+typedef uint8_t vec8 __attribute__((vector_size(16)));
+typedef uint16_t vec16 __attribute__((vector_size(16)));
+typedef uint32_t vec32 __attribute__((vector_size(16)));
+typedef uint64_t vec64 __attribute__((vector_size(16)));
+
+typedef int8_t svec8 __attribute__((vector_size(16)));
+typedef int16_t svec16 __attribute__((vector_size(16)));
+typedef int32_t svec32 __attribute__((vector_size(16)));
+typedef int64_t svec64 __attribute__((vector_size(16)));
+
+#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
+#define DUP8(X)   { X, X, X, X, X, X, X, X }
+#define DUP4(X)   { X, X, X, X }
+#define DUP2(X)   { X, X }
+#else
+typedef uint8_t vec8;
+typedef uint16_t vec16;
+typedef uint32_t vec32;
+typedef uint64_t vec64;
+
+typedef int8_t svec8;
+typedef int16_t svec16;
+typedef int32_t svec32;
+typedef int64_t svec64;
+
+#define DUP16(X)  X
+#define DUP8(X)   X
+#define DUP4(X)   X
+#define DUP2(X)   X
+#endif /* CONFIG_VECTOR16 */
+
+static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
+{
+    intptr_t maxsz = simd_maxsz(desc);
+    intptr_t i;
+
+    if (unlikely(maxsz > oprsz)) {
+        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
+            *(uint64_t *)(d + i) = 0;
+        }
+    }
+}
+
+void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec8 vecb = (vec8)DUP16(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec16 vecb = (vec16)DUP8(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec32 vecb = (vec32)DUP4(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec8 vecb = (vec8)DUP16(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec16 vecb = (vec16)DUP8(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec32 vecb = (vec32)DUP4(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec8 vecb = (vec8)DUP16(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec16 vecb = (vec16)DUP8(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec32 vecb = (vec32)DUP4(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+
+    memcpy(d, a, oprsz);
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    if (c == 0) {
+        oprsz = 0;
+    } else {
+        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+            *(uint64_t *)(d + i) = c;
+        }
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    if (c == 0) {
+        oprsz = 0;
+    } else {
+        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+            *(uint32_t *)(d + i) = c;
+        }
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
+{
+    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
+}
+
+void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
+{
+    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
+}
+
+void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    vec64 vecb = (vec64)DUP2(b);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    int shift = simd_data(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+/* If vectors are enabled, the compiler fills in -1 for true.
+   Otherwise, we must take care of this by hand.  */
+#ifdef CONFIG_VECTOR16
+# define DO_CMP0(X)  X
+#else
+# define DO_CMP0(X)  -(X)
+#endif
+
+#define DO_CMP1(NAME, TYPE, OP)                                            \
+void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    intptr_t i;                                                            \
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {                           \
+        *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i));  \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_CMP2(SZ) \
+    DO_CMP1(gvec_eq##SZ, vec##SZ, ==)    \
+    DO_CMP1(gvec_ne##SZ, vec##SZ, !=)    \
+    DO_CMP1(gvec_lt##SZ, svec##SZ, <)    \
+    DO_CMP1(gvec_le##SZ, svec##SZ, <=)   \
+    DO_CMP1(gvec_ltu##SZ, vec##SZ, <)    \
+    DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP0
+#undef DO_CMP1
+#undef DO_CMP2
+
+void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+        int r = *(int8_t *)(a + i) + *(int8_t *)(b + i);
+        if (r > INT8_MAX) {
+            r = INT8_MAX;
+        } else if (r < INT8_MIN) {
+            r = INT8_MIN;
+        }
+        *(int8_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+        int r = *(int16_t *)(a + i) + *(int16_t *)(b + i);
+        if (r > INT16_MAX) {
+            r = INT16_MAX;
+        } else if (r < INT16_MIN) {
+            r = INT16_MIN;
+        }
+        *(int16_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+        int32_t ai = *(int32_t *)(a + i);
+        int32_t bi = *(int32_t *)(b + i);
+        int32_t di = ai + bi;
+        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+            /* Signed overflow.  */
+            di = (di < 0 ? INT32_MAX : INT32_MIN);
+        }
+        *(int32_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+        int64_t ai = *(int64_t *)(a + i);
+        int64_t bi = *(int64_t *)(b + i);
+        int64_t di = ai + bi;
+        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+            /* Signed overflow.  */
+            di = (di < 0 ? INT64_MAX : INT64_MIN);
+        }
+        *(int64_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        int r = *(int8_t *)(a + i) - *(int8_t *)(b + i);
+        if (r > INT8_MAX) {
+            r = INT8_MAX;
+        } else if (r < INT8_MIN) {
+            r = INT8_MIN;
+        }
+        *(uint8_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+        int r = *(int16_t *)(a + i) - *(int16_t *)(b + i);
+        if (r > INT16_MAX) {
+            r = INT16_MAX;
+        } else if (r < INT16_MIN) {
+            r = INT16_MIN;
+        }
+        *(int16_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+        int32_t ai = *(int32_t *)(a + i);
+        int32_t bi = *(int32_t *)(b + i);
+        int32_t di = ai - bi;
+        if (((di ^ ai) & (ai ^ bi)) < 0) {
+            /* Signed overflow.  */
+            di = (di < 0 ? INT32_MAX : INT32_MIN);
+        }
+        *(int32_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+        int64_t ai = *(int64_t *)(a + i);
+        int64_t bi = *(int64_t *)(b + i);
+        int64_t di = ai - bi;
+        if (((di ^ ai) & (ai ^ bi)) < 0) {
+            /* Signed overflow.  */
+            di = (di < 0 ? INT64_MAX : INT64_MIN);
+        }
+        *(int64_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
+        if (r > UINT8_MAX) {
+            r = UINT8_MAX;
+        }
+        *(uint8_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
+        if (r > UINT16_MAX) {
+            r = UINT16_MAX;
+        }
+        *(uint16_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        uint32_t ai = *(uint32_t *)(a + i);
+        uint32_t bi = *(uint32_t *)(b + i);
+        uint32_t di = ai + bi;
+        if (di < ai) {
+            di = UINT32_MAX;
+        }
+        *(uint32_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        uint64_t ai = *(uint64_t *)(a + i);
+        uint64_t bi = *(uint64_t *)(b + i);
+        uint64_t di = ai + bi;
+        if (di < ai) {
+            di = UINT64_MAX;
+        }
+        *(uint64_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
+        if (r < 0) {
+            r = 0;
+        }
+        *(uint8_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
+        if (r < 0) {
+            r = 0;
+        }
+        *(uint16_t *)(d + i) = r;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        uint32_t ai = *(uint32_t *)(a + i);
+        uint32_t bi = *(uint32_t *)(b + i);
+        uint32_t di = ai - bi;
+        if (ai < bi) {
+            di = 0;
+        }
+        *(uint32_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        uint64_t ai = *(uint64_t *)(a + i);
+        uint64_t bi = *(uint64_t *)(b + i);
+        uint64_t di = ai - bi;
+        if (ai < bi) {
+            di = 0;
+        }
+        *(uint64_t *)(d + i) = di;
+    }
+    clear_high(d, oprsz, desc);
+}
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -156,8 +156,9 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
        return tcg_ctx->code_gen_epilogue;
    }
    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
-                           "Chain %p [%d: " TARGET_FMT_lx "] %s\n",
-                           tb->tc.ptr, cpu->cpu_index, pc,
+                           "Chain %d: %p ["
+                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
+                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
                           lookup_symbol(pc));
    return tb->tc.ptr;
 }
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -134,3 +134,121 @@ GEN_ATOMIC_HELPERS(xor_fetch)
 GEN_ATOMIC_HELPERS(xchg)

 #undef GEN_ATOMIC_HELPERS
+
+DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
+
+DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_adds8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_subs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_mul8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_muls8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ssadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sssub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_usadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ussub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_3(gvec_shl8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_shr8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_sar8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ne8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lt8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_le8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -31,7 +31,6 @@
 #include "tcg.h"
 #if defined(CONFIG_USER_ONLY)
 #include "qemu.h"
-#include "exec/exec-all.h"
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 #include <sys/param.h>
 #if __FreeBSD_version >= 700104
@@ -257,7 +256,7 @@ static target_long decode_sleb128(uint8_t **pp)
 /* Encode the data collected about the instructions while compiling TB.
   Place the data at BLOCK, and return the number of bytes consumed.

-   The logical table consisits of TARGET_INSN_START_WORDS target_ulong's,
+   The logical table consists of TARGET_INSN_START_WORDS target_ulong's,
   which come from the target's insn_start data, followed by a uintptr_t
   which comes from the host pc of the end of the code implementing the insn.

@@ -2182,13 +2181,25 @@ int page_unprotect(target_ulong address, uintptr_t pc)

    /* if the page was really writable, then we change its
       protection back to writable */
-    if ((p->flags & PAGE_WRITE_ORG) && !(p->flags & PAGE_WRITE)) {
+    if (p->flags & PAGE_WRITE_ORG) {
+        current_tb_invalidated = false;
+        if (p->flags & PAGE_WRITE) {
+            /* If the page is actually marked WRITE then assume this is because
+             * this thread raced with another one which got here first and
+             * set the page to PAGE_WRITE and did the TB invalidate for us.
+             */
+#ifdef TARGET_HAS_PRECISE_SMC
+            TranslationBlock *current_tb = tb_find_pc(pc);
+            if (current_tb) {
+                current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
+            }
+#endif
+        } else {
            host_start = address & qemu_host_page_mask;
            host_end = host_start + qemu_host_page_size;

            prot = 0;
-        current_tb_invalidated = false;
-        for (addr = host_start ; addr < host_end ; addr += TARGET_PAGE_SIZE) {
+            for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
                p = page_find(addr >> TARGET_PAGE_BITS);
                p->flags |= PAGE_WRITE;
                prot |= p->flags;
@@ -2204,7 +2215,7 @@ int page_unprotect(target_ulong address, uintptr_t pc)
            }
            mprotect((void *)g2h(host_start), qemu_host_page_size,
                     prot & PAGE_BITS);
-
+        }
        mmap_unlock();
        /* If current TB was invalidated return to main loop */
        return current_tb_invalidated ? 2 : 1;
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -57,12 +57,13 @@ static void cpu_exit_tb_from_sighandler(CPUState *cpu, sigset_t *old_set)
   the effective address of the memory exception. 'is_write' is 1 if a
   write caused the exception and otherwise 0'. 'old_set' is the
   signal set which should be restored */
-static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
+static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
                                    int is_write, sigset_t *old_set)
 {
    CPUState *cpu = current_cpu;
    CPUClass *cc;
    int ret;
+    unsigned long address = (unsigned long)info->si_addr;

    /* We must handle PC addresses from two different sources:
     * a call return address and a signal frame address.
@@ -103,7 +104,18 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
           pc, address, is_write, *(unsigned long *)old_set);
 #endif
    /* XXX: locking issue */
-    if (is_write && h2g_valid(address)) {
+    /* Note that it is important that we don't call page_unprotect() unless
+     * this is really a "write to nonwriteable page" fault, because
+     * page_unprotect() assumes that if it is called for an access to
+     * a page that's writeable this means we had two threads racing and
+     * another thread got there first and already made the page writeable;
+     * so we will retry the access. If we were to call page_unprotect()
+     * for some other kind of fault that should really be passed to the
+     * guest, we'd end up in an infinite loop of retrying the faulting
+     * access.
+     */
+    if (is_write && info->si_signo == SIGSEGV && info->si_code == SEGV_ACCERR &&
+        h2g_valid(address)) {
        switch (page_unprotect(h2g(address), pc)) {
        case 0:
            /* Fault not caused by a page marked unwritable to protect
@@ -137,7 +149,7 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
    cc = CPU_GET_CLASS(cpu);
    /* see if it is an MMU fault */
    g_assert(cc->handle_mmu_fault);
-    ret = cc->handle_mmu_fault(cpu, address, is_write, MMU_USER_IDX);
+    ret = cc->handle_mmu_fault(cpu, address, 0, is_write, MMU_USER_IDX);

    if (ret == 0) {
        /* The MMU fault was handled without causing real CPU fault.
@@ -215,9 +227,8 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #endif
    pc = EIP_sig(uc);
    trapno = TRAP_sig(uc);
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             trapno == 0xe ?
-                             (ERROR_sig(uc) >> 1) & 1 : 0,
+    return handle_cpu_signal(pc, info,
+                             trapno == 0xe ? (ERROR_sig(uc) >> 1) & 1 : 0,
                             &MASK_sig(uc));
 }

@@ -261,9 +272,8 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #endif

    pc = PC_sig(uc);
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             TRAP_sig(uc) == 0xe ?
-                             (ERROR_sig(uc) >> 1) & 1 : 0,
+    return handle_cpu_signal(pc, info,
+                             TRAP_sig(uc) == 0xe ? (ERROR_sig(uc) >> 1) & 1 : 0,
                             &MASK_sig(uc));
 }

@@ -341,8 +351,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
        is_write = 1;
    }
 #endif
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write, &uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }

 #elif defined(__alpha__)
@@ -372,8 +381,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
        is_write = 1;
    }

-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write, &uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }
 #elif defined(__sparc__)

@@ -432,8 +440,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
            break;
        }
    }
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write, sigmask);
+    return handle_cpu_signal(pc, info, is_write, sigmask);
 }

 #elif defined(__arm__)
@@ -466,9 +473,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
     * later processor; on v5 we will always report this as a read).
     */
    is_write = extract32(uc->uc_mcontext.error_code, 11, 1);
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write,
-                             &uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }

 #elif defined(__aarch64__)
@@ -495,43 +500,7 @@ int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
                /* Ignore bits 23 & 24, controlling indexing.  */
                || (insn & 0x3a400000) == 0x28000000); /* C3.3.7,14-16 */

-    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
-                             is_write, &uc->uc_sigmask);
-}
-
-#elif defined(__ia64)
-
-#ifndef __ISR_VALID
-  /* This ought to be in <bits/siginfo.h>... */
-# define __ISR_VALID    1
-#endif
-
-int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
-{
-    siginfo_t *info = pinfo;
-    ucontext_t *uc = puc;
-    unsigned long ip;
-    int is_write = 0;
-
-    ip = uc->uc_mcontext.sc_ip;
-    switch (host_signum) {
-    case SIGILL:
-    case SIGFPE:
-    case SIGSEGV:
-    case SIGBUS:
-    case SIGTRAP:
-        if (info->si_code && (info->si_segvflags & __ISR_VALID)) {
-            /* ISR.W (write-access) is bit 33:  */
-            is_write = (info->si_isr >> 33) & 1;
-        }
-        break;
-
-    default:
-        break;
-    }
-    return handle_cpu_signal(ip, (unsigned long)info->si_addr,
-                             is_write,
-                             (sigset_t *)&uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }

 #elif defined(__s390__)
@@ -583,8 +552,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
        }
        break;
    }
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write, &uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }

 #elif defined(__mips__)
@@ -599,8 +567,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,

    /* XXX: compute is_write */
    is_write = 0;
-    return handle_cpu_signal(pc, (unsigned long)info->si_addr,
-                             is_write, &uc->uc_sigmask);
+    return handle_cpu_signal(pc, info, is_write, &uc->uc_sigmask);
 }

 #else
--- a/arch_init.c
+++ b/arch_init.c
@@ -28,9 +28,9 @@
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
 #include "hw/audio/soundhw.h"
+#include "qapi/qapi-commands-misc.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "qmp-commands.h"
 #include "hw/acpi/acpi.h"
 #include "qemu/help_option.h"

@@ -53,6 +53,8 @@ int graphic_depth = 32;
 #define QEMU_ARCH QEMU_ARCH_CRIS
 #elif defined(TARGET_I386)
 #define QEMU_ARCH QEMU_ARCH_I386
+#elif defined(TARGET_HPPA)
+#define QEMU_ARCH QEMU_ARCH_HPPA
 #elif defined(TARGET_M68K)
 #define QEMU_ARCH QEMU_ARCH_M68K
 #elif defined(TARGET_LM32)
@@ -69,6 +71,8 @@ int graphic_depth = 32;
 #define QEMU_ARCH QEMU_ARCH_OPENRISC
 #elif defined(TARGET_PPC)
 #define QEMU_ARCH QEMU_ARCH_PPC
+#elif defined(TARGET_RISCV)
+#define QEMU_ARCH QEMU_ARCH_RISCV
 #elif defined(TARGET_S390X)
 #define QEMU_ARCH QEMU_ARCH_S390X
 #elif defined(TARGET_SH4)
--- a/audio/Makefile.objs
+++ b/audio/Makefile.objs
@@ -1,19 +1,31 @@
 common-obj-y = audio.o noaudio.o wavaudio.o mixeng.o
-common-obj-$(CONFIG_SDL) += sdlaudio.o
-common-obj-$(CONFIG_OSS) += ossaudio.o
 common-obj-$(CONFIG_SPICE) += spiceaudio.o
-common-obj-$(CONFIG_COREAUDIO) += coreaudio.o
-common-obj-$(CONFIG_ALSA) += alsaaudio.o
-common-obj-$(CONFIG_DSOUND) += dsoundaudio.o
-common-obj-$(CONFIG_PA) += paaudio.o
+common-obj-$(CONFIG_AUDIO_COREAUDIO) += coreaudio.o
+common-obj-$(CONFIG_AUDIO_DSOUND) += dsoundaudio.o
 common-obj-$(CONFIG_AUDIO_PT_INT) += audio_pt_int.o
 common-obj-$(CONFIG_AUDIO_WIN_INT) += audio_win_int.o
 common-obj-y += wavcapture.o

-sdlaudio.o-cflags := $(SDL_CFLAGS)
-sdlaudio.o-libs := $(SDL_LIBS)
-alsaaudio.o-libs := $(ALSA_LIBS)
-paaudio.o-libs := $(PULSE_LIBS)
 coreaudio.o-libs := $(COREAUDIO_LIBS)
 dsoundaudio.o-libs := $(DSOUND_LIBS)
-ossaudio.o-libs := $(OSS_LIBS)
+
+# alsa module
+common-obj-$(CONFIG_AUDIO_ALSA) += alsa.mo
+alsa.mo-objs = alsaaudio.o
+alsa.mo-libs := $(ALSA_LIBS)
+
+# oss module
+common-obj-$(CONFIG_AUDIO_OSS) += oss.mo
+oss.mo-objs = ossaudio.o
+oss.mo-libs := $(OSS_LIBS)
+
+# pulseaudio module
+common-obj-$(CONFIG_AUDIO_PA) += pa.mo
+pa.mo-objs = paaudio.o
+pa.mo-libs := $(PULSE_LIBS)
+
+# sdl module
+common-obj-$(CONFIG_AUDIO_SDL) += sdl.mo
+sdl.mo-objs = sdlaudio.o
+sdl.mo-cflags := $(SDL_CFLAGS)
+sdl.mo-libs := $(SDL_LIBS)
--- a/audio/alsaaudio.c
+++ b/audio/alsaaudio.c
@@ -823,7 +823,7 @@ static int alsa_init_out(HWVoiceOut *hw, struct audsettings *as,
    audio_pcm_init_info (&hw->info, &obt_as);
    hw->samples = obt.samples;

-    alsa->pcm_buf = audio_calloc (AUDIO_FUNC, obt.samples, 1 << hw->info.shift);
+    alsa->pcm_buf = audio_calloc(__func__, obt.samples, 1 << hw->info.shift);
    if (!alsa->pcm_buf) {
        dolog ("Could not allocate DAC buffer (%d samples, each %d bytes)\n",
               hw->samples, 1 << hw->info.shift);
@@ -934,7 +934,7 @@ static int alsa_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque)
    audio_pcm_init_info (&hw->info, &obt_as);
    hw->samples = obt.samples;

-    alsa->pcm_buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+    alsa->pcm_buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
    if (!alsa->pcm_buf) {
        dolog ("Could not allocate ADC buffer (%d samples, each %d bytes)\n",
               hw->samples, 1 << hw->info.shift);
@@ -1213,7 +1213,7 @@ static struct audio_pcm_ops alsa_pcm_ops = {
    .ctl_in   = alsa_ctl_in,
 };

-struct audio_driver alsa_audio_driver = {
+static struct audio_driver alsa_audio_driver = {
    .name           = "alsa",
    .descr          = "ALSA http://www.alsa-project.org",
    .options        = alsa_options,
@@ -1226,3 +1226,9 @@ struct audio_driver alsa_audio_driver = {
    .voice_size_out = sizeof (ALSAVoiceOut),
    .voice_size_in  = sizeof (ALSAVoiceIn)
 };
+
+static void register_audio_alsa(void)
+{
+    audio_driver_register(&alsa_audio_driver);
+}
+type_init(register_audio_alsa);
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -45,15 +45,49 @@
   The 1st one is the one used by default, that is the reason
    that we generate the list.
 */
-static struct audio_driver *drvtab[] = {
-#ifdef CONFIG_SPICE
-    &spice_audio_driver,
-#endif
+static const char *audio_prio_list[] = {
+    "spice",
    CONFIG_AUDIO_DRIVERS
-    &no_audio_driver,
-    &wav_audio_driver
+    "none",
+    "wav",
 };

+static QLIST_HEAD(, audio_driver) audio_drivers;
+
+void audio_driver_register(audio_driver *drv)
+{
+    QLIST_INSERT_HEAD(&audio_drivers, drv, next);
+}
+
+audio_driver *audio_driver_lookup(const char *name)
+{
+    struct audio_driver *d;
+
+    QLIST_FOREACH(d, &audio_drivers, next) {
+        if (strcmp(name, d->name) == 0) {
+            return d;
+        }
+    }
+
+    audio_module_load_one(name);
+    QLIST_FOREACH(d, &audio_drivers, next) {
+        if (strcmp(name, d->name) == 0) {
+            return d;
+        }
+    }
+
+    return NULL;
+}
+
+static void audio_module_load_all(void)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(audio_prio_list); i++) {
+        audio_driver_lookup(audio_prio_list[i]);
+    }
+}
+
 struct fixed_settings {
    int enabled;
    int nb_voices;
@@ -424,12 +458,12 @@ static void audio_process_options (const char *prefix,
    const char qemu_prefix[] = "QEMU_";
    size_t preflen, optlen;

-    if (audio_bug (AUDIO_FUNC, !prefix)) {
+    if (audio_bug(__func__, !prefix)) {
        dolog ("prefix = NULL\n");
        return;
    }

-    if (audio_bug (AUDIO_FUNC, !opt)) {
+    if (audio_bug(__func__, !opt)) {
        dolog ("opt = NULL\n");
        return;
    }
@@ -792,7 +826,7 @@ static int audio_attach_capture (HWVoiceOut *hw)
        SWVoiceOut *sw;
        HWVoiceOut *hw_cap = &cap->hw;

-        sc = audio_calloc (AUDIO_FUNC, 1, sizeof (*sc));
+        sc = audio_calloc(__func__, 1, sizeof(*sc));
        if (!sc) {
            dolog ("Could not allocate soft capture voice (%zu bytes)\n",
                   sizeof (*sc));
@@ -848,7 +882,7 @@ static int audio_pcm_hw_find_min_in (HWVoiceIn *hw)
 int audio_pcm_hw_get_live_in (HWVoiceIn *hw)
 {
    int live = hw->total_samples_captured - audio_pcm_hw_find_min_in (hw);
-    if (audio_bug (AUDIO_FUNC, live < 0 || live > hw->samples)) {
+    if (audio_bug(__func__, live < 0 || live > hw->samples)) {
        dolog ("live=%d hw->samples=%d\n", live, hw->samples);
        return 0;
    }
@@ -886,7 +920,7 @@ static int audio_pcm_sw_get_rpos_in (SWVoiceIn *sw)
    int live = hw->total_samples_captured - sw->total_hw_samples_acquired;
    int rpos;

-    if (audio_bug (AUDIO_FUNC, live < 0 || live > hw->samples)) {
+    if (audio_bug(__func__, live < 0 || live > hw->samples)) {
        dolog ("live=%d hw->samples=%d\n", live, hw->samples);
        return 0;
    }
@@ -909,7 +943,7 @@ int audio_pcm_sw_read (SWVoiceIn *sw, void *buf, int size)
    rpos = audio_pcm_sw_get_rpos_in (sw) % hw->samples;

    live = hw->total_samples_captured - sw->total_hw_samples_acquired;
-    if (audio_bug (AUDIO_FUNC, live < 0 || live > hw->samples)) {
+    if (audio_bug(__func__, live < 0 || live > hw->samples)) {
        dolog ("live_in=%d hw->samples=%d\n", live, hw->samples);
        return 0;
    }
@@ -935,7 +969,7 @@ int audio_pcm_sw_read (SWVoiceIn *sw, void *buf, int size)
        }
        osamp = swlim;

-        if (audio_bug (AUDIO_FUNC, osamp < 0)) {
+        if (audio_bug(__func__, osamp < 0)) {
            dolog ("osamp=%d\n", osamp);
            return 0;
        }
@@ -990,7 +1024,7 @@ static int audio_pcm_hw_get_live_out (HWVoiceOut *hw, int *nb_live)
    if (nb_live1) {
        int live = smin;

-        if (audio_bug (AUDIO_FUNC, live < 0 || live > hw->samples)) {
+        if (audio_bug(__func__, live < 0 || live > hw->samples)) {
            dolog ("live=%d hw->samples=%d\n", live, hw->samples);
            return 0;
        }
@@ -1014,7 +1048,7 @@ int audio_pcm_sw_write (SWVoiceOut *sw, void *buf, int size)
    hwsamples = sw->hw->samples;

    live = sw->total_hw_samples_mixed;
-    if (audio_bug (AUDIO_FUNC, live < 0 || live > hwsamples)){
+    if (audio_bug(__func__, live < 0 || live > hwsamples)) {
        dolog ("live=%d hw->samples=%d\n", live, hwsamples);
        return 0;
    }
@@ -1263,7 +1297,7 @@ static int audio_get_avail (SWVoiceIn *sw)
    }

    live = sw->hw->total_samples_captured - sw->total_hw_samples_acquired;
-    if (audio_bug (AUDIO_FUNC, live < 0 || live > sw->hw->samples)) {
+    if (audio_bug(__func__, live < 0 || live > sw->hw->samples)) {
        dolog ("live=%d sw->hw->samples=%d\n", live, sw->hw->samples);
        return 0;
    }
@@ -1287,7 +1321,7 @@ static int audio_get_free (SWVoiceOut *sw)

    live = sw->total_hw_samples_mixed;

-    if (audio_bug (AUDIO_FUNC, live < 0 || live > sw->hw->samples)) {
+    if (audio_bug(__func__, live < 0 || live > sw->hw->samples)) {
        dolog ("live=%d sw->hw->samples=%d\n", live, sw->hw->samples);
        return 0;
    }
@@ -1354,7 +1388,7 @@ static void audio_run_out (AudioState *s)
            live = 0;
        }

-        if (audio_bug (AUDIO_FUNC, live < 0 || live > hw->samples)) {
+        if (audio_bug(__func__, live < 0 || live > hw->samples)) {
            dolog ("live=%d hw->samples=%d\n", live, hw->samples);
            continue;
        }
@@ -1389,7 +1423,7 @@ static void audio_run_out (AudioState *s)
        prev_rpos = hw->rpos;
        played = hw->pcm_ops->run_out (hw, live);
        replay_audio_out(&played);
-        if (audio_bug (AUDIO_FUNC, hw->rpos >= hw->samples)) {
+        if (audio_bug(__func__, hw->rpos >= hw->samples)) {
            dolog ("hw->rpos=%d hw->samples=%d played=%d\n",
                   hw->rpos, hw->samples, played);
            hw->rpos = 0;
@@ -1410,7 +1444,7 @@ static void audio_run_out (AudioState *s)
                continue;
            }

-            if (audio_bug (AUDIO_FUNC, played > sw->total_hw_samples_mixed)) {
+            if (audio_bug(__func__, played > sw->total_hw_samples_mixed)) {
                dolog ("played=%d sw->total_hw_samples_mixed=%d\n",
                       played, sw->total_hw_samples_mixed);
                played = sw->total_hw_samples_mixed;
@@ -1513,7 +1547,7 @@ static void audio_run_capture (AudioState *s)
                continue;
            }

-            if (audio_bug (AUDIO_FUNC, captured > sw->total_hw_samples_mixed)) {
+            if (audio_bug(__func__, captured > sw->total_hw_samples_mixed)) {
                dolog ("captured=%d sw->total_hw_samples_mixed=%d\n",
                       captured, sw->total_hw_samples_mixed);
                captured = sw->total_hw_samples_mixed;
@@ -1656,11 +1690,13 @@ static void audio_pp_nb_voices (const char *typ, int nb)

 void AUD_help (void)
 {
-    size_t i;
+    struct audio_driver *d;
+
+    /* make sure we print the help text for modular drivers too */
+    audio_module_load_all();

    audio_process_options ("AUDIO", audio_options);
-    for (i = 0; i < ARRAY_SIZE (drvtab); i++) {
-        struct audio_driver *d = drvtab[i];
+    QLIST_FOREACH(d, &audio_drivers, next) {
        if (d->options) {
            audio_process_options (d->name, d->options);
        }
@@ -1672,8 +1708,7 @@ void AUD_help (void)

    printf ("Available drivers:\n");

-    for (i = 0; i < ARRAY_SIZE (drvtab); i++) {
-        struct audio_driver *d = drvtab[i];
+    QLIST_FOREACH(d, &audio_drivers, next) {

        printf ("Name: %s\n", d->name);
        printf ("Description: %s\n", d->descr);
@@ -1807,6 +1842,7 @@ static void audio_init (void)
    const char *drvname;
    VMChangeStateEntry *e;
    AudioState *s = &glob_audio_state;
+    struct audio_driver *driver;

    if (s->drv) {
        return;
@@ -1842,32 +1878,27 @@ static void audio_init (void)
    }

    if (drvname) {
-        int found = 0;
-
-        for (i = 0; i < ARRAY_SIZE (drvtab); i++) {
-            if (!strcmp (drvname, drvtab[i]->name)) {
-                done = !audio_driver_init (s, drvtab[i]);
-                found = 1;
-                break;
-            }
-        }
-
-        if (!found) {
+        driver = audio_driver_lookup(drvname);
+        if (driver) {
+            done = !audio_driver_init(s, driver);
+        } else {
            dolog ("Unknown audio driver `%s'\n", drvname);
            dolog ("Run with -audio-help to list available drivers\n");
        }
    }

    if (!done) {
-        for (i = 0; !done && i < ARRAY_SIZE (drvtab); i++) {
-            if (drvtab[i]->can_be_default) {
-                done = !audio_driver_init (s, drvtab[i]);
+        for (i = 0; !done && i < ARRAY_SIZE(audio_prio_list); i++) {
+            driver = audio_driver_lookup(audio_prio_list[i]);
+            if (driver && driver->can_be_default) {
+                done = !audio_driver_init(s, driver);
            }
        }
    }

    if (!done) {
-        done = !audio_driver_init (s, &no_audio_driver);
+        driver = audio_driver_lookup("none");
+        done = !audio_driver_init(s, driver);
        assert(done);
        dolog("warning: Using timer based audio emulation\n");
    }
@@ -1924,7 +1955,7 @@ CaptureVoiceOut *AUD_add_capture (
        goto err0;
    }

-    cb = audio_calloc (AUDIO_FUNC, 1, sizeof (*cb));
+    cb = audio_calloc(__func__, 1, sizeof(*cb));
    if (!cb) {
        dolog ("Could not allocate capture callback information, size %zu\n",
               sizeof (*cb));
@@ -1942,7 +1973,7 @@ CaptureVoiceOut *AUD_add_capture (
        HWVoiceOut *hw;
        CaptureVoiceOut *cap;

-        cap = audio_calloc (AUDIO_FUNC, 1, sizeof (*cap));
+        cap = audio_calloc(__func__, 1, sizeof(*cap));
        if (!cap) {
            dolog ("Could not allocate capture voice, size %zu\n",
                   sizeof (*cap));
@@ -1955,8 +1986,8 @@ CaptureVoiceOut *AUD_add_capture (

        /* XXX find a more elegant way */
        hw->samples = 4096 * 4;
-        hw->mix_buf = audio_calloc (AUDIO_FUNC, hw->samples,
-                                    sizeof (struct st_sample));
+        hw->mix_buf = audio_calloc(__func__, hw->samples,
+                                   sizeof(struct st_sample));
        if (!hw->mix_buf) {
            dolog ("Could not allocate capture mix buffer (%d samples)\n",
                   hw->samples);
@@ -1965,7 +1996,7 @@ CaptureVoiceOut *AUD_add_capture (

        audio_pcm_init_info (&hw->info, as);

-        cap->buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+        cap->buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
        if (!cap->buf) {
            dolog ("Could not allocate capture buffer "
                   "(%d samples, each %d bytes)\n",
--- a/audio/audio_int.h
+++ b/audio/audio_int.h
@@ -25,7 +25,7 @@
 #ifndef QEMU_AUDIO_INT_H
 #define QEMU_AUDIO_INT_H

-#ifdef CONFIG_COREAUDIO
+#ifdef CONFIG_AUDIO_COREAUDIO
 #define FLOAT_MIXENG
 /* #define RECIPROCAL */
 #endif
@@ -141,6 +141,7 @@ struct SWVoiceIn {
    QLIST_ENTRY (SWVoiceIn) entries;
 };

+typedef struct audio_driver audio_driver;
 struct audio_driver {
    const char *name;
    const char *descr;
@@ -154,6 +155,7 @@ struct audio_driver {
    int voice_size_out;
    int voice_size_in;
    int ctl_caps;
+    QLIST_ENTRY(audio_driver) next;
 };

 struct audio_pcm_ops {
@@ -203,17 +205,11 @@ struct AudioState {
    int vm_running;
 };

-extern struct audio_driver no_audio_driver;
-extern struct audio_driver oss_audio_driver;
-extern struct audio_driver sdl_audio_driver;
-extern struct audio_driver wav_audio_driver;
-extern struct audio_driver alsa_audio_driver;
-extern struct audio_driver coreaudio_audio_driver;
-extern struct audio_driver dsound_audio_driver;
-extern struct audio_driver pa_audio_driver;
-extern struct audio_driver spice_audio_driver;
 extern const struct mixeng_volume nominal_volume;

+void audio_driver_register(audio_driver *drv);
+audio_driver *audio_driver_lookup(const char *name);
+
 void audio_pcm_init_info (struct audio_pcm_info *info, struct audsettings *as);
 void audio_pcm_info_clear_buf (struct audio_pcm_info *info, void *buf, int len);

@@ -252,10 +248,4 @@ static inline int audio_ring_dist (int dst, int src, int len)
 #define AUDIO_STRINGIFY_(n) #n
 #define AUDIO_STRINGIFY(n) AUDIO_STRINGIFY_(n)

-#if defined _MSC_VER || defined __GNUC__
-#define AUDIO_FUNC __FUNCTION__
-#else
-#define AUDIO_FUNC __FILE__ ":" AUDIO_STRINGIFY (__LINE__)
-#endif
-
 #endif /* QEMU_AUDIO_INT_H */
--- a/audio/audio_pt_int.c
+++ b/audio/audio_pt_int.c
@@ -31,7 +31,7 @@ int audio_pt_init (struct audio_pt *p, void *(*func) (void *),

    err = sigfillset (&set);
    if (err) {
-        logerr (p, errno, "%s(%s): sigfillset failed", cap, AUDIO_FUNC);
+        logerr(p, errno, "%s(%s): sigfillset failed", cap, __func__);
        return -1;
    }

@@ -57,8 +57,8 @@ int audio_pt_init (struct audio_pt *p, void *(*func) (void *),

    err2 = pthread_sigmask (SIG_SETMASK, &old_set, NULL);
    if (err2) {
-        logerr (p, err2, "%s(%s): pthread_sigmask (restore) failed",
-                cap, AUDIO_FUNC);
+        logerr(p, err2, "%s(%s): pthread_sigmask (restore) failed",
+               cap, __func__);
        /* We have failed to restore original signal mask, all bets are off,
           so terminate the process */
        exit (EXIT_FAILURE);
@@ -74,17 +74,17 @@ int audio_pt_init (struct audio_pt *p, void *(*func) (void *),
 err2:
    err2 = pthread_cond_destroy (&p->cond);
    if (err2) {
-        logerr (p, err2, "%s(%s): pthread_cond_destroy failed", cap, AUDIO_FUNC);
+        logerr(p, err2, "%s(%s): pthread_cond_destroy failed", cap, __func__);
    }

 err1:
    err2 = pthread_mutex_destroy (&p->mutex);
    if (err2) {
-        logerr (p, err2, "%s(%s): pthread_mutex_destroy failed", cap, AUDIO_FUNC);
+        logerr(p, err2, "%s(%s): pthread_mutex_destroy failed", cap, __func__);
    }

 err0:
-    logerr (p, err, "%s(%s): %s failed", cap, AUDIO_FUNC, efunc);
+    logerr(p, err, "%s(%s): %s failed", cap, __func__, efunc);
    return -1;
 }

@@ -94,13 +94,13 @@ int audio_pt_fini (struct audio_pt *p, const char *cap)

    err = pthread_cond_destroy (&p->cond);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_cond_destroy failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_cond_destroy failed", cap, __func__);
        ret = -1;
    }

    err = pthread_mutex_destroy (&p->mutex);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_mutex_destroy failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_mutex_destroy failed", cap, __func__);
        ret = -1;
    }
    return ret;
@@ -112,7 +112,7 @@ int audio_pt_lock (struct audio_pt *p, const char *cap)

    err = pthread_mutex_lock (&p->mutex);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_mutex_lock failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_mutex_lock failed", cap, __func__);
        return -1;
    }
    return 0;
@@ -124,7 +124,7 @@ int audio_pt_unlock (struct audio_pt *p, const char *cap)

    err = pthread_mutex_unlock (&p->mutex);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_mutex_unlock failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_mutex_unlock failed", cap, __func__);
        return -1;
    }
    return 0;
@@ -136,7 +136,7 @@ int audio_pt_wait (struct audio_pt *p, const char *cap)

    err = pthread_cond_wait (&p->cond, &p->mutex);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_cond_wait failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_cond_wait failed", cap, __func__);
        return -1;
    }
    return 0;
@@ -148,12 +148,12 @@ int audio_pt_unlock_and_signal (struct audio_pt *p, const char *cap)

    err = pthread_mutex_unlock (&p->mutex);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_mutex_unlock failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_mutex_unlock failed", cap, __func__);
        return -1;
    }
    err = pthread_cond_signal (&p->cond);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_cond_signal failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_cond_signal failed", cap, __func__);
        return -1;
    }
    return 0;
@@ -166,7 +166,7 @@ int audio_pt_join (struct audio_pt *p, void **arg, const char *cap)

    err = pthread_join (p->thread, &ret);
    if (err) {
-        logerr (p, err, "%s(%s): pthread_join failed", cap, AUDIO_FUNC);
+        logerr(p, err, "%s(%s): pthread_join failed", cap, __func__);
        return -1;
    }
    *arg = ret;
--- a/audio/audio_template.h
+++ b/audio/audio_template.h
@@ -57,13 +57,13 @@ static void glue (audio_init_nb_voices_, TYPE) (struct audio_driver *drv)
        glue (s->nb_hw_voices_, TYPE) = max_voices;
    }

-    if (audio_bug (AUDIO_FUNC, !voice_size && max_voices)) {
+    if (audio_bug(__func__, !voice_size && max_voices)) {
        dolog ("drv=`%s' voice_size=0 max_voices=%d\n",
               drv->name, max_voices);
        glue (s->nb_hw_voices_, TYPE) = 0;
    }

-    if (audio_bug (AUDIO_FUNC, voice_size && !max_voices)) {
+    if (audio_bug(__func__, voice_size && !max_voices)) {
        dolog ("drv=`%s' voice_size=%d max_voices=0\n",
               drv->name, voice_size);
    }
@@ -77,7 +77,7 @@ static void glue (audio_pcm_hw_free_resources_, TYPE) (HW *hw)

 static int glue (audio_pcm_hw_alloc_resources_, TYPE) (HW *hw)
 {
-    HWBUF = audio_calloc (AUDIO_FUNC, hw->samples, sizeof (struct st_sample));
+    HWBUF = audio_calloc(__func__, hw->samples, sizeof(struct st_sample));
    if (!HWBUF) {
        dolog ("Could not allocate " NAME " buffer (%d samples)\n",
               hw->samples);
@@ -105,7 +105,7 @@ static int glue (audio_pcm_sw_alloc_resources_, TYPE) (SW *sw)

    samples = ((int64_t) sw->hw->samples << 32) / sw->ratio;

-    sw->buf = audio_calloc (AUDIO_FUNC, samples, sizeof (struct st_sample));
+    sw->buf = audio_calloc(__func__, samples, sizeof(struct st_sample));
    if (!sw->buf) {
        dolog ("Could not allocate buffer for `%s' (%d samples)\n",
               SW_NAME (sw), samples);
@@ -238,17 +238,17 @@ static HW *glue (audio_pcm_hw_add_new_, TYPE) (struct audsettings *as)
        return NULL;
    }

-    if (audio_bug (AUDIO_FUNC, !drv)) {
+    if (audio_bug(__func__, !drv)) {
        dolog ("No host audio driver\n");
        return NULL;
    }

-    if (audio_bug (AUDIO_FUNC, !drv->pcm_ops)) {
+    if (audio_bug(__func__, !drv->pcm_ops)) {
        dolog ("Host audio driver without pcm_ops\n");
        return NULL;
    }

-    hw = audio_calloc (AUDIO_FUNC, 1, glue (drv->voice_size_, TYPE));
+    hw = audio_calloc(__func__, 1, glue(drv->voice_size_, TYPE));
    if (!hw) {
        dolog ("Can not allocate voice `%s' size %d\n",
               drv->name, glue (drv->voice_size_, TYPE));
@@ -266,7 +266,7 @@ static HW *glue (audio_pcm_hw_add_new_, TYPE) (struct audsettings *as)
        goto err0;
    }

-    if (audio_bug (AUDIO_FUNC, hw->samples <= 0)) {
+    if (audio_bug(__func__, hw->samples <= 0)) {
        dolog ("hw->samples=%d\n", hw->samples);
        goto err1;
    }
@@ -339,7 +339,7 @@ static SW *glue (audio_pcm_create_voice_pair_, TYPE) (
        hw_as = *as;
    }

-    sw = audio_calloc (AUDIO_FUNC, 1, sizeof (*sw));
+    sw = audio_calloc(__func__, 1, sizeof(*sw));
    if (!sw) {
        dolog ("Could not allocate soft voice `%s' (%zu bytes)\n",
               sw_name ? sw_name : "unknown", sizeof (*sw));
@@ -379,7 +379,7 @@ static void glue (audio_close_, TYPE) (SW *sw)
 void glue (AUD_close_, TYPE) (QEMUSoundCard *card, SW *sw)
 {
    if (sw) {
-        if (audio_bug (AUDIO_FUNC, !card)) {
+        if (audio_bug(__func__, !card)) {
            dolog ("card=%p\n", card);
            return;
        }
@@ -399,7 +399,7 @@ SW *glue (AUD_open_, TYPE) (
 {
    AudioState *s = &glob_audio_state;

-    if (audio_bug (AUDIO_FUNC, !card || !name || !callback_fn || !as)) {
+    if (audio_bug(__func__, !card || !name || !callback_fn || !as)) {
        dolog ("card=%p name=%p callback_fn=%p as=%p\n",
               card, name, callback_fn, as);
        goto fail;
@@ -408,12 +408,12 @@ SW *glue (AUD_open_, TYPE) (
    ldebug ("open %s, freq %d, nchannels %d, fmt %d\n",
            name, as->freq, as->nchannels, as->fmt);

-    if (audio_bug (AUDIO_FUNC, audio_validate_settings (as))) {
+    if (audio_bug(__func__, audio_validate_settings(as))) {
        audio_print_settings (as);
        goto fail;
    }

-    if (audio_bug (AUDIO_FUNC, !s->drv)) {
+    if (audio_bug(__func__, !s->drv)) {
        dolog ("Can not open `%s' (no host audio driver)\n", name);
        goto fail;
    }
--- a/audio/coreaudio.c
+++ b/audio/coreaudio.c
@@ -722,7 +722,7 @@ static struct audio_pcm_ops coreaudio_pcm_ops = {
    .ctl_out  = coreaudio_ctl_out
 };

-struct audio_driver coreaudio_audio_driver = {
+static struct audio_driver coreaudio_audio_driver = {
    .name           = "coreaudio",
    .descr          = "CoreAudio http://developer.apple.com/audio/coreaudio.html",
    .options        = coreaudio_options,
@@ -735,3 +735,9 @@ struct audio_driver coreaudio_audio_driver = {
    .voice_size_out = sizeof (coreaudioVoiceOut),
    .voice_size_in  = 0
 };
+
+static void register_audio_coreaudio(void)
+{
+    audio_driver_register(&coreaudio_audio_driver);
+}
+type_init(register_audio_coreaudio);
--- a/audio/dsoundaudio.c
+++ b/audio/dsoundaudio.c
@@ -543,7 +543,7 @@ static int dsound_run_out (HWVoiceOut *hw, int live)
        }
    }

-    if (audio_bug (AUDIO_FUNC, len < 0 || len > bufsize)) {
+    if (audio_bug(__func__, len < 0 || len > bufsize)) {
        dolog ("len=%d bufsize=%d old_pos=%ld ppos=%ld\n",
               len, bufsize, old_pos, ppos);
        return 0;
@@ -890,7 +890,7 @@ static struct audio_pcm_ops dsound_pcm_ops = {
    .ctl_in   = dsound_ctl_in
 };

-struct audio_driver dsound_audio_driver = {
+static struct audio_driver dsound_audio_driver = {
    .name           = "dsound",
    .descr          = "DirectSound http://wikipedia.org/wiki/DirectSound",
    .options        = dsound_options,
@@ -903,3 +903,9 @@ struct audio_driver dsound_audio_driver = {
    .voice_size_out = sizeof (DSoundVoiceOut),
    .voice_size_in  = sizeof (DSoundVoiceIn)
 };
+
+static void register_audio_dsound(void)
+{
+    audio_driver_register(&dsound_audio_driver);
+}
+type_init(register_audio_dsound);
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -344,7 +344,7 @@ struct rate {
 */
 void *st_rate_start (int inrate, int outrate)
 {
-    struct rate *rate = audio_calloc (AUDIO_FUNC, 1, sizeof (*rate));
+    struct rate *rate = audio_calloc(__func__, 1, sizeof(*rate));

    if (!rate) {
        dolog ("Could not allocate resampler (%zu bytes)\n", sizeof (*rate));
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@@ -160,7 +160,7 @@ static struct audio_pcm_ops no_pcm_ops = {
    .ctl_in   = no_ctl_in
 };

-struct audio_driver no_audio_driver = {
+static struct audio_driver no_audio_driver = {
    .name           = "none",
    .descr          = "Timer based audio emulation",
    .options        = NULL,
@@ -173,3 +173,9 @@ struct audio_driver no_audio_driver = {
    .voice_size_out = sizeof (NoVoiceOut),
    .voice_size_in  = sizeof (NoVoiceIn)
 };
+
+static void register_audio_none(void)
+{
+    audio_driver_register(&no_audio_driver);
+}
+type_init(register_audio_none);
--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@@ -582,11 +582,9 @@ static int oss_init_out(HWVoiceOut *hw, struct audsettings *as,
    }

    if (!oss->mmapped) {
-        oss->pcm_buf = audio_calloc (
-            AUDIO_FUNC,
+        oss->pcm_buf = audio_calloc(__func__,
                                    hw->samples,
-            1 << hw->info.shift
-            );
+                                    1 << hw->info.shift);
        if (!oss->pcm_buf) {
            dolog (
                "Could not allocate DAC buffer (%d samples, each %d bytes)\n",
@@ -705,7 +703,7 @@ static int oss_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque)
    }

    hw->samples = (obt.nfrags * obt.fragsize) >> hw->info.shift;
-    oss->pcm_buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+    oss->pcm_buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
    if (!oss->pcm_buf) {
        dolog ("Could not allocate ADC buffer (%d samples, each %d bytes)\n",
               hw->samples, 1 << hw->info.shift);
@@ -924,7 +922,7 @@ static struct audio_pcm_ops oss_pcm_ops = {
    .ctl_in   = oss_ctl_in
 };

-struct audio_driver oss_audio_driver = {
+static struct audio_driver oss_audio_driver = {
    .name           = "oss",
    .descr          = "OSS http://www.opensound.com",
    .options        = oss_options,
@@ -937,3 +935,9 @@ struct audio_driver oss_audio_driver = {
    .voice_size_out = sizeof (OSSVoiceOut),
    .voice_size_in  = sizeof (OSSVoiceIn)
 };
+
+static void register_audio_oss(void)
+{
+    audio_driver_register(&oss_audio_driver);
+}
+type_init(register_audio_oss);
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -89,7 +89,7 @@ static inline int PA_STREAM_IS_GOOD(pa_stream_state_t x)
            }                                                   \
            goto label;                                         \
        }                                                       \
-    } while (0);
+    } while (0)

 #define CHECK_DEAD_GOTO(c, stream, rerror, label)                       \
    do {                                                                \
@@ -107,7 +107,7 @@ static inline int PA_STREAM_IS_GOOD(pa_stream_state_t x)
            }                                                           \
            goto label;                                                 \
        }                                                               \
-    } while (0);
+    } while (0)

 static int qpa_simple_read (PAVoiceIn *p, void *data, size_t length, int *rerror)
 {
@@ -206,7 +206,7 @@ static void *qpa_thread_out (void *arg)
    PAVoiceOut *pa = arg;
    HWVoiceOut *hw = &pa->hw;

-    if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+    if (audio_pt_lock(&pa->pt, __func__)) {
        return NULL;
    }

@@ -222,7 +222,7 @@ static void *qpa_thread_out (void *arg)
                break;
            }

-            if (audio_pt_wait (&pa->pt, AUDIO_FUNC)) {
+            if (audio_pt_wait(&pa->pt, __func__)) {
                goto exit;
            }
        }
@@ -230,7 +230,7 @@ static void *qpa_thread_out (void *arg)
        decr = to_mix = audio_MIN (pa->live, pa->g->conf.samples >> 2);
        rpos = pa->rpos;

-        if (audio_pt_unlock (&pa->pt, AUDIO_FUNC)) {
+        if (audio_pt_unlock(&pa->pt, __func__)) {
            return NULL;
        }

@@ -251,7 +251,7 @@ static void *qpa_thread_out (void *arg)
            to_mix -= chunk;
        }

-        if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+        if (audio_pt_lock(&pa->pt, __func__)) {
            return NULL;
        }

@@ -261,7 +261,7 @@ static void *qpa_thread_out (void *arg)
    }

 exit:
-    audio_pt_unlock (&pa->pt, AUDIO_FUNC);
+    audio_pt_unlock(&pa->pt, __func__);
    return NULL;
 }

@@ -270,7 +270,7 @@ static int qpa_run_out (HWVoiceOut *hw, int live)
    int decr;
    PAVoiceOut *pa = (PAVoiceOut *) hw;

-    if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+    if (audio_pt_lock(&pa->pt, __func__)) {
        return 0;
    }

@@ -279,10 +279,10 @@ static int qpa_run_out (HWVoiceOut *hw, int live)
    pa->live = live - decr;
    hw->rpos = pa->rpos;
    if (pa->live > 0) {
-        audio_pt_unlock_and_signal (&pa->pt, AUDIO_FUNC);
+        audio_pt_unlock_and_signal(&pa->pt, __func__);
    }
    else {
-        audio_pt_unlock (&pa->pt, AUDIO_FUNC);
+        audio_pt_unlock(&pa->pt, __func__);
    }
    return decr;
 }
@@ -298,7 +298,7 @@ static void *qpa_thread_in (void *arg)
    PAVoiceIn *pa = arg;
    HWVoiceIn *hw = &pa->hw;

-    if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+    if (audio_pt_lock(&pa->pt, __func__)) {
        return NULL;
    }

@@ -314,7 +314,7 @@ static void *qpa_thread_in (void *arg)
                break;
            }

-            if (audio_pt_wait (&pa->pt, AUDIO_FUNC)) {
+            if (audio_pt_wait(&pa->pt, __func__)) {
                goto exit;
            }
        }
@@ -322,7 +322,7 @@ static void *qpa_thread_in (void *arg)
        incr = to_grab = audio_MIN (pa->dead, pa->g->conf.samples >> 2);
        wpos = pa->wpos;

-        if (audio_pt_unlock (&pa->pt, AUDIO_FUNC)) {
+        if (audio_pt_unlock(&pa->pt, __func__)) {
            return NULL;
        }

@@ -342,7 +342,7 @@ static void *qpa_thread_in (void *arg)
            to_grab -= chunk;
        }

-        if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+        if (audio_pt_lock(&pa->pt, __func__)) {
            return NULL;
        }

@@ -352,7 +352,7 @@ static void *qpa_thread_in (void *arg)
    }

 exit:
-    audio_pt_unlock (&pa->pt, AUDIO_FUNC);
+    audio_pt_unlock(&pa->pt, __func__);
    return NULL;
 }

@@ -361,7 +361,7 @@ static int qpa_run_in (HWVoiceIn *hw)
    int live, incr, dead;
    PAVoiceIn *pa = (PAVoiceIn *) hw;

-    if (audio_pt_lock (&pa->pt, AUDIO_FUNC)) {
+    if (audio_pt_lock(&pa->pt, __func__)) {
        return 0;
    }

@@ -372,10 +372,10 @@ static int qpa_run_in (HWVoiceIn *hw)
    pa->dead = dead - incr;
    hw->wpos = pa->wpos;
    if (pa->dead > 0) {
-        audio_pt_unlock_and_signal (&pa->pt, AUDIO_FUNC);
+        audio_pt_unlock_and_signal(&pa->pt, __func__);
    }
    else {
-        audio_pt_unlock (&pa->pt, AUDIO_FUNC);
+        audio_pt_unlock(&pa->pt, __func__);
    }
    return incr;
 }
@@ -579,7 +579,7 @@ static int qpa_init_out(HWVoiceOut *hw, struct audsettings *as,

    audio_pcm_init_info (&hw->info, &obt_as);
    hw->samples = g->conf.samples;
-    pa->pcm_buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+    pa->pcm_buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
    pa->rpos = hw->rpos;
    if (!pa->pcm_buf) {
        dolog ("Could not allocate buffer (%d bytes)\n",
@@ -587,7 +587,7 @@ static int qpa_init_out(HWVoiceOut *hw, struct audsettings *as,
        goto fail2;
    }

-    if (audio_pt_init (&pa->pt, qpa_thread_out, hw, AUDIO_CAP, AUDIO_FUNC)) {
+    if (audio_pt_init(&pa->pt, qpa_thread_out, hw, AUDIO_CAP, __func__)) {
        goto fail3;
    }

@@ -636,7 +636,7 @@ static int qpa_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque)

    audio_pcm_init_info (&hw->info, &obt_as);
    hw->samples = g->conf.samples;
-    pa->pcm_buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+    pa->pcm_buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
    pa->wpos = hw->wpos;
    if (!pa->pcm_buf) {
        dolog ("Could not allocate buffer (%d bytes)\n",
@@ -644,7 +644,7 @@ static int qpa_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque)
        goto fail2;
    }

-    if (audio_pt_init (&pa->pt, qpa_thread_in, hw, AUDIO_CAP, AUDIO_FUNC)) {
+    if (audio_pt_init(&pa->pt, qpa_thread_in, hw, AUDIO_CAP, __func__)) {
        goto fail3;
    }

@@ -667,17 +667,17 @@ static void qpa_fini_out (HWVoiceOut *hw)
    void *ret;
    PAVoiceOut *pa = (PAVoiceOut *) hw;

-    audio_pt_lock (&pa->pt, AUDIO_FUNC);
+    audio_pt_lock(&pa->pt, __func__);
    pa->done = 1;
-    audio_pt_unlock_and_signal (&pa->pt, AUDIO_FUNC);
-    audio_pt_join (&pa->pt, &ret, AUDIO_FUNC);
+    audio_pt_unlock_and_signal(&pa->pt, __func__);
+    audio_pt_join(&pa->pt, &ret, __func__);

    if (pa->stream) {
        pa_stream_unref (pa->stream);
        pa->stream = NULL;
    }

-    audio_pt_fini (&pa->pt, AUDIO_FUNC);
+    audio_pt_fini(&pa->pt, __func__);
    g_free (pa->pcm_buf);
    pa->pcm_buf = NULL;
 }
@@ -687,17 +687,17 @@ static void qpa_fini_in (HWVoiceIn *hw)
    void *ret;
    PAVoiceIn *pa = (PAVoiceIn *) hw;

-    audio_pt_lock (&pa->pt, AUDIO_FUNC);
+    audio_pt_lock(&pa->pt, __func__);
    pa->done = 1;
-    audio_pt_unlock_and_signal (&pa->pt, AUDIO_FUNC);
-    audio_pt_join (&pa->pt, &ret, AUDIO_FUNC);
+    audio_pt_unlock_and_signal(&pa->pt, __func__);
+    audio_pt_join(&pa->pt, &ret, __func__);

    if (pa->stream) {
        pa_stream_unref (pa->stream);
        pa->stream = NULL;
    }

-    audio_pt_fini (&pa->pt, AUDIO_FUNC);
+    audio_pt_fini(&pa->pt, __func__);
    g_free (pa->pcm_buf);
    pa->pcm_buf = NULL;
 }
@@ -937,7 +937,7 @@ static struct audio_pcm_ops qpa_pcm_ops = {
    .ctl_in   = qpa_ctl_in
 };

-struct audio_driver pa_audio_driver = {
+static struct audio_driver pa_audio_driver = {
    .name           = "pa",
    .descr          = "http://www.pulseaudio.org/",
    .options        = qpa_options,
@@ -951,3 +951,9 @@ struct audio_driver pa_audio_driver = {
    .voice_size_in  = sizeof (PAVoiceIn),
    .ctl_caps       = VOICE_VOLUME_CAP
 };
+
+static void register_audio_pa(void)
+{
+    audio_driver_register(&pa_audio_driver);
+}
+type_init(register_audio_pa);
--- a/audio/sdlaudio.c
+++ b/audio/sdlaudio.c
@@ -277,7 +277,7 @@ static void sdl_callback (void *opaque, Uint8 *buf, int len)
            return;
        }

-        if (audio_bug (AUDIO_FUNC, sdl->live < 0 || sdl->live > hw->samples)) {
+        if (audio_bug(__func__, sdl->live < 0 || sdl->live > hw->samples)) {
            dolog ("sdl->live=%d hw->samples=%d\n",
                   sdl->live, hw->samples);
            return;
@@ -500,7 +500,7 @@ static struct audio_pcm_ops sdl_pcm_ops = {
    .ctl_out  = sdl_ctl_out,
 };

-struct audio_driver sdl_audio_driver = {
+static struct audio_driver sdl_audio_driver = {
    .name           = "sdl",
    .descr          = "SDL http://www.libsdl.org",
    .options        = sdl_options,
@@ -513,3 +513,9 @@ struct audio_driver sdl_audio_driver = {
    .voice_size_out = sizeof (SDLVoiceOut),
    .voice_size_in  = 0
 };
+
+static void register_audio_sdl(void)
+{
+    audio_driver_register(&sdl_audio_driver);
+}
+type_init(register_audio_sdl);
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -391,7 +391,7 @@ static struct audio_pcm_ops audio_callbacks = {
    .ctl_in   = line_in_ctl,
 };

-struct audio_driver spice_audio_driver = {
+static struct audio_driver spice_audio_driver = {
    .name           = "spice",
    .descr          = "spice audio driver",
    .options        = audio_options,
@@ -411,3 +411,9 @@ void qemu_spice_audio_init (void)
 {
    spice_audio_driver.can_be_default = 1;
 }
+
+static void register_audio_spice(void)
+{
+    audio_driver_register(&spice_audio_driver);
+}
+type_init(register_audio_spice);
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -139,7 +139,7 @@ static int wav_init_out(HWVoiceOut *hw, struct audsettings *as,
    audio_pcm_init_info (&hw->info, &wav_as);

    hw->samples = 1024;
-    wav->pcm_buf = audio_calloc (AUDIO_FUNC, hw->samples, 1 << hw->info.shift);
+    wav->pcm_buf = audio_calloc(__func__, hw->samples, 1 << hw->info.shift);
    if (!wav->pcm_buf) {
        dolog ("Could not allocate buffer (%d bytes)\n",
               hw->samples << hw->info.shift);
@@ -278,7 +278,7 @@ static struct audio_pcm_ops wav_pcm_ops = {
    .ctl_out  = wav_ctl_out,
 };

-struct audio_driver wav_audio_driver = {
+static struct audio_driver wav_audio_driver = {
    .name           = "wav",
    .descr          = "WAV renderer http://wikipedia.org/wiki/WAV",
    .options        = wav_options,
@@ -291,3 +291,9 @@ struct audio_driver wav_audio_driver = {
    .voice_size_out = sizeof (WAVVoiceOut),
    .voice_size_in  = 0
 };
+
+static void register_audio_wav(void)
+{
+    audio_driver_register(&wav_audio_driver);
+}
+type_init(register_audio_wav);
--- a/audio/wavcapture.c
+++ b/audio/wavcapture.c
@@ -1,6 +1,7 @@
 #include "qemu/osdep.h"
 #include "hw/hw.h"
 #include "monitor/monitor.h"
+#include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "audio.h"

--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -8,3 +8,11 @@ common-obj-$(CONFIG_LINUX) += hostmem-file.o

 common-obj-y += cryptodev.o
 common-obj-y += cryptodev-builtin.o
+
+ifeq ($(CONFIG_VIRTIO),y)
+common-obj-y += cryptodev-vhost.o
+common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_LINUX)) += \
+    cryptodev-vhost-user.o
+endif
+
+common-obj-$(CONFIG_LINUX) += hostmem-memfd.o
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -78,6 +78,7 @@ static void cryptodev_builtin_init(
              "cryptodev-builtin", NULL);
    cc->info_str = g_strdup_printf("cryptodev-builtin0");
    cc->queue_index = 0;
+    cc->type = CRYPTODEV_BACKEND_TYPE_BUILTIN;
    backend->conf.peers.ccs[0] = cc;

    backend->conf.crypto_services =
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -0,0 +1,377 @@
+/*
+ * QEMU Cryptodev backend for QEMU cipher APIs
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ *    Gonglei <arei.gonglei@huawei.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "hw/boards.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/virtio_crypto.h"
+#include "sysemu/cryptodev-vhost.h"
+#include "chardev/char-fe.h"
+#include "sysemu/cryptodev-vhost-user.h"
+
+
+/**
+ * @TYPE_CRYPTODEV_BACKEND_VHOST_USER:
+ * name of backend that uses vhost user server
+ */
+#define TYPE_CRYPTODEV_BACKEND_VHOST_USER "cryptodev-vhost-user"
+
+#define CRYPTODEV_BACKEND_VHOST_USER(obj) \
+    OBJECT_CHECK(CryptoDevBackendVhostUser, \
+                 (obj), TYPE_CRYPTODEV_BACKEND_VHOST_USER)
+
+
+typedef struct CryptoDevBackendVhostUser {
+    CryptoDevBackend parent_obj;
+
+    CharBackend chr;
+    char *chr_name;
+    bool opened;
+    CryptoDevBackendVhost *vhost_crypto[MAX_CRYPTO_QUEUE_NUM];
+} CryptoDevBackendVhostUser;
+
+static int
+cryptodev_vhost_user_running(
+             CryptoDevBackendVhost *crypto)
+{
+    return crypto ? 1 : 0;
+}
+
+CryptoDevBackendVhost *
+cryptodev_vhost_user_get_vhost(
+                         CryptoDevBackendClient *cc,
+                         CryptoDevBackend *b,
+                         uint16_t queue)
+{
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(b);
+    assert(cc->type == CRYPTODEV_BACKEND_TYPE_VHOST_USER);
+    assert(queue < MAX_CRYPTO_QUEUE_NUM);
+
+    return s->vhost_crypto[queue];
+}
+
+static void cryptodev_vhost_user_stop(int queues,
+                          CryptoDevBackendVhostUser *s)
+{
+    size_t i;
+
+    for (i = 0; i < queues; i++) {
+        if (!cryptodev_vhost_user_running(s->vhost_crypto[i])) {
+            continue;
+        }
+
+        cryptodev_vhost_cleanup(s->vhost_crypto[i]);
+        s->vhost_crypto[i] = NULL;
+    }
+}
+
+static int
+cryptodev_vhost_user_start(int queues,
+                         CryptoDevBackendVhostUser *s)
+{
+    CryptoDevBackendVhostOptions options;
+    CryptoDevBackend *b = CRYPTODEV_BACKEND(s);
+    int max_queues;
+    size_t i;
+
+    for (i = 0; i < queues; i++) {
+        if (cryptodev_vhost_user_running(s->vhost_crypto[i])) {
+            continue;
+        }
+
+        options.opaque = &s->chr;
+        options.backend_type = VHOST_BACKEND_TYPE_USER;
+        options.cc = b->conf.peers.ccs[i];
+        s->vhost_crypto[i] = cryptodev_vhost_init(&options);
+        if (!s->vhost_crypto[i]) {
+            error_report("failed to init vhost_crypto for queue %zu", i);
+            goto err;
+        }
+
+        if (i == 0) {
+            max_queues =
+              cryptodev_vhost_get_max_queues(s->vhost_crypto[i]);
+            if (queues > max_queues) {
+                error_report("you are asking more queues than supported: %d",
+                             max_queues);
+                goto err;
+            }
+        }
+    }
+
+    return 0;
+
+err:
+    cryptodev_vhost_user_stop(i + 1, s);
+    return -1;
+}
+
+static Chardev *
+cryptodev_vhost_claim_chardev(CryptoDevBackendVhostUser *s,
+                                    Error **errp)
+{
+    Chardev *chr;
+
+    if (s->chr_name == NULL) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+                   "chardev", "a valid character device");
+        return NULL;
+    }
+
+    chr = qemu_chr_find(s->chr_name);
+    if (chr == NULL) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                  "Device '%s' not found", s->chr_name);
+        return NULL;
+    }
+
+    return chr;
+}
+
+static void cryptodev_vhost_user_event(void *opaque, int event)
+{
+    CryptoDevBackendVhostUser *s = opaque;
+    CryptoDevBackend *b = CRYPTODEV_BACKEND(s);
+    Error *err = NULL;
+    int queues = b->conf.peers.queues;
+
+    assert(queues < MAX_CRYPTO_QUEUE_NUM);
+
+    switch (event) {
+    case CHR_EVENT_OPENED:
+        if (cryptodev_vhost_user_start(queues, s) < 0) {
+            exit(1);
+        }
+        b->ready = true;
+        break;
+    case CHR_EVENT_CLOSED:
+        b->ready = false;
+        cryptodev_vhost_user_stop(queues, s);
+        break;
+    }
+
+    if (err) {
+        error_report_err(err);
+    }
+}
+
+static void cryptodev_vhost_user_init(
+             CryptoDevBackend *backend, Error **errp)
+{
+    int queues = backend->conf.peers.queues;
+    size_t i;
+    Error *local_err = NULL;
+    Chardev *chr;
+    CryptoDevBackendClient *cc;
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(backend);
+
+    chr = cryptodev_vhost_claim_chardev(s, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    s->opened = true;
+
+    for (i = 0; i < queues; i++) {
+        cc = cryptodev_backend_new_client(
+                  "cryptodev-vhost-user", NULL);
+        cc->info_str = g_strdup_printf("cryptodev-vhost-user%zu to %s ",
+                                       i, chr->label);
+        cc->queue_index = i;
+        cc->type = CRYPTODEV_BACKEND_TYPE_VHOST_USER;
+
+        backend->conf.peers.ccs[i] = cc;
+
+        if (i == 0) {
+            if (!qemu_chr_fe_init(&s->chr, chr, &local_err)) {
+                error_propagate(errp, local_err);
+                return;
+            }
+        }
+    }
+
+    qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
+                     cryptodev_vhost_user_event, NULL, s, NULL, true);
+
+    backend->conf.crypto_services =
+                         1u << VIRTIO_CRYPTO_SERVICE_CIPHER |
+                         1u << VIRTIO_CRYPTO_SERVICE_HASH |
+                         1u << VIRTIO_CRYPTO_SERVICE_MAC;
+    backend->conf.cipher_algo_l = 1u << VIRTIO_CRYPTO_CIPHER_AES_CBC;
+    backend->conf.hash_algo = 1u << VIRTIO_CRYPTO_HASH_SHA1;
+
+    backend->conf.max_size = UINT64_MAX;
+    backend->conf.max_cipher_key_len = VHOST_USER_MAX_CIPHER_KEY_LEN;
+    backend->conf.max_auth_key_len = VHOST_USER_MAX_AUTH_KEY_LEN;
+}
+
+static int64_t cryptodev_vhost_user_sym_create_session(
+           CryptoDevBackend *backend,
+           CryptoDevBackendSymSessionInfo *sess_info,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendClient *cc =
+                   backend->conf.peers.ccs[queue_index];
+    CryptoDevBackendVhost *vhost_crypto;
+    uint64_t session_id = 0;
+    int ret;
+
+    vhost_crypto = cryptodev_vhost_user_get_vhost(cc, backend, queue_index);
+    if (vhost_crypto) {
+        struct vhost_dev *dev = &(vhost_crypto->dev);
+        ret = dev->vhost_ops->vhost_crypto_create_session(dev,
+                                                          sess_info,
+                                                          &session_id);
+        if (ret < 0) {
+            return -1;
+        } else {
+            return session_id;
+        }
+    }
+    return -1;
+}
+
+static int cryptodev_vhost_user_sym_close_session(
+           CryptoDevBackend *backend,
+           uint64_t session_id,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendClient *cc =
+                  backend->conf.peers.ccs[queue_index];
+    CryptoDevBackendVhost *vhost_crypto;
+    int ret;
+
+    vhost_crypto = cryptodev_vhost_user_get_vhost(cc, backend, queue_index);
+    if (vhost_crypto) {
+        struct vhost_dev *dev = &(vhost_crypto->dev);
+        ret = dev->vhost_ops->vhost_crypto_close_session(dev,
+                                                         session_id);
+        if (ret < 0) {
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+    return -1;
+}
+
+static void cryptodev_vhost_user_cleanup(
+             CryptoDevBackend *backend,
+             Error **errp)
+{
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(backend);
+    size_t i;
+    int queues = backend->conf.peers.queues;
+    CryptoDevBackendClient *cc;
+
+    cryptodev_vhost_user_stop(queues, s);
+
+    for (i = 0; i < queues; i++) {
+        cc = backend->conf.peers.ccs[i];
+        if (cc) {
+            cryptodev_backend_free_client(cc);
+            backend->conf.peers.ccs[i] = NULL;
+        }
+    }
+}
+
+static void cryptodev_vhost_user_set_chardev(Object *obj,
+                                    const char *value, Error **errp)
+{
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(obj);
+
+    if (s->opened) {
+        error_setg(errp, QERR_PERMISSION_DENIED);
+    } else {
+        g_free(s->chr_name);
+        s->chr_name = g_strdup(value);
+    }
+}
+
+static char *
+cryptodev_vhost_user_get_chardev(Object *obj, Error **errp)
+{
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(obj);
+    Chardev *chr = qemu_chr_fe_get_driver(&s->chr);
+
+    if (chr && chr->label) {
+        return g_strdup(chr->label);
+    }
+
+    return NULL;
+}
+
+static void cryptodev_vhost_user_instance_int(Object *obj)
+{
+    object_property_add_str(obj, "chardev",
+                            cryptodev_vhost_user_get_chardev,
+                            cryptodev_vhost_user_set_chardev,
+                            NULL);
+}
+
+static void cryptodev_vhost_user_finalize(Object *obj)
+{
+    CryptoDevBackendVhostUser *s =
+                      CRYPTODEV_BACKEND_VHOST_USER(obj);
+
+    qemu_chr_fe_deinit(&s->chr, false);
+
+    g_free(s->chr_name);
+}
+
+static void
+cryptodev_vhost_user_class_init(ObjectClass *oc, void *data)
+{
+    CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_CLASS(oc);
+
+    bc->init = cryptodev_vhost_user_init;
+    bc->cleanup = cryptodev_vhost_user_cleanup;
+    bc->create_session = cryptodev_vhost_user_sym_create_session;
+    bc->close_session = cryptodev_vhost_user_sym_close_session;
+    bc->do_sym_op = NULL;
+}
+
+static const TypeInfo cryptodev_vhost_user_info = {
+    .name = TYPE_CRYPTODEV_BACKEND_VHOST_USER,
+    .parent = TYPE_CRYPTODEV_BACKEND,
+    .class_init = cryptodev_vhost_user_class_init,
+    .instance_init = cryptodev_vhost_user_instance_int,
+    .instance_finalize = cryptodev_vhost_user_finalize,
+    .instance_size = sizeof(CryptoDevBackendVhostUser),
+};
+
+static void
+cryptodev_vhost_user_register_types(void)
+{
+    type_register_static(&cryptodev_vhost_user_info);
+}
+
+type_init(cryptodev_vhost_user_register_types);
--- a/backends/cryptodev-vhost.c
+++ b/backends/cryptodev-vhost.c
@@ -0,0 +1,347 @@
+/*
+ * QEMU Cryptodev backend for QEMU cipher APIs
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ *    Gonglei <arei.gonglei@huawei.com>
+ *    Jay Zhou <jianjay.zhou@huawei.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/virtio-bus.h"
+#include "sysemu/cryptodev-vhost.h"
+
+#ifdef CONFIG_VHOST_CRYPTO
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-crypto.h"
+#include "sysemu/cryptodev-vhost-user.h"
+
+uint64_t
+cryptodev_vhost_get_max_queues(
+                        CryptoDevBackendVhost *crypto)
+{
+    return crypto->dev.max_queues;
+}
+
+void cryptodev_vhost_cleanup(CryptoDevBackendVhost *crypto)
+{
+    vhost_dev_cleanup(&crypto->dev);
+    g_free(crypto);
+}
+
+struct CryptoDevBackendVhost *
+cryptodev_vhost_init(
+             CryptoDevBackendVhostOptions *options)
+{
+    int r;
+    CryptoDevBackendVhost *crypto;
+
+    crypto = g_new(CryptoDevBackendVhost, 1);
+    crypto->dev.max_queues = 1;
+    crypto->dev.nvqs = 1;
+    crypto->dev.vqs = crypto->vqs;
+
+    crypto->cc = options->cc;
+
+    crypto->dev.protocol_features = 0;
+    crypto->backend = -1;
+
+    /* vhost-user needs vq_index to initiate a specific queue pair */
+    crypto->dev.vq_index = crypto->cc->queue_index * crypto->dev.nvqs;
+
+    r = vhost_dev_init(&crypto->dev, options->opaque, options->backend_type, 0);
+    if (r < 0) {
+        goto fail;
+    }
+
+    return crypto;
+fail:
+    g_free(crypto);
+    return NULL;
+}
+
+static int
+cryptodev_vhost_start_one(CryptoDevBackendVhost *crypto,
+                                  VirtIODevice *dev)
+{
+    int r;
+
+    crypto->dev.nvqs = 1;
+    crypto->dev.vqs = crypto->vqs;
+
+    r = vhost_dev_enable_notifiers(&crypto->dev, dev);
+    if (r < 0) {
+        goto fail_notifiers;
+    }
+
+    r = vhost_dev_start(&crypto->dev, dev);
+    if (r < 0) {
+        goto fail_start;
+    }
+
+    return 0;
+
+fail_start:
+    vhost_dev_disable_notifiers(&crypto->dev, dev);
+fail_notifiers:
+    return r;
+}
+
+static void
+cryptodev_vhost_stop_one(CryptoDevBackendVhost *crypto,
+                                 VirtIODevice *dev)
+{
+    vhost_dev_stop(&crypto->dev, dev);
+    vhost_dev_disable_notifiers(&crypto->dev, dev);
+}
+
+CryptoDevBackendVhost *
+cryptodev_get_vhost(CryptoDevBackendClient *cc,
+                            CryptoDevBackend *b,
+                            uint16_t queue)
+{
+    CryptoDevBackendVhost *vhost_crypto = NULL;
+
+    if (!cc) {
+        return NULL;
+    }
+
+    switch (cc->type) {
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+    case CRYPTODEV_BACKEND_TYPE_VHOST_USER:
+        vhost_crypto = cryptodev_vhost_user_get_vhost(cc, b, queue);
+        break;
+#endif
+    default:
+        break;
+    }
+
+    return vhost_crypto;
+}
+
+static void
+cryptodev_vhost_set_vq_index(CryptoDevBackendVhost *crypto,
+                                     int vq_index)
+{
+    crypto->dev.vq_index = vq_index;
+}
+
+static int
+vhost_set_vring_enable(CryptoDevBackendClient *cc,
+                            CryptoDevBackend *b,
+                            uint16_t queue, int enable)
+{
+    CryptoDevBackendVhost *crypto =
+                       cryptodev_get_vhost(cc, b, queue);
+    const VhostOps *vhost_ops;
+
+    cc->vring_enable = enable;
+
+    if (!crypto) {
+        return 0;
+    }
+
+    vhost_ops = crypto->dev.vhost_ops;
+    if (vhost_ops->vhost_set_vring_enable) {
+        return vhost_ops->vhost_set_vring_enable(&crypto->dev, enable);
+    }
+
+    return 0;
+}
+
+int cryptodev_vhost_start(VirtIODevice *dev, int total_queues)
+{
+    VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
+    VirtioBusState *vbus = VIRTIO_BUS(qbus);
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
+    int r, e;
+    int i;
+    CryptoDevBackend *b = vcrypto->cryptodev;
+    CryptoDevBackendVhost *vhost_crypto;
+    CryptoDevBackendClient *cc;
+
+    if (!k->set_guest_notifiers) {
+        error_report("binding does not support guest notifiers");
+        return -ENOSYS;
+    }
+
+    for (i = 0; i < total_queues; i++) {
+        cc = b->conf.peers.ccs[i];
+
+        vhost_crypto = cryptodev_get_vhost(cc, b, i);
+        cryptodev_vhost_set_vq_index(vhost_crypto, i);
+
+        /* Suppress the masking guest notifiers on vhost user
+         * because vhost user doesn't interrupt masking/unmasking
+         * properly.
+         */
+        if (cc->type == CRYPTODEV_BACKEND_TYPE_VHOST_USER) {
+            dev->use_guest_notifier_mask = false;
+        }
+     }
+
+    r = k->set_guest_notifiers(qbus->parent, total_queues, true);
+    if (r < 0) {
+        error_report("error binding guest notifier: %d", -r);
+        goto err;
+    }
+
+    for (i = 0; i < total_queues; i++) {
+        cc = b->conf.peers.ccs[i];
+
+        vhost_crypto = cryptodev_get_vhost(cc, b, i);
+        r = cryptodev_vhost_start_one(vhost_crypto, dev);
+
+        if (r < 0) {
+            goto err_start;
+        }
+
+        if (cc->vring_enable) {
+            /* restore vring enable state */
+            r = vhost_set_vring_enable(cc, b, i, cc->vring_enable);
+
+            if (r < 0) {
+                goto err_start;
+            }
+        }
+    }
+
+    return 0;
+
+err_start:
+    while (--i >= 0) {
+        cc = b->conf.peers.ccs[i];
+        vhost_crypto = cryptodev_get_vhost(cc, b, i);
+        cryptodev_vhost_stop_one(vhost_crypto, dev);
+    }
+    e = k->set_guest_notifiers(qbus->parent, total_queues, false);
+    if (e < 0) {
+        error_report("vhost guest notifier cleanup failed: %d", e);
+    }
+err:
+    return r;
+}
+
+void cryptodev_vhost_stop(VirtIODevice *dev, int total_queues)
+{
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
+    VirtioBusState *vbus = VIRTIO_BUS(qbus);
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
+    VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+    CryptoDevBackend *b = vcrypto->cryptodev;
+    CryptoDevBackendVhost *vhost_crypto;
+    CryptoDevBackendClient *cc;
+    size_t i;
+    int r;
+
+    for (i = 0; i < total_queues; i++) {
+        cc = b->conf.peers.ccs[i];
+
+        vhost_crypto = cryptodev_get_vhost(cc, b, i);
+        cryptodev_vhost_stop_one(vhost_crypto, dev);
+    }
+
+    r = k->set_guest_notifiers(qbus->parent, total_queues, false);
+    if (r < 0) {
+        error_report("vhost guest notifier cleanup failed: %d", r);
+    }
+    assert(r >= 0);
+}
+
+void cryptodev_vhost_virtqueue_mask(VirtIODevice *dev,
+                                           int queue,
+                                           int idx, bool mask)
+{
+    VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+    CryptoDevBackend *b = vcrypto->cryptodev;
+    CryptoDevBackendVhost *vhost_crypto;
+    CryptoDevBackendClient *cc;
+
+    assert(queue < MAX_CRYPTO_QUEUE_NUM);
+
+    cc = b->conf.peers.ccs[queue];
+    vhost_crypto = cryptodev_get_vhost(cc, b, queue);
+
+    vhost_virtqueue_mask(&vhost_crypto->dev, dev, idx, mask);
+}
+
+bool cryptodev_vhost_virtqueue_pending(VirtIODevice *dev,
+                                              int queue, int idx)
+{
+    VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+    CryptoDevBackend *b = vcrypto->cryptodev;
+    CryptoDevBackendVhost *vhost_crypto;
+    CryptoDevBackendClient *cc;
+
+    assert(queue < MAX_CRYPTO_QUEUE_NUM);
+
+    cc = b->conf.peers.ccs[queue];
+    vhost_crypto = cryptodev_get_vhost(cc, b, queue);
+
+    return vhost_virtqueue_pending(&vhost_crypto->dev, idx);
+}
+
+#else
+uint64_t
+cryptodev_vhost_get_max_queues(CryptoDevBackendVhost *crypto)
+{
+    return 0;
+}
+
+void cryptodev_vhost_cleanup(CryptoDevBackendVhost *crypto)
+{
+}
+
+struct CryptoDevBackendVhost *
+cryptodev_vhost_init(CryptoDevBackendVhostOptions *options)
+{
+    return NULL;
+}
+
+CryptoDevBackendVhost *
+cryptodev_get_vhost(CryptoDevBackendClient *cc,
+                    CryptoDevBackend *b,
+                    uint16_t queue)
+{
+    return NULL;
+}
+
+int cryptodev_vhost_start(VirtIODevice *dev, int total_queues)
+{
+    return -1;
+}
+
+void cryptodev_vhost_stop(VirtIODevice *dev, int total_queues)
+{
+}
+
+void cryptodev_vhost_virtqueue_mask(VirtIODevice *dev,
+                                    int queue,
+                                    int idx, bool mask)
+{
+}
+
+bool cryptodev_vhost_virtqueue_pending(VirtIODevice *dev,
+                                       int queue, int idx)
+{
+    return false;
+}
+#endif
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -26,8 +26,6 @@
 #include "hw/boards.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
-#include "qapi-types.h"
-#include "qapi-visit.h"
 #include "qemu/config-file.h"
 #include "qom/object_interfaces.h"
 #include "hw/virtio/virtio-crypto.h"
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -31,9 +31,9 @@ typedef struct HostMemoryBackendFile HostMemoryBackendFile;
 struct HostMemoryBackendFile {
    HostMemoryBackend parent_obj;

-    bool share;
    bool discard_data;
    char *mem_path;
+    uint64_t align;
 };

 static void
@@ -58,7 +58,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
        path = object_get_canonical_path(OBJECT(backend));
        memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
                                 path,
-                                 backend->size, fb->share,
+                                 backend->size, fb->align, backend->share,
                                 fb->mem_path, errp);
        g_free(path);
    }
@@ -85,25 +85,6 @@ static void set_mem_path(Object *o, const char *str, Error **errp)
    fb->mem_path = g_strdup(str);
 }

-static bool file_memory_backend_get_share(Object *o, Error **errp)
-{
-    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
-
-    return fb->share;
-}
-
-static void file_memory_backend_set_share(Object *o, bool value, Error **errp)
-{
-    HostMemoryBackend *backend = MEMORY_BACKEND(o);
-    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
-
-    if (host_memory_backend_mr_inited(backend)) {
-        error_setg(errp, "cannot change property value");
-        return;
-    }
-    fb->share = value;
-}
-
 static bool file_memory_backend_get_discard_data(Object *o, Error **errp)
 {
    return MEMORY_BACKEND_FILE(o)->discard_data;
@@ -115,6 +96,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value,
    MEMORY_BACKEND_FILE(o)->discard_data = value;
 }

+static void file_memory_backend_get_align(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    uint64_t val = fb->align;
+
+    visit_type_size(v, name, &val, errp);
+}
+
+static void file_memory_backend_set_align(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(o);
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    Error *local_err = NULL;
+    uint64_t val;
+
+    if (host_memory_backend_mr_inited(backend)) {
+        error_setg(&local_err, "cannot change property value");
+        goto out;
+    }
+
+    visit_type_size(v, name, &val, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    fb->align = val;
+
+ out:
+    error_propagate(errp, local_err);
+}
+
 static void file_backend_unparent(Object *obj)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@@ -136,15 +151,16 @@ file_backend_class_init(ObjectClass *oc, void *data)
    bc->alloc = file_backend_memory_alloc;
    oc->unparent = file_backend_unparent;

-    object_class_property_add_bool(oc, "share",
-        file_memory_backend_get_share, file_memory_backend_set_share,
-        &error_abort);
    object_class_property_add_bool(oc, "discard-data",
        file_memory_backend_get_discard_data, file_memory_backend_set_discard_data,
        &error_abort);
    object_class_property_add_str(oc, "mem-path",
        get_mem_path, set_mem_path,
        &error_abort);
+    object_class_property_add(oc, "align", "int",
+        file_memory_backend_get_align,
+        file_memory_backend_set_align,
+        NULL, NULL, &error_abort);
 }

 static void file_backend_instance_finalize(Object *o)
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -0,0 +1,170 @@
+/*
+ * QEMU host memfd memory backend
+ *
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *   Marc-André Lureau <marcandre.lureau@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/hostmem.h"
+#include "sysemu/sysemu.h"
+#include "qom/object_interfaces.h"
+#include "qemu/memfd.h"
+#include "qapi/error.h"
+
+#define TYPE_MEMORY_BACKEND_MEMFD "memory-backend-memfd"
+
+#define MEMORY_BACKEND_MEMFD(obj)                                        \
+    OBJECT_CHECK(HostMemoryBackendMemfd, (obj), TYPE_MEMORY_BACKEND_MEMFD)
+
+typedef struct HostMemoryBackendMemfd HostMemoryBackendMemfd;
+
+struct HostMemoryBackendMemfd {
+    HostMemoryBackend parent_obj;
+
+    bool hugetlb;
+    uint64_t hugetlbsize;
+    bool seal;
+};
+
+static void
+memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
+{
+    HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend);
+    char *name;
+    int fd;
+
+    if (!backend->size) {
+        error_setg(errp, "can't create backend with size 0");
+        return;
+    }
+
+    if (host_memory_backend_mr_inited(backend)) {
+        return;
+    }
+
+    backend->force_prealloc = mem_prealloc;
+    fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
+                           m->hugetlb, m->hugetlbsize, m->seal ?
+                           F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
+                           errp);
+    if (fd == -1) {
+        return;
+    }
+
+    name = object_get_canonical_path(OBJECT(backend));
+    memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
+                                   name, backend->size, true, fd, errp);
+    g_free(name);
+}
+
+static bool
+memfd_backend_get_hugetlb(Object *o, Error **errp)
+{
+    return MEMORY_BACKEND_MEMFD(o)->hugetlb;
+}
+
+static void
+memfd_backend_set_hugetlb(Object *o, bool value, Error **errp)
+{
+    MEMORY_BACKEND_MEMFD(o)->hugetlb = value;
+}
+
+static void
+memfd_backend_set_hugetlbsize(Object *obj, Visitor *v, const char *name,
+                              void *opaque, Error **errp)
+{
+    HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
+    Error *local_err = NULL;
+    uint64_t value;
+
+    if (host_memory_backend_mr_inited(MEMORY_BACKEND(obj))) {
+        error_setg(&local_err, "cannot change property value");
+        goto out;
+    }
+
+    visit_type_size(v, name, &value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    if (!value) {
+        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
+                   PRIu64 "'", object_get_typename(obj), name, value);
+        goto out;
+    }
+    m->hugetlbsize = value;
+out:
+    error_propagate(errp, local_err);
+}
+
+static void
+memfd_backend_get_hugetlbsize(Object *obj, Visitor *v, const char *name,
+                              void *opaque, Error **errp)
+{
+    HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
+    uint64_t value = m->hugetlbsize;
+
+    visit_type_size(v, name, &value, errp);
+}
+
+static bool
+memfd_backend_get_seal(Object *o, Error **errp)
+{
+    return MEMORY_BACKEND_MEMFD(o)->seal;
+}
+
+static void
+memfd_backend_set_seal(Object *o, bool value, Error **errp)
+{
+    MEMORY_BACKEND_MEMFD(o)->seal = value;
+}
+
+static void
+memfd_backend_instance_init(Object *obj)
+{
+    HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
+
+    /* default to sealed file */
+    m->seal = true;
+}
+
+static void
+memfd_backend_class_init(ObjectClass *oc, void *data)
+{
+    HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
+
+    bc->alloc = memfd_backend_memory_alloc;
+
+    object_class_property_add_bool(oc, "hugetlb",
+                                   memfd_backend_get_hugetlb,
+                                   memfd_backend_set_hugetlb,
+                                   &error_abort);
+    object_class_property_add(oc, "hugetlbsize", "int",
+                              memfd_backend_get_hugetlbsize,
+                              memfd_backend_set_hugetlbsize,
+                              NULL, NULL, &error_abort);
+    object_class_property_add_bool(oc, "seal",
+                                   memfd_backend_get_seal,
+                                   memfd_backend_set_seal,
+                                   &error_abort);
+}
+
+static const TypeInfo memfd_backend_info = {
+    .name = TYPE_MEMORY_BACKEND_MEMFD,
+    .parent = TYPE_MEMORY_BACKEND,
+    .instance_init = memfd_backend_instance_init,
+    .class_init = memfd_backend_class_init,
+    .instance_size = sizeof(HostMemoryBackendMemfd),
+};
+
+static void register_types(void)
+{
+    type_register_static(&memfd_backend_info);
+}
+
+type_init(register_types);
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -28,8 +28,8 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
    }

    path = object_get_canonical_path_component(OBJECT(backend));
-    memory_region_init_ram_nomigrate(&backend->mr, OBJECT(backend), path,
-                           backend->size, errp);
+    memory_region_init_ram_shared_nomigrate(&backend->mr, OBJECT(backend), path,
+                           backend->size, backend->share, errp);
    g_free(path);
 }

--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -9,13 +9,13 @@
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
+
 #include "qemu/osdep.h"
 #include "sysemu/hostmem.h"
 #include "hw/boards.h"
 #include "qapi/error.h"
+#include "qapi/qapi-builtin-visit.h"
 #include "qapi/visitor.h"
-#include "qapi-types.h"
-#include "qapi-visit.h"
 #include "qemu/config-file.h"
 #include "qom/object_interfaces.h"

@@ -369,6 +369,24 @@ static void set_id(Object *o, const char *str, Error **errp)
    backend->id = g_strdup(str);
 }

+static bool host_memory_backend_get_share(Object *o, Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(o);
+
+    return backend->share;
+}
+
+static void host_memory_backend_set_share(Object *o, bool value, Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(o);
+
+    if (host_memory_backend_mr_inited(backend)) {
+        error_setg(errp, "cannot change property value");
+        return;
+    }
+    backend->share = value;
+}
+
 static void
 host_memory_backend_class_init(ObjectClass *oc, void *data)
 {
@@ -399,6 +417,9 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)
        host_memory_backend_get_policy,
        host_memory_backend_set_policy, &error_abort);
    object_class_property_add_str(oc, "id", get_id, set_id, &error_abort);
+    object_class_property_add_bool(oc, "share",
+        host_memory_backend_get_share, host_memory_backend_set_share,
+        &error_abort);
 }

 static void host_memory_backend_finalize(Object *o)
--- a/backends/tpm.c
+++ b/backends/tpm.c
@@ -15,25 +15,43 @@
 #include "qemu/osdep.h"
 #include "sysemu/tpm_backend.h"
 #include "qapi/error.h"
-#include "qapi/qmp/qerror.h"
 #include "sysemu/tpm.h"
-#include "hw/tpm/tpm_int.h"
 #include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "block/thread-pool.h"
+#include "qemu/error-report.h"

-static void tpm_backend_worker_thread(gpointer data, gpointer user_data)
+static void tpm_backend_request_completed(void *opaque, int ret)
 {
-    TPMBackend *s = TPM_BACKEND(user_data);
-    TPMBackendClass *k  = TPM_BACKEND_GET_CLASS(s);
+    TPMBackend *s = TPM_BACKEND(opaque);
+    TPMIfClass *tic = TPM_IF_GET_CLASS(s->tpmif);

-    assert(k->handle_request != NULL);
-    k->handle_request(s, (TPMBackendCmd *)data);
+    tic->request_completed(s->tpmif, ret);
+
+    /* no need for atomic, as long the BQL is taken */
+    s->cmd = NULL;
+    object_unref(OBJECT(s));
 }

-static void tpm_backend_thread_end(TPMBackend *s)
+static int tpm_backend_worker_thread(gpointer data)
 {
-    if (s->thread_pool) {
-        g_thread_pool_free(s->thread_pool, FALSE, TRUE);
-        s->thread_pool = NULL;
+    TPMBackend *s = TPM_BACKEND(data);
+    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);
+    Error *err = NULL;
+
+    k->handle_request(s, s->cmd, &err);
+    if (err) {
+        error_report_err(err);
+        return -1;
+    }
+
+    return 0;
+}
+
+void tpm_backend_finish_sync(TPMBackend *s)
+{
+    while (s->cmd) {
+        aio_poll(qemu_get_aio_context(), true);
    }
 }

@@ -44,26 +62,30 @@ enum TpmType tpm_backend_get_type(TPMBackend *s)
    return k->type;
 }

-int tpm_backend_init(TPMBackend *s, TPMState *state)
+int tpm_backend_init(TPMBackend *s, TPMIf *tpmif, Error **errp)
 {
-    s->tpm_state = state;
+    if (s->tpmif) {
+        error_setg(errp, "TPM backend '%s' is already initialized", s->id);
+        return -1;
+    }
+
+    s->tpmif = tpmif;
+    object_ref(OBJECT(tpmif));
+
    s->had_startup_error = false;

    return 0;
 }

-int tpm_backend_startup_tpm(TPMBackend *s)
+int tpm_backend_startup_tpm(TPMBackend *s, size_t buffersize)
 {
    int res = 0;
    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);

    /* terminate a running TPM */
-    tpm_backend_thread_end(s);
+    tpm_backend_finish_sync(s);

-    s->thread_pool = g_thread_pool_new(tpm_backend_worker_thread, s, 1, TRUE,
-                                       NULL);
-
-    res = k->startup_tpm ? k->startup_tpm(s) : 0;
+    res = k->startup_tpm ? k->startup_tpm(s, buffersize) : 0;

    s->had_startup_error = (res != 0);

@@ -77,7 +99,17 @@ bool tpm_backend_had_startup_error(TPMBackend *s)

 void tpm_backend_deliver_request(TPMBackend *s, TPMBackendCmd *cmd)
 {
-    g_thread_pool_push(s->thread_pool, cmd, NULL);
+    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+    if (s->cmd != NULL) {
+        error_report("There is a TPM request pending");
+        return;
+    }
+
+    s->cmd = cmd;
+    object_ref(OBJECT(s));
+    thread_pool_submit_aio(pool, tpm_backend_worker_thread, s,
+                           tpm_backend_request_completed, s);
 }

 void tpm_backend_reset(TPMBackend *s)
@@ -88,7 +120,7 @@ void tpm_backend_reset(TPMBackend *s)
        k->reset(s);
    }

-    tpm_backend_thread_end(s);
+    tpm_backend_finish_sync(s);

    s->had_startup_error = false;
 }
@@ -97,8 +129,6 @@ void tpm_backend_cancel_cmd(TPMBackend *s)
 {
    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);

-    assert(k->cancel_cmd);
-
    k->cancel_cmd(s);
 }

@@ -122,87 +152,41 @@ TPMVersion tpm_backend_get_tpm_version(TPMBackend *s)
 {
    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);

-    assert(k->get_tpm_version);
-
    return k->get_tpm_version(s);
 }

+size_t tpm_backend_get_buffer_size(TPMBackend *s)
+{
+    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);
+
+    return k->get_buffer_size(s);
+}
+
 TPMInfo *tpm_backend_query_tpm(TPMBackend *s)
 {
    TPMInfo *info = g_new0(TPMInfo, 1);
    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);
+    TPMIfClass *tic = TPM_IF_GET_CLASS(s->tpmif);

    info->id = g_strdup(s->id);
-    info->model = s->fe_model;
-    if (k->get_tpm_options) {
+    info->model = tic->model;
    info->options = k->get_tpm_options(s);
-    }

    return info;
 }

-static bool tpm_backend_prop_get_opened(Object *obj, Error **errp)
-{
-    TPMBackend *s = TPM_BACKEND(obj);
-
-    return s->opened;
-}
-
-void tpm_backend_open(TPMBackend *s, Error **errp)
-{
-    object_property_set_bool(OBJECT(s), true, "opened", errp);
-}
-
-static void tpm_backend_prop_set_opened(Object *obj, bool value, Error **errp)
-{
-    TPMBackend *s = TPM_BACKEND(obj);
-    TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s);
-    Error *local_err = NULL;
-
-    if (value == s->opened) {
-        return;
-    }
-
-    if (!value && s->opened) {
-        error_setg(errp, QERR_PERMISSION_DENIED);
-        return;
-    }
-
-    if (k->opened) {
-        k->opened(s, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    }
-
-    s->opened = true;
-}
-
-static void tpm_backend_instance_init(Object *obj)
-{
-    TPMBackend *s = TPM_BACKEND(obj);
-
-    object_property_add_bool(obj, "opened",
-                             tpm_backend_prop_get_opened,
-                             tpm_backend_prop_set_opened,
-                             NULL);
-    s->fe_model = -1;
-}
-
 static void tpm_backend_instance_finalize(Object *obj)
 {
    TPMBackend *s = TPM_BACKEND(obj);

+    object_unref(OBJECT(s->tpmif));
    g_free(s->id);
-    tpm_backend_thread_end(s);
 }

 static const TypeInfo tpm_backend_info = {
    .name = TYPE_TPM_BACKEND,
    .parent = TYPE_OBJECT,
    .instance_size = sizeof(TPMBackend),
-    .instance_init = tpm_backend_instance_init,
    .instance_finalize = tpm_backend_instance_finalize,
    .class_size = sizeof(TPMBackendClass),
    .abstract = true,
--- a/balloon.c
+++ b/balloon.c
@@ -30,9 +30,9 @@
 #include "sysemu/kvm.h"
 #include "sysemu/balloon.h"
 #include "trace-root.h"
-#include "qmp-commands.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc.h"
 #include "qapi/qmp/qerror.h"
-#include "qapi/qmp/qjson.h"

 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
--- a/block.c
+++ b/block.c
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "block/trace.h"
 #include "block/block_int.h"
@@ -29,17 +30,19 @@
 #include "qemu/error-report.h"
 #include "module_block.h"
 #include "qemu/module.h"
-#include "qapi/qmp/qerror.h"
-#include "qapi/qmp/qbool.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qobject-output-visitor.h"
+#include "qapi/qapi-visit-block-core.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
 #include "qemu/notify.h"
+#include "qemu/option.h"
 #include "qemu/coroutine.h"
 #include "block/qapi.h"
-#include "qmp-commands.h"
 #include "qemu/timer.h"
-#include "qapi-event.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"

@@ -367,7 +370,7 @@ BlockDriver *bdrv_find_format(const char *format_name)
    return bdrv_do_find_format(format_name);
 }

-static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
+int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 {
    static const char *whitelist_rw[] = {
        CONFIG_BDRV_RW_WHITELIST
@@ -417,7 +420,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
    CreateCo *cco = opaque;
    assert(cco->drv);

-    ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
+    ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err);
    error_propagate(&cco->err, local_err);
    cco->ret = ret;
 }
@@ -436,7 +439,7 @@ int bdrv_create(BlockDriver *drv, const char* filename,
        .err = NULL,
    };

-    if (!drv->bdrv_create) {
+    if (!drv->bdrv_co_create_opts) {
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
        ret = -ENOTSUP;
        goto out;
@@ -822,6 +825,18 @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
    bdrv_drained_end(bs);
 }

+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
    BlockDriverState *bs = child->opaque;
@@ -889,6 +904,8 @@ const BdrvChildRole child_file = {
    .inherit_options = bdrv_inherited_options,
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
    .inactivate      = bdrv_child_cb_inactivate,
 };

@@ -911,6 +928,8 @@ const BdrvChildRole child_format = {
    .inherit_options = bdrv_inherited_fmt_options,
    .drained_begin   = bdrv_child_cb_drained_begin,
    .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
    .inactivate      = bdrv_child_cb_inactivate,
 };

@@ -953,6 +972,8 @@ static void bdrv_backing_attach(BdrvChild *c)
                    parent->backing_blocker);
    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                    parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }

 static void bdrv_backing_detach(BdrvChild *c)
@@ -963,6 +984,8 @@ static void bdrv_backing_detach(BdrvChild *c)
    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
    error_free(parent->backing_blocker);
    parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }

 /*
@@ -1924,6 +1947,8 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
    assert(role == &child_backing || role == &child_file);

    if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
        /* Apart from the modifications below, the same permissions are
         * forwarded and left alone as for filters */
        bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -1936,7 +1961,9 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,

        /* bs->file always needs to be consistent because of the metadata. We
         * can never allow other users to resize or write to it. */
+        if (!(flags & BDRV_O_NO_IO)) {
            perm |= BLK_PERM_CONSISTENT_READ;
+        }
        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
    } else {
        /* We want consistent read from backing files if the parent needs it.
@@ -1968,17 +1995,23 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                      BlockDriverState *new_bs)
 {
    BlockDriverState *old_bs = child->bs;
+    int i;

    if (old_bs && new_bs) {
        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
    }
    if (old_bs) {
-        if (old_bs->quiesce_counter && child->role->drained_end) {
-            child->role->drained_end(child);
-        }
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
        if (child->role->detach) {
            child->role->detach(child);
        }
+        if (old_bs->quiesce_counter && child->role->drained_end) {
+            for (i = 0; i < old_bs->quiesce_counter; i++) {
+                child->role->drained_end(child);
+            }
+        }
        QLIST_REMOVE(child, next_parent);
    }

@@ -1987,9 +2020,14 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
    if (new_bs) {
        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
        if (new_bs->quiesce_counter && child->role->drained_begin) {
+            for (i = 0; i < new_bs->quiesce_counter; i++) {
                child->role->drained_begin(child);
            }
+        }

+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
        if (child->role->attach) {
            child->role->attach(child);
        }
@@ -2370,6 +2408,51 @@ BdrvChild *bdrv_open_child(const char *filename,
    return c;
 }

+/* TODO Future callers may need to specify parent/child_role in order for
+ * option inheritance to work. Existing callers use it for the root node. */
+BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
+{
+    BlockDriverState *bs = NULL;
+    Error *local_err = NULL;
+    QObject *obj = NULL;
+    QDict *qdict = NULL;
+    const char *reference = NULL;
+    Visitor *v = NULL;
+
+    if (ref->type == QTYPE_QSTRING) {
+        reference = ref->u.reference;
+    } else {
+        BlockdevOptions *options = &ref->u.definition;
+        assert(ref->type == QTYPE_QDICT);
+
+        v = qobject_output_visitor_new(&obj);
+        visit_type_BlockdevOptions(v, NULL, &options, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            goto fail;
+        }
+        visit_complete(v, &obj);
+
+        qdict = qobject_to_qdict(obj);
+        qdict_flatten(qdict);
+
+        /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
+         * compatibility with other callers) rather than what we want as the
+         * real defaults. Apply the defaults here instead. */
+        qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
+        qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
+        qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
+    }
+
+    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, errp);
+    obj = NULL;
+
+fail:
+    qobject_decref(obj);
+    visit_free(v);
+    return bs;
+}
+
 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
                                                   int flags,
                                                   QDict *snapshot_options,
@@ -2731,6 +2814,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
 * returns a pointer to bs_queue, which is either the newly allocated
 * bs_queue, or the existing bs_queue being used.
 *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
 */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                 BlockDriverState *bs,
@@ -2746,6 +2830,11 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
    BdrvChild *child;
    QDict *old_options, *explicit_options;

+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
    if (bs_queue == NULL) {
        bs_queue = g_new0(BlockReopenQueue, 1);
        QSIMPLEQ_INIT(bs_queue);
@@ -2870,6 +2959,8 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
 * If all devices prepare successfully, then the changes are committed
 * to all devices.
 *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
 */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -2879,11 +2970,8 @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er

    assert(bs_queue != NULL);

-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
            error_propagate(errp, local_err);
            goto cleanup;
@@ -2912,8 +3000,6 @@ cleanup:
    }
    g_free(bs_queue);

-    bdrv_drain_all_end();
-
    return ret;
 }

@@ -2923,12 +3009,18 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
    int ret = -1;
    Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;

+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
    ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
    }
+
+    bdrv_subtree_drained_end(bs);
+
    return ret;
 }

@@ -3410,17 +3502,54 @@ static void bdrv_delete(BlockDriverState *bs)
 * free of errors) or -errno when an internal error occurred. The results of the
 * check are stored in res.
 */
-int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
+static int coroutine_fn bdrv_co_check(BlockDriverState *bs,
+                                      BdrvCheckResult *res, BdrvCheckMode fix)
 {
    if (bs->drv == NULL) {
        return -ENOMEDIUM;
    }
-    if (bs->drv->bdrv_check == NULL) {
+    if (bs->drv->bdrv_co_check == NULL) {
        return -ENOTSUP;
    }

    memset(res, 0, sizeof(*res));
-    return bs->drv->bdrv_check(bs, res, fix);
+    return bs->drv->bdrv_co_check(bs, res, fix);
+}
+
+typedef struct CheckCo {
+    BlockDriverState *bs;
+    BdrvCheckResult *res;
+    BdrvCheckMode fix;
+    int ret;
+} CheckCo;
+
+static void bdrv_check_co_entry(void *opaque)
+{
+    CheckCo *cco = opaque;
+    cco->ret = bdrv_co_check(cco->bs, cco->res, cco->fix);
+}
+
+int bdrv_check(BlockDriverState *bs,
+               BdrvCheckResult *res, BdrvCheckMode fix)
+{
+    Coroutine *co;
+    CheckCo cco = {
+        .bs = bs,
+        .res = res,
+        .ret = -EINPROGRESS,
+        .fix = fix,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_check_co_entry(&cco);
+    } else {
+        co = qemu_coroutine_create(bdrv_check_co_entry, &cco);
+        qemu_coroutine_enter(co);
+        BDRV_POLL_WHILE(bs, cco.ret == -EINPROGRESS);
+    }
+
+    return cco.ret;
 }

 /*
@@ -3590,6 +3719,11 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
        error_setg(errp, "No medium inserted");
        return -ENOMEDIUM;
    }
+    if (offset < 0) {
+        error_setg(errp, "Image size cannot be negative");
+        return -EINVAL;
+    }
+
    if (!drv->bdrv_truncate) {
        if (bs->file && drv->is_filter) {
            return bdrv_truncate(bs->file, offset, prealloc, errp);
@@ -3963,17 +4097,11 @@ bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)

 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
 {
-    BlockDriverInfo bdi;
-
    if (!(bs->open_flags & BDRV_O_UNMAP)) {
        return false;
    }

-    if (bdrv_get_info(bs, &bdi) == 0) {
-        return bdi.can_write_zeroes_with_unmap;
-    }
-
-    return false;
+    return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
 }

 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
@@ -4170,7 +4298,8 @@ void bdrv_init_with_whitelist(void)
    bdrv_init();
 }

-void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
+static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
+                                                  Error **errp)
 {
    BdrvChild *child, *parent;
    uint64_t perm, shared_perm;
@@ -4186,7 +4315,7 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
    }

    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_invalidate_cache(child->bs, &local_err);
+        bdrv_co_invalidate_cache(child->bs, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            return;
@@ -4216,8 +4345,8 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
    }
    bdrv_set_perm(bs, perm, shared_perm);

-    if (bs->drv->bdrv_invalidate_cache) {
-        bs->drv->bdrv_invalidate_cache(bs, &local_err);
+    if (bs->drv->bdrv_co_invalidate_cache) {
+        bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
        if (local_err) {
            bs->open_flags |= BDRV_O_INACTIVE;
            error_propagate(errp, local_err);
@@ -4243,6 +4372,38 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
    }
 }

+typedef struct InvalidateCacheCo {
+    BlockDriverState *bs;
+    Error **errp;
+    bool done;
+} InvalidateCacheCo;
+
+static void coroutine_fn bdrv_invalidate_cache_co_entry(void *opaque)
+{
+    InvalidateCacheCo *ico = opaque;
+    bdrv_co_invalidate_cache(ico->bs, ico->errp);
+    ico->done = true;
+}
+
+void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
+{
+    Coroutine *co;
+    InvalidateCacheCo ico = {
+        .bs = bs,
+        .done = false,
+        .errp = errp
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_invalidate_cache_co_entry(&ico);
+    } else {
+        co = qemu_coroutine_create(bdrv_invalidate_cache_co_entry, &ico);
+        qemu_coroutine_enter(co);
+        BDRV_POLL_WHILE(bs, !ico.done);
+    }
+}
+
 void bdrv_invalidate_cache_all(Error **errp)
 {
    BlockDriverState *bs;
@@ -4320,9 +4481,15 @@ int bdrv_inactivate_all(void)
    BdrvNextIterator it;
    int ret = 0;
    int pass;
+    GSList *aio_ctxs = NULL, *ctx;

    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        aio_context_acquire(bdrv_get_aio_context(bs));
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        if (!g_slist_find(aio_ctxs, aio_context)) {
+            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+            aio_context_acquire(aio_context);
+        }
    }

    /* We do two passes of inactivation. The first pass calls to drivers'
@@ -4340,9 +4507,11 @@ int bdrv_inactivate_all(void)
    }

 out:
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        aio_context_release(bdrv_get_aio_context(bs));
+    for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+        AioContext *aio_context = ctx->data;
+        aio_context_release(aio_context);
    }
+    g_slist_free(aio_ctxs);

    return ret;
 }
@@ -4593,10 +4762,11 @@ void bdrv_img_create(const char *filename, const char *fmt,
        back_flags = flags;
        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);

-        if (backing_fmt) {
        backing_options = qdict_new();
+        if (backing_fmt) {
            qdict_put_str(backing_options, "driver", backing_fmt);
        }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);

        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                       &local_err);
@@ -4663,7 +4833,12 @@ out:

 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 {
-    return bs->aio_context;
+    return bs ? bs->aio_context : qemu_get_aio_context();
+}
+
+AioWait *bdrv_get_aio_wait(BlockDriverState *bs)
+{
+    return bs ? &bs->wait : NULL;
 }

 void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
@@ -4746,7 +4921,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
    AioContext *ctx = bdrv_get_aio_context(bs);

    aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs);
+    bdrv_parent_drained_begin(bs, NULL);
    bdrv_drain(bs); /* ensure there are no in-flight requests */

    while (aio_poll(ctx, false)) {
@@ -4760,7 +4935,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     */
    aio_context_acquire(new_context);
    bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs);
+    bdrv_parent_drained_end(bs, NULL);
    aio_enable_external(ctx);
    aio_context_release(new_context);
 }
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,8 +9,9 @@ block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += file-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o commit.o io.o
+block-obj-y += null.o mirror.o commit.o io.o create.o
 block-obj-y += throttle-groups.o
+block-obj-$(CONFIG_LINUX) += nvme.o

 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -47,3 +48,5 @@ block-obj-$(if $(CONFIG_BZIP2),m,n) += dmg-bz2.o
 dmg-bz2.o-libs     := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
+parallels.o-cflags := $(LIBXML2_CFLAGS)
+parallels.o-libs   := $(LIBXML2_LIBS)
--- a/block/backup.c
+++ b/block/backup.c
@@ -40,11 +40,12 @@ typedef struct BackupBlockJob {
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
    uint64_t bytes_read;
-    unsigned long *done_bitmap;
    int64_t cluster_size;
    bool compress;
    NotifierWithReturn before_write;
    QLIST_HEAD(, CowRequest) inflight_reqs;
+
+    HBitmap *copy_bitmap;
 } BackupBlockJob;

 /* See if in-flight requests overlap and wait for them to complete */
@@ -109,10 +110,11 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
    cow_request_begin(&cow_request, job, start, end);

    for (; start < end; start += job->cluster_size) {
-        if (test_bit(start / job->cluster_size, job->done_bitmap)) {
+        if (!hbitmap_get(job->copy_bitmap, start / job->cluster_size)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }
+        hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);

        trace_backup_do_cow_process(job, start);

@@ -132,6 +134,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
            if (error_is_read) {
                *error_is_read = true;
            }
+            hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1);
            goto out;
        }

@@ -148,11 +151,10 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
            if (error_is_read) {
                *error_is_read = false;
            }
+            hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1);
            goto out;
        }

-        set_bit(start / job->cluster_size, job->done_bitmap);
-
        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
         */
@@ -260,7 +262,7 @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
    }

    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
-    bitmap_zero(backup_job->done_bitmap, len);
+    hbitmap_set(backup_job->copy_bitmap, 0, len);
 }

 void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
@@ -360,64 +362,68 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)

 static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
 {
+    int ret;
    bool error_is_read;
-    int ret = 0;
-    int clusters_per_iter;
-    uint32_t granularity;
-    int64_t offset;
    int64_t cluster;
-    int64_t end;
-    int64_t last_cluster = -1;
-    BdrvDirtyBitmapIter *dbi;
+    HBitmapIter hbi;

-    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
-    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
-    dbi = bdrv_dirty_iter_new(job->sync_bitmap);
-
-    /* Find the next dirty sector(s) */
-    while ((offset = bdrv_dirty_iter_next(dbi)) >= 0) {
-        cluster = offset / job->cluster_size;
-
-        /* Fake progress updates for any clusters we skipped */
-        if (cluster != last_cluster + 1) {
-            job->common.offset += ((cluster - last_cluster - 1) *
-                                   job->cluster_size);
-        }
-
-        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+    hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
+    while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
        do {
            if (yield_and_check(job)) {
-                    goto out;
+                return 0;
            }
            ret = backup_do_cow(job, cluster * job->cluster_size,
-                                    job->cluster_size, &error_is_read,
-                                    false);
-                if ((ret < 0) &&
-                    backup_error_action(job, error_is_read, -ret) ==
-                    BLOCK_ERROR_ACTION_REPORT) {
-                    goto out;
+                                job->cluster_size, &error_is_read, false);
+            if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
+                           BLOCK_ERROR_ACTION_REPORT)
+            {
+                return ret;
            }
        } while (ret < 0);
    }

-        /* If the bitmap granularity is smaller than the backup granularity,
-         * we need to advance the iterator pointer to the next cluster. */
-        if (granularity < job->cluster_size) {
-            bdrv_set_dirty_iter(dbi, cluster * job->cluster_size);
+    return 0;
+}
+
+/* init copy_bitmap from sync_bitmap */
+static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
+{
+    BdrvDirtyBitmapIter *dbi;
+    int64_t offset;
+    int64_t end = DIV_ROUND_UP(bdrv_dirty_bitmap_size(job->sync_bitmap),
+                               job->cluster_size);
+
+    dbi = bdrv_dirty_iter_new(job->sync_bitmap);
+    while ((offset = bdrv_dirty_iter_next(dbi)) != -1) {
+        int64_t cluster = offset / job->cluster_size;
+        int64_t next_cluster;
+
+        offset += bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+        if (offset >= bdrv_dirty_bitmap_size(job->sync_bitmap)) {
+            hbitmap_set(job->copy_bitmap, cluster, end - cluster);
+            break;
        }

-        last_cluster = cluster - 1;
+        offset = bdrv_dirty_bitmap_next_zero(job->sync_bitmap, offset);
+        if (offset == -1) {
+            hbitmap_set(job->copy_bitmap, cluster, end - cluster);
+            break;
        }

-    /* Play some final catchup with the progress meter */
-    end = DIV_ROUND_UP(job->common.len, job->cluster_size);
-    if (last_cluster + 1 < end) {
-        job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
+        next_cluster = DIV_ROUND_UP(offset, job->cluster_size);
+        hbitmap_set(job->copy_bitmap, cluster, next_cluster - cluster);
+        if (next_cluster >= end) {
+            break;
        }

-out:
+        bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size);
+    }
+
+    job->common.offset = job->common.len -
+                         hbitmap_count(job->copy_bitmap) * job->cluster_size;
+
    bdrv_dirty_iter_free(dbi);
-    return ret;
 }

 static void coroutine_fn backup_run(void *opaque)
@@ -425,19 +431,27 @@ static void coroutine_fn backup_run(void *opaque)
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
    BlockDriverState *bs = blk_bs(job->common.blk);
-    int64_t offset;
+    int64_t offset, nb_clusters;
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

-    job->done_bitmap = bitmap_new(DIV_ROUND_UP(job->common.len,
-                                               job->cluster_size));
+    nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size);
+    job->copy_bitmap = hbitmap_alloc(nb_clusters, 0);
+    if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+        backup_incremental_init_copy_bitmap(job);
+    } else {
+        hbitmap_set(job->copy_bitmap, 0, nb_clusters);
+    }
+

    job->before_write.notify = backup_before_write_notify;
    bdrv_add_before_write_notifier(bs, &job->before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
+        /* All bits are set in copy_bitmap to allow any cluster to be copied.
+         * This does not actually require them to be copied. */
        while (!block_job_is_cancelled(&job->common)) {
            /* Yield until the job is cancelled.  We just let our before_write
             * notify callback service CoW requests. */
@@ -512,7 +526,7 @@ static void coroutine_fn backup_run(void *opaque)
    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
-    g_free(job->done_bitmap);
+    hbitmap_free(job->copy_bitmap);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -29,7 +29,7 @@
 #include "qemu/config-file.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qapi/qmp/qbool.h"
+#include "qemu/option.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "sysemu/qtest.h"
@@ -627,15 +627,17 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

-static int64_t coroutine_fn blkdebug_co_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero,
+                                                 int64_t offset,
+                                                 int64_t bytes,
+                                                 int64_t *pnum,
+                                                 int64_t *map,
                                                 BlockDriverState **file)
 {
-    assert(QEMU_IS_ALIGNED(sector_num | nb_sectors,
-                           DIV_ROUND_UP(bs->bl.request_alignment,
-                                        BDRV_SECTOR_SIZE)));
-    return bdrv_co_get_block_status_from_file(bs, sector_num, nb_sectors,
-                                              pnum, file);
+    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+    return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
+                                          pnum, map, file);
 }

 static void blkdebug_close(BlockDriverState *bs)
@@ -907,7 +909,7 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_co_flush_to_disk  = blkdebug_co_flush,
    .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
    .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
-    .bdrv_co_get_block_status = blkdebug_co_get_block_status,
+    .bdrv_co_block_status   = blkdebug_co_block_status,

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -129,10 +129,9 @@ static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs)

 static BlockDriver bdrv_blkreplay = {
    .format_name            = "blkreplay",
-    .protocol_name          = "blkreplay",
    .instance_size          = 0,

-    .bdrv_file_open         = blkreplay_open,
+    .bdrv_open              = blkreplay_open,
    .bdrv_close             = blkreplay_close,
    .bdrv_child_perm        = bdrv_filter_default_perms,
    .bdrv_getlength         = blkreplay_getlength,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -14,6 +14,7 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "qemu/cutils.h"
+#include "qemu/option.h"

 typedef struct {
    BdrvChild *test_file;
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -17,8 +17,10 @@
 #include "block/throttle-groups.h"
 #include "sysemu/blockdev.h"
 #include "sysemu/sysemu.h"
-#include "qapi-event.h"
+#include "qapi/error.h"
+#include "qapi/qapi-events-block.h"
 #include "qemu/id.h"
+#include "qemu/option.h"
 #include "trace.h"
 #include "migration/misc.h"

@@ -71,6 +73,14 @@ struct BlockBackend {
    int quiesce_counter;
    VMChangeStateEntry *vmsh;
    bool force_allow_inactivate;
+
+    /* Number of in-flight aio requests.  BlockDriverState also counts
+     * in-flight requests but aio requests can exist even when blk->root is
+     * NULL, so we cannot rely on its counter for that case.
+     * Accessed with atomic ops.
+     */
+    unsigned int in_flight;
+    AioWait wait;
 };

 typedef struct BlockBackendAIOCB {
@@ -1140,7 +1150,7 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
 typedef struct BlkRwCo {
    BlockBackend *blk;
    int64_t offset;
-    QEMUIOVector *qiov;
+    void *iobuf;
    int ret;
    BdrvRequestFlags flags;
 } BlkRwCo;
@@ -1148,17 +1158,19 @@ typedef struct BlkRwCo {
 static void blk_read_entry(void *opaque)
 {
    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;

-    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size,
-                              rwco->qiov, rwco->flags);
+    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
+                              qiov, rwco->flags);
 }

 static void blk_write_entry(void *opaque)
 {
    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;

-    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size,
-                               rwco->qiov, rwco->flags);
+    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
+                               qiov, rwco->flags);
 }

 static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
@@ -1178,7 +1190,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
    rwco = (BlkRwCo) {
        .blk    = blk,
        .offset = offset,
-        .qiov   = &qiov,
+        .iobuf  = &qiov,
        .flags  = flags,
        .ret    = NOT_DONE,
    };
@@ -1223,11 +1235,22 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
    return bdrv_make_zero(blk->root, flags);
 }

+static void blk_inc_in_flight(BlockBackend *blk)
+{
+    atomic_inc(&blk->in_flight);
+}
+
+static void blk_dec_in_flight(BlockBackend *blk)
+{
+    atomic_dec(&blk->in_flight);
+    aio_wait_kick(&blk->wait);
+}
+
 static void error_callback_bh(void *opaque)
 {
    struct BlockBackendAIOCB *acb = opaque;

-    bdrv_dec_in_flight(acb->common.bs);
+    blk_dec_in_flight(acb->blk);
    acb->common.cb(acb->common.opaque, acb->ret);
    qemu_aio_unref(acb);
 }
@@ -1238,7 +1261,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 {
    struct BlockBackendAIOCB *acb;

-    bdrv_inc_in_flight(blk_bs(blk));
+    blk_inc_in_flight(blk);
    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
    acb->blk = blk;
    acb->ret = ret;
@@ -1261,7 +1284,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {
 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
    if (acb->has_returned) {
-        bdrv_dec_in_flight(acb->common.bs);
+        blk_dec_in_flight(acb->rwco.blk);
        acb->common.cb(acb->common.opaque, acb->rwco.ret);
        qemu_aio_unref(acb);
    }
@@ -1275,19 +1298,19 @@ static void blk_aio_complete_bh(void *opaque)
 }

 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
-                                QEMUIOVector *qiov, CoroutineEntry co_entry,
+                                void *iobuf, CoroutineEntry co_entry,
                                BdrvRequestFlags flags,
                                BlockCompletionFunc *cb, void *opaque)
 {
    BlkAioEmAIOCB *acb;
    Coroutine *co;

-    bdrv_inc_in_flight(blk_bs(blk));
+    blk_inc_in_flight(blk);
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
    acb->rwco = (BlkRwCo) {
        .blk    = blk,
        .offset = offset,
-        .qiov   = qiov,
+        .iobuf  = iobuf,
        .flags  = flags,
        .ret    = NOT_DONE,
    };
@@ -1310,10 +1333,11 @@ static void blk_aio_read_entry(void *opaque)
 {
    BlkAioEmAIOCB *acb = opaque;
    BlkRwCo *rwco = &acb->rwco;
+    QEMUIOVector *qiov = rwco->iobuf;

-    assert(rwco->qiov->size == acb->bytes);
+    assert(qiov->size == acb->bytes);
    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
-                              rwco->qiov, rwco->flags);
+                              qiov, rwco->flags);
    blk_aio_complete(acb);
 }

@@ -1321,10 +1345,11 @@ static void blk_aio_write_entry(void *opaque)
 {
    BlkAioEmAIOCB *acb = opaque;
    BlkRwCo *rwco = &acb->rwco;
+    QEMUIOVector *qiov = rwco->iobuf;

-    assert(!rwco->qiov || rwco->qiov->size == acb->bytes);
+    assert(!qiov || qiov->size == acb->bytes);
    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
-                               rwco->qiov, rwco->flags);
+                               qiov, rwco->flags);
    blk_aio_complete(acb);
 }

@@ -1453,8 +1478,10 @@ int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 static void blk_ioctl_entry(void *opaque)
 {
    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;
+
    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
-                             rwco->qiov->iov[0].iov_base);
+                             qiov->iov[0].iov_base);
 }

 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
@@ -1467,24 +1494,15 @@ static void blk_aio_ioctl_entry(void *opaque)
    BlkAioEmAIOCB *acb = opaque;
    BlkRwCo *rwco = &acb->rwco;

-    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
-                             rwco->qiov->iov[0].iov_base);
+    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
+
    blk_aio_complete(acb);
 }

 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                          BlockCompletionFunc *cb, void *opaque)
 {
-    QEMUIOVector qiov;
-    struct iovec iov;
-
-    iov = (struct iovec) {
-        .iov_base = buf,
-        .iov_len = 0,
-    };
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
-    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
+    return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
 }

 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
@@ -1519,14 +1537,41 @@ int blk_flush(BlockBackend *blk)

 void blk_drain(BlockBackend *blk)
 {
-    if (blk_bs(blk)) {
-        bdrv_drain(blk_bs(blk));
+    BlockDriverState *bs = blk_bs(blk);
+
+    if (bs) {
+        bdrv_drained_begin(bs);
+    }
+
+    /* We may have -ENOMEDIUM completions in flight */
+    AIO_WAIT_WHILE(&blk->wait,
+            blk_get_aio_context(blk),
+            atomic_mb_read(&blk->in_flight) > 0);
+
+    if (bs) {
+        bdrv_drained_end(bs);
    }
 }

 void blk_drain_all(void)
 {
-    bdrv_drain_all();
+    BlockBackend *blk = NULL;
+
+    bdrv_drain_all_begin();
+
+    while ((blk = blk_all_next(blk)) != NULL) {
+        AioContext *ctx = blk_get_aio_context(blk);
+
+        aio_context_acquire(ctx);
+
+        /* We may have -ENOMEDIUM completions in flight */
+        AIO_WAIT_WHILE(&blk->wait, ctx,
+                atomic_mb_read(&blk->in_flight) > 0);
+
+        aio_context_release(ctx);
+    }
+
+    bdrv_drain_all_end();
 }

 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
@@ -1567,10 +1612,11 @@ static void send_qmp_error_event(BlockBackend *blk,
                                 bool is_read, int error)
 {
    IoOperationType optype;
+    BlockDriverState *bs = blk_bs(blk);

    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
-    qapi_event_send_block_io_error(blk_name(blk),
-                                   bdrv_get_node_name(blk_bs(blk)), optype,
+    qapi_event_send_block_io_error(blk_name(blk), !!bs,
+                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
                                   action, blk_iostatus_is_enabled(blk),
                                   error == ENOSPC, strerror(error),
                                   &error_abort);
@@ -1900,7 +1946,9 @@ int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
 static void blk_pdiscard_entry(void *opaque)
 {
    BlkRwCo *rwco = opaque;
-    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
+    QEMUIOVector *qiov = rwco->iobuf;
+
+    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
 }

 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
@@ -2096,3 +2144,13 @@ static void blk_root_drained_end(BdrvChild *child)
        }
    }
 }
+
+void blk_register_buf(BlockBackend *blk, void *host, size_t size)
+{
+    bdrv_register_buf(blk_bs(blk), host, size);
+}
+
+void blk_unregister_buf(BlockBackend *blk, void *host)
+{
+    bdrv_unregister_buf(blk_bs(blk), host);
+}
--- a/block/commit.c
+++ b/block/commit.c
@@ -265,7 +265,7 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 static BlockDriver bdrv_commit_top = {
    .format_name                = "commit_top",
    .bdrv_co_preadv             = bdrv_commit_top_preadv,
-    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
+    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
    .bdrv_close                 = bdrv_commit_top_close,
    .bdrv_child_perm            = bdrv_commit_top_child_perm,
@@ -277,7 +277,6 @@ void commit_start(const char *job_id, BlockDriverState *bs,
                  const char *filter_node_name, Error **errp)
 {
    CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
    int orig_base_flags;
    BlockDriverState *iter;
    BlockDriverState *commit_top_bs = NULL;
@@ -299,12 +298,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    /* convert base to r/w, if necessary */
    orig_base_flags = bdrv_get_flags(base);
    if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
            goto fail;
--- a/block/create.c
+++ b/block/create.c
@@ -0,0 +1,76 @@
+/*
+ * Block layer code related to image creation
+ *
+ * Copyright (c) 2018 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block_int.h"
+#include "qapi/qapi-commands-block-core.h"
+#include "qapi/error.h"
+
+typedef struct BlockdevCreateCo {
+    BlockDriver *drv;
+    BlockdevCreateOptions *opts;
+    int ret;
+    Error **errp;
+} BlockdevCreateCo;
+
+static void coroutine_fn bdrv_co_create_co_entry(void *opaque)
+{
+    BlockdevCreateCo *cco = opaque;
+    cco->ret = cco->drv->bdrv_co_create(cco->opts, cco->errp);
+}
+
+void qmp_x_blockdev_create(BlockdevCreateOptions *options, Error **errp)
+{
+    const char *fmt = BlockdevDriver_str(options->driver);
+    BlockDriver *drv = bdrv_find_format(fmt);
+    Coroutine *co;
+    BlockdevCreateCo cco;
+
+    /* If the driver is in the schema, we know that it exists. But it may not
+     * be whitelisted. */
+    assert(drv);
+    if (bdrv_uses_whitelist() && !bdrv_is_whitelisted(drv, false)) {
+        error_setg(errp, "Driver is not whitelisted");
+        return;
+    }
+
+    /* Call callback if it exists */
+    if (!drv->bdrv_co_create) {
+        error_setg(errp, "Driver does not support blockdev-create");
+        return;
+    }
+
+    cco = (BlockdevCreateCo) {
+        .drv = drv,
+        .opts = options,
+        .ret = -EINPROGRESS,
+        .errp = errp,
+    };
+
+    co = qemu_coroutine_create(bdrv_co_create_co_entry, &cco);
+    qemu_coroutine_enter(co);
+    while (cco.ret == -EINPROGRESS) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -24,9 +24,11 @@
 #include "sysemu/block-backend.h"
 #include "crypto/block.h"
 #include "qapi/opts-visitor.h"
+#include "qapi/qapi-visit-crypto.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qobject-input-visitor.h"
-#include "qapi-visit.h"
 #include "qapi/error.h"
+#include "qemu/option.h"
 #include "block/crypto.h"

 typedef struct BlockCrypto BlockCrypto;
@@ -382,6 +384,12 @@ static void block_crypto_close(BlockDriverState *bs)
    qcrypto_block_free(crypto->block);
 }

+static int block_crypto_reopen_prepare(BDRVReopenState *state,
+                                       BlockReopenQueue *queue, Error **errp)
+{
+    /* nothing needs checking */
+    return 0;
+}

 /*
 * 1 MB bounce buffer gives good performance / memory tradeoff
@@ -554,7 +562,7 @@ static int block_crypto_open_luks(BlockDriverState *bs,
                                     bs, options, flags, errp);
 }

-static int block_crypto_create_luks(const char *filename,
+static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename,
                                                         QemuOpts *opts,
                                                         Error **errp)
 {
@@ -574,7 +582,6 @@ static int block_crypto_get_info_luks(BlockDriverState *bs,
    }

    bdi->unallocated_blocks_are_zero = false;
-    bdi->can_write_zeroes_with_unmap = false;
    bdi->cluster_size = subbdi.cluster_size;

    return 0;
@@ -616,10 +623,11 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_open          = block_crypto_open_luks,
    .bdrv_close         = block_crypto_close,
    .bdrv_child_perm    = bdrv_format_default_perms,
-    .bdrv_create        = block_crypto_create_luks,
+    .bdrv_co_create_opts = block_crypto_co_create_opts_luks,
    .bdrv_truncate      = block_crypto_truncate,
    .create_opts        = &block_crypto_create_opts_luks,

+    .bdrv_reopen_prepare = block_crypto_reopen_prepare,
    .bdrv_refresh_limits = block_crypto_refresh_limits,
    .bdrv_co_preadv     = block_crypto_co_preadv,
    .bdrv_co_pwritev    = block_crypto_co_pwritev,
--- a/block/curl.c
+++ b/block/curl.c
@@ -21,12 +21,13 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
-#include "qemu-common.h"
 #include "qemu/error-report.h"
+#include "qemu/option.h"
 #include "block/block_int.h"
-#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "crypto/secret.h"
 #include <curl/curl.h>
@@ -89,6 +90,8 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,

 struct BDRVCURLState;

+static bool libcurl_initialized;
+
 typedef struct CURLAIOCB {
    Coroutine *co;
    QEMUIOVector *qiov;
@@ -99,8 +102,6 @@ typedef struct CURLAIOCB {

    size_t start;
    size_t end;
-
-    QSIMPLEQ_ENTRY(CURLAIOCB) next;
 } CURLAIOCB;

 typedef struct CURLSocket {
@@ -136,7 +137,7 @@ typedef struct BDRVCURLState {
    bool accept_range;
    AioContext *aio_context;
    QemuMutex mutex;
-    QSIMPLEQ_HEAD(, CURLAIOCB) free_state_waitq;
+    CoQueue free_state_waitq;
    char *username;
    char *password;
    char *proxyusername;
@@ -536,7 +537,6 @@ static int curl_init_state(BDRVCURLState *s, CURLState *state)
 /* Called with s->mutex held.  */
 static void curl_clean_state(CURLState *s)
 {
-    CURLAIOCB *next;
    int j;
    for (j = 0; j < CURL_NUM_ACB; j++) {
        assert(!s->acb[j]);
@@ -554,13 +554,7 @@ static void curl_clean_state(CURLState *s)

    s->in_use = 0;

-    next = QSIMPLEQ_FIRST(&s->s->free_state_waitq);
-    if (next) {
-        QSIMPLEQ_REMOVE_HEAD(&s->s->free_state_waitq, next);
-        qemu_mutex_unlock(&s->s->mutex);
-        aio_co_wake(next->co);
-        qemu_mutex_lock(&s->s->mutex);
-    }
+    qemu_co_enter_next(&s->s->free_state_waitq, &s->s->mutex);
 }

 static void curl_parse_filename(const char *filename, QDict *options,
@@ -686,14 +680,23 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    double d;
    const char *secretid;
    const char *protocol_delimiter;
+    int ret;

-    static int inited = 0;

    if (flags & BDRV_O_RDWR) {
        error_setg(errp, "curl block device does not support writes");
        return -EROFS;
    }

+    if (!libcurl_initialized) {
+        ret = curl_global_init(CURL_GLOBAL_ALL);
+        if (ret) {
+            error_setg(errp, "libcurl initialization failed with %d", ret);
+            return -EIO;
+        }
+        libcurl_initialized = true;
+    }
+
    qemu_mutex_init(&s->mutex);
    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -772,13 +775,8 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

-    if (!inited) {
-        curl_global_init(CURL_GLOBAL_ALL);
-        inited = 1;
-    }
-
    DPRINTF("CURL: Opening %s\n", file);
-    QSIMPLEQ_INIT(&s->free_state_waitq);
+    qemu_co_queue_init(&s->free_state_waitq);
    s->aio_context = bdrv_get_aio_context(bs);
    s->url = g_strdup(file);
    qemu_mutex_lock(&s->mutex);
@@ -851,6 +849,9 @@ out_noclean:
    qemu_mutex_destroy(&s->mutex);
    g_free(s->cookie);
    g_free(s->url);
+    g_free(s->username);
+    g_free(s->proxyusername);
+    g_free(s->proxypassword);
    qemu_opts_del(opts);
    return -EINVAL;
 }
@@ -879,10 +880,7 @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
        if (state) {
            break;
        }
-        QSIMPLEQ_INSERT_TAIL(&s->free_state_waitq, acb, next);
-        qemu_mutex_unlock(&s->mutex);
-        qemu_coroutine_yield();
-        qemu_mutex_lock(&s->mutex);
+        qemu_co_queue_wait(&s->free_state_waitq, &s->mutex);
    }

    if (curl_init_state(s, state) < 0) {
@@ -949,6 +947,9 @@ static void curl_close(BlockDriverState *bs)

    g_free(s->cookie);
    g_free(s->url);
+    g_free(s->username);
+    g_free(s->proxyusername);
+    g_free(s->proxypassword);
 }

 static int64_t curl_getlength(BlockDriverState *bs)
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -52,8 +52,6 @@ struct BdrvDirtyBitmap {
                                   Such operations must fail and both the image
                                   and this bitmap must remain unchanged while
                                   this flag is set. */
-    bool autoload;              /* For persistent bitmaps: bitmap must be
-                                   autoloaded on image opening */
    bool persistent;            /* bitmap must be saved to owner disk image */
    QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
@@ -104,7 +102,6 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
    g_free(bitmap->name);
    bitmap->name = NULL;
    bitmap->persistent = false;
-    bitmap->autoload = false;
 }

 /* Called with BQL taken.  */
@@ -261,8 +258,6 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
    bitmap->successor = NULL;
    successor->persistent = bitmap->persistent;
    bitmap->persistent = false;
-    successor->autoload = bitmap->autoload;
-    bitmap->autoload = false;
    bdrv_release_dirty_bitmap(bs, bitmap);

    return successor;
@@ -666,19 +661,6 @@ bool bdrv_has_readonly_bitmaps(BlockDriverState *bs)
    return false;
 }

-/* Called with BQL taken. */
-void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload)
-{
-    qemu_mutex_lock(bitmap->mutex);
-    bitmap->autoload = autoload;
-    qemu_mutex_unlock(bitmap->mutex);
-}
-
-bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap)
-{
-    return bitmap->autoload;
-}
-
 /* Called with BQL taken. */
 void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap, bool persistent)
 {
@@ -715,3 +697,8 @@ char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp)
 {
    return hbitmap_sha256(bitmap->bitmap, errp);
 }
+
+int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, uint64_t offset)
+{
+    return hbitmap_next_zero(bitmap->bitmap, offset);
+}
--- a/block/dmg.h
+++ b/block/dmg.h
@@ -26,7 +26,6 @@
 #ifndef BLOCK_DMG_H
 #define BLOCK_DMG_H

-#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include <zlib.h>
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -21,16 +21,19 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/option.h"
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "block/raw-aio.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"

 #include "scsi/pr-manager.h"
@@ -546,7 +549,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
@@ -596,6 +598,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
 #endif

+    bs->supported_zero_flags = s->discard_zeroes ? BDRV_REQ_MAY_UNMAP : 0;
    ret = 0;
 fail:
    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -1683,12 +1686,16 @@ static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc,
         * file systems that do not support fallocate(), trying to check if a
         * block is allocated before allocating it, so don't do that here.
         */
+        if (offset != current_length) {
            result = -posix_fallocate(fd, current_length, offset - current_length);
            if (result != 0) {
                /* posix_fallocate() doesn't set errno. */
                error_setg_errno(errp, -result,
                                 "Could not preallocate new data");
            }
+        } else {
+            result = 0;
+        }
        goto out;
 #endif
    case PREALLOC_MODE_FULL:
@@ -1979,33 +1986,25 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
    return (int64_t)st.st_blocks * 512;
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
 {
+    BlockdevCreateOptionsFile *file_opts;
    int fd;
    int result = 0;
-    int64_t total_size = 0;
-    bool nocow = false;
-    PreallocMode prealloc;
-    char *buf = NULL;
-    Error *local_err = NULL;

-    strstart(filename, "file:", &filename);
+    /* Validate options and set default values */
+    assert(options->driver == BLOCKDEV_DRIVER_FILE);
+    file_opts = &options->u.file;

-    /* Read out options */
-    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                          BDRV_SECTOR_SIZE);
-    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
-    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
-                               PREALLOC_MODE_OFF, &local_err);
-    g_free(buf);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        result = -EINVAL;
-        goto out;
+    if (!file_opts->has_nocow) {
+        file_opts->nocow = false;
+    }
+    if (!file_opts->has_preallocation) {
+        file_opts->preallocation = PREALLOC_MODE_OFF;
    }

-    fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
+    /* Create file */
+    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
                   0644);
    if (fd < 0) {
        result = -errno;
@@ -2013,7 +2012,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
        goto out;
    }

-    if (nocow) {
+    if (file_opts->nocow) {
 #ifdef __linux__
        /* Set NOCOW flag to solve performance issue on fs like btrfs.
         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
@@ -2028,7 +2027,8 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 #endif
    }

-    result = raw_regular_truncate(fd, total_size, prealloc, errp);
+    result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation,
+                                  errp);
    if (result < 0) {
        goto out_close;
    }
@@ -2042,6 +2042,46 @@ out:
    return result;
 }

+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
+{
+    BlockdevCreateOptions options;
+    int64_t total_size = 0;
+    bool nocow = false;
+    PreallocMode prealloc;
+    char *buf = NULL;
+    Error *local_err = NULL;
+
+    /* Skip file: protocol prefix */
+    strstart(filename, "file:", &filename);
+
+    /* Read out options */
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
+    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
+                               PREALLOC_MODE_OFF, &local_err);
+    g_free(buf);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EINVAL;
+    }
+
+    options = (BlockdevCreateOptions) {
+        .driver     = BLOCKDEV_DRIVER_FILE,
+        .u.file     = {
+            .filename           = (char *) filename,
+            .size               = total_size,
+            .has_preallocation  = true,
+            .preallocation      = prealloc,
+            .has_nocow          = true,
+            .nocow              = nocow,
+        },
+    };
+    return raw_co_create(&options, errp);
+}
+
 /*
 * Find allocation range in @bs around offset @start.
 * May change underlying file descriptor's file offset.
@@ -2128,25 +2168,24 @@ static int find_allocation(BlockDriverState *bs, off_t start,
 }

 /*
- * Returns the allocation status of the specified sectors.
+ * Returns the allocation status of the specified offset.
 *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * The block layer guarantees 'offset' and 'bytes' are within bounds.
 *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ * the specified offset) that are known to be in the same
 * allocated/unallocated state.
 *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * 'bytes' is the max value 'pnum' should be set to.
 */
-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                                    int64_t sector_num,
-                                                    int nb_sectors, int *pnum,
+static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
+                                            bool want_zero,
+                                            int64_t offset,
+                                            int64_t bytes, int64_t *pnum,
+                                            int64_t *map,
                                            BlockDriverState **file)
 {
-    off_t start, data = 0, hole = 0;
-    int64_t total_size;
+    off_t data = 0, hole = 0;
    int ret;

    ret = fd_open(bs);
@@ -2154,39 +2193,36 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
        return ret;
    }

-    start = sector_num * BDRV_SECTOR_SIZE;
-    total_size = bdrv_getlength(bs);
-    if (total_size < 0) {
-        return total_size;
-    } else if (start >= total_size) {
-        *pnum = 0;
-        return 0;
-    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
-        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    if (!want_zero) {
+        *pnum = bytes;
+        *map = offset;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    }

-    ret = find_allocation(bs, start, &data, &hole);
+    ret = find_allocation(bs, offset, &data, &hole);
    if (ret == -ENXIO) {
        /* Trailing hole */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_ZERO;
    } else if (ret < 0) {
        /* No info available, so pretend there are no holes */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_DATA;
-    } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent,
+    } else if (data == offset) {
+        /* On a data extent, compute bytes to the end of the extent,
         * possibly including a partial sector at EOF. */
-        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        *pnum = MIN(bytes, hole - offset);
        ret = BDRV_BLOCK_DATA;
    } else {
-        /* On a hole, compute sectors to the beginning of the next extent.  */
-        assert(hole == start);
-        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        /* On a hole, compute bytes to the beginning of the next extent.  */
+        assert(hole == offset);
+        *pnum = MIN(bytes, data - offset);
        ret = BDRV_BLOCK_ZERO;
    }
+    *map = offset;
    *file = bs;
-    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+    return ret | BDRV_BLOCK_OFFSET_VALID;
 }

 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
@@ -2220,7 +2256,6 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    BDRVRawState *s = bs->opaque;

    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
-    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
    return 0;
 }

@@ -2278,9 +2313,10 @@ BlockDriver bdrv_file = {
    .bdrv_reopen_commit = raw_reopen_commit,
    .bdrv_reopen_abort = raw_reopen_abort,
    .bdrv_close = raw_close,
-    .bdrv_create = raw_create,
+    .bdrv_co_create = raw_co_create,
+    .bdrv_co_create_opts = raw_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = raw_co_get_block_status,
+    .bdrv_co_block_status = raw_co_block_status,
    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,

    .bdrv_co_preadv         = raw_co_preadv,
@@ -2682,7 +2718,7 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
    return -ENOTSUP;
 }

-static int hdev_create(const char *filename, QemuOpts *opts,
+static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
                                            Error **errp)
 {
    int fd;
@@ -2756,7 +2792,7 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts         = &raw_create_opts,
    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,

@@ -2878,7 +2914,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts         = &raw_create_opts,


@@ -3009,7 +3045,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create        = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts        = &raw_create_opts,

    .bdrv_co_preadv         = raw_co_preadv,
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -21,15 +21,18 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/option.h"
 #include "block/raw-aio.h"
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include <windows.h>
 #include <winioctl.h>
@@ -550,9 +553,40 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
    return st.st_size;
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
 {
+    BlockdevCreateOptionsFile *file_opts;
    int fd;
+
+    assert(options->driver == BLOCKDEV_DRIVER_FILE);
+    file_opts = &options->u.file;
+
+    if (file_opts->has_preallocation) {
+        error_setg(errp, "Preallocation is not supported on Windows");
+        return -EINVAL;
+    }
+    if (file_opts->has_nocow) {
+        error_setg(errp, "nocow is not supported on Windows");
+        return -EINVAL;
+    }
+
+    fd = qemu_open(file_opts->filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+                   0644);
+    if (fd < 0) {
+        error_setg_errno(errp, errno, "Could not create file");
+        return -EIO;
+    }
+    set_sparse(fd);
+    ftruncate(fd, file_opts->size);
+    qemu_close(fd);
+
+    return 0;
+}
+
+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
+{
+    BlockdevCreateOptions options;
    int64_t total_size = 0;

    strstart(filename, "file:", &filename);
@@ -561,19 +595,18 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);

-    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
-                   0644);
-    if (fd < 0) {
-        error_setg_errno(errp, errno, "Could not create file");
-        return -EIO;
-    }
-    set_sparse(fd);
-    ftruncate(fd, total_size);
-    qemu_close(fd);
-    return 0;
+    options = (BlockdevCreateOptions) {
+        .driver     = BLOCKDEV_DRIVER_FILE,
+        .u.file     = {
+            .filename           = (char *) filename,
+            .size               = total_size,
+            .has_preallocation  = false,
+            .has_nocow          = false,
+        },
+    };
+    return raw_co_create(&options, errp);
 }

-
 static QemuOptsList raw_create_opts = {
    .name = "raw-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -596,7 +629,7 @@ BlockDriver bdrv_file = {
    .bdrv_file_open     = raw_open,
    .bdrv_refresh_limits = raw_probe_alignment,
    .bdrv_close         = raw_close,
-    .bdrv_create        = raw_create,
+    .bdrv_co_create_opts = raw_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,

    .bdrv_aio_readv     = raw_aio_readv,
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -7,13 +7,16 @@
 * See the COPYING file in the top-level directory.
 *
 */
+
 #include "qemu/osdep.h"
 #include <glusterfs/api/glfs.h>
 #include "block/block_int.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
+#include "qemu/option.h"
 #include "qemu/cutils.h"

 #define GLUSTER_OPT_FILENAME        "filename"
@@ -652,7 +655,9 @@ out:
    return -errno;
 }

-static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
+/* Converts options given in @filename and the @options QDict into the QAPI
+ * object @gconf. */
+static int qemu_gluster_parse(BlockdevOptionsGluster *gconf,
                              const char *filename,
                              QDict *options, Error **errp)
 {
@@ -665,8 +670,7 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
                                    "[host[:port]]volume/path[?socket=...]"
                                    "[,file.debug=N]"
                                    "[,file.logfile=/path/filename.log]\n");
-            errno = -ret;
-            return NULL;
+            return ret;
        }
    } else {
        ret = qemu_gluster_parse_json(gconf, options, errp);
@@ -682,10 +686,23 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
                             "file.server.1.transport=unix,"
                             "file.server.1.socket=/var/run/glusterd.socket ..."
                             "\n");
-            errno = -ret;
-            return NULL;
+            return ret;
+        }
    }

+    return 0;
+}
+
+static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
+                                      const char *filename,
+                                      QDict *options, Error **errp)
+{
+    int ret;
+
+    ret = qemu_gluster_parse(gconf, filename, options, errp);
+    if (ret < 0) {
+        errno = -ret;
+        return NULL;
    }

    return qemu_gluster_glfs_init(gconf, errp);
@@ -962,19 +979,128 @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
 }
 #endif

-static int qemu_gluster_create(const char *filename,
-                               QemuOpts *opts, Error **errp)
+static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
+                                    PreallocMode prealloc, Error **errp)
 {
-    BlockdevOptionsGluster *gconf;
+    int64_t current_length;
+
+    current_length = glfs_lseek(fd, 0, SEEK_END);
+    if (current_length < 0) {
+        error_setg_errno(errp, errno, "Failed to determine current size");
+        return -errno;
+    }
+
+    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Cannot use preallocation for shrinking files");
+        return -ENOTSUP;
+    }
+
+    if (current_length == offset) {
+        return 0;
+    }
+
+    switch (prealloc) {
+#ifdef CONFIG_GLUSTERFS_FALLOCATE
+    case PREALLOC_MODE_FALLOC:
+        if (glfs_fallocate(fd, 0, current_length, offset - current_length)) {
+            error_setg_errno(errp, errno, "Could not preallocate data");
+            return -errno;
+        }
+        break;
+#endif /* CONFIG_GLUSTERFS_FALLOCATE */
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    case PREALLOC_MODE_FULL:
+        if (glfs_ftruncate(fd, offset)) {
+            error_setg_errno(errp, errno, "Could not resize file");
+            return -errno;
+        }
+        if (glfs_zerofill(fd, current_length, offset - current_length)) {
+            error_setg_errno(errp, errno, "Could not zerofill the new area");
+            return -errno;
+        }
+        break;
+#endif /* CONFIG_GLUSTERFS_ZEROFILL */
+    case PREALLOC_MODE_OFF:
+        if (glfs_ftruncate(fd, offset)) {
+            error_setg_errno(errp, errno, "Could not resize file");
+            return -errno;
+        }
+        break;
+    default:
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_str(prealloc));
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int qemu_gluster_co_create(BlockdevCreateOptions *options,
+                                  Error **errp)
+{
+    BlockdevCreateOptionsGluster *opts = &options->u.gluster;
    struct glfs *glfs;
-    struct glfs_fd *fd;
+    struct glfs_fd *fd = NULL;
    int ret = 0;
-    PreallocMode prealloc;
-    int64_t total_size = 0;
+
+    assert(options->driver == BLOCKDEV_DRIVER_GLUSTER);
+
+    glfs = qemu_gluster_glfs_init(opts->location, errp);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    fd = glfs_creat(glfs, opts->location->path,
+                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_gluster_do_truncate(fd, opts->size, opts->preallocation, errp);
+
+out:
+    if (fd) {
+        if (glfs_close(fd) != 0 && ret == 0) {
+            ret = -errno;
+        }
+    }
+    glfs_clear_preopened(glfs);
+    return ret;
+}
+
+static int coroutine_fn qemu_gluster_co_create_opts(const char *filename,
+                                                    QemuOpts *opts,
+                                                    Error **errp)
+{
+    BlockdevCreateOptions *options;
+    BlockdevCreateOptionsGluster *gopts;
+    BlockdevOptionsGluster *gconf;
    char *tmp = NULL;
    Error *local_err = NULL;
+    int ret;
+
+    options = g_new0(BlockdevCreateOptions, 1);
+    options->driver = BLOCKDEV_DRIVER_GLUSTER;
+    gopts = &options->u.gluster;

    gconf = g_new0(BlockdevOptionsGluster, 1);
+    gopts->location = gconf;
+
+    gopts->size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                           BDRV_SECTOR_SIZE);
+
+    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    gopts->preallocation = qapi_enum_parse(&PreallocMode_lookup, tmp,
+                                           PREALLOC_MODE_OFF, &local_err);
+    g_free(tmp);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
    gconf->debug = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
                                           GLUSTER_DEBUG_DEFAULT);
    if (gconf->debug < 0) {
@@ -990,73 +1116,19 @@ static int qemu_gluster_create(const char *filename,
    }
    gconf->has_logfile = true;

-    glfs = qemu_gluster_init(gconf, filename, NULL, errp);
-    if (!glfs) {
-        ret = -errno;
-        goto out;
+    ret = qemu_gluster_parse(gconf, filename, NULL, errp);
+    if (ret < 0) {
+        goto fail;
    }

-    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                          BDRV_SECTOR_SIZE);
-
-    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    prealloc = qapi_enum_parse(&PreallocMode_lookup, tmp, PREALLOC_MODE_OFF,
-                               &local_err);
-    g_free(tmp);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto out;
+    ret = qemu_gluster_co_create(options, errp);
+    if (ret < 0) {
+        goto fail;
    }

-    fd = glfs_creat(glfs, gconf->path,
-                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
-    if (!fd) {
-        ret = -errno;
-        goto out;
-    }
-
-    switch (prealloc) {
-#ifdef CONFIG_GLUSTERFS_FALLOCATE
-    case PREALLOC_MODE_FALLOC:
-        if (glfs_fallocate(fd, 0, 0, total_size)) {
-            error_setg(errp, "Could not preallocate data for the new file");
-            ret = -errno;
-        }
-        break;
-#endif /* CONFIG_GLUSTERFS_FALLOCATE */
-#ifdef CONFIG_GLUSTERFS_ZEROFILL
-    case PREALLOC_MODE_FULL:
-        if (!glfs_ftruncate(fd, total_size)) {
-            if (glfs_zerofill(fd, 0, total_size)) {
-                error_setg(errp, "Could not zerofill the new file");
-                ret = -errno;
-            }
-        } else {
-            error_setg(errp, "Could not resize file");
-            ret = -errno;
-        }
-        break;
-#endif /* CONFIG_GLUSTERFS_ZEROFILL */
-    case PREALLOC_MODE_OFF:
-        if (glfs_ftruncate(fd, total_size) != 0) {
-            ret = -errno;
-            error_setg(errp, "Could not resize file");
-        }
-        break;
-    default:
-        ret = -EINVAL;
-        error_setg(errp, "Unsupported preallocation mode: %s",
-                   PreallocMode_str(prealloc));
-        break;
-    }
-
-    if (glfs_close(fd) != 0) {
-        ret = -errno;
-    }
-out:
-    qapi_free_BlockdevOptionsGluster(gconf);
-    glfs_clear_preopened(glfs);
+    ret = 0;
+fail:
+    qapi_free_BlockdevCreateOptions(options);
    return ret;
 }

@@ -1094,23 +1166,8 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset,
                                 PreallocMode prealloc, Error **errp)
 {
-    int ret;
    BDRVGlusterState *s = bs->opaque;
-
-    if (prealloc != PREALLOC_MODE_OFF) {
-        error_setg(errp, "Unsupported preallocation mode '%s'",
-                   PreallocMode_str(prealloc));
-        return -ENOTSUP;
-    }
-
-    ret = glfs_ftruncate(s->fd, offset);
-    if (ret < 0) {
-        ret = -errno;
-        error_setg_errno(errp, -ret, "Failed to truncate file");
-        return ret;
-    }
-
-    return 0;
+    return qemu_gluster_do_truncate(s->fd, offset, prealloc, errp);
 }

 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
@@ -1349,68 +1406,66 @@ exit:
 }

 /*
- * Returns the allocation status of the specified sectors.
+ * Returns the allocation status of the specified offset.
 *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * The block layer guarantees 'offset' and 'bytes' are within bounds.
 *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ * the specified offset) that are known to be in the same
 * allocated/unallocated state.
 *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * 'bytes' is the max value 'pnum' should be set to.
 *
- * (Based on raw_co_get_block_status() from file-posix.c.)
+ * (Based on raw_co_block_status() from file-posix.c.)
 */
-static int64_t coroutine_fn qemu_gluster_co_get_block_status(
-        BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
+                                                     bool want_zero,
+                                                     int64_t offset,
+                                                     int64_t bytes,
+                                                     int64_t *pnum,
+                                                     int64_t *map,
                                                     BlockDriverState **file)
 {
    BDRVGlusterState *s = bs->opaque;
-    off_t start, data = 0, hole = 0;
-    int64_t total_size;
+    off_t data = 0, hole = 0;
    int ret = -EINVAL;

    if (!s->fd) {
        return ret;
    }

-    start = sector_num * BDRV_SECTOR_SIZE;
-    total_size = bdrv_getlength(bs);
-    if (total_size < 0) {
-        return total_size;
-    } else if (start >= total_size) {
-        *pnum = 0;
-        return 0;
-    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
-        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    if (!want_zero) {
+        *pnum = bytes;
+        *map = offset;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    }

-    ret = find_allocation(bs, start, &data, &hole);
+    ret = find_allocation(bs, offset, &data, &hole);
    if (ret == -ENXIO) {
        /* Trailing hole */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_ZERO;
    } else if (ret < 0) {
        /* No info available, so pretend there are no holes */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_DATA;
-    } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent,
+    } else if (data == offset) {
+        /* On a data extent, compute bytes to the end of the extent,
         * possibly including a partial sector at EOF. */
-        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        *pnum = MIN(bytes, hole - offset);
        ret = BDRV_BLOCK_DATA;
    } else {
-        /* On a hole, compute sectors to the beginning of the next extent.  */
-        assert(hole == start);
-        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        /* On a hole, compute bytes to the beginning of the next extent.  */
+        assert(hole == offset);
+        *pnum = MIN(bytes, data - offset);
        ret = BDRV_BLOCK_ZERO;
    }

+    *map = offset;
    *file = bs;

-    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+    return ret | BDRV_BLOCK_OFFSET_VALID;
 }


@@ -1424,7 +1479,8 @@ static BlockDriver bdrv_gluster = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create               = qemu_gluster_co_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@@ -1438,7 +1494,7 @@ static BlockDriver bdrv_gluster = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -1452,7 +1508,8 @@ static BlockDriver bdrv_gluster_tcp = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create               = qemu_gluster_co_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@@ -1466,7 +1523,7 @@ static BlockDriver bdrv_gluster_tcp = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -1480,7 +1537,8 @@ static BlockDriver bdrv_gluster_unix = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create               = qemu_gluster_co_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@@ -1494,7 +1552,7 @@ static BlockDriver bdrv_gluster_unix = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -1514,7 +1572,8 @@ static BlockDriver bdrv_gluster_rdma = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create               = qemu_gluster_co_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@@ -1528,7 +1587,7 @@ static BlockDriver bdrv_gluster_rdma = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

--- a/block/io.c
+++ b/block/io.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "sysemu/block-backend.h"
+#include "block/aio-wait.h"
 #include "block/blockjob.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -40,22 +41,28 @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int bytes, BdrvRequestFlags flags);

-void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
    BdrvChild *c, *next;

    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
        if (c->role->drained_begin) {
            c->role->drained_begin(c);
        }
    }
 }

-void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
    BdrvChild *c, *next;

    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
        if (c->role->drained_end) {
            c->role->drained_end(c);
        }
@@ -134,29 +141,13 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
    assert(old >= 1);
 }

-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
    Coroutine *co;
    BlockDriverState *bs;
    bool done;
    bool begin;
+    bool recursive;
+    BdrvChild *parent;
 } BdrvCoDrainData;

 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -175,8 +166,10 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
    bdrv_wakeup(bs);
 }

-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
+    BdrvChild *child, *tmp;
    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};

    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -187,16 +180,19 @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
    bdrv_coroutine_enter(bs, data.co);
    BDRV_POLL_WHILE(bs, !data.done);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
+    }
 }

-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
    BdrvChild *child, *tmp;
    bool waited;

-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
    /* Wait for drained requests to finish */
    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);

@@ -215,7 +211,7 @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
             */
            bdrv_ref(bs);
        }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
        if (in_main_loop) {
            bdrv_unref(bs);
        }
@@ -224,6 +220,11 @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
    return waited;
 }

+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
    BdrvCoDrainData *data = opaque;
@@ -232,9 +233,9 @@ static void bdrv_co_drain_bh_cb(void *opaque)

    bdrv_dec_in_flight(bs);
    if (data->begin) {
-        bdrv_drained_begin(bs);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
    } else {
-        bdrv_drained_end(bs);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
    }

    data->done = true;
@@ -242,7 +243,8 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }

 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
    BdrvCoDrainData data;

@@ -256,6 +258,8 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
        .bs = bs,
        .done = false,
        .begin = begin,
+        .recursive = recursive,
+        .parent = parent,
    };
    bdrv_inc_in_flight(bs);
    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -267,35 +271,97 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
    assert(data.done);
 }

-void bdrv_drained_begin(BlockDriverState *bs)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
        return;
    }

+    /* Stop things in parent-to-child order */
    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
        aio_disable_external(bdrv_get_aio_context(bs));
-        bdrv_parent_drained_begin(bs);
    }

-    bdrv_drain_recurse(bs, true);
+    bdrv_parent_drained_begin(bs, parent);
+    bdrv_drain_invoke(bs, true, false);
+    bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        bs->recursive_quiesce_counter++;
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
+}
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
+}
+
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
+{
+    BdrvChild *child, *next;
+    int old_quiesce_counter;
+
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
+        return;
+    }
+    assert(bs->quiesce_counter > 0);
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
+
+    /* Re-enable things in child-to-parent order */
+    bdrv_drain_invoke(bs, false, false);
+    bdrv_parent_drained_end(bs, parent);
+    if (old_quiesce_counter == 1) {
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
+
+    if (recursive) {
+        bs->recursive_quiesce_counter--;
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }

 void bdrv_drained_end(BlockDriverState *bs)
 {
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false);
-        return;
-    }
-    assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    bdrv_do_drained_end(bs, false, NULL);
+}

-    bdrv_parent_drained_end(bs);
-    bdrv_drain_recurse(bs, false);
-    aio_enable_external(bdrv_get_aio_context(bs));
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
+}
+
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
 }

 /*
@@ -342,14 +408,20 @@ void bdrv_drain_all_begin(void)
    BdrvNextIterator it;
    GSList *aio_ctxs = NULL, *ctx;

-    block_job_pause_all();
+    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
+     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
+     * nodes in several different AioContexts, so make sure we're in the main
+     * context. */
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());

    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

+        /* Stop things in parent-to-child order */
        aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
        aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs, NULL);
+        bdrv_drain_invoke(bs, true, true);
        aio_context_release(aio_context);

        if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -372,7 +444,7 @@ void bdrv_drain_all_begin(void)
            aio_context_acquire(aio_context);
            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                }
            }
            aio_context_release(aio_context);
@@ -390,14 +462,13 @@ void bdrv_drain_all_end(void)
    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

+        /* Re-enable things in child-to-parent order */
        aio_context_acquire(aio_context);
+        bdrv_drain_invoke(bs, false, true);
+        bdrv_parent_drained_end(bs, NULL);
        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
-        bdrv_drain_recurse(bs, false);
        aio_context_release(aio_context);
    }
-
-    block_job_resume_all();
 }

 void bdrv_drain_all(void)
@@ -517,16 +588,9 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
    atomic_inc(&bs->in_flight);
 }

-static void dummy_bh_cb(void *opaque)
-{
-}
-
 void bdrv_wakeup(BlockDriverState *bs)
 {
-    /* The barrier (or an atomic op) is in the caller.  */
-    if (atomic_read(&bs->wakeup)) {
-        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
-    }
+    aio_wait_kick(bdrv_get_aio_wait(bs));
 }

 void bdrv_dec_in_flight(BlockDriverState *bs)
@@ -1631,7 +1695,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     */
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

-    if (!qiov) {
+    if (flags & BDRV_REQ_ZERO_WRITE) {
        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
        goto out;
    }
@@ -1798,30 +1862,34 @@ typedef struct BdrvCoBlockStatusData {
    bool done;
 } BdrvCoBlockStatusData;

-int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
-                                                        int64_t sector_num,
-                                                        int nb_sectors,
-                                                        int *pnum,
+int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
+                                                bool want_zero,
+                                                int64_t offset,
+                                                int64_t bytes,
+                                                int64_t *pnum,
+                                                int64_t *map,
                                                BlockDriverState **file)
 {
    assert(bs->file && bs->file->bs);
-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs->file->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

-int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
-                                                           int64_t sector_num,
-                                                           int nb_sectors,
-                                                           int *pnum,
+int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
+                                                   bool want_zero,
+                                                   int64_t offset,
+                                                   int64_t bytes,
+                                                   int64_t *pnum,
+                                                   int64_t *map,
                                                   BlockDriverState **file)
 {
    assert(bs->backing && bs->backing->bs);
-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

 /*
@@ -1829,10 +1897,10 @@ int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
 * Drivers not implementing the functionality are assumed to not support
 * backing files, hence all their sectors are reported as allocated.
 *
- * If 'want_zero' is true, the caller is querying for mapping purposes,
- * and the result should include BDRV_BLOCK_OFFSET_VALID and
- * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those
- * bits particularly if it allows for a larger value in 'pnum'.
+ * If 'want_zero' is true, the caller is querying for mapping
+ * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
+ * _ZERO where possible; otherwise, the result favors larger 'pnum',
+ * with a focus on accurate BDRV_BLOCK_ALLOCATED.
 *
 * If 'offset' is beyond the end of the disk image the return value is
 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
@@ -1889,7 +1957,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,

    /* Must be non-NULL or bdrv_getlength() would have failed */
    assert(bs->drv);
-    if (!bs->drv->bdrv_co_get_block_status) {
+    if (!bs->drv->bdrv_co_block_status) {
        *pnum = bytes;
        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
        if (offset + bytes == total_size) {
@@ -1906,44 +1974,24 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
    bdrv_inc_in_flight(bs);

    /* Round out to request_alignment boundaries */
-    /* TODO: until we have a byte-based driver callback, we also have to
-     * round out to sectors, even if that is bigger than request_alignment */
-    align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE);
+    align = bs->bl.request_alignment;
    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;

-    {
-        int count; /* sectors */
-        int64_t longret;
-
-        assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes,
-                               BDRV_SECTOR_SIZE));
-        /*
-         * The contract allows us to return pnum smaller than bytes, even
-         * if the next query would see the same status; we truncate the
-         * request to avoid overflowing the driver's 32-bit interface.
-         */
-        longret = bs->drv->bdrv_co_get_block_status(
-            bs, aligned_offset >> BDRV_SECTOR_BITS,
-            MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count,
+    ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+                                        aligned_bytes, pnum, &local_map,
                                        &local_file);
-        if (longret < 0) {
-            assert(INT_MIN <= longret);
-            ret = longret;
+    if (ret < 0) {
+        *pnum = 0;
        goto out;
    }
-        if (longret & BDRV_BLOCK_OFFSET_VALID) {
-            local_map = longret & BDRV_BLOCK_OFFSET_MASK;
-        }
-        ret = longret & ~BDRV_BLOCK_OFFSET_MASK;
-        *pnum = count * BDRV_SECTOR_SIZE;
-    }

    /*
-     * The driver's result must be a multiple of request_alignment.
+     * The driver's result must be a non-zero multiple of request_alignment.
     * Clamp pnum and adjust map to original request.
     */
-    assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset);
+    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
+           align > offset - aligned_offset);
    *pnum -= offset - aligned_offset;
    if (*pnum > bytes) {
        *pnum = bytes;
@@ -2755,3 +2803,27 @@ void bdrv_io_unplug(BlockDriverState *bs)
        bdrv_io_unplug(child->bs);
    }
 }
+
+void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
+{
+    BdrvChild *child;
+
+    if (bs->drv && bs->drv->bdrv_register_buf) {
+        bs->drv->bdrv_register_buf(bs, host, size);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_register_buf(child->bs, host, size);
+    }
+}
+
+void bdrv_unregister_buf(BlockDriverState *bs, void *host)
+{
+    BdrvChild *child;
+
+    if (bs->drv && bs->drv->bdrv_unregister_buf) {
+        bs->drv->bdrv_unregister_buf(bs, host);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_unregister_buf(child->bs, host);
+    }
+}
--- a/block/iscsi-opts.c
+++ b/block/iscsi-opts.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/config-file.h"
+#include "qemu/option.h"

 static QemuOptsList qemu_iscsi_opts = {
    .name = "iscsi",
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
 * QEMU Block driver for iSCSI images
 *
 * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2017 Peter Lieven <pl@kamp.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,7 +28,6 @@
 #include <poll.h>
 #include <math.h>
 #include <arpa/inet.h>
-#include "qemu-common.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
 #include "qemu/bitops.h"
@@ -36,8 +35,11 @@
 #include "block/block_int.h"
 #include "scsi/constants.h"
 #include "qemu/iov.h"
+#include "qemu/option.h"
 #include "qemu/uuid.h"
-#include "qmp-commands.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "crypto/secret.h"
 #include "scsi/utils.h"
@@ -84,7 +86,7 @@ typedef struct IscsiLun {
    unsigned long *allocmap;
    unsigned long *allocmap_valid;
    long allocmap_size;
-    int cluster_sectors;
+    int cluster_size;
    bool use_16_for_rw;
    bool write_protected;
    bool lbpme;
@@ -104,6 +106,7 @@ typedef struct IscsiTask {
    IscsiLun *iscsilun;
    QEMUTimer retry_timer;
    int err_code;
+    char *err_str;
 } IscsiTask;

 typedef struct IscsiAIOCB {
@@ -265,7 +268,7 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
            }
        }
        iTask->err_code = iscsi_translate_sense(&task->sense);
-        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+        iTask->err_str = g_strdup(iscsi_get_error(iscsi));
    }

 out:
@@ -427,9 +430,10 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 {
    iscsi_allocmap_free(iscsilun);

+    assert(iscsilun->cluster_size);
    iscsilun->allocmap_size =
-        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
-                     iscsilun->cluster_sectors);
+        DIV_ROUND_UP(iscsilun->num_blocks * iscsilun->block_size,
+                     iscsilun->cluster_size);

    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
    if (!iscsilun->allocmap) {
@@ -437,7 +441,7 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
    }

    if (open_flags & BDRV_O_NOCACHE) {
-        /* in case that cache.direct = on all allocmap entries are
+        /* when cache.direct = on all allocmap entries are
         * treated as invalid to force a relookup of the block
         * status on every read request */
        return 0;
@@ -454,8 +458,8 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 }

 static void
-iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
-                      int nb_sectors, bool allocated, bool valid)
+iscsi_allocmap_update(IscsiLun *iscsilun, int64_t offset,
+                      int64_t bytes, bool allocated, bool valid)
 {
    int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk;

@@ -463,13 +467,13 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
        return;
    }
    /* expand to entirely contain all affected clusters */
-    cl_num_expanded = sector_num / iscsilun->cluster_sectors;
-    nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors,
-                                   iscsilun->cluster_sectors) - cl_num_expanded;
+    assert(iscsilun->cluster_size);
+    cl_num_expanded = offset / iscsilun->cluster_size;
+    nb_cls_expanded = DIV_ROUND_UP(offset + bytes,
+                                   iscsilun->cluster_size) - cl_num_expanded;
    /* shrink to touch only completely contained clusters */
-    cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
-    nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors
-                      - cl_num_shrunk;
+    cl_num_shrunk = DIV_ROUND_UP(offset, iscsilun->cluster_size);
+    nb_cls_shrunk = (offset + bytes) / iscsilun->cluster_size - cl_num_shrunk;
    if (allocated) {
        bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded);
    } else {
@@ -492,26 +496,26 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
 }

 static void
-iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                             int nb_sectors)
+iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t offset,
+                             int64_t bytes)
 {
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
+    iscsi_allocmap_update(iscsilun, offset, bytes, true, true);
 }

 static void
-iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
-                               int nb_sectors)
+iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t offset,
+                               int64_t bytes)
 {
    /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update
     * is ignored, so this will in effect be an iscsi_allocmap_set_invalid.
     */
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
+    iscsi_allocmap_update(iscsilun, offset, bytes, false, true);
 }

-static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
-                                       int nb_sectors)
+static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t offset,
+                                       int64_t bytes)
 {
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
+    iscsi_allocmap_update(iscsilun, offset, bytes, false, false);
 }

 static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
@@ -525,28 +529,30 @@ static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
 }

 static inline bool
-iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                            int nb_sectors)
+iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t offset,
+                            int64_t bytes)
 {
    unsigned long size;
    if (iscsilun->allocmap == NULL) {
        return true;
    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    assert(iscsilun->cluster_size);
+    size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size);
    return !(find_next_bit(iscsilun->allocmap, size,
-                           sector_num / iscsilun->cluster_sectors) == size);
+                           offset / iscsilun->cluster_size) == size);
 }

 static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
-                                           int64_t sector_num, int nb_sectors)
+                                           int64_t offset, int64_t bytes)
 {
    unsigned long size;
    if (iscsilun->allocmap_valid == NULL) {
        return false;
    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    assert(iscsilun->cluster_size);
+    size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size);
    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
-                               sector_num / iscsilun->cluster_sectors) == size);
+                               offset / iscsilun->cluster_size) == size);
 }

 static int coroutine_fn
@@ -628,53 +634,60 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_invalid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
+        error_report("iSCSI WRITE10/16 failed at lba %" PRIu64 ": %s", lba,
+                     iTask.err_str);
        r = iTask.err_code;
        goto out_unlock;
    }

-    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                 nb_sectors * BDRV_SECTOR_SIZE);

 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
    return r;
 }



-static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
-                                                  int64_t sector_num,
-                                                  int nb_sectors, int *pnum,
+static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs,
+                                              bool want_zero, int64_t offset,
+                                              int64_t bytes, int64_t *pnum,
+                                              int64_t *map,
                                              BlockDriverState **file)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct scsi_get_lba_status *lbas = NULL;
    struct scsi_lba_status_descriptor *lbasd = NULL;
    struct IscsiTask iTask;
-    int64_t ret;
+    uint64_t lba;
+    int ret;

    iscsi_co_init_iscsitask(iscsilun, &iTask);

-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        ret = -EINVAL;
-        goto out;
-    }
+    assert(QEMU_IS_ALIGNED(offset | bytes, iscsilun->block_size));

    /* default to all sectors allocated */
-    ret = BDRV_BLOCK_DATA;
-    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
-    *pnum = nb_sectors;
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    if (map) {
+        *map = offset;
+    }
+    *pnum = bytes;

    /* LUN does not support logical block provisioning */
    if (!iscsilun->lbpme) {
        goto out;
    }

+    lba = offset / iscsilun->block_size;
+
    qemu_mutex_lock(&iscsilun->mutex);
 retry:
    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
-                                  sector_qemu2lun(sector_num, iscsilun),
-                                  8 + 16, iscsi_co_generic_cb,
+                                  lba, 8 + 16, iscsi_co_generic_cb,
                                  &iTask) == NULL) {
        ret = -ENOMEM;
        goto out_unlock;
@@ -701,6 +714,8 @@ retry:
         * because the device is busy or the cmd is not
         * supported) we pretend all blocks are allocated
         * for backwards compatibility */
+        error_report("iSCSI GET_LBA_STATUS failed at lba %" PRIu64 ": %s",
+                     lba, iTask.err_str);
        goto out_unlock;
    }

@@ -712,12 +727,12 @@ retry:

    lbasd = &lbas->descriptors[0];

-    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
+    if (lba != lbasd->lba) {
        ret = -EIO;
        goto out_unlock;
    }

-    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
+    *pnum = lbasd->num_blocks * iscsilun->block_size;

    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
@@ -728,21 +743,22 @@ retry:
    }

    if (ret & BDRV_BLOCK_ZERO) {
-        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_unallocated(iscsilun, offset, *pnum);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_allocated(iscsilun, offset, *pnum);
    }

-    if (*pnum > nb_sectors) {
-        *pnum = nb_sectors;
+    if (*pnum > bytes) {
+        *pnum = bytes;
    }
 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
 out:
    if (iTask.task != NULL) {
        scsi_free_scsi_task(iTask.task);
    }
-    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
+    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID && file) {
        *file = bs;
    }
    return ret;
@@ -756,6 +772,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
+    int r = 0;

    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
@@ -768,29 +785,37 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    /* if cache.direct is off and we have a valid entry in our allocation map
     * we can skip checking the block status and directly return zeroes if
     * the request falls within an unallocated area */
-    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+    if (iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                nb_sectors * BDRV_SECTOR_SIZE) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                     nb_sectors * BDRV_SECTOR_SIZE)) {
            qemu_iovec_memset(iov, 0, 0x00, iov->size);
            return 0;
    }

    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
-        int pnum;
-        BlockDriverState *file;
+        !iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                 nb_sectors * BDRV_SECTOR_SIZE) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                     nb_sectors * BDRV_SECTOR_SIZE)) {
+        int64_t pnum;
        /* check the block status from the beginning of the cluster
         * containing the start sector */
-        int64_t ret = iscsi_co_get_block_status(bs,
-                          sector_num - sector_num % iscsilun->cluster_sectors,
-                          BDRV_REQUEST_MAX_SECTORS, &pnum, &file);
+        int64_t head;
+        int ret;
+
+        assert(iscsilun->cluster_size);
+        head = (sector_num * BDRV_SECTOR_SIZE) % iscsilun->cluster_size;
+        ret = iscsi_co_block_status(bs, true,
+                                    sector_num * BDRV_SECTOR_SIZE - head,
+                                    BDRV_REQUEST_MAX_BYTES, &pnum, NULL, NULL);
        if (ret < 0) {
            return ret;
        }
        /* if the whole request falls into an unallocated area we can avoid
-         * to read and directly return zeroes instead */
+         * reading and directly return zeroes instead */
        if (ret & BDRV_BLOCK_ZERO &&
-            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
+            pnum >= nb_sectors * BDRV_SECTOR_SIZE + head) {
            qemu_iovec_memset(iov, 0, 0x00, iov->size);
            return 0;
        }
@@ -853,19 +878,23 @@ retry:
        iTask.complete = 0;
        goto retry;
    }
-    qemu_mutex_unlock(&iscsilun->mutex);

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return iTask.err_code;
+        error_report("iSCSI READ10/16 failed at lba %" PRIu64 ": %s",
+                     lba, iTask.err_str);
+        r = iTask.err_code;
    }

-    return 0;
+    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
+    return r;
 }

 static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
+    int r = 0;

    iscsi_co_init_iscsitask(iscsilun, &iTask);
    qemu_mutex_lock(&iscsilun->mutex);
@@ -892,13 +921,15 @@ retry:
        iTask.complete = 0;
        goto retry;
    }
-    qemu_mutex_unlock(&iscsilun->mutex);

    if (iTask.status != SCSI_STATUS_GOOD) {
-        return iTask.err_code;
+        error_report("iSCSI SYNCHRONIZECACHE10 failed: %s", iTask.err_str);
+        r = iTask.err_code;
    }

-    return 0;
+    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
+    return r;
 }

 #ifdef __linux__
@@ -1128,6 +1159,8 @@ retry:
        goto retry;
    }

+    iscsi_allocmap_set_invalid(iscsilun, offset, bytes);
+
    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
        /* the target might fail with a check condition if it
           is not happy with the alignment of the UNMAP request
@@ -1136,15 +1169,15 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI UNMAP failed at lba %" PRIu64 ": %s",
+                     list.lba, iTask.err_str);
        r = iTask.err_code;
        goto out_unlock;
    }

-    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               bytes >> BDRV_SECTOR_BITS);
-
 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
    return r;
 }

@@ -1239,22 +1272,22 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_invalid(iscsilun, offset, bytes);
+        error_report("iSCSI WRITESAME10/16 failed at lba %" PRIu64 ": %s",
+                     lba, iTask.err_str);
        r = iTask.err_code;
        goto out_unlock;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_invalid(iscsilun, offset, bytes);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                     bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_allocated(iscsilun, offset, bytes);
    }

 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
+    g_free(iTask.err_str);
    return r;
 }

@@ -1853,7 +1886,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    if (iscsilun->dpofua) {
        bs->supported_write_flags = BDRV_REQ_FUA;
    }
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Check the write protect flag of the LUN if we want to write */
    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1930,13 +1962,17 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     * reasonable size */
    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 &&
        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
-        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
-                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
+        iscsilun->cluster_size = iscsilun->bl.opt_unmap_gran *
+            iscsilun->block_size;
        if (iscsilun->lbprz) {
            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
        }
    }

+    if (iscsilun->lbprz && iscsilun->lbp.lbpws) {
+        bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+    }
+
 out:
    qemu_opts_del(opts);
    g_free(initiator_name);
@@ -2081,7 +2117,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset,
    return 0;
 }

-static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn iscsi_co_create_opts(const char *filename, QemuOpts *opts,
+                                             Error **errp)
 {
    int ret = 0;
    int64_t total_size = 0;
@@ -2136,12 +2173,11 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    IscsiLun *iscsilun = bs->opaque;
    bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
-    bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
-    bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
+    bdi->cluster_size = iscsilun->cluster_size;
    return 0;
 }

-static void iscsi_invalidate_cache(BlockDriverState *bs,
+static void coroutine_fn iscsi_co_invalidate_cache(BlockDriverState *bs,
                                                   Error **errp)
 {
    IscsiLun *iscsilun = bs->opaque;
@@ -2169,18 +2205,18 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
-    .bdrv_create            = iscsi_create,
+    .bdrv_co_create_opts    = iscsi_co_create_opts,
    .create_opts            = &iscsi_create_opts,
    .bdrv_reopen_prepare    = iscsi_reopen_prepare,
    .bdrv_reopen_commit     = iscsi_reopen_commit,
-    .bdrv_invalidate_cache  = iscsi_invalidate_cache,
+    .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache,

    .bdrv_getlength  = iscsi_getlength,
    .bdrv_get_info   = iscsi_get_info,
    .bdrv_truncate   = iscsi_truncate,
    .bdrv_refresh_limits = iscsi_refresh_limits,

-    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+    .bdrv_co_block_status  = iscsi_co_block_status,
    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
@@ -2204,7 +2240,7 @@ static BlockDriver bdrv_iser = {
    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
-    .bdrv_create            = iscsi_create,
+    .bdrv_co_create_opts    = iscsi_co_create_opts,
    .create_opts            = &iscsi_create_opts,
    .bdrv_reopen_prepare    = iscsi_reopen_prepare,
    .bdrv_reopen_commit     = iscsi_reopen_commit,
@@ -2215,7 +2251,7 @@ static BlockDriver bdrv_iser = {
    .bdrv_truncate   = iscsi_truncate,
    .bdrv_refresh_limits = iscsi_refresh_limits,

-    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+    .bdrv_co_block_status  = iscsi_co_block_status,
    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1094,7 +1094,7 @@ static BlockDriver bdrv_mirror_top = {
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
-    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
+    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -846,9 +846,6 @@ int nbd_client_init(BlockDriverState *bs,
    if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
        bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
    }
-    if (client->info.min_block > bs->bl.request_alignment) {
-        bs->bl.request_alignment = client->info.min_block;
-    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_queue_init(&client->free_sema);
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -32,11 +32,11 @@
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qapi-visit.h"
+#include "qemu/option.h"
+#include "qapi/qapi-visit-sockets.h"
 #include "qapi/qobject-input-visitor.h"
 #include "qapi/qobject-output-visitor.h"
 #include "qapi/qmp/qdict.h"
-#include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qstring.h"
 #include "qemu/cutils.h"

@@ -388,6 +388,7 @@ static QemuOptsList nbd_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "ID of the TLS credentials to use",
        },
+        { /* end of list */ }
    },
 };

@@ -473,8 +474,10 @@ static int nbd_co_flush(BlockDriverState *bs)
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    NBDClientSession *s = nbd_get_client_session(bs);
+    uint32_t min = s->info.min_block;
    uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);

+    bs->bl.request_alignment = min ? min : BDRV_SECTOR_SIZE;
    bs->bl.max_pdiscard = max;
    bs->bl.max_pwrite_zeroes = max;
    bs->bl.max_transfer = max;
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -25,19 +25,19 @@
 #include "qemu/osdep.h"

 #include <poll.h>
-#include "qemu-common.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "trace.h"
 #include "qemu/iov.h"
+#include "qemu/option.h"
 #include "qemu/uri.h"
 #include "qemu/cutils.h"
 #include "sysemu/sysemu.h"
+#include "qapi/qapi-visit-block-core.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
-#include "qapi-visit.h"
 #include "qapi/qobject-input-visitor.h"
 #include "qapi/qobject-output-visitor.h"
 #include <nfsc/libnfs.h>
@@ -367,49 +367,6 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
    return task.ret;
 }

-static QemuOptsList runtime_opts = {
-    .name = "nfs",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-    .desc = {
-        {
-            .name = "path",
-            .type = QEMU_OPT_STRING,
-            .help = "Path of the image on the host",
-        },
-        {
-            .name = "user",
-            .type = QEMU_OPT_NUMBER,
-            .help = "UID value to use when talking to the server",
-        },
-        {
-            .name = "group",
-            .type = QEMU_OPT_NUMBER,
-            .help = "GID value to use when talking to the server",
-        },
-        {
-            .name = "tcp-syn-count",
-            .type = QEMU_OPT_NUMBER,
-            .help = "Number of SYNs to send during the session establish",
-        },
-        {
-            .name = "readahead-size",
-            .type = QEMU_OPT_NUMBER,
-            .help = "Set the readahead size in bytes",
-        },
-        {
-            .name = "page-cache-size",
-            .type = QEMU_OPT_NUMBER,
-            .help = "Set the pagecache size in bytes",
-        },
-        {
-            .name = "debug",
-            .type = QEMU_OPT_NUMBER,
-            .help = "Set the NFS debug level (max 2)",
-        },
-        { /* end of list */ }
-    },
-};
-
 static void nfs_detach_aio_context(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;
@@ -452,71 +409,16 @@ static void nfs_file_close(BlockDriverState *bs)
    nfs_client_close(client);
 }

-static NFSServer *nfs_config(QDict *options, Error **errp)
-{
-    NFSServer *server = NULL;
-    QDict *addr = NULL;
-    QObject *crumpled_addr = NULL;
-    Visitor *iv = NULL;
-    Error *local_error = NULL;
-
-    qdict_extract_subqdict(options, &addr, "server.");
-    if (!qdict_size(addr)) {
-        error_setg(errp, "NFS server address missing");
-        goto out;
-    }
-
-    crumpled_addr = qdict_crumple(addr, errp);
-    if (!crumpled_addr) {
-        goto out;
-    }
-
-    /*
-     * Caution: this works only because all scalar members of
-     * NFSServer are QString in @crumpled_addr.  The visitor expects
-     * @crumpled_addr to be typed according to the QAPI schema.  It
-     * is when @options come from -blockdev or blockdev_add.  But when
-     * they come from -drive, they're all QString.
-     */
-    iv = qobject_input_visitor_new(crumpled_addr);
-    visit_type_NFSServer(iv, NULL, &server, &local_error);
-    if (local_error) {
-        error_propagate(errp, local_error);
-        goto out;
-    }
-
-out:
-    QDECREF(addr);
-    qobject_decref(crumpled_addr);
-    visit_free(iv);
-    return server;
-}
-
-
-static int64_t nfs_client_open(NFSClient *client, QDict *options,
+static int64_t nfs_client_open(NFSClient *client, BlockdevOptionsNfs *opts,
                               int flags, int open_flags, Error **errp)
 {
    int64_t ret = -EINVAL;
-    QemuOpts *opts = NULL;
-    Error *local_err = NULL;
    struct stat st;
    char *file = NULL, *strp = NULL;

    qemu_mutex_init(&client->mutex);
-    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto fail;
-    }

-    client->path = g_strdup(qemu_opt_get(opts, "path"));
-    if (!client->path) {
-        ret = -EINVAL;
-        error_setg(errp, "No path was specified");
-        goto fail;
-    }
+    client->path = g_strdup(opts->path);

    strp = strrchr(client->path, '/');
    if (strp == NULL) {
@@ -526,12 +428,10 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
    file = g_strdup(strp);
    *strp = 0;

-    /* Pop the config into our state object, Exit if invalid */
-    client->server = nfs_config(options, errp);
-    if (!client->server) {
-        ret = -EINVAL;
-        goto fail;
-    }
+    /* Steal the NFSServer object from opts; set the original pointer to NULL
+     * to avoid use after free and double free. */
+    client->server = opts->server;
+    opts->server = NULL;

    client->context = nfs_init_context();
    if (client->context == NULL) {
@@ -539,29 +439,29 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
        goto fail;
    }

-    if (qemu_opt_get(opts, "user")) {
-        client->uid = qemu_opt_get_number(opts, "user", 0);
+    if (opts->has_user) {
+        client->uid = opts->user;
        nfs_set_uid(client->context, client->uid);
    }

-    if (qemu_opt_get(opts, "group")) {
-        client->gid = qemu_opt_get_number(opts, "group", 0);
+    if (opts->has_group) {
+        client->gid = opts->group;
        nfs_set_gid(client->context, client->gid);
    }

-    if (qemu_opt_get(opts, "tcp-syn-count")) {
-        client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syn-count", 0);
+    if (opts->has_tcp_syn_count) {
+        client->tcp_syncnt = opts->tcp_syn_count;
        nfs_set_tcp_syncnt(client->context, client->tcp_syncnt);
    }

 #ifdef LIBNFS_FEATURE_READAHEAD
-    if (qemu_opt_get(opts, "readahead-size")) {
+    if (opts->has_readahead_size) {
        if (open_flags & BDRV_O_NOCACHE) {
            error_setg(errp, "Cannot enable NFS readahead "
                             "if cache.direct = on");
            goto fail;
        }
-        client->readahead = qemu_opt_get_number(opts, "readahead-size", 0);
+        client->readahead = opts->readahead_size;
        if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
            warn_report("Truncating NFS readahead size to %d",
                        QEMU_NFS_MAX_READAHEAD_SIZE);
@@ -576,13 +476,13 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
 #endif

 #ifdef LIBNFS_FEATURE_PAGECACHE
-    if (qemu_opt_get(opts, "page-cache-size")) {
+    if (opts->has_page_cache_size) {
        if (open_flags & BDRV_O_NOCACHE) {
            error_setg(errp, "Cannot enable NFS pagecache "
                             "if cache.direct = on");
            goto fail;
        }
-        client->pagecache = qemu_opt_get_number(opts, "page-cache-size", 0);
+        client->pagecache = opts->page_cache_size;
        if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
            warn_report("Truncating NFS pagecache size to %d pages",
                        QEMU_NFS_MAX_PAGECACHE_SIZE);
@@ -595,8 +495,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
 #endif

 #ifdef LIBNFS_FEATURE_DEBUG
-    if (qemu_opt_get(opts, "debug")) {
-        client->debug = qemu_opt_get_number(opts, "debug", 0);
+    if (opts->has_debug) {
+        client->debug = opts->debug;
        /* limit the maximum debug level to avoid potential flooding
         * of our log files. */
        if (client->debug > QEMU_NFS_MAX_DEBUG_LEVEL) {
@@ -647,11 +547,53 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options,
 fail:
    nfs_client_close(client);
 out:
-    qemu_opts_del(opts);
    g_free(file);
    return ret;
 }

+static BlockdevOptionsNfs *nfs_options_qdict_to_qapi(QDict *options,
+                                                     Error **errp)
+{
+    BlockdevOptionsNfs *opts = NULL;
+    QObject *crumpled = NULL;
+    Visitor *v;
+    Error *local_err = NULL;
+
+    crumpled = qdict_crumple(options, errp);
+    if (crumpled == NULL) {
+        return NULL;
+    }
+
+    v = qobject_input_visitor_new_keyval(crumpled);
+    visit_type_BlockdevOptionsNfs(v, NULL, &opts, &local_err);
+    visit_free(v);
+    qobject_decref(crumpled);
+
+    if (local_err) {
+        return NULL;
+    }
+
+    return opts;
+}
+
+static int64_t nfs_client_open_qdict(NFSClient *client, QDict *options,
+                                     int flags, int open_flags, Error **errp)
+{
+    BlockdevOptionsNfs *opts;
+    int ret;
+
+    opts = nfs_options_qdict_to_qapi(options, errp);
+    if (opts == NULL) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = nfs_client_open(client, opts, flags, open_flags, errp);
+fail:
+    qapi_free_BlockdevOptionsNfs(opts);
+    return ret;
+}
+
 static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp) {
    NFSClient *client = bs->opaque;
@@ -659,7 +601,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,

    client->aio_context = bdrv_get_aio_context(bs);

-    ret = nfs_client_open(client, options,
+    ret = nfs_client_open_qdict(client, options,
                                (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                                bs->open_flags, errp);
    if (ret < 0) {
@@ -684,16 +626,42 @@ static QemuOptsList nfs_create_opts = {
    }
 };

-static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
+static int nfs_file_co_create(BlockdevCreateOptions *options, Error **errp)
 {
-    int64_t ret, total_size;
+    BlockdevCreateOptionsNfs *opts = &options->u.nfs;
    NFSClient *client = g_new0(NFSClient, 1);
-    QDict *options = NULL;
+    int ret;
+
+    assert(options->driver == BLOCKDEV_DRIVER_NFS);

    client->aio_context = qemu_get_aio_context();

+    ret = nfs_client_open(client, opts->location, O_CREAT, 0, errp);
+    if (ret < 0) {
+        goto out;
+    }
+    ret = nfs_ftruncate(client->context, client->fh, opts->size);
+    nfs_client_close(client);
+
+out:
+    g_free(client);
+    return ret;
+}
+
+static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts,
+                                                Error **errp)
+{
+    BlockdevCreateOptions *create_options;
+    BlockdevCreateOptionsNfs *nfs_opts;
+    QDict *options;
+    int ret;
+
+    create_options = g_new0(BlockdevCreateOptions, 1);
+    create_options->driver = BLOCKDEV_DRIVER_NFS;
+    nfs_opts = &create_options->u.nfs;
+
    /* Read out options */
-    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+    nfs_opts->size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                              BDRV_SECTOR_SIZE);

    options = qdict_new();
@@ -702,15 +670,21 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
        goto out;
    }

-    ret = nfs_client_open(client, options, O_CREAT, 0, errp);
+    nfs_opts->location = nfs_options_qdict_to_qapi(options, errp);
+    if (nfs_opts->location == NULL) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = nfs_file_co_create(create_options, errp);
    if (ret < 0) {
        goto out;
    }
-    ret = nfs_ftruncate(client->context, client->fh, total_size);
-    nfs_client_close(client);
+
+    ret = 0;
 out:
    QDECREF(options);
-    g_free(client);
+    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }

@@ -875,7 +849,7 @@ static void nfs_refresh_filename(BlockDriverState *bs, QDict *options)
 }

 #ifdef LIBNFS_FEATURE_PAGECACHE
-static void nfs_invalidate_cache(BlockDriverState *bs,
+static void coroutine_fn nfs_co_invalidate_cache(BlockDriverState *bs,
                                                 Error **errp)
 {
    NFSClient *client = bs->opaque;
@@ -897,7 +871,8 @@ static BlockDriver bdrv_nfs = {

    .bdrv_file_open                 = nfs_file_open,
    .bdrv_close                     = nfs_file_close,
-    .bdrv_create                    = nfs_file_create,
+    .bdrv_co_create                 = nfs_file_co_create,
+    .bdrv_co_create_opts            = nfs_file_co_create_opts,
    .bdrv_reopen_prepare            = nfs_reopen_prepare,

    .bdrv_co_preadv                 = nfs_co_preadv,
@@ -909,7 +884,7 @@ static BlockDriver bdrv_nfs = {
    .bdrv_refresh_filename          = nfs_refresh_filename,

 #ifdef LIBNFS_FEATURE_PAGECACHE
-    .bdrv_invalidate_cache          = nfs_invalidate_cache,
+    .bdrv_co_invalidate_cache       = nfs_co_invalidate_cache,
 #endif
 };

--- a/block/null.c
+++ b/block/null.c
@@ -14,6 +14,7 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
+#include "qemu/option.h"
 #include "block/block_int.h"

 #define NULL_OPT_LATENCY "latency-ns"
@@ -110,8 +111,7 @@ static coroutine_fn int null_co_common(BlockDriverState *bs)
    BDRVNullState *s = bs->opaque;

    if (s->latency_ns) {
-        co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
-                        s->latency_ns);
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, s->latency_ns);
    }
    return 0;
 }
@@ -223,22 +223,23 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state,
    return 0;
 }

-static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
-                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum,
+static int coroutine_fn null_co_block_status(BlockDriverState *bs,
+                                             bool want_zero, int64_t offset,
+                                             int64_t bytes, int64_t *pnum,
+                                             int64_t *map,
                                             BlockDriverState **file)
 {
    BDRVNullState *s = bs->opaque;
-    off_t start = sector_num * BDRV_SECTOR_SIZE;
+    int ret = BDRV_BLOCK_OFFSET_VALID;

-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs;

    if (s->read_zeroes) {
-        return BDRV_BLOCK_OFFSET_VALID | start | BDRV_BLOCK_ZERO;
-    } else {
-        return BDRV_BLOCK_OFFSET_VALID | start;
+        ret |= BDRV_BLOCK_ZERO;
    }
+    return ret;
 }

 static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
@@ -270,7 +271,7 @@ static BlockDriver bdrv_null_co = {
    .bdrv_co_flush_to_disk  = null_co_flush,
    .bdrv_reopen_prepare    = null_reopen_prepare,

-    .bdrv_co_get_block_status   = null_co_get_block_status,
+    .bdrv_co_block_status   = null_co_block_status,

    .bdrv_refresh_filename  = null_refresh_filename,
 };
@@ -290,7 +291,7 @@ static BlockDriver bdrv_null_aio = {
    .bdrv_aio_flush         = null_aio_flush,
    .bdrv_reopen_prepare    = null_reopen_prepare,

-    .bdrv_co_get_block_status   = null_co_get_block_status,
+    .bdrv_co_block_status   = null_co_block_status,

    .bdrv_refresh_filename  = null_refresh_filename,
 };
--- a/block/nvme.c
+++ b/block/nvme.c
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -27,15 +27,17 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
-#include "qemu-common.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/option.h"
 #include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "migration/blocker.h"
+#include "parallels.h"

 /**************************************************************/

@@ -45,30 +47,6 @@
 #define HEADER_INUSE_MAGIC  (0x746F6E59)
 #define MAX_PARALLELS_IMAGE_FACTOR (1ull << 32)

-#define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */
-
-
-// always little-endian
-typedef struct ParallelsHeader {
-    char magic[16]; // "WithoutFreeSpace"
-    uint32_t version;
-    uint32_t heads;
-    uint32_t cylinders;
-    uint32_t tracks;
-    uint32_t bat_entries;
-    uint64_t nb_sectors;
-    uint32_t inuse;
-    uint32_t data_off;
-    char padding[12];
-} QEMU_PACKED ParallelsHeader;
-
-
-typedef enum ParallelsPreallocMode {
-    PRL_PREALLOC_MODE_FALLOCATE = 0,
-    PRL_PREALLOC_MODE_TRUNCATE = 1,
-    PRL_PREALLOC_MODE__MAX = 2,
-} ParallelsPreallocMode;
-
 static QEnumLookup prealloc_mode_lookup = {
    .array = (const char *const[]) {
        "falloc",
@@ -77,34 +55,6 @@ static QEnumLookup prealloc_mode_lookup = {
    .size = PRL_PREALLOC_MODE__MAX
 };

-typedef struct BDRVParallelsState {
-    /** Locking is conservative, the lock protects
-     *   - image file extending (truncate, fallocate)
-     *   - any access to block allocation table
-     */
-    CoMutex lock;
-
-    ParallelsHeader *header;
-    uint32_t header_size;
-    bool header_unclean;
-
-    unsigned long *bat_dirty_bmap;
-    unsigned int  bat_dirty_block;
-
-    uint32_t *bat_bitmap;
-    unsigned int bat_size;
-
-    int64_t  data_end;
-    uint64_t prealloc_size;
-    ParallelsPreallocMode prealloc_mode;
-
-    unsigned int tracks;
-
-    unsigned int off_multiplier;
-    Error *migration_blocker;
-} BDRVParallelsState;
-
-
 #define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode"
 #define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size"

@@ -193,6 +143,7 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
 static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                                 int nb_sectors, int *pnum)
 {
+    int ret;
    BDRVParallelsState *s = bs->opaque;
    int64_t pos, space, idx, to_allocate, i, len;

@@ -221,7 +172,6 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        return len;
    }
    if (s->data_end + space > (len >> BDRV_SECTOR_BITS)) {
-        int ret;
        space += s->prealloc_size;
        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
            ret = bdrv_pwrite_zeroes(bs->file,
@@ -237,6 +187,37 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        }
    }

+    /* Try to read from backing to fill empty clusters
+     * FIXME: 1. previous write_zeroes may be redundant
+     *        2. most of data we read from backing will be rewritten by
+     *           parallels_co_writev. On aligned-to-cluster write we do not need
+     *           this read at all.
+     *        3. it would be good to combine write of data from backing and new
+     *           data into one write call */
+    if (bs->backing) {
+        int64_t nb_cow_sectors = to_allocate * s->tracks;
+        int64_t nb_cow_bytes = nb_cow_sectors << BDRV_SECTOR_BITS;
+        QEMUIOVector qiov;
+        struct iovec iov = {
+            .iov_len = nb_cow_bytes,
+            .iov_base = qemu_blockalign(bs, nb_cow_bytes)
+        };
+        qemu_iovec_init_external(&qiov, &iov, 1);
+
+        ret = bdrv_co_readv(bs->backing, idx * s->tracks, nb_cow_sectors,
+                            &qiov);
+        if (ret < 0) {
+            qemu_vfree(iov.iov_base);
+            return ret;
+        }
+
+        ret = bdrv_co_writev(bs->file, s->data_end, nb_cow_sectors, &qiov);
+        qemu_vfree(iov.iov_base);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
    for (i = 0; i < to_allocate; i++) {
        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
        s->data_end += s->tracks;
@@ -280,23 +261,31 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
 }


-static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
+                                                  bool want_zero,
+                                                  int64_t offset,
+                                                  int64_t bytes,
+                                                  int64_t *pnum,
+                                                  int64_t *map,
+                                                  BlockDriverState **file)
 {
    BDRVParallelsState *s = bs->opaque;
-    int64_t offset;
+    int count;

+    assert(QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE));
    qemu_co_mutex_lock(&s->lock);
-    offset = block_status(s, sector_num, nb_sectors, pnum);
+    offset = block_status(s, offset >> BDRV_SECTOR_BITS,
+                          bytes >> BDRV_SECTOR_BITS, &count);
    qemu_co_mutex_unlock(&s->lock);

+    *pnum = count * BDRV_SECTOR_SIZE;
    if (offset < 0) {
        return 0;
    }

+    *map = offset * BDRV_SECTOR_SIZE;
    *file = bs->file->bs;
-    return (offset << BDRV_SECTOR_BITS) |
-        BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }

 static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
@@ -360,12 +349,19 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs,

        nbytes = n << BDRV_SECTOR_BITS;

-        if (position < 0) {
-            qemu_iovec_memset(qiov, bytes_done, 0, nbytes);
-        } else {
        qemu_iovec_reset(&hd_qiov);
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);

+        if (position < 0) {
+            if (bs->backing) {
+                ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
+                if (ret < 0) {
+                    break;
+                }
+            } else {
+                qemu_iovec_memset(&hd_qiov, 0, 0, nbytes);
+            }
+        } else {
            ret = bdrv_co_readv(bs->file, position, n, &hd_qiov);
            if (ret < 0) {
                break;
@@ -382,7 +378,8 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
 }


-static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
+static int coroutine_fn parallels_co_check(BlockDriverState *bs,
+                                           BdrvCheckResult *res,
                                           BdrvCheckMode fix)
 {
    BDRVParallelsState *s = bs->opaque;
@@ -398,6 +395,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
        return size;
    }

+    qemu_co_mutex_lock(&s->lock);
    if (s->header_unclean) {
        fprintf(stderr, "%s image was not closed correctly\n",
                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
@@ -446,11 +444,12 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
        prev_off = off;
    }

+    ret = 0;
    if (flush_bat) {
        ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size);
        if (ret < 0) {
            res->check_errors++;
-            return ret;
+            goto out;
        }
    }

@@ -469,17 +468,21 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
            if (ret < 0) {
                error_report_err(local_err);
                res->check_errors++;
-                return ret;
+                goto out;
            }
            res->leaks_fixed += count;
        }
    }

-    return 0;
+out:
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
 }


-static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn parallels_co_create_opts(const char *filename,
+                                                 QemuOpts *opts,
+                                                 Error **errp)
 {
    int64_t total_size, cl_size;
    uint8_t tmp[BDRV_SECTOR_SIZE];
@@ -527,8 +530,9 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic));
    header.version = cpu_to_le32(HEADER_VERSION);
    /* don't care much about geometry, it is not used on image level */
-    header.heads = cpu_to_le32(16);
-    header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32);
+    header.heads = cpu_to_le32(HEADS_NUMBER);
+    header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE
+                                   / HEADS_NUMBER / SEC_IN_CYL);
    header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS);
    header.bat_entries = cpu_to_le32(bat_entries);
    header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE));
@@ -793,14 +797,14 @@ static BlockDriver bdrv_parallels = {
    .bdrv_open		= parallels_open,
    .bdrv_close		= parallels_close,
    .bdrv_child_perm          = bdrv_format_default_perms,
-    .bdrv_co_get_block_status = parallels_co_get_block_status,
+    .bdrv_co_block_status     = parallels_co_block_status,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
    .bdrv_co_readv  = parallels_co_readv,
    .bdrv_co_writev = parallels_co_writev,
-
-    .bdrv_create    = parallels_create,
-    .bdrv_check     = parallels_check,
+    .supports_backing = true,
+    .bdrv_co_create_opts = parallels_co_create_opts,
+    .bdrv_co_check  = parallels_co_check,
    .create_opts    = &parallels_create_opts,
 };

--- a/block/parallels.h
+++ b/block/parallels.h
@@ -0,0 +1,87 @@
+/*
+* Block driver for Parallels disk image format
+*
+* Copyright (c) 2015-2017 Virtuozzo, Inc.
+* Authors:
+*         2016-2017 Klim S. Kireev <klim.kireev@virtuozzo.com>
+*         2015 Denis V. Lunev <den@openvz.org>
+*
+* This code was originally based on comparing different disk images created
+* by Parallels. Currently it is based on opened OpenVZ sources
+* available at
+*     https://github.com/OpenVZ/ploop
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*/
+#ifndef BLOCK_PARALLELS_H
+#define BLOCK_PARALLELS_H
+#include "qemu/coroutine.h"
+
+#define HEADS_NUMBER 16
+#define SEC_IN_CYL 32
+#define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */
+
+/* always little-endian */
+typedef struct ParallelsHeader {
+    char magic[16]; /* "WithoutFreeSpace" */
+    uint32_t version;
+    uint32_t heads;
+    uint32_t cylinders;
+    uint32_t tracks;
+    uint32_t bat_entries;
+    uint64_t nb_sectors;
+    uint32_t inuse;
+    uint32_t data_off;
+    char padding[12];
+} QEMU_PACKED ParallelsHeader;
+
+typedef enum ParallelsPreallocMode {
+    PRL_PREALLOC_MODE_FALLOCATE = 0,
+    PRL_PREALLOC_MODE_TRUNCATE = 1,
+    PRL_PREALLOC_MODE__MAX = 2,
+} ParallelsPreallocMode;
+
+typedef struct BDRVParallelsState {
+    /** Locking is conservative, the lock protects
+     *   - image file extending (truncate, fallocate)
+     *   - any access to block allocation table
+     */
+    CoMutex lock;
+
+    ParallelsHeader *header;
+    uint32_t header_size;
+    bool header_unclean;
+
+    unsigned long *bat_dirty_bmap;
+    unsigned int  bat_dirty_block;
+
+    uint32_t *bat_bitmap;
+    unsigned int bat_size;
+
+    int64_t  data_end;
+    uint64_t prealloc_size;
+    ParallelsPreallocMode prealloc_mode;
+
+    unsigned int tracks;
+
+    unsigned int off_multiplier;
+    Error *migration_blocker;
+} BDRVParallelsState;
+
+#endif
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -27,10 +27,15 @@
 #include "block/block_int.h"
 #include "block/throttle-groups.h"
 #include "block/write-threshold.h"
-#include "qmp-commands.h"
-#include "qapi-visit.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-block-core.h"
 #include "qapi/qobject-output-visitor.h"
-#include "qapi/qmp/types.h"
+#include "qapi/qapi-visit-block-core.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qnum.h"
+#include "qapi/qmp/qstring.h"
 #include "sysemu/block-backend.h"
 #include "qemu/cutils.h"

--- a/block/qcow.c
+++ b/block/qcow.c
@@ -21,16 +21,17 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
-#include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/option.h"
 #include "qemu/bswap.h"
 #include <zlib.h>
-#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "crypto/block.h"
 #include "migration/blocker.h"
@@ -379,6 +380,7 @@ static int get_cluster_offset(BlockDriverState *bs,
        /* update the L1 entry */
        s->l1_table[l1_index] = l2_offset;
        tmp = cpu_to_be64(l2_offset);
+        BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
        ret = bdrv_pwrite_sync(bs->file,
                               s->l1_table_offset + l1_index * sizeof(tmp),
                               &tmp, sizeof(tmp));
@@ -409,6 +411,7 @@ static int get_cluster_offset(BlockDriverState *bs,
        }
    }
    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
    if (new_l2_table) {
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
        ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
@@ -432,6 +435,7 @@ static int get_cluster_offset(BlockDriverState *bs,
        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
        if (!allocate)
            return 0;
+        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
        /* allocate a new cluster */
        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
            (n_end - n_start) < s->cluster_sectors) {
@@ -447,6 +451,7 @@ static int get_cluster_offset(BlockDriverState *bs,
            }
            cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
            /* write the cluster content */
+            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
            ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
                              s->cluster_size);
            if (ret < 0) {
@@ -486,6 +491,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                                                      NULL) < 0) {
                                return -EIO;
                            }
+                            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
                            ret = bdrv_pwrite(bs->file,
                                              cluster_offset + i * 512,
                                              s->cluster_data, 512);
@@ -503,6 +509,11 @@ static int get_cluster_offset(BlockDriverState *bs,
        /* update L2 table */
        tmp = cpu_to_be64(cluster_offset);
        l2_table[l2_index] = tmp;
+        if (allocate == 2) {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+        } else {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+        }
        ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
                               &tmp, sizeof(tmp));
        if (ret < 0) {
@@ -513,23 +524,28 @@ static int get_cluster_offset(BlockDriverState *bs,
    return 1;
 }

-static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn qcow_co_block_status(BlockDriverState *bs,
+                                             bool want_zero,
+                                             int64_t offset, int64_t bytes,
+                                             int64_t *pnum, int64_t *map,
+                                             BlockDriverState **file)
 {
    BDRVQcowState *s = bs->opaque;
-    int index_in_cluster, n, ret;
+    int index_in_cluster, ret;
+    int64_t n;
    uint64_t cluster_offset;

    qemu_co_mutex_lock(&s->lock);
-    ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset);
+    ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        return ret;
    }
-    index_in_cluster = sector_num & (s->cluster_sectors - 1);
-    n = s->cluster_sectors - index_in_cluster;
-    if (n > nb_sectors)
-        n = nb_sectors;
+    index_in_cluster = offset & (s->cluster_size - 1);
+    n = s->cluster_size - index_in_cluster;
+    if (n > bytes) {
+        n = bytes;
+    }
    *pnum = n;
    if (!cluster_offset) {
        return 0;
@@ -537,9 +553,9 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
        return BDRV_BLOCK_DATA;
    }
-    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+    *map = cluster_offset | index_in_cluster;
    *file = bs->file->bs;
-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }

 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@@ -579,6 +595,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
    if (s->cluster_cache_offset != coffset) {
        csize = cluster_offset >> (63 - s->cluster_bits);
        csize &= (s->cluster_size - 1);
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
        if (ret != csize)
            return -1;
@@ -635,6 +652,8 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                hd_iov.iov_len = n * 512;
                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
                qemu_co_mutex_unlock(&s->lock);
+                /* qcow2 emits this on bs->file instead of bs->backing */
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
@@ -661,6 +680,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
            hd_iov.iov_len = n * 512;
            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
            qemu_co_mutex_unlock(&s->lock);
+            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            ret = bdrv_co_readv(bs->file,
                                (cluster_offset >> 9) + index_in_cluster,
                                n, &hd_qiov);
@@ -754,6 +774,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
        hd_iov.iov_len = n * 512;
        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
        qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        ret = bdrv_co_writev(bs->file,
                             (cluster_offset >> 9) + index_in_cluster,
                             n, &hd_qiov);
@@ -789,7 +810,8 @@ static void qcow_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

-static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn qcow_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
 {
    int header_size, backing_filename_len, l1_size, shift, i;
    QCowHeader header;
@@ -1048,6 +1070,7 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
        .iov_len    = out_len,
    };
    qemu_iovec_init_external(&hd_qiov, &iov, 1);
+    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
    if (ret < 0) {
        goto fail;
@@ -1105,13 +1128,13 @@ static BlockDriver bdrv_qcow = {
    .bdrv_close		= qcow_close,
    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_reopen_prepare    = qcow_reopen_prepare,
-    .bdrv_create            = qcow_create,
+    .bdrv_co_create_opts    = qcow_co_create_opts,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
    .supports_backing       = true,

    .bdrv_co_readv          = qcow_co_readv,
    .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_get_block_status   = qcow_co_get_block_status,
+    .bdrv_co_block_status   = qcow_co_block_status,

    .bdrv_make_empty        = qcow_make_empty,
    .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -110,7 +110,7 @@ static int update_header_sync(BlockDriverState *bs)
        return ret;
    }

-    return bdrv_flush(bs);
+    return bdrv_flush(bs->file->bs);
 }

 static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size)
@@ -413,8 +413,8 @@ static inline void bitmap_dir_entry_to_be(Qcow2BitmapDirEntry *entry)

 static inline int calc_dir_entry_size(size_t name_size, size_t extra_data_size)
 {
-    return align_offset(sizeof(Qcow2BitmapDirEntry) +
-                        name_size + extra_data_size, 8);
+    int size = sizeof(Qcow2BitmapDirEntry) + name_size + extra_data_size;
+    return ROUND_UP(size, 8);
 }

 static inline int dir_entry_size(Qcow2BitmapDirEntry *entry)
@@ -882,7 +882,7 @@ static int update_ext_header_and_dir(BlockDriverState *bs,
            return ret;
        }

-        ret = bdrv_flush(bs->file->bs);
+        ret = qcow2_flush_caches(bs);
        if (ret < 0) {
            goto fail;
        }
@@ -933,14 +933,14 @@ static void set_readonly_helper(gpointer bitmap, gpointer value)
    bdrv_dirty_bitmap_set_readonly(bitmap, (bool)value);
 }

-/* qcow2_load_autoloading_dirty_bitmaps()
+/* qcow2_load_dirty_bitmaps()
 * Return value is a hint for caller: true means that the Qcow2 header was
 * updated. (false doesn't mean that the header should be updated by the
 * caller, it just means that updating was not needed or the image cannot be
 * written to).
 * On failure the function returns false.
 */
-bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;
    Qcow2BitmapList *bm_list;
@@ -960,14 +960,16 @@ bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
    }

    QSIMPLEQ_FOREACH(bm, bm_list, entry) {
-        if ((bm->flags & BME_FLAG_AUTO) && !(bm->flags & BME_FLAG_IN_USE)) {
+        if (!(bm->flags & BME_FLAG_IN_USE)) {
            BdrvDirtyBitmap *bitmap = load_bitmap(bs, bm, errp);
            if (bitmap == NULL) {
                goto fail;
            }

+            if (!(bm->flags & BME_FLAG_AUTO)) {
+                bdrv_disable_dirty_bitmap(bitmap);
+            }
            bdrv_dirty_bitmap_set_persistance(bitmap, true);
-            bdrv_dirty_bitmap_set_autoload(bitmap, true);
            bm->flags |= BME_FLAG_IN_USE;
            created_dirty_bitmaps =
                    g_slist_append(created_dirty_bitmaps, bitmap);
@@ -1369,7 +1371,7 @@ void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp)
            bm->table.size = 0;
            QSIMPLEQ_INSERT_TAIL(&drop_tables, tb, entry);
        }
-        bm->flags = bdrv_dirty_bitmap_get_autoload(bitmap) ? BME_FLAG_AUTO : 0;
+        bm->flags = bdrv_dirty_bitmap_enabled(bitmap) ? BME_FLAG_AUTO : 0;
        bm->granularity_bits = ctz32(bdrv_dirty_bitmap_granularity(bitmap));
        bm->dirty_bitmap = bitmap;
    }
@@ -1449,6 +1451,16 @@ bool qcow2_can_store_new_dirty_bitmap(BlockDriverState *bs,
    bool found;
    Qcow2BitmapList *bm_list;

+    if (s->qcow_version < 3) {
+        /* Without autoclear_features, we would always have to assume
+         * that a program without persistent dirty bitmap support has
+         * accessed this qcow2 file when opening it, and would thus
+         * have to drop all dirty bitmaps (defeating their purpose).
+         */
+        error_setg(errp, "Cannot store dirty bitmaps in qcow2 v2 files");
+        goto fail;
+    }
+
    if (check_constraints_on_bitmap(bs, name, granularity, errp) != 0) {
        goto fail;
    }
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -39,26 +39,23 @@ struct Qcow2Cache {
    Qcow2CachedTable       *entries;
    struct Qcow2Cache      *depends;
    int                     size;
+    int                     table_size;
    bool                    depends_on_flush;
    void                   *table_array;
    uint64_t                lru_counter;
    uint64_t                cache_clean_lru_counter;
 };

-static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
-                    Qcow2Cache *c, int table)
+static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table)
 {
-    BDRVQcow2State *s = bs->opaque;
-    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
+    return (uint8_t *) c->table_array + (size_t) table * c->table_size;
 }

-static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
-                  Qcow2Cache *c, void *table)
+static inline int qcow2_cache_get_table_idx(Qcow2Cache *c, void *table)
 {
-    BDRVQcow2State *s = bs->opaque;
    ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
-    int idx = table_offset / s->cluster_size;
-    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
+    int idx = table_offset / c->table_size;
+    assert(idx >= 0 && idx < c->size && table_offset % c->table_size == 0);
    return idx;
 }

@@ -74,15 +71,13 @@ static inline const char *qcow2_cache_get_name(BDRVQcow2State *s, Qcow2Cache *c)
    }
 }

-static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
-                                      int i, int num_tables)
+static void qcow2_cache_table_release(Qcow2Cache *c, int i, int num_tables)
 {
 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
 #ifdef CONFIG_LINUX
-    BDRVQcow2State *s = bs->opaque;
-    void *t = qcow2_cache_get_table_addr(bs, c, i);
+    void *t = qcow2_cache_get_table_addr(c, i);
    int align = getpagesize();
-    size_t mem_size = (size_t) s->cluster_size * num_tables;
+    size_t mem_size = (size_t) c->table_size * num_tables;
    size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
    size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
    if (mem_size > offset && length > 0) {
@@ -98,7 +93,7 @@ static inline bool can_clean_entry(Qcow2Cache *c, int i)
        t->lru_counter <= c->cache_clean_lru_counter;
 }

-void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
+void qcow2_cache_clean_unused(Qcow2Cache *c)
 {
    int i = 0;
    while (i < c->size) {
@@ -118,23 +113,30 @@ void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
        }

        if (to_clean > 0) {
-            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
+            qcow2_cache_table_release(c, i - to_clean, to_clean);
        }
    }

    c->cache_clean_lru_counter = c->lru_counter;
 }

-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+                               unsigned table_size)
 {
    BDRVQcow2State *s = bs->opaque;
    Qcow2Cache *c;

+    assert(num_tables > 0);
+    assert(is_power_of_2(table_size));
+    assert(table_size >= (1 << MIN_CLUSTER_BITS));
+    assert(table_size <= s->cluster_size);
+
    c = g_new0(Qcow2Cache, 1);
    c->size = num_tables;
+    c->table_size = table_size;
    c->entries = g_try_new0(Qcow2CachedTable, num_tables);
    c->table_array = qemu_try_blockalign(bs->file->bs,
-                                         (size_t) num_tables * s->cluster_size);
+                                         (size_t) num_tables * c->table_size);

    if (!c->entries || !c->table_array) {
        qemu_vfree(c->table_array);
@@ -146,7 +148,7 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
    return c;
 }

-int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c)
+int qcow2_cache_destroy(Qcow2Cache *c)
 {
    int i;

@@ -203,13 +205,13 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)

    if (c == s->refcount_block_cache) {
        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
    } else if (c == s->l2_table_cache) {
        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
    } else {
        ret = qcow2_pre_write_overlap_check(bs, 0,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
    }

    if (ret < 0) {
@@ -223,7 +225,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
    }

    ret = bdrv_pwrite(bs->file, c->entries[i].offset,
-                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
+                      qcow2_cache_get_table_addr(c, i), c->table_size);
    if (ret < 0) {
        return ret;
    }
@@ -309,7 +311,7 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
        c->entries[i].lru_counter = 0;
    }

-    qcow2_cache_table_release(bs, c, 0, c->size);
+    qcow2_cache_table_release(c, 0, c->size);

    c->lru_counter = 0;

@@ -331,7 +333,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
    trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
                          offset, read_from_disk);

-    if (offset_into_cluster(s, offset)) {
+    if (!QEMU_IS_ALIGNED(offset, c->table_size)) {
        qcow2_signal_corruption(bs, true, -1, -1, "Cannot get entry from %s "
                                "cache: Offset %#" PRIx64 " is unaligned",
                                qcow2_cache_get_name(s, c), offset);
@@ -339,7 +341,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
    }

    /* Check if the table is already cached */
-    i = lookup_index = (offset / s->cluster_size * 4) % c->size;
+    i = lookup_index = (offset / c->table_size * 4) % c->size;
    do {
        const Qcow2CachedTable *t = &c->entries[i];
        if (t->offset == offset) {
@@ -379,8 +381,8 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
        }

        ret = bdrv_pread(bs->file, offset,
-                         qcow2_cache_get_table_addr(bs, c, i),
-                         s->cluster_size);
+                         qcow2_cache_get_table_addr(c, i),
+                         c->table_size);
        if (ret < 0) {
            return ret;
        }
@@ -391,7 +393,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
    /* And return the right table */
 found:
    c->entries[i].ref++;
-    *table = qcow2_cache_get_table_addr(bs, c, i);
+    *table = qcow2_cache_get_table_addr(c, i);

    trace_qcow2_cache_get_done(qemu_coroutine_self(),
                               c == s->l2_table_cache, i);
@@ -411,9 +413,9 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
    return qcow2_cache_do_get(bs, c, offset, table, false);
 }

-void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+void qcow2_cache_put(Qcow2Cache *c, void **table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, *table);
+    int i = qcow2_cache_get_table_idx(c, *table);

    c->entries[i].ref--;
    *table = NULL;
@@ -425,30 +427,28 @@ void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
    assert(c->entries[i].ref >= 0);
 }

-void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
-     void *table)
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, table);
+    int i = qcow2_cache_get_table_idx(c, table);
    assert(c->entries[i].offset != 0);
    c->entries[i].dirty = true;
 }

-void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
-                                  uint64_t offset)
+void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset)
 {
    int i;

    for (i = 0; i < c->size; i++) {
        if (c->entries[i].offset == offset) {
-            return qcow2_cache_get_table_addr(bs, c, i);
+            return qcow2_cache_get_table_addr(c, i);
        }
    }
    return NULL;
 }

-void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
+void qcow2_cache_discard(Qcow2Cache *c, void *table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, table);
+    int i = qcow2_cache_get_table_idx(c, table);

    assert(c->entries[i].ref == 0);

@@ -456,5 +456,5 @@ void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
    c->entries[i].lru_counter = 0;
    c->entries[i].dirty = false;

-    qcow2_cache_table_release(bs, c, i, 1);
+    qcow2_cache_table_release(c, i, 1);
 }
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -127,11 +127,11 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
    new_l1_table = qemu_try_blockalign(bs->file->bs,
-                                       align_offset(new_l1_size2, 512));
+                                       ROUND_UP(new_l1_size2, 512));
    if (new_l1_table == NULL) {
        return -ENOMEM;
    }
-    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+    memset(new_l1_table, 0, ROUND_UP(new_l1_size2, 512));

    if (s->l1_size) {
        memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
@@ -196,20 +196,26 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 /*
 * l2_load
 *
- * Loads a L2 table into memory. If the table is in the cache, the cache
- * is used; otherwise the L2 table is loaded from the image file.
+ * @bs: The BlockDriverState
+ * @offset: A guest offset, used to calculate what slice of the L2
+ *          table to load.
+ * @l2_offset: Offset to the L2 table in the image file.
+ * @l2_slice: Location to store the pointer to the L2 slice.
 *
- * Returns a pointer to the L2 table on success, or NULL if the read from
- * the image file failed.
+ * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
+ * that are loaded by the qcow2 cache). If the slice is in the cache,
+ * the cache is used; otherwise the L2 slice is loaded from the image
+ * file.
 */
-
-static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
-    uint64_t **l2_table)
+static int l2_load(BlockDriverState *bs, uint64_t offset,
+                   uint64_t l2_offset, uint64_t **l2_slice)
 {
    BDRVQcow2State *s = bs->opaque;
+    int start_of_slice = sizeof(uint64_t) *
+        (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));

-    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                           (void **)l2_table);
+    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
+                           (void **)l2_slice);
 }

 /*
@@ -258,11 +264,12 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 *
 */

-static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+static int l2_allocate(BlockDriverState *bs, int l1_index)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t old_l2_offset;
-    uint64_t *l2_table = NULL;
+    uint64_t *l2_slice = NULL;
+    unsigned slice, slice_size2, n_slices;
    int64_t l2_offset;
    int ret;

@@ -293,39 +300,47 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)

    /* allocate a new entry in the l2 cache */

+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
+    n_slices = s->cluster_size / slice_size2;
+
    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
-    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+    for (slice = 0; slice < n_slices; slice++) {
+        ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
+                                    l2_offset + slice * slice_size2,
+                                    (void **) &l2_slice);
        if (ret < 0) {
            goto fail;
        }

-    l2_table = *table;
-
        if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
-        /* if there was no old l2 table, clear the new table */
-        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+            /* if there was no old l2 table, clear the new slice */
+            memset(l2_slice, 0, slice_size2);
        } else {
-        uint64_t* old_table;
+            uint64_t *old_slice;
+            uint64_t old_l2_slice_offset =
+                (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;

-        /* if there was an old l2 table, read it from the disk */
+            /* if there was an old l2 table, read a slice from the disk */
            BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-        ret = qcow2_cache_get(bs, s->l2_table_cache,
-            old_l2_offset & L1E_OFFSET_MASK,
-            (void**) &old_table);
+            ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
+                                  (void **) &old_slice);
            if (ret < 0) {
                goto fail;
            }

-        memcpy(l2_table, old_table, s->cluster_size);
+            memcpy(l2_slice, old_slice, slice_size2);

-        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
        }

-    /* write the l2 table to the file */
+        /* write the l2 slice to the file */
        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);

        trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
+    }
+
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
@@ -339,14 +354,13 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
        goto fail;
    }

-    *table = l2_table;
    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
    return 0;

 fail:
    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
-    if (l2_table != NULL) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+    if (l2_slice != NULL) {
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    }
    s->l1_table[l1_index] = old_l2_offset;
    if (l2_offset > 0) {
@@ -357,19 +371,19 @@ fail:
 }

 /*
- * Checks how many clusters in a given L2 table are contiguous in the image
+ * Checks how many clusters in a given L2 slice are contiguous in the image
 * file. As soon as one of the flags in the bitmask stop_flags changes compared
 * to the first cluster, the search is stopped and the cluster is not counted
 * as contiguous. (This allows it, for example, to stop at the first compressed
 * cluster which may require a different handling)
 */
 static int count_contiguous_clusters(int nb_clusters, int cluster_size,
-        uint64_t *l2_table, uint64_t stop_flags)
+        uint64_t *l2_slice, uint64_t stop_flags)
 {
    int i;
    QCow2ClusterType first_cluster_type;
    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
-    uint64_t first_entry = be64_to_cpu(l2_table[0]);
+    uint64_t first_entry = be64_to_cpu(l2_slice[0]);
    uint64_t offset = first_entry & mask;

    if (!offset) {
@@ -382,7 +396,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
           first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);

    for (i = 0; i < nb_clusters; i++) {
-        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+        uint64_t l2_entry = be64_to_cpu(l2_slice[i]) & mask;
        if (offset + (uint64_t) i * cluster_size != l2_entry) {
            break;
        }
@@ -393,10 +407,10 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,

 /*
 * Checks how many consecutive unallocated clusters in a given L2
- * table have the same cluster type.
+ * slice have the same cluster type.
 */
 static int count_contiguous_clusters_unallocated(int nb_clusters,
-                                                 uint64_t *l2_table,
+                                                 uint64_t *l2_slice,
                                                 QCow2ClusterType wanted_type)
 {
    int i;
@@ -404,7 +418,7 @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
    assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
           wanted_type == QCOW2_CLUSTER_UNALLOCATED);
    for (i = 0; i < nb_clusters; i++) {
-        uint64_t entry = be64_to_cpu(l2_table[i]);
+        uint64_t entry = be64_to_cpu(l2_slice[i]);
        QCow2ClusterType type = qcow2_get_cluster_type(entry);

        if (type != wanted_type) {
@@ -516,8 +530,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 {
    BDRVQcow2State *s = bs->opaque;
    unsigned int l2_index;
-    uint64_t l1_index, l2_offset, *l2_table;
-    int l1_bits, c;
+    uint64_t l1_index, l2_offset, *l2_slice;
+    int c;
    unsigned int offset_in_cluster;
    uint64_t bytes_available, bytes_needed, nb_clusters;
    QCow2ClusterType type;
@@ -526,12 +540,12 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    offset_in_cluster = offset_into_cluster(s, offset);
    bytes_needed = (uint64_t) *bytes + offset_in_cluster;

-    l1_bits = s->l2_bits + s->cluster_bits;
-
    /* compute how many bytes there are between the start of the cluster
-     * containing offset and the end of the l1 entry */
-    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
-                    + offset_in_cluster;
+     * containing offset and the end of the l2 slice that contains
+     * the entry pointing to it */
+    bytes_available =
+        ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
+        << s->cluster_bits;

    if (bytes_needed > bytes_available) {
        bytes_needed = bytes_available;
@@ -541,7 +555,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,

    /* seek to the l2 offset in the l1 table */

-    l1_index = offset >> l1_bits;
+    l1_index = offset_to_l1_index(s, offset);
    if (l1_index >= s->l1_size) {
        type = QCOW2_CLUSTER_UNALLOCATED;
        goto out;
@@ -560,17 +574,17 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        return -EIO;
    }

-    /* load the l2 table in memory */
+    /* load the l2 slice in memory */

-    ret = l2_load(bs, l2_offset, &l2_table);
+    ret = l2_load(bs, offset, l2_offset, &l2_slice);
    if (ret < 0) {
        return ret;
    }

    /* find the cluster offset for the given disk offset */

-    l2_index = offset_to_l2_index(s, offset);
-    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    l2_index = offset_to_l2_slice_index(s, offset);
+    *cluster_offset = be64_to_cpu(l2_slice[l2_index]);

    nb_clusters = size_to_clusters(s, bytes_needed);
    /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
@@ -597,14 +611,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    case QCOW2_CLUSTER_UNALLOCATED:
        /* how many empty clusters ? */
        c = count_contiguous_clusters_unallocated(nb_clusters,
-                                                  &l2_table[l2_index], type);
+                                                  &l2_slice[l2_index], type);
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_ZERO_ALLOC:
    case QCOW2_CLUSTER_NORMAL:
        /* how many allocated clusters ? */
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], QCOW_OFLAG_ZERO);
+                                      &l2_slice[l2_index], QCOW_OFLAG_ZERO);
        *cluster_offset &= L2E_OFFSET_MASK;
        if (offset_into_cluster(s, *cluster_offset)) {
            qcow2_signal_corruption(bs, true, -1, -1,
@@ -620,7 +634,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        abort();
    }

-    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    bytes_available = (int64_t)c * s->cluster_size;

@@ -638,7 +652,7 @@ out:
    return type;

 fail:
-    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
    return ret;
 }

@@ -646,26 +660,25 @@ fail:
 * get_cluster_table
 *
 * for a given disk offset, load (and allocate if needed)
- * the l2 table.
+ * the appropriate slice of its l2 table.
 *
- * the l2 table offset in the qcow2 file and the cluster index
- * in the l2 table are given to the caller.
+ * the cluster index in the l2 slice is given to the caller.
 *
 * Returns 0 on success, -errno in failure case
 */
 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-                             uint64_t **new_l2_table,
+                             uint64_t **new_l2_slice,
                             int *new_l2_index)
 {
    BDRVQcow2State *s = bs->opaque;
    unsigned int l2_index;
    uint64_t l1_index, l2_offset;
-    uint64_t *l2_table = NULL;
+    uint64_t *l2_slice = NULL;
    int ret;

    /* seek to the l2 offset in the l1 table */

-    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l1_index = offset_to_l1_index(s, offset);
    if (l1_index >= s->l1_size) {
        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
        if (ret < 0) {
@@ -682,17 +695,9 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
        return -EIO;
    }

-    /* seek the l2 table of the given l2 offset */
-
-    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
-        /* load the l2 table in memory */
-        ret = l2_load(bs, l2_offset, &l2_table);
-        if (ret < 0) {
-            return ret;
-        }
-    } else {
+    if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
        /* First allocate a new L2 table (and do COW if needed) */
-        ret = l2_allocate(bs, l1_index, &l2_table);
+        ret = l2_allocate(bs, l1_index);
        if (ret < 0) {
            return ret;
        }
@@ -702,13 +707,23 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
                                QCOW2_DISCARD_OTHER);
        }
+
+        /* Get the offset of the newly-allocated l2 table */
+        l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+        assert(offset_into_cluster(s, l2_offset) == 0);
+    }
+
+    /* load the l2 slice in memory */
+    ret = l2_load(bs, offset, l2_offset, &l2_slice);
+    if (ret < 0) {
+        return ret;
    }

    /* find the cluster offset for the given disk offset */

-    l2_index = offset_to_l2_index(s, offset);
+    l2_index = offset_to_l2_slice_index(s, offset);

-    *new_l2_table = l2_table;
+    *new_l2_slice = l2_slice;
    *new_l2_index = l2_index;

    return 0;
@@ -733,26 +748,26 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 {
    BDRVQcow2State *s = bs->opaque;
    int l2_index, ret;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
    int64_t cluster_offset;
    int nb_csectors;

-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
    if (ret < 0) {
        return 0;
    }

    /* Compression can't overwrite anything. Fail if the cluster was already
     * allocated. */
-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);
    if (cluster_offset & L2E_OFFSET_MASK) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
        return 0;
    }

    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
    if (cluster_offset < 0) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
        return 0;
    }

@@ -767,9 +782,9 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
    /* compressed clusters never have the copied flag */

    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-    l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
+    l2_slice[l2_index] = cpu_to_be64(cluster_offset);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    return cluster_offset;
 }
@@ -908,7 +923,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 {
    BDRVQcow2State *s = bs->opaque;
    int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, *l2_table;
+    uint64_t *old_cluster, *l2_slice;
    uint64_t cluster_offset = m->alloc_offset;

    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
@@ -935,13 +950,13 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                                   s->refcount_block_cache);
    }

-    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
    if (ret < 0) {
        goto err;
    }
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);

-    assert(l2_index + m->nb_clusters <= s->l2_size);
+    assert(l2_index + m->nb_clusters <= s->l2_slice_size);
    for (i = 0; i < m->nb_clusters; i++) {
        /* if two concurrent writes happen to the same unallocated cluster
         * each write allocates separate cluster and writes data concurrently.
@@ -949,16 +964,16 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
         * cluster the second one has to do RMW (which is done above by
         * perform_cow()), update l2 table with its cluster pointer and free
         * old cluster. This is what this loop does */
-        if (l2_table[l2_index + i] != 0) {
-            old_cluster[j++] = l2_table[l2_index + i];
+        if (l2_slice[l2_index + i] != 0) {
+            old_cluster[j++] = l2_slice[l2_index + i];
        }

-        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+        l2_slice[l2_index + i] = cpu_to_be64((cluster_offset +
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
     }


-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    /*
     * If this was a COW, we need to decrease the refcount of the old cluster.
@@ -985,12 +1000,12 @@ err:
 * which must copy from the backing file)
 */
 static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
-    uint64_t *l2_table, int l2_index)
+    uint64_t *l2_slice, int l2_index)
 {
    int i;

    for (i = 0; i < nb_clusters; i++) {
-        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
        QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);

        switch(cluster_type) {
@@ -1105,7 +1120,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
    BDRVQcow2State *s = bs->opaque;
    int l2_index;
    uint64_t cluster_offset;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
    uint64_t nb_clusters;
    unsigned int keep_clusters;
    int ret;
@@ -1117,23 +1132,23 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
                                == offset_into_cluster(s, *host_offset));

    /*
-     * Calculate the number of clusters to look for. We stop at L2 table
+     * Calculate the number of clusters to look for. We stop at L2 slice
     * boundaries to keep things simple.
     */
    nb_clusters =
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);

-    l2_index = offset_to_l2_index(s, guest_offset);
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    l2_index = offset_to_l2_slice_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
    assert(nb_clusters <= INT_MAX);

    /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
    if (ret < 0) {
        return ret;
    }

-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);

    /* Check how many clusters are already allocated and don't need COW */
    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
@@ -1161,7 +1176,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
        /* We keep all QCOW_OFLAG_COPIED clusters */
        keep_clusters =
            count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index],
+                                      &l2_slice[l2_index],
                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
        assert(keep_clusters <= nb_clusters);

@@ -1176,7 +1191,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,

    /* Cleanup */
 out:
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    /* Only return a host offset if we actually made progress. Otherwise we
     * would make requirements for handle_alloc() that it can't fulfill */
@@ -1260,7 +1275,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
 {
    BDRVQcow2State *s = bs->opaque;
    int l2_index;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
    uint64_t entry;
    uint64_t nb_clusters;
    int ret;
@@ -1273,29 +1288,29 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
    assert(*bytes > 0);

    /*
-     * Calculate the number of clusters to look for. We stop at L2 table
+     * Calculate the number of clusters to look for. We stop at L2 slice
     * boundaries to keep things simple.
     */
    nb_clusters =
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);

-    l2_index = offset_to_l2_index(s, guest_offset);
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    l2_index = offset_to_l2_slice_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
    assert(nb_clusters <= INT_MAX);

    /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
    if (ret < 0) {
        return ret;
    }

-    entry = be64_to_cpu(l2_table[l2_index]);
+    entry = be64_to_cpu(l2_slice[l2_index]);

    /* For the moment, overwrite compressed clusters one by one */
    if (entry & QCOW_OFLAG_COMPRESSED) {
        nb_clusters = 1;
    } else {
-        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_slice, l2_index);
    }

    /* This function is only called when there were no non-COW clusters, so if
@@ -1324,7 +1339,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         * nb_clusters already to a range of COW clusters */
        preallocated_nb_clusters =
            count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], QCOW_OFLAG_COPIED);
+                                      &l2_slice[l2_index], QCOW_OFLAG_COPIED);
        assert(preallocated_nb_clusters > 0);

        nb_clusters = preallocated_nb_clusters;
@@ -1335,7 +1350,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
        keep_old_clusters = true;
    }

-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    if (!alloc_cluster_offset) {
        /* Allocate, if necessary at a given offset in the image file */
@@ -1617,32 +1632,32 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)

 /*
 * This discards as many clusters of nb_clusters as possible at once (i.e.
- * all clusters in the same L2 table) and returns the number of discarded
+ * all clusters in the same L2 slice) and returns the number of discarded
 * clusters.
 */
-static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-                             uint64_t nb_clusters, enum qcow2_discard_type type,
-                             bool full_discard)
+static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
+                               uint64_t nb_clusters,
+                               enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcow2State *s = bs->opaque;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
    int l2_index;
    int ret;
    int i;

-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
    if (ret < 0) {
        return ret;
    }

-    /* Limit nb_clusters to one L2 table */
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    /* Limit nb_clusters to one L2 slice */
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
    assert(nb_clusters <= INT_MAX);

    for (i = 0; i < nb_clusters; i++) {
        uint64_t old_l2_entry;

-        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        old_l2_entry = be64_to_cpu(l2_slice[l2_index + i]);

        /*
         * If full_discard is false, make sure that a discarded area reads back
@@ -1680,18 +1695,18 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
        }

        /* First remove L2 entries */
-        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
        if (!full_discard && s->qcow_version >= 3) {
-            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
        } else {
-            l2_table[l2_index + i] = cpu_to_be64(0);
+            l2_slice[l2_index + i] = cpu_to_be64(0);
        }

        /* Then decrease the refcount */
        qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
    }

-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    return nb_clusters;
 }
@@ -1715,9 +1730,9 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,

    s->cache_discards = true;

-    /* Each L2 table is handled by its own loop iteration */
+    /* Each L2 slice is handled by its own loop iteration */
    while (nb_clusters > 0) {
-        cleared = discard_single_l2(bs, offset, nb_clusters, type,
+        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
                                      full_discard);
        if (cleared < 0) {
            ret = cleared;
@@ -1738,33 +1753,33 @@ fail:

 /*
 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
- * all clusters in the same L2 table) and returns the number of zeroed
+ * all clusters in the same L2 slice) and returns the number of zeroed
 * clusters.
 */
-static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
                            uint64_t nb_clusters, int flags)
 {
    BDRVQcow2State *s = bs->opaque;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
    int l2_index;
    int ret;
    int i;
    bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);

-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
    if (ret < 0) {
        return ret;
    }

-    /* Limit nb_clusters to one L2 table */
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    /* Limit nb_clusters to one L2 slice */
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
    assert(nb_clusters <= INT_MAX);

    for (i = 0; i < nb_clusters; i++) {
        uint64_t old_offset;
        QCow2ClusterType cluster_type;

-        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+        old_offset = be64_to_cpu(l2_slice[l2_index + i]);

        /*
         * Minimize L2 changes if the cluster already reads back as
@@ -1776,16 +1791,16 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
            continue;
        }

-        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
        if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
-            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
        } else {
-            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
        }
    }

-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

    return nb_clusters;
 }
@@ -1809,13 +1824,13 @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
        return -ENOTSUP;
    }

-    /* Each L2 table is handled by its own loop iteration */
+    /* Each L2 slice is handled by its own loop iteration */
    nb_clusters = size_to_clusters(s, bytes);

    s->cache_discards = true;

    while (nb_clusters > 0) {
-        cleared = zero_single_l2(bs, offset, nb_clusters, flags);
+        cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
        if (cleared < 0) {
            ret = cleared;
            goto fail;
@@ -1849,22 +1864,25 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
 {
    BDRVQcow2State *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
-    uint64_t *l2_table = NULL;
+    uint64_t *l2_slice = NULL;
+    unsigned slice, slice_size2, n_slices;
    int ret;
    int i, j;

+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
+    n_slices = s->cluster_size / slice_size2;
+
    if (!is_active_l1) {
        /* inactive L2 tables require a buffer to be stored in when loading
         * them from disk */
-        l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size);
-        if (l2_table == NULL) {
+        l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
+        if (l2_slice == NULL) {
            return -ENOMEM;
        }
    }

    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
-        bool l2_dirty = false;
        uint64_t l2_refcount;

        if (!l2_offset) {
@@ -1884,29 +1902,32 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            goto fail;
        }

-        if (is_active_l1) {
-            /* get active L2 tables from cache */
-            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                    (void **)&l2_table);
-        } else {
-            /* load inactive L2 tables from disk */
-            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
-                            (void *)l2_table, s->cluster_sectors);
-        }
-        if (ret < 0) {
-            goto fail;
-        }
-
        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
                                 &l2_refcount);
        if (ret < 0) {
            goto fail;
        }

-        for (j = 0; j < s->l2_size; j++) {
-            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+        for (slice = 0; slice < n_slices; slice++) {
+            uint64_t slice_offset = l2_offset + slice * slice_size2;
+            bool l2_dirty = false;
+            if (is_active_l1) {
+                /* get active L2 tables from cache */
+                ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
+                                      (void **)&l2_slice);
+            } else {
+                /* load inactive L2 tables from disk */
+                ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2);
+            }
+            if (ret < 0) {
+                goto fail;
+            }
+
+            for (j = 0; j < s->l2_slice_size; j++) {
+                uint64_t l2_entry = be64_to_cpu(l2_slice[j]);
                int64_t offset = l2_entry & L2E_OFFSET_MASK;
-            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
+                QCow2ClusterType cluster_type =
+                    qcow2_get_cluster_type(l2_entry);

                if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
                    cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
@@ -1917,7 +1938,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    if (!bs->backing) {
                        /* not backed; therefore we can simply deallocate the
                         * cluster */
-                    l2_table[j] = 0;
+                        l2_slice[j] = 0;
                        l2_dirty = true;
                        continue;
                    }
@@ -1929,10 +1950,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    }

                    if (l2_refcount > 1) {
-                    /* For shared L2 tables, set the refcount accordingly (it is
-                     * already 1 and needs to be l2_refcount) */
-                    ret = qcow2_update_cluster_refcount(bs,
-                            offset >> s->cluster_bits,
+                        /* For shared L2 tables, set the refcount accordingly
+                         * (it is already 1 and needs to be l2_refcount) */
+                        ret = qcow2_update_cluster_refcount(
+                            bs, offset >> s->cluster_bits,
                            refcount_diff(1, l2_refcount), false,
                            QCOW2_DISCARD_OTHER);
                        if (ret < 0) {
@@ -1944,11 +1965,13 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                }

                if (offset_into_cluster(s, offset)) {
-                qcow2_signal_corruption(bs, true, -1, -1,
+                    int l2_index = slice * s->l2_slice_size + j;
+                    qcow2_signal_corruption(
+                        bs, true, -1, -1,
                        "Cluster allocation offset "
                        "%#" PRIx64 " unaligned (L2 offset: %#"
                        PRIx64 ", L2 index: %#x)", offset,
-                                        l2_offset, j);
+                        l2_offset, l2_index);
                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                        qcow2_free_clusters(bs, offset, s->cluster_size,
                                            QCOW2_DISCARD_ALWAYS);
@@ -1957,7 +1980,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    goto fail;
                }

-            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
+                ret = qcow2_pre_write_overlap_check(bs, 0, offset,
+                                                    s->cluster_size);
                if (ret < 0) {
                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                        qcow2_free_clusters(bs, offset, s->cluster_size,
@@ -1976,35 +2000,36 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                }

                if (l2_refcount == 1) {
-                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+                    l2_slice[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
                } else {
-                l2_table[j] = cpu_to_be64(offset);
+                    l2_slice[j] = cpu_to_be64(offset);
                }
                l2_dirty = true;
            }

            if (is_active_l1) {
                if (l2_dirty) {
-                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
                    qcow2_cache_depends_on_flush(s->l2_table_cache);
                }
-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
            } else {
                if (l2_dirty) {
-                ret = qcow2_pre_write_overlap_check(bs,
-                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
-                        s->cluster_size);
+                    ret = qcow2_pre_write_overlap_check(
+                        bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
+                        slice_offset, slice_size2);
                    if (ret < 0) {
                        goto fail;
                    }

-                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
-                                 (void *)l2_table, s->cluster_sectors);
+                    ret = bdrv_pwrite(bs->file, slice_offset,
+                                      l2_slice, slice_size2);
                    if (ret < 0) {
                        goto fail;
                    }
                }
            }
+        }

        (*visited_l1_entries)++;
        if (status_cb) {
@@ -2015,11 +2040,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
    ret = 0;

 fail:
-    if (l2_table) {
+    if (l2_slice) {
        if (!is_active_l1) {
-            qemu_vfree(l2_table);
+            qemu_vfree(l2_slice);
        } else {
-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
        }
    }
    return ret;
@@ -2068,14 +2093,31 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
    }

    for (i = 0; i < s->nb_snapshots; i++) {
-        int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
-                                      sizeof(uint64_t), BDRV_SECTOR_SIZE);
+        int l1_size2;
+        uint64_t *new_l1_table;
+        Error *local_err = NULL;

-        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+        ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
+                                   s->snapshots[i].l1_size, sizeof(uint64_t),
+                                   QCOW_MAX_L1_SIZE, "Snapshot L1 table",
+                                   &local_err);
+        if (ret < 0) {
+            error_report_err(local_err);
+            goto fail;
+        }

-        ret = bdrv_read(bs->file,
-                        s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
-                        (void *)l1_table, l1_sectors);
+        l1_size2 = s->snapshots[i].l1_size * sizeof(uint64_t);
+        new_l1_table = g_try_realloc(l1_table, l1_size2);
+
+        if (!new_l1_table) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+
+        l1_table = new_l1_table;
+
+        ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset,
+                         l1_table, l1_size2);
        if (ret < 0) {
            goto fail;
        }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -277,7 +277,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
    block_index = cluster_index & (s->refcount_block_size - 1);
    *refcount = s->get_refcount(refcount_block, block_index);

-    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+    qcow2_cache_put(s->refcount_block_cache, &refcount_block);

    return 0;
 }
@@ -421,7 +421,7 @@ static int alloc_refcount_block(BlockDriverState *bs,

    /* Now the new refcount block needs to be written to disk */
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
-    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
        goto fail;
@@ -449,7 +449,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
        return -EAGAIN;
    }

-    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
+    qcow2_cache_put(s->refcount_block_cache, refcount_block);

    /*
     * If we come here, we need to grow the refcount table. Again, a new
@@ -501,7 +501,7 @@ static int alloc_refcount_block(BlockDriverState *bs,

 fail:
    if (*refcount_block != NULL) {
-        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
+        qcow2_cache_put(s->refcount_block_cache, refcount_block);
    }
    return ret;
 }
@@ -623,7 +623,7 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                goto fail;
            }
            memset(refblock_data, 0, s->cluster_size);
-            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                         refblock_data);

            new_table[i] = block_offset;
@@ -656,11 +656,11 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                s->set_refcount(refblock_data, j, 1);
            }

-            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                         refblock_data);
        }

-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
+        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
    }

    assert(block_offset == table_offset);
@@ -836,7 +836,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
        /* Load the refcount block and allocate it if needed */
        if (table_index != old_table_index) {
            if (refcount_block) {
-                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
            }
            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
            if (ret < 0) {
@@ -845,8 +845,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
        }
        old_table_index = table_index;

-        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
-                                     refcount_block);
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);

        /* we can update the count and save it */
        block_index = cluster_index & (s->refcount_block_size - 1);
@@ -872,16 +871,16 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
        if (refcount == 0) {
            void *table;

-            table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
                                                offset);
            if (table != NULL) {
-                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
-                qcow2_cache_discard(bs, s->refcount_block_cache, table);
+                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
+                qcow2_cache_discard(s->refcount_block_cache, table);
            }

-            table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
+            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
            if (table != NULL) {
-                qcow2_cache_discard(bs, s->l2_table_cache, table);
+                qcow2_cache_discard(s->l2_table_cache, table);
            }

            if (s->discard_passthrough[type]) {
@@ -898,7 +897,7 @@ fail:

    /* Write last changed block to disk */
    if (refcount_block) {
-        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
    }

    /*
@@ -1172,7 +1171,35 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
    }
 }

+int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;

+    ret = qcow2_cache_write(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (qcow2_need_accurate_refcounts(s)) {
+        ret = qcow2_cache_write(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
+{
+    int ret = qcow2_write_caches(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_flush(bs->file->bs);
+}

 /*********************************************************/
 /* snapshots and image creation */
@@ -1184,17 +1211,20 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
    int64_t l1_table_offset, int l1_size, int addend)
 {
    BDRVQcow2State *s = bs->opaque;
-    uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
+    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
    bool l1_allocated = false;
    int64_t old_entry, old_l2_offset;
+    unsigned slice, slice_size2, n_slices;
    int i, j, l1_modified = 0, nb_csectors;
    int ret;

    assert(addend >= -1 && addend <= 1);

-    l2_table = NULL;
+    l2_slice = NULL;
    l1_table = NULL;
    l1_size2 = l1_size * sizeof(uint64_t);
+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
+    n_slices = s->cluster_size / slice_size2;

    s->cache_discards = true;

@@ -1202,7 +1232,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     * l1_table_offset when it is the current s->l1_table_offset! Be careful
     * when changing this! */
    if (l1_table_offset != s->l1_table_offset) {
-        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
+        l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
        if (l1_size2 && l1_table == NULL) {
            ret = -ENOMEM;
            goto fail;
@@ -1237,17 +1267,19 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                goto fail;
            }

-            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                (void**) &l2_table);
+            for (slice = 0; slice < n_slices; slice++) {
+                ret = qcow2_cache_get(bs, s->l2_table_cache,
+                                      l2_offset + slice * slice_size2,
+                                      (void **) &l2_slice);
                if (ret < 0) {
                    goto fail;
                }

-            for (j = 0; j < s->l2_size; j++) {
+                for (j = 0; j < s->l2_slice_size; j++) {
                    uint64_t cluster_index;
                    uint64_t offset;

-                entry = be64_to_cpu(l2_table[j]);
+                    entry = be64_to_cpu(l2_slice[j]);
                    old_entry = entry;
                    entry &= ~QCOW_OFLAG_COPIED;
                    offset = entry & L2E_OFFSET_MASK;
@@ -1257,8 +1289,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                        nb_csectors = ((entry >> s->csize_shift) &
                                       s->csize_mask) + 1;
                        if (addend != 0) {
-                        ret = update_refcount(bs,
-                                (entry & s->cluster_offset_mask) & ~511,
+                            ret = update_refcount(
+                                bs, (entry & s->cluster_offset_mask) & ~511,
                                nb_csectors * 512, abs(addend), addend < 0,
                                QCOW2_DISCARD_SNAPSHOT);
                            if (ret < 0) {
@@ -1272,11 +1304,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                    case QCOW2_CLUSTER_NORMAL:
                    case QCOW2_CLUSTER_ZERO_ALLOC:
                        if (offset_into_cluster(s, offset)) {
-                        qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
+                            /* Here l2_index means table (not slice) index */
+                            int l2_index = slice * s->l2_slice_size + j;
+                            qcow2_signal_corruption(
+                                bs, true, -1, -1, "Cluster "
                                "allocation offset %#" PRIx64
                                " unaligned (L2 offset: %#"
                                PRIx64 ", L2 index: %#x)",
-                                                offset, l2_offset, j);
+                                offset, l2_offset, l2_index);
                            ret = -EIO;
                            goto fail;
                        }
@@ -1284,8 +1319,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                        cluster_index = offset >> s->cluster_bits;
                        assert(cluster_index);
                        if (addend != 0) {
-                        ret = qcow2_update_cluster_refcount(bs,
-                                    cluster_index, abs(addend), addend < 0,
+                            ret = qcow2_update_cluster_refcount(
+                                bs, cluster_index, abs(addend), addend < 0,
                                QCOW2_DISCARD_SNAPSHOT);
                            if (ret < 0) {
                                goto fail;
@@ -1315,13 +1350,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                                       s->refcount_block_cache);
                        }
-                    l2_table[j] = cpu_to_be64(entry);
-                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
-                                                 l2_table);
+                        l2_slice[j] = cpu_to_be64(entry);
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
+                                                     l2_slice);
                    }
                }

-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
+            }

            if (addend != 0) {
                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
@@ -1348,8 +1384,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,

    ret = bdrv_flush(bs);
 fail:
-    if (l2_table) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (l2_slice) {
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
    }

    s->cache_discards = false;
@@ -1508,7 +1544,7 @@ enum {
 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                              void **refcount_table,
                              int64_t *refcount_table_size, int64_t l2_offset,
-                              int flags)
+                              int flags, BdrvCheckMode fix)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l2_table, l2_entry;
@@ -1579,6 +1615,57 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                next_contiguous_offset = offset + s->cluster_size;
            }

+            /* Correct offsets are cluster aligned */
+            if (offset_into_cluster(s, offset)) {
+                if (qcow2_get_cluster_type(l2_entry) ==
+                    QCOW2_CLUSTER_ZERO_ALLOC)
+                {
+                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
+                            "cluster is not properly aligned; L2 entry "
+                            "corrupted.\n",
+                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
+                            offset);
+                    if (fix & BDRV_FIX_ERRORS) {
+                        uint64_t l2e_offset =
+                            l2_offset + (uint64_t)i * sizeof(uint64_t);
+
+                        l2_entry = QCOW_OFLAG_ZERO;
+                        l2_table[i] = cpu_to_be64(l2_entry);
+                        ret = qcow2_pre_write_overlap_check(bs,
+                                QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
+                                l2e_offset, sizeof(uint64_t));
+                        if (ret < 0) {
+                            fprintf(stderr, "ERROR: Overlap check failed\n");
+                            res->check_errors++;
+                            /* Something is seriously wrong, so abort checking
+                             * this L2 table */
+                            goto fail;
+                        }
+
+                        ret = bdrv_pwrite_sync(bs->file, l2e_offset,
+                                               &l2_table[i], sizeof(uint64_t));
+                        if (ret < 0) {
+                            fprintf(stderr, "ERROR: Failed to overwrite L2 "
+                                    "table entry: %s\n", strerror(-ret));
+                            res->check_errors++;
+                            /* Do not abort, continue checking the rest of this
+                             * L2 table's entries */
+                        } else {
+                            res->corruptions_fixed++;
+                            /* Skip marking the cluster as used
+                             * (it is unused now) */
+                            continue;
+                        }
+                    } else {
+                        res->corruptions++;
+                    }
+                } else {
+                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
+                        "not properly aligned; L2 entry corrupted.\n", offset);
+                    res->corruptions++;
+                }
+            }
+
            /* Mark cluster as used */
            ret = qcow2_inc_refcounts_imrt(bs, res,
                                           refcount_table, refcount_table_size,
@@ -1586,13 +1673,6 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
            if (ret < 0) {
                goto fail;
            }
-
-            /* Correct offsets are cluster aligned */
-            if (offset_into_cluster(s, offset)) {
-                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
-                    "properly aligned; L2 entry corrupted.\n", offset);
-                res->corruptions++;
-            }
            break;
        }

@@ -1626,7 +1706,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
                              void **refcount_table,
                              int64_t *refcount_table_size,
                              int64_t l1_table_offset, int l1_size,
-                              int flags)
+                              int flags, BdrvCheckMode fix)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l1_table = NULL, l2_offset, l1_size2;
@@ -1681,7 +1761,8 @@ static int check_refcounts_l1(BlockDriverState *bs,

            /* Process and check L2 entries */
            ret = check_refcounts_l2(bs, res, refcount_table,
-                                     refcount_table_size, l2_offset, flags);
+                                     refcount_table_size, l2_offset, flags,
+                                     fix);
            if (ret < 0) {
                goto fail;
            }
@@ -1957,7 +2038,8 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,

    /* current L1 table */
    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
-                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
+                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
+                             fix);
    if (ret < 0) {
        return ret;
    }
@@ -1965,8 +2047,22 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
    /* snapshots */
    for (i = 0; i < s->nb_snapshots; i++) {
        sn = s->snapshots + i;
+        if (offset_into_cluster(s, sn->l1_table_offset)) {
+            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
+                    "L1 table is not cluster aligned; snapshot table entry "
+                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
+            res->corruptions++;
+            continue;
+        }
+        if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
+            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
+                    "L1 table is too large; snapshot table entry corrupted\n",
+                    sn->id_str, sn->name, sn->l1_size);
+            res->corruptions++;
+            continue;
+        }
        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
-                                 sn->l1_table_offset, sn->l1_size, 0);
+                                 sn->l1_table_offset, sn->l1_size, 0, fix);
        if (ret < 0) {
            return ret;
        }
@@ -2499,7 +2595,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
    }

    /* align range to test to cluster boundaries */
-    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
+    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
    offset = start_of_cluster(s, offset);

    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
@@ -2560,9 +2656,17 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
            uint32_t l1_sz  = s->snapshots[i].l1_size;
            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
-            uint64_t *l1 = g_try_malloc(l1_sz2);
+            uint64_t *l1;
            int ret;

+            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
+                                       QCOW_MAX_L1_SIZE, "", NULL);
+            if (ret < 0) {
+                return ret;
+            }
+
+            l1 = g_try_malloc(l1_sz2);
+
            if (l1_sz2 && l1 == NULL) {
                return -ENOMEM;
            }
@@ -2803,7 +2907,7 @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                                    new_reftable_size, new_refblock,
                                    new_refblock_empty, allocated, errp);
                    if (ret < 0) {
-                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                        qcow2_cache_put(s->refcount_block_cache, &refblock);
                        return ret;
                    }

@@ -2816,7 +2920,7 @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
                    uint64_t offset;

-                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                    qcow2_cache_put(s->refcount_block_cache, &refblock);

                    offset = ((reftable_index << s->refcount_block_bits)
                              + refblock_index) << s->cluster_bits;
@@ -2837,7 +2941,7 @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                new_refblock_empty = new_refblock_empty && refcount == 0;
            }

-            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+            qcow2_cache_put(s->refcount_block_cache, &refblock);
        } else {
            /* No refblock means every refcount is 0 */
            for (refblock_index = 0; refblock_index < s->refcount_block_size;
@@ -3129,24 +3233,24 @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
                                offset_to_reftable_index(s, discard_block_offs),
                                discard_block_offs,
                                s->get_refcount(refblock, block_index));
-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        qcow2_cache_put(s->refcount_block_cache, &refblock);
        return -EINVAL;
    }
    s->set_refcount(refblock, block_index, 0);

-    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);

-    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+    qcow2_cache_put(s->refcount_block_cache, &refblock);

    if (cluster_index < s->free_cluster_index) {
        s->free_cluster_index = cluster_index;
    }

-    refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
                                           discard_block_offs);
    if (refblock) {
        /* discard refblock from the cache if refblock is cached */
-        qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
+        qcow2_cache_discard(s->refcount_block_cache, refblock);
    }
    update_refcount_discard(bs, discard_block_offs, s->cluster_size);

@@ -3189,7 +3293,7 @@ int qcow2_shrink_reftable(BlockDriverState *bs)
        } else {
            unused_block = buffer_is_zero(refblock, s->cluster_size);
        }
-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        qcow2_cache_put(s->refcount_block_cache, &refblock);

        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
    }
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -66,7 +66,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)

    for(i = 0; i < s->nb_snapshots; i++) {
        /* Read statically sized part of the snapshot header */
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);
        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
        if (ret < 0) {
            goto fail;
@@ -155,7 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
    offset = 0;
    for(i = 0; i < s->nb_snapshots; i++) {
        sn = s->snapshots + i;
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);
        offset += sizeof(h);
        offset += sizeof(extra);
        offset += strlen(sn->id_str);
@@ -215,7 +215,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
        assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX);
        h.id_str_size = cpu_to_be16(id_str_size);
        h.name_size = cpu_to_be16(name_size);
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);

        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
        if (ret < 0) {
@@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    /* The VM state isn't needed any more in the active L1 table; in fact, it
     * hurts by causing expensive COW for the next snapshot. */
    qcow2_cluster_discard(bs, qcow2_vm_state_offset(s),
-                          align_offset(sn->vm_state_size, s->cluster_size),
+                          ROUND_UP(sn->vm_state_size, s->cluster_size),
                          QCOW2_DISCARD_NEVER, false);

 #ifdef DEBUG_ALLOC
@@ -465,6 +465,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 {
    BDRVQcow2State *s = bs->opaque;
    QCowSnapshot *sn;
+    Error *local_err = NULL;
    int i, snapshot_index;
    int cur_l1_bytes, sn_l1_bytes;
    int ret;
@@ -477,6 +478,14 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
    }
    sn = &s->snapshots[snapshot_index];

+    ret = qcow2_validate_table(bs, sn->l1_table_offset, sn->l1_size,
+                               sizeof(uint64_t), QCOW_MAX_L1_SIZE,
+                               "Snapshot L1 table", &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+        goto fail;
+    }
+
    if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
        error_report("qcow2: Loading snapshots with different disk "
            "size is not implemented");
@@ -602,6 +611,13 @@ int qcow2_snapshot_delete(BlockDriverState *bs,
    }
    sn = s->snapshots[snapshot_index];

+    ret = qcow2_validate_table(bs, sn.l1_table_offset, sn.l1_size,
+                               sizeof(uint64_t), QCOW_MAX_L1_SIZE,
+                               "Snapshot L1 table", errp);
+    if (ret < 0) {
+        return ret;
+    }
+
    /* Remove it from the snapshot list */
    memmove(s->snapshots + snapshot_index,
            s->snapshots + snapshot_index + 1,
@@ -704,13 +720,15 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
    sn = &s->snapshots[snapshot_index];

    /* Allocate and read in the snapshot's L1 table */
-    if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
-        error_setg(errp, "Snapshot L1 table too large");
-        return -EFBIG;
+    ret = qcow2_validate_table(bs, sn->l1_table_offset, sn->l1_size,
+                               sizeof(uint64_t), QCOW_MAX_L1_SIZE,
+                               "Snapshot L1 table", errp);
+    if (ret < 0) {
+        return ret;
    }
    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
    new_l1_table = qemu_try_blockalign(bs->file->bs,
-                                       align_offset(new_l1_bytes, 512));
+                                       ROUND_UP(new_l1_bytes, 512));
    if (new_l1_table == NULL) {
        return -ENOMEM;
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -68,7 +68,7 @@
 #define MAX_CLUSTER_BITS 21

 /* Must be at least 2 to cover COW */
-#define MIN_L2_CACHE_SIZE 2 /* clusters */
+#define MIN_L2_CACHE_SIZE 2 /* cache entries */

 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
@@ -100,6 +100,7 @@
 #define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
 #define QCOW2_OPT_CACHE_SIZE "cache-size"
 #define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size"
 #define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
 #define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"

@@ -251,6 +252,7 @@ typedef struct BDRVQcow2State {
    int cluster_bits;
    int cluster_size;
    int cluster_sectors;
+    int l2_slice_size;
    int l2_bits;
    int l2_size;
    int l1_size;
@@ -463,15 +465,19 @@ static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size)
    return (size + (1ULL << shift) - 1) >> shift;
 }

+static inline int offset_to_l1_index(BDRVQcow2State *s, uint64_t offset)
+{
+    return offset >> (s->l2_bits + s->cluster_bits);
+}
+
 static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset)
 {
    return (offset >> s->cluster_bits) & (s->l2_size - 1);
 }

-static inline int64_t align_offset(int64_t offset, int n)
+static inline int offset_to_l2_slice_index(BDRVQcow2State *s, int64_t offset)
 {
-    offset = (offset + n - 1) & ~(n - 1);
-    return offset;
+    return (offset >> s->cluster_bits) & (s->l2_slice_size - 1);
 }

 static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s)
@@ -479,11 +485,6 @@ static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s)
    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
 }

-static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s)
-{
-    return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
-}
-
 static inline QCow2ClusterType qcow2_get_cluster_type(uint64_t l2_entry)
 {
    if (l2_entry & QCOW_OFLAG_COMPRESSED) {
@@ -528,9 +529,6 @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }

 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                     int refcount_order, bool generous_increase,
                                     uint64_t *refblock_count);
@@ -544,6 +542,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                             int64_t size, const char *message_format, ...)
                             GCC_FMT_ATTR(5, 6);

+int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
+                         uint64_t entries, size_t entry_len,
+                         int64_t max_size_bytes, const char *table_name,
+                         Error **errp);
+
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
@@ -573,6 +576,8 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
    int64_t l1_table_offset, int l1_size, int addend);

+int coroutine_fn qcow2_flush_caches(BlockDriverState *bs);
+int coroutine_fn qcow2_write_caches(BlockDriverState *bs);
 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                          BdrvCheckMode fix);

@@ -639,34 +644,33 @@ void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);

 /* qcow2-cache.c functions */
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
-int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+                               unsigned table_size);
+int qcow2_cache_destroy(Qcow2Cache *c);

-void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
-     void *table);
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
    Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);

-void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c);
+void qcow2_cache_clean_unused(Qcow2Cache *c);
 int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);

 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
    void **table);
 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
    void **table);
-void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
-void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
-                                  uint64_t offset);
-void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
+void qcow2_cache_put(Qcow2Cache *c, void **table);
+void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset);
+void qcow2_cache_discard(Qcow2Cache *c, void *table);

 /* qcow2-bitmap.c functions */
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                  void **refcount_table,
                                  int64_t *refcount_table_size);
-bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp);
+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp);
 int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
 void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp);
 int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -217,6 +217,7 @@ static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
    qed_write_header_sync(s);
 }

+/* Called with table_lock held.  */
 int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
 {
    QEDCheck check = {
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,7 +18,7 @@
 #include "qed.h"
 #include "qemu/bswap.h"

-/* Called either from qed_check or with table_lock held.  */
+/* Called with table_lock held.  */
 static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)
 {
    QEMUIOVector qiov;
@@ -33,13 +33,9 @@ static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)

    trace_qed_read_table(s, offset, table);

-    if (qemu_in_coroutine()) {
    qemu_co_mutex_unlock(&s->table_lock);
-    }
    ret = bdrv_preadv(s->bs->file, offset, &qiov);
-    if (qemu_in_coroutine()) {
    qemu_co_mutex_lock(&s->table_lock);
-    }
    if (ret < 0) {
        goto out;
    }
@@ -67,7 +63,7 @@ out:
 * @n:          Number of elements
 * @flush:      Whether or not to sync to disk
 *
- * Called either from qed_check or with table_lock held.
+ * Called with table_lock held.
 */
 static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
                           unsigned int index, unsigned int n, bool flush)
@@ -104,13 +100,9 @@ static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
    /* Adjust for offset into table */
    offset += start * sizeof(uint64_t);

-    if (qemu_in_coroutine()) {
    qemu_co_mutex_unlock(&s->table_lock);
-    }
    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
-    if (qemu_in_coroutine()) {
    qemu_co_mutex_lock(&s->table_lock);
-    }
    trace_qed_write_table_cb(s, table, flush, ret);
    if (ret < 0) {
        goto out;
@@ -134,7 +126,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
    return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }

-/* Called either from qed_check or with table_lock held.  */
+/* Called with table_lock held.  */
 int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
@@ -148,7 +140,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
    return qed_write_l1_table(s, index, n);
 }

-/* Called either from qed_check or with table_lock held.  */
+/* Called with table_lock held.  */
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
    int ret;
@@ -191,7 +183,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
    return qed_read_l2_table(s, request, offset);
 }

-/* Called either from qed_check or with table_lock held.  */
+/* Called with table_lock held.  */
 int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                       unsigned int index, unsigned int n, bool flush)
 {
--- a/block/qed.c
+++ b/block/qed.c
@@ -16,9 +16,9 @@
 #include "qapi/error.h"
 #include "qemu/timer.h"
 #include "qemu/bswap.h"
+#include "qemu/option.h"
 #include "trace.h"
 #include "qed.h"
-#include "qapi/qmp/qerror.h"
 #include "sysemu/block-backend.h"

 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
@@ -381,8 +381,9 @@ static void bdrv_qed_init_state(BlockDriverState *bs)
    qemu_co_queue_init(&s->allocating_write_reqs);
 }

-static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
-                            Error **errp)
+/* Called with table_lock held.  */
+static int coroutine_fn bdrv_qed_do_open(BlockDriverState *bs, QDict *options,
+                                         int flags, Error **errp)
 {
    BDRVQEDState *s = bs->opaque;
    QEDHeader le_header;
@@ -513,9 +514,35 @@ out:
    return ret;
 }

+typedef struct QEDOpenCo {
+    BlockDriverState *bs;
+    QDict *options;
+    int flags;
+    Error **errp;
+    int ret;
+} QEDOpenCo;
+
+static void coroutine_fn bdrv_qed_open_entry(void *opaque)
+{
+    QEDOpenCo *qoc = opaque;
+    BDRVQEDState *s = qoc->bs->opaque;
+
+    qemu_co_mutex_lock(&s->table_lock);
+    qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
+    qemu_co_mutex_unlock(&s->table_lock);
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
+    QEDOpenCo qoc = {
+        .bs = bs,
+        .options = options,
+        .flags = flags,
+        .errp = errp,
+        .ret = -EINPROGRESS
+    };
+
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
@@ -523,7 +550,14 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    }

    bdrv_qed_init_state(bs);
-    return bdrv_qed_do_open(bs, options, flags, errp);
+    if (qemu_in_coroutine()) {
+        bdrv_qed_open_entry(&qoc);
+    } else {
+        qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
+        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+    }
+    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+    return qoc.ret;
 }

 static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
@@ -638,7 +672,9 @@ out:
    return ret;
 }

-static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn bdrv_qed_co_create_opts(const char *filename,
+                                                QemuOpts *opts,
+                                                Error **errp)
 {
    uint64_t image_size = 0;
    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
@@ -688,74 +724,46 @@ finish:
    return ret;
 }

-typedef struct {
-    BlockDriverState *bs;
-    Coroutine *co;
-    uint64_t pos;
-    int64_t status;
-    int *pnum;
-    BlockDriverState **file;
-} QEDIsAllocatedCB;
-
-/* Called with table_lock held.  */
-static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
-{
-    QEDIsAllocatedCB *cb = opaque;
-    BDRVQEDState *s = cb->bs->opaque;
-    *cb->pnum = len / BDRV_SECTOR_SIZE;
-    switch (ret) {
-    case QED_CLUSTER_FOUND:
-        offset |= qed_offset_into_cluster(s, cb->pos);
-        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
-        *cb->file = cb->bs->file->bs;
-        break;
-    case QED_CLUSTER_ZERO:
-        cb->status = BDRV_BLOCK_ZERO;
-        break;
-    case QED_CLUSTER_L2:
-    case QED_CLUSTER_L1:
-        cb->status = 0;
-        break;
-    default:
-        assert(ret < 0);
-        cb->status = ret;
-        break;
-    }
-
-    if (cb->co) {
-        aio_co_wake(cb->co);
-    }
-}
-
-static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors, int *pnum,
+static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero,
+                                                 int64_t pos, int64_t bytes,
+                                                 int64_t *pnum, int64_t *map,
                                                 BlockDriverState **file)
 {
    BDRVQEDState *s = bs->opaque;
-    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
-    QEDIsAllocatedCB cb = {
-        .bs = bs,
-        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
-        .status = BDRV_BLOCK_OFFSET_MASK,
-        .pnum = pnum,
-        .file = file,
-    };
+    size_t len = MIN(bytes, SIZE_MAX);
+    int status;
    QEDRequest request = { .l2_table = NULL };
    uint64_t offset;
    int ret;

    qemu_co_mutex_lock(&s->table_lock);
-    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
-    qed_is_allocated_cb(&cb, ret, offset, len);
+    ret = qed_find_cluster(s, &request, pos, &len, &offset);

-    /* The callback was invoked immediately */
-    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
+    *pnum = len;
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        *map = offset | qed_offset_into_cluster(s, pos);
+        status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+        *file = bs->file->bs;
+        break;
+    case QED_CLUSTER_ZERO:
+        status = BDRV_BLOCK_ZERO;
+        break;
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+        status = 0;
+        break;
+    default:
+        assert(ret < 0);
+        status = ret;
+        break;
+    }

    qed_unref_l2_cache_entry(request.l2_table);
    qemu_co_mutex_unlock(&s->table_lock);

-    return cb.status;
+    return status;
 }

 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
@@ -1438,7 +1446,6 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    bdi->cluster_size = s->header.cluster_size;
    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
    bdi->unallocated_blocks_are_zero = true;
-    bdi->can_write_zeroes_with_unmap = true;
    return 0;
 }

@@ -1514,7 +1521,8 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
    return ret;
 }

-static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
+static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
+                                                      Error **errp)
 {
    BDRVQEDState *s = bs->opaque;
    Error *local_err = NULL;
@@ -1523,13 +1531,9 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
    bdrv_qed_close(bs);

    bdrv_qed_init_state(bs);
-    if (qemu_in_coroutine()) {
    qemu_co_mutex_lock(&s->table_lock);
-    }
    ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
-    if (qemu_in_coroutine()) {
    qemu_co_mutex_unlock(&s->table_lock);
-    }
    if (local_err) {
        error_propagate(errp, local_err);
        error_prepend(errp, "Could not reopen qed layer: ");
@@ -1540,12 +1544,17 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
    }
 }

-static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+static int bdrv_qed_co_check(BlockDriverState *bs, BdrvCheckResult *result,
                             BdrvCheckMode fix)
 {
    BDRVQEDState *s = bs->opaque;
+    int ret;

-    return qed_check(s, result, !!fix);
+    qemu_co_mutex_lock(&s->table_lock);
+    ret = qed_check(s, result, !!fix);
+    qemu_co_mutex_unlock(&s->table_lock);
+
+    return ret;
 }

 static QemuOptsList qed_create_opts = {
@@ -1593,9 +1602,9 @@ static BlockDriver bdrv_qed = {
    .bdrv_close               = bdrv_qed_close,
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
    .bdrv_child_perm          = bdrv_format_default_perms,
-    .bdrv_create              = bdrv_qed_create,
+    .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
+    .bdrv_co_block_status     = bdrv_qed_co_block_status,
    .bdrv_co_readv            = bdrv_qed_co_readv,
    .bdrv_co_writev           = bdrv_qed_co_writev,
    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
@@ -1604,8 +1613,8 @@ static BlockDriver bdrv_qed = {
    .bdrv_get_info            = bdrv_qed_get_info,
    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
-    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
-    .bdrv_check               = bdrv_qed_check,
+    .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
+    .bdrv_co_check            = bdrv_qed_co_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
    .bdrv_co_drain_begin      = bdrv_qed_co_drain_begin,
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -15,14 +15,14 @@

 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
+#include "qemu/option.h"
 #include "block/block_int.h"
-#include "qapi/qmp/qbool.h"
+#include "qapi/error.h"
+#include "qapi/qapi-events-block.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
-#include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qstring.h"
-#include "qapi-event.h"
 #include "crypto/hash.h"

 #define HASH_LENGTH 32
@@ -1098,11 +1098,10 @@ static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)

 static BlockDriver bdrv_quorum = {
    .format_name                        = "quorum",
-    .protocol_name                      = "quorum",

    .instance_size                      = sizeof(BDRVQuorumState),

-    .bdrv_file_open                     = quorum_open,
+    .bdrv_open                          = quorum_open,
    .bdrv_close                         = quorum_close,
    .bdrv_refresh_filename              = quorum_refresh_filename,

--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -250,17 +250,17 @@ fail:
    return ret;
 }

-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum,
+static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
+                                            bool want_zero, int64_t offset,
+                                            int64_t bytes, int64_t *pnum,
+                                            int64_t *map,
                                            BlockDriverState **file)
 {
    BDRVRawState *s = bs->opaque;
-    *pnum = nb_sectors;
+    *pnum = bytes;
    *file = bs->file->bs;
-    sector_num += s->offset / BDRV_SECTOR_SIZE;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    *map = offset + s->offset;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

 static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
@@ -396,7 +396,8 @@ static int raw_has_zero_init(BlockDriverState *bs)
    return bdrv_has_zero_init(bs->file->bs);
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    return bdrv_create_file(filename, opts, errp);
 }
@@ -491,12 +492,12 @@ BlockDriver bdrv_raw = {
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
    .bdrv_child_perm      = bdrv_filter_default_perms,
-    .bdrv_create          = &raw_create,
+    .bdrv_co_create_opts  = &raw_co_create_opts,
    .bdrv_co_preadv       = &raw_co_preadv,
    .bdrv_co_pwritev      = &raw_co_pwritev,
    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_pdiscard     = &raw_co_pdiscard,
-    .bdrv_co_get_block_status = &raw_co_get_block_status,
+    .bdrv_co_block_status = &raw_co_block_status,
    .bdrv_truncate        = &raw_truncate,
    .bdrv_getlength       = &raw_getlength,
    .has_variable_length  = true,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -16,11 +16,16 @@
 #include <rbd/librbd.h>
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/option.h"
 #include "block/block_int.h"
 #include "crypto/secret.h"
 #include "qemu/cutils.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qapi-visit-block-core.h"

 /*
 * When specifying the image filename use:
@@ -98,6 +103,11 @@ typedef struct BDRVRBDState {
    char *snap;
 } BDRVRBDState;

+static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+                            BlockdevOptionsRbd *opts, bool cache,
+                            const char *keypairs, const char *secretid,
+                            Error **errp);
+
 static char *qemu_rbd_next_tok(char *src, char delim, char **p)
 {
    char *end;
@@ -265,13 +275,14 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
        key = qstring_get_str(name);

        ret = rados_conf_set(cluster, key, qstring_get_str(value));
-        QDECREF(name);
        QDECREF(value);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "invalid conf option %s", key);
+            QDECREF(name);
            ret = -EINVAL;
            break;
        }
+        QDECREF(name);
    }

    QDECREF(keypairs);
@@ -322,65 +333,93 @@ static QemuOptsList runtime_opts = {
        /*
         * server.* extracted manually, see qemu_rbd_mon_host()
         */
-        {
-            .name = "password-secret",
-            .type = QEMU_OPT_STRING,
-            .help = "ID of secret providing the password",
-        },
-
-        /*
-         * Keys for qemu_rbd_parse_filename(), not in the QAPI schema
-         */
-        {
-            /*
-             * HACK: name starts with '=' so that qemu_opts_parse()
-             * can't set it
-             */
-            .name = "=keyvalue-pairs",
-            .type = QEMU_OPT_STRING,
-            .help = "Legacy rados key/value option parameters",
-        },
-        {
-            .name = "filename",
-            .type = QEMU_OPT_STRING,
-        },
        { /* end of list */ }
    },
 };

-static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
+/* FIXME Deprecate and remove keypairs or make it available in QMP.
+ * password_secret should eventually be configurable in opts->location. Support
+ * for it in .bdrv_open will make it work here as well. */
+static int qemu_rbd_do_create(BlockdevCreateOptions *options,
+                              const char *keypairs, const char *password_secret,
+                              Error **errp)
 {
-    Error *local_err = NULL;
-    int64_t bytes = 0;
-    int64_t objsize;
-    int obj_order = 0;
-    const char *pool, *image_name, *conf, *user, *keypairs;
-    const char *secretid;
+    BlockdevCreateOptionsRbd *opts = &options->u.rbd;
    rados_t cluster;
    rados_ioctx_t io_ctx;
-    QDict *options = NULL;
-    int ret = 0;
+    int obj_order = 0;
+    int ret;

-    secretid = qemu_opt_get(opts, "password-secret");
+    assert(options->driver == BLOCKDEV_DRIVER_RBD);
+    if (opts->location->has_snapshot) {
+        error_setg(errp, "Can't use snapshot name for image creation");
+        return -EINVAL;
+    }

-    /* Read out options */
-    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                     BDRV_SECTOR_SIZE);
-    objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0);
-    if (objsize) {
+    if (opts->has_cluster_size) {
+        int64_t objsize = opts->cluster_size;
        if ((objsize - 1) & objsize) {    /* not a power of 2? */
            error_setg(errp, "obj size needs to be power of 2");
-            ret = -EINVAL;
-            goto exit;
+            return -EINVAL;
        }
        if (objsize < 4096) {
            error_setg(errp, "obj size too small");
-            ret = -EINVAL;
-            goto exit;
+            return -EINVAL;
        }
        obj_order = ctz32(objsize);
    }

+    ret = qemu_rbd_connect(&cluster, &io_ctx, opts->location, false, keypairs,
+                           password_secret, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd create");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    rados_ioctx_destroy(io_ctx);
+    rados_shutdown(cluster);
+    return ret;
+}
+
+static int qemu_rbd_co_create(BlockdevCreateOptions *options, Error **errp)
+{
+    return qemu_rbd_do_create(options, NULL, NULL, errp);
+}
+
+static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
+                                                QemuOpts *opts,
+                                                Error **errp)
+{
+    BlockdevCreateOptions *create_options;
+    BlockdevCreateOptionsRbd *rbd_opts;
+    BlockdevOptionsRbd *loc;
+    Error *local_err = NULL;
+    const char *keypairs, *password_secret;
+    QDict *options = NULL;
+    int ret = 0;
+
+    create_options = g_new0(BlockdevCreateOptions, 1);
+    create_options->driver = BLOCKDEV_DRIVER_RBD;
+    rbd_opts = &create_options->u.rbd;
+
+    rbd_opts->location = g_new0(BlockdevOptionsRbd, 1);
+
+    password_secret = qemu_opt_get(opts, "password-secret");
+
+    /* Read out options */
+    rbd_opts->size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                              BDRV_SECTOR_SIZE);
+    rbd_opts->cluster_size = qemu_opt_get_size_del(opts,
+                                                   BLOCK_OPT_CLUSTER_SIZE, 0);
+    rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
+
    options = qdict_new();
    qemu_rbd_parse_filename(filename, options, &local_err);
    if (local_err) {
@@ -395,61 +434,23 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
     * or blockdev_add, its members are typed according to the QAPI
     * schema, but when they come from -drive, they're all QString.
     */
-    pool       = qdict_get_try_str(options, "pool");
-    conf       = qdict_get_try_str(options, "conf");
-    user       = qdict_get_try_str(options, "user");
-    image_name = qdict_get_try_str(options, "image");
+    loc = rbd_opts->location;
+    loc->pool     = g_strdup(qdict_get_try_str(options, "pool"));
+    loc->conf     = g_strdup(qdict_get_try_str(options, "conf"));
+    loc->has_conf = !!loc->conf;
+    loc->user     = g_strdup(qdict_get_try_str(options, "user"));
+    loc->has_user = !!loc->user;
+    loc->image    = g_strdup(qdict_get_try_str(options, "image"));
    keypairs      = qdict_get_try_str(options, "=keyvalue-pairs");

-    ret = rados_create(&cluster, user);
+    ret = qemu_rbd_do_create(create_options, keypairs, password_secret, errp);
    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error initializing");
        goto exit;
    }

-    /* try default location when conf=NULL, but ignore failure */
-    ret = rados_conf_read_file(cluster, conf);
-    if (conf && ret < 0) {
-        error_setg_errno(errp, -ret, "error reading conf file %s", conf);
-        ret = -EIO;
-        goto shutdown;
-    }
-
-    ret = qemu_rbd_set_keypairs(cluster, keypairs, errp);
-    if (ret < 0) {
-        ret = -EIO;
-        goto shutdown;
-    }
-
-    if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
-        ret = -EIO;
-        goto shutdown;
-    }
-
-    ret = rados_connect(cluster);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error connecting");
-        goto shutdown;
-    }
-
-    ret = rados_ioctx_create(cluster, pool, &io_ctx);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error opening pool %s", pool);
-        goto shutdown;
-    }
-
-    ret = rbd_create(io_ctx, image_name, bytes, &obj_order);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error rbd create");
-    }
-
-    rados_ioctx_destroy(io_ctx);
-
-shutdown:
-    rados_shutdown(cluster);
-
 exit:
    QDECREF(options);
+    qapi_free_BlockdevCreateOptions(create_options);
    return ret;
 }

@@ -500,131 +501,83 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
    qemu_aio_unref(acb);
 }

-static char *qemu_rbd_mon_host(QDict *options, Error **errp)
+static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp)
 {
-    const char **vals = g_new(const char *, qdict_size(options) + 1);
-    char keybuf[32];
+    const char **vals;
    const char *host, *port;
    char *rados_str;
-    int i;
+    InetSocketAddressBaseList *p;
+    int i, cnt;

-    for (i = 0;; i++) {
-        sprintf(keybuf, "server.%d.host", i);
-        host = qdict_get_try_str(options, keybuf);
-        qdict_del(options, keybuf);
-        sprintf(keybuf, "server.%d.port", i);
-        port = qdict_get_try_str(options, keybuf);
-        qdict_del(options, keybuf);
-        if (!host && !port) {
-            break;
+    if (!opts->has_server) {
+        return NULL;
    }
-        if (!host) {
-            error_setg(errp, "Parameter server.%d.host is missing", i);
-            rados_str = NULL;
-            goto out;
+
+    for (cnt = 0, p = opts->server; p; p = p->next) {
+        cnt++;
    }

+    vals = g_new(const char *, cnt + 1);
+
+    for (i = 0, p = opts->server; p; p = p->next, i++) {
+        host = p->value->host;
+        port = p->value->port;
+
        if (strchr(host, ':')) {
-            vals[i] = port ? g_strdup_printf("[%s]:%s", host, port)
-                : g_strdup_printf("[%s]", host);
+            vals[i] = g_strdup_printf("[%s]:%s", host, port);
        } else {
-            vals[i] = port ? g_strdup_printf("%s:%s", host, port)
-                : g_strdup(host);
+            vals[i] = g_strdup_printf("%s:%s", host, port);
        }
    }
    vals[i] = NULL;

    rados_str = i ? g_strjoinv(";", (char **)vals) : NULL;
-out:
    g_strfreev((char **)vals);
    return rados_str;
 }

-static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
+static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+                            BlockdevOptionsRbd *opts, bool cache,
+                            const char *keypairs, const char *secretid,
                            Error **errp)
 {
-    BDRVRBDState *s = bs->opaque;
-    const char *pool, *snap, *conf, *user, *image_name, *keypairs;
-    const char *secretid, *filename;
-    QemuOpts *opts;
-    Error *local_err = NULL;
    char *mon_host = NULL;
+    Error *local_err = NULL;
    int r;

-    /* If we are given a filename, parse the filename, with precedence given to
-     * filename encoded options */
-    filename = qdict_get_try_str(options, "filename");
-    if (filename) {
-        warn_report("'filename' option specified. "
-                    "This is an unsupported option, and may be deprecated "
-                    "in the future");
-        qemu_rbd_parse_filename(filename, options, &local_err);
-        if (local_err) {
-            r = -EINVAL;
-            error_propagate(errp, local_err);
-            goto exit;
-        }
-    }
-
-    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
+    mon_host = qemu_rbd_mon_host(opts, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        r = -EINVAL;
        goto failed_opts;
    }

-    mon_host = qemu_rbd_mon_host(options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        r = -EINVAL;
-        goto failed_opts;
-    }
-
-    secretid = qemu_opt_get(opts, "password-secret");
-
-    pool           = qemu_opt_get(opts, "pool");
-    conf           = qemu_opt_get(opts, "conf");
-    snap           = qemu_opt_get(opts, "snapshot");
-    user           = qemu_opt_get(opts, "user");
-    image_name     = qemu_opt_get(opts, "image");
-    keypairs       = qemu_opt_get(opts, "=keyvalue-pairs");
-
-    if (!pool || !image_name) {
-        error_setg(errp, "Parameters 'pool' and 'image' are required");
-        r = -EINVAL;
-        goto failed_opts;
-    }
-
-    r = rados_create(&s->cluster, user);
+    r = rados_create(cluster, opts->user);
    if (r < 0) {
        error_setg_errno(errp, -r, "error initializing");
        goto failed_opts;
    }

-    s->snap = g_strdup(snap);
-    s->image_name = g_strdup(image_name);
-
    /* try default location when conf=NULL, but ignore failure */
-    r = rados_conf_read_file(s->cluster, conf);
-    if (conf && r < 0) {
-        error_setg_errno(errp, -r, "error reading conf file %s", conf);
+    r = rados_conf_read_file(*cluster, opts->conf);
+    if (opts->has_conf && r < 0) {
+        error_setg_errno(errp, -r, "error reading conf file %s", opts->conf);
        goto failed_shutdown;
    }

-    r = qemu_rbd_set_keypairs(s->cluster, keypairs, errp);
+    r = qemu_rbd_set_keypairs(*cluster, keypairs, errp);
    if (r < 0) {
        goto failed_shutdown;
    }

    if (mon_host) {
-        r = rados_conf_set(s->cluster, "mon_host", mon_host);
+        r = rados_conf_set(*cluster, "mon_host", mon_host);
        if (r < 0) {
            goto failed_shutdown;
        }
    }

-    if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) {
+    if (qemu_rbd_set_auth(*cluster, secretid, errp) < 0) {
        r = -EIO;
        goto failed_shutdown;
    }
@@ -636,24 +589,97 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     * librbd defaults to no caching. If write through caching cannot
     * be set up, fall back to no caching.
     */
-    if (flags & BDRV_O_NOCACHE) {
-        rados_conf_set(s->cluster, "rbd_cache", "false");
+    if (cache) {
+        rados_conf_set(*cluster, "rbd_cache", "true");
    } else {
-        rados_conf_set(s->cluster, "rbd_cache", "true");
+        rados_conf_set(*cluster, "rbd_cache", "false");
    }

-    r = rados_connect(s->cluster);
+    r = rados_connect(*cluster);
    if (r < 0) {
        error_setg_errno(errp, -r, "error connecting");
        goto failed_shutdown;
    }

-    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
+    r = rados_ioctx_create(*cluster, opts->pool, io_ctx);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error opening pool %s", pool);
+        error_setg_errno(errp, -r, "error opening pool %s", opts->pool);
        goto failed_shutdown;
    }

+    return 0;
+
+failed_shutdown:
+    rados_shutdown(*cluster);
+failed_opts:
+    g_free(mon_host);
+    return r;
+}
+
+static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
+                         Error **errp)
+{
+    BDRVRBDState *s = bs->opaque;
+    BlockdevOptionsRbd *opts = NULL;
+    Visitor *v;
+    QObject *crumpled = NULL;
+    Error *local_err = NULL;
+    const char *filename;
+    char *keypairs, *secretid;
+    int r;
+
+    /* If we are given a filename, parse the filename, with precedence given to
+     * filename encoded options */
+    filename = qdict_get_try_str(options, "filename");
+    if (filename) {
+        warn_report("'filename' option specified. "
+                    "This is an unsupported option, and may be deprecated "
+                    "in the future");
+        qemu_rbd_parse_filename(filename, options, &local_err);
+        qdict_del(options, "filename");
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return -EINVAL;
+        }
+    }
+
+    keypairs = g_strdup(qdict_get_try_str(options, "=keyvalue-pairs"));
+    if (keypairs) {
+        qdict_del(options, "=keyvalue-pairs");
+    }
+
+    secretid = g_strdup(qdict_get_try_str(options, "password-secret"));
+    if (secretid) {
+        qdict_del(options, "password-secret");
+    }
+
+    /* Convert the remaining options into a QAPI object */
+    crumpled = qdict_crumple(options, errp);
+    if (crumpled == NULL) {
+        r = -EINVAL;
+        goto out;
+    }
+
+    v = qobject_input_visitor_new_keyval(crumpled);
+    visit_type_BlockdevOptionsRbd(v, NULL, &opts, &local_err);
+    visit_free(v);
+    qobject_decref(crumpled);
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+        r = -EINVAL;
+        goto out;
+    }
+
+    r = qemu_rbd_connect(&s->cluster, &s->io_ctx, opts,
+                         !(flags & BDRV_O_NOCACHE), keypairs, secretid, errp);
+    if (r < 0) {
+        goto out;
+    }
+
+    s->snap = g_strdup(opts->snapshot);
+    s->image_name = g_strdup(opts->image);
+
    /* rbd_open is always r/w */
    r = rbd_open(s->io_ctx, s->image_name, &s->image, s->snap);
    if (r < 0) {
@@ -678,19 +704,18 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

-    qemu_opts_del(opts);
-    return 0;
+    r = 0;
+    goto out;

 failed_open:
    rados_ioctx_destroy(s->io_ctx);
-failed_shutdown:
-    rados_shutdown(s->cluster);
    g_free(s->snap);
    g_free(s->image_name);
-failed_opts:
-    qemu_opts_del(opts);
-    g_free(mon_host);
-exit:
+    rados_shutdown(s->cluster);
+out:
+    qapi_free_BlockdevOptionsRbd(opts);
+    g_free(keypairs);
+    g_free(secretid);
    return r;
 }

@@ -1088,7 +1113,7 @@ static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
 #endif

 #ifdef LIBRBD_SUPPORTS_INVALIDATE
-static void qemu_rbd_invalidate_cache(BlockDriverState *bs,
+static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
                                                      Error **errp)
 {
    BDRVRBDState *s = bs->opaque;
@@ -1129,7 +1154,8 @@ static BlockDriver bdrv_rbd = {
    .bdrv_file_open         = qemu_rbd_open,
    .bdrv_close             = qemu_rbd_close,
    .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
-    .bdrv_create            = qemu_rbd_create,
+    .bdrv_co_create         = qemu_rbd_co_create,
+    .bdrv_co_create_opts    = qemu_rbd_co_create_opts,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
    .bdrv_get_info          = qemu_rbd_getinfo,
    .create_opts            = &qemu_rbd_create_opts,
@@ -1155,7 +1181,7 @@ static BlockDriver bdrv_rbd = {
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
 #ifdef LIBRBD_SUPPORTS_INVALIDATE
-    .bdrv_invalidate_cache  = qemu_rbd_invalidate_cache,
+    .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache,
 #endif
 };

--- a/block/replication.c
+++ b/block/replication.c
@@ -13,7 +13,7 @@
 */

 #include "qemu/osdep.h"
-#include "qemu-common.h"
+#include "qemu/option.h"
 #include "block/nbd.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
@@ -394,6 +394,9 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
        new_secondary_flags = s->orig_secondary_flags;
    }

+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
    if (orig_hidden_flags != new_hidden_flags) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                         new_hidden_flags);
@@ -409,6 +412,9 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                             reopen_queue, &local_err);
        error_propagate(errp, local_err);
    }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }

 static void backup_job_cleanup(BlockDriverState *bs)
@@ -697,7 +703,6 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)

 BlockDriver bdrv_replication = {
    .format_name                = "replication",
-    .protocol_name              = "replication",
    .instance_size              = sizeof(BDRVReplicationState),

    .bdrv_open                  = replication_open,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,12 +13,15 @@
 */

 #include "qemu/osdep.h"
-#include "qapi-visit.h"
 #include "qapi/error.h"
+#include "qapi/qapi-visit-sockets.h"
+#include "qapi/qapi-visit-block-core.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
+#include "qemu/option.h"
 #include "qemu/sockets.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
@@ -400,7 +403,7 @@ typedef struct BDRVSheepdogReopenState {
    int cache_flags;
 } BDRVSheepdogReopenState;

-static const char * sd_strerror(int err)
+static const char *sd_strerror(int err)
 {
    int i;

@@ -532,23 +535,6 @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
    qemu_co_mutex_unlock(&s->queue_lock);
 }

-static SocketAddress *sd_socket_address(const char *path,
-                                        const char *host, const char *port)
-{
-    SocketAddress *addr = g_new0(SocketAddress, 1);
-
-    if (path) {
-        addr->type = SOCKET_ADDRESS_TYPE_UNIX;
-        addr->u.q_unix.path = g_strdup(path);
-    } else {
-        addr->type = SOCKET_ADDRESS_TYPE_INET;
-        addr->u.inet.host = g_strdup(host ?: SD_DEFAULT_ADDR);
-        addr->u.inet.port = g_strdup(port ?: stringify(SD_DEFAULT_PORT));
-    }
-
-    return addr;
-}
-
 static SocketAddress *sd_server_config(QDict *options, Error **errp)
 {
    QDict *server = NULL;
@@ -776,8 +762,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
        if (s->fd < 0) {
            DPRINTF("Wait for connection to be established\n");
            error_report_err(local_err);
-            co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
-                            1000000000ULL);
+            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000000ULL);
        }
    };

@@ -1632,7 +1617,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    if (!tag) {
        tag = "";
    }
-    if (tag && strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
+    if (strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
        error_setg(errp, "value of parameter 'tag' is too long");
        ret = -EINVAL;
        goto err_no_fd;
@@ -1826,40 +1811,34 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
    return 0;
 }

-static int sd_prealloc(const char *filename, Error **errp)
+static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
+                       Error **errp)
 {
    BlockBackend *blk = NULL;
-    BDRVSheepdogState *base = NULL;
+    BDRVSheepdogState *base = bs->opaque;
    unsigned long buf_size;
    uint32_t idx, max_idx;
    uint32_t object_size;
-    int64_t vdi_size;
    void *buf = NULL;
    int ret;

-    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
-    if (blk == NULL) {
-        ret = -EIO;
+    blk = blk_new(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
+                  BLK_PERM_ALL);
+
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
        goto out_with_err_set;
    }

    blk_set_allow_write_beyond_eof(blk, true);

-    vdi_size = blk_getlength(blk);
-    if (vdi_size < 0) {
-        ret = vdi_size;
-        goto out;
-    }
-
-    base = blk_bs(blk)->opaque;
    object_size = (UINT32_C(1) << base->inode.block_size_shift);
    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
    buf = g_malloc0(buf_size);

-    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
+    max_idx = DIV_ROUND_UP(new_size, buf_size);

-    for (idx = 0; idx < max_idx; idx++) {
+    for (idx = old_size / buf_size; idx < max_idx; idx++) {
        /*
         * The created image can be a cloned image, so we need to read
         * a data from the source image.
@@ -1888,48 +1867,66 @@ out_with_err_set:
    return ret;
 }

-/*
- * Sheepdog support two kinds of redundancy, full replication and erasure
- * coding.
- *
- * # create a fully replicated vdi with x copies
- * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
- *
- * # create a erasure coded vdi with x data strips and y parity strips
- * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
- */
-static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
+static int sd_create_prealloc(BlockdevOptionsSheepdog *location, int64_t size,
+                              Error **errp)
+{
+    BlockDriverState *bs;
+    Visitor *v;
+    QObject *obj = NULL;
+    QDict *qdict;
+    Error *local_err = NULL;
+    int ret;
+
+    v = qobject_output_visitor_new(&obj);
+    visit_type_BlockdevOptionsSheepdog(v, NULL, &location, &local_err);
+    visit_free(v);
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+        qobject_decref(obj);
+        return -EINVAL;
+    }
+
+    qdict = qobject_to_qdict(obj);
+    qdict_flatten(qdict);
+
+    qdict_put_str(qdict, "driver", "sheepdog");
+
+    bs = bdrv_open(NULL, NULL, qdict, BDRV_O_PROTOCOL | BDRV_O_RDWR, errp);
+    if (bs == NULL) {
+        ret = -EIO;
+        goto fail;
+    }
+
+    ret = sd_prealloc(bs, 0, size, errp);
+fail:
+    bdrv_unref(bs);
+    QDECREF(qdict);
+    return ret;
+}
+
+static int parse_redundancy(BDRVSheepdogState *s, SheepdogRedundancy *opt)
 {
    struct SheepdogInode *inode = &s->inode;
-    const char *n1, *n2;
-    long copy, parity;
-    char p[10];

-    pstrcpy(p, sizeof(p), opt);
-    n1 = strtok(p, ":");
-    n2 = strtok(NULL, ":");
-
-    if (!n1) {
+    switch (opt->type) {
+    case SHEEPDOG_REDUNDANCY_TYPE_FULL:
+        if (opt->u.full.copies > SD_MAX_COPIES || opt->u.full.copies < 1) {
            return -EINVAL;
        }
-
-    copy = strtol(n1, NULL, 10);
-    /* FIXME fix error checking by switching to qemu_strtol() */
-    if (copy > SD_MAX_COPIES || copy < 1) {
-        return -EINVAL;
-    }
-    if (!n2) {
        inode->copy_policy = 0;
-        inode->nr_copies = copy;
+        inode->nr_copies = opt->u.full.copies;
        return 0;
-    }
+
+    case SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED:
+    {
+        int64_t copy = opt->u.erasure_coded.data_strips;
+        int64_t parity = opt->u.erasure_coded.parity_strips;

        if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
            return -EINVAL;
        }

-    parity = strtol(n2, NULL, 10);
-    /* FIXME fix error checking by switching to qemu_strtol() */
        if (parity >= SD_EC_MAX_STRIP || parity < 1) {
            return -EINVAL;
        }
@@ -1940,18 +1937,81 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
         */
        inode->copy_policy = ((copy / 2) << 4) + parity;
        inode->nr_copies = copy + parity;
-
        return 0;
+    }
+
+    default:
+        g_assert_not_reached();
+    }
+
+    return -EINVAL;
 }

-static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
+/*
+ * Sheepdog support two kinds of redundancy, full replication and erasure
+ * coding.
+ *
+ * # create a fully replicated vdi with x copies
+ * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
+ *
+ * # create a erasure coded vdi with x data strips and y parity strips
+ * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
+ */
+static SheepdogRedundancy *parse_redundancy_str(const char *opt)
+{
+    SheepdogRedundancy *redundancy;
+    const char *n1, *n2;
+    long copy, parity;
+    char p[10];
+    int ret;
+
+    pstrcpy(p, sizeof(p), opt);
+    n1 = strtok(p, ":");
+    n2 = strtok(NULL, ":");
+
+    if (!n1) {
+        return NULL;
+    }
+
+    ret = qemu_strtol(n1, NULL, 10, &copy);
+    if (ret < 0) {
+        return NULL;
+    }
+
+    redundancy = g_new0(SheepdogRedundancy, 1);
+    if (!n2) {
+        *redundancy = (SheepdogRedundancy) {
+            .type               = SHEEPDOG_REDUNDANCY_TYPE_FULL,
+            .u.full.copies      = copy,
+        };
+    } else {
+        ret = qemu_strtol(n2, NULL, 10, &parity);
+        if (ret < 0) {
+            return NULL;
+        }
+
+        *redundancy = (SheepdogRedundancy) {
+            .type               = SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED,
+            .u.erasure_coded    = {
+                .data_strips    = copy,
+                .parity_strips  = parity,
+            },
+        };
+    }
+
+    return redundancy;
+}
+
+static int parse_block_size_shift(BDRVSheepdogState *s,
+                                  BlockdevCreateOptionsSheepdog *opts)
 {
    struct SheepdogInode *inode = &s->inode;
    uint64_t object_size;
    int obj_order;

-    object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
-    if (object_size) {
+    if (opts->has_object_size) {
+        object_size = opts->object_size;
+
        if ((object_size - 1) & object_size) {    /* not a power of 2? */
            return -EINVAL;
        }
@@ -1965,57 +2025,55 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
    return 0;
 }

-static int sd_create(const char *filename, QemuOpts *opts,
-                     Error **errp)
+static int sd_co_create(BlockdevCreateOptions *options, Error **errp)
 {
-    Error *err = NULL;
+    BlockdevCreateOptionsSheepdog *opts = &options->u.sheepdog;
    int ret = 0;
    uint32_t vid = 0;
    char *backing_file = NULL;
    char *buf = NULL;
    BDRVSheepdogState *s;
-    SheepdogConfig cfg;
    uint64_t max_vdi_size;
    bool prealloc = false;

+    assert(options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
+
    s = g_new0(BDRVSheepdogState, 1);

-    if (strstr(filename, "://")) {
-        sd_parse_uri(&cfg, filename, &err);
-    } else {
-        parse_vdiname(&cfg, filename, &err);
-    }
-    if (err) {
-        error_propagate(errp, err);
+    /* Steal SocketAddress from QAPI, set NULL to prevent double free */
+    s->addr = opts->location->server;
+    opts->location->server = NULL;
+
+    if (strlen(opts->location->vdi) >= sizeof(s->name)) {
+        error_setg(errp, "'vdi' string too long");
+        ret = -EINVAL;
        goto out;
    }
+    pstrcpy(s->name, sizeof(s->name), opts->location->vdi);

-    buf = cfg.port ? g_strdup_printf("%d", cfg.port) : NULL;
-    s->addr = sd_socket_address(cfg.path, cfg.host, buf);
-    g_free(buf);
-    strcpy(s->name, cfg.vdi);
-    sd_config_done(&cfg);
+    s->inode.vdi_size = opts->size;
+    backing_file = opts->backing_file;

-    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                                 BDRV_SECTOR_SIZE);
-    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    if (!buf || !strcmp(buf, "off")) {
+    if (!opts->has_preallocation) {
+        opts->preallocation = PREALLOC_MODE_OFF;
+    }
+    switch (opts->preallocation) {
+    case PREALLOC_MODE_OFF:
        prealloc = false;
-    } else if (!strcmp(buf, "full")) {
+        break;
+    case PREALLOC_MODE_FULL:
        prealloc = true;
-    } else {
-        error_setg(errp, "Invalid preallocation mode: '%s'", buf);
+        break;
+    default:
+        error_setg(errp, "Preallocation mode not supported for Sheepdog");
        ret = -EINVAL;
        goto out;
    }

-    g_free(buf);
-    buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
-    if (buf) {
-        ret = parse_redundancy(s, buf);
+    if (opts->has_redundancy) {
+        ret = parse_redundancy(s, opts->redundancy);
        if (ret < 0) {
-            error_setg(errp, "Invalid redundancy mode: '%s'", buf);
+            error_setg(errp, "Invalid redundancy mode");
            goto out;
        }
    }
@@ -2027,20 +2085,20 @@ static int sd_create(const char *filename, QemuOpts *opts,
        goto out;
    }

-    if (backing_file) {
+    if (opts->has_backing_file) {
        BlockBackend *blk;
        BDRVSheepdogState *base;
        BlockDriver *drv;

        /* Currently, only Sheepdog backing image is supported. */
-        drv = bdrv_find_protocol(backing_file, true, NULL);
+        drv = bdrv_find_protocol(opts->backing_file, true, NULL);
        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
            error_setg(errp, "backing_file must be a sheepdog image");
            ret = -EINVAL;
            goto out;
        }

-        blk = blk_new_open(backing_file, NULL, NULL,
+        blk = blk_new_open(opts->backing_file, NULL, NULL,
                           BDRV_O_PROTOCOL, errp);
        if (blk == NULL) {
            ret = -EIO;
@@ -2108,15 +2166,96 @@ static int sd_create(const char *filename, QemuOpts *opts,
    }

    if (prealloc) {
-        ret = sd_prealloc(filename, errp);
+        ret = sd_create_prealloc(opts->location, opts->size, errp);
    }
 out:
    g_free(backing_file);
    g_free(buf);
+    g_free(s->addr);
    g_free(s);
    return ret;
 }

+static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts,
+                                          Error **errp)
+{
+    BlockdevCreateOptions *create_options = NULL;
+    QDict *qdict, *location_qdict;
+    QObject *crumpled;
+    Visitor *v;
+    const char *redundancy;
+    Error *local_err = NULL;
+    int ret;
+
+    redundancy = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
+
+    qdict = qemu_opts_to_qdict(opts, NULL);
+    qdict_put_str(qdict, "driver", "sheepdog");
+
+    location_qdict = qdict_new();
+    qdict_put(qdict, "location", location_qdict);
+
+    sd_parse_filename(filename, location_qdict, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    qdict_flatten(qdict);
+
+    /* Change legacy command line options into QMP ones */
+    static const QDictRenames opt_renames[] = {
+        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
+        { BLOCK_OPT_OBJECT_SIZE,        "object-size" },
+        { NULL, NULL },
+    };
+
+    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Get the QAPI object */
+    crumpled = qdict_crumple(qdict, errp);
+    if (crumpled == NULL) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    v = qobject_input_visitor_new_keyval(crumpled);
+    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
+    visit_free(v);
+    qobject_decref(crumpled);
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    assert(create_options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
+    create_options->u.sheepdog.size =
+        ROUND_UP(create_options->u.sheepdog.size, BDRV_SECTOR_SIZE);
+
+    if (redundancy) {
+        create_options->u.sheepdog.has_redundancy = true;
+        create_options->u.sheepdog.redundancy =
+            parse_redundancy_str(redundancy);
+        if (create_options->u.sheepdog.redundancy == NULL) {
+            error_setg(errp, "Invalid redundancy mode");
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
+    ret = sd_co_create(create_options, errp);
+fail:
+    qapi_free_BlockdevCreateOptions(create_options);
+    QDECREF(qdict);
+    return ret;
+}
+
 static void sd_close(BlockDriverState *bs)
 {
    Error *local_err = NULL;
@@ -2173,15 +2312,16 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,
    int ret, fd;
    unsigned int datalen;
    uint64_t max_vdi_size;
+    int64_t old_size = s->inode.vdi_size;

-    if (prealloc != PREALLOC_MODE_OFF) {
+    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) {
        error_setg(errp, "Unsupported preallocation mode '%s'",
                   PreallocMode_str(prealloc));
        return -ENOTSUP;
    }

    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
-    if (offset < s->inode.vdi_size) {
+    if (offset < old_size) {
        error_setg(errp, "shrinking is not supported");
        return -EINVAL;
    } else if (offset > max_vdi_size) {
@@ -2204,9 +2344,17 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,

    if (ret < 0) {
        error_setg_errno(errp, -ret, "failed to update an inode");
+        return ret;
    }

+    if (prealloc == PREALLOC_MODE_FULL) {
+        ret = sd_prealloc(bs, old_size, offset, errp);
+        if (ret < 0) {
            return ret;
+        }
+    }
+
+    return 0;
 }

 /*
@@ -2988,19 +3136,19 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    return acb.ret;
 }

-static coroutine_fn int64_t
-sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                       int *pnum, BlockDriverState **file)
+static coroutine_fn int
+sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                   int64_t bytes, int64_t *pnum, int64_t *map,
+                   BlockDriverState **file)
 {
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
-    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
    unsigned long start = offset / object_size,
-                  end = DIV_ROUND_UP((sector_num + nb_sectors) *
-                                     BDRV_SECTOR_SIZE, object_size);
+                  end = DIV_ROUND_UP(offset + bytes, object_size);
    unsigned long idx;
-    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+    *map = offset;
+    int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;

    for (idx = start; idx < end; idx++) {
        if (inode->data_vdi_id[idx] == 0) {
@@ -3017,9 +3165,9 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
        }
    }

-    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
-    if (*pnum > nb_sectors) {
-        *pnum = nb_sectors;
+    *pnum = (idx - start) * object_size;
+    if (*pnum > bytes) {
+        *pnum = bytes;
    }
    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
        *file = bs;
@@ -3087,7 +3235,8 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create    = sd_create,
+    .bdrv_co_create               = sd_co_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@@ -3097,7 +3246,7 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
@@ -3123,7 +3272,8 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create    = sd_create,
+    .bdrv_co_create               = sd_co_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@@ -3133,7 +3283,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
@@ -3159,7 +3309,8 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create    = sd_create,
+    .bdrv_co_create               = sd_co_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@@ -3169,7 +3320,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .11.0
 .11.50