spice: Disallow use of gl + TCP port

Currently, virgl support has to go through a local unix socket, trying to connect to a VM using -spice gl through spice://localhost:5900 will only result in a black screen. This commit errors out when the user tries to start a VM with both GL support and a port/tls-port set. This would fit better in spice-server, but currently QEMU does not call into spice-server when parsing 'gl' on its command line, so we have to do this check in QEMU instead. Signed-off-by: Christophe Fergeau <cfergeau@redhat.com> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com> Message-id: 1457955672-28758-1-git-send-email-cfergeau@redhat.com [ applied codestyle fix: break long line ] Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
input-linux: fix Coverity warning
2016-03-24 08:04:01 +01:00 · 2016-03-24 07:58:20 +01:00 · 2016-03-24 07:58:20 +01:00 · 2016-03-23 12:57:44 +00:00 · 2016-03-22 20:27:55 +00:00 · 2016-03-22 19:17:38 +01:00
1389 changed files with 50641 additions and 18574 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,9 +1,38 @@
+sudo: false
 language: c
 python:
  - "2.4"
 compiler:
  - gcc
  - clang
+cache: ccache
+addons:
+  apt:
+    packages:
+      - libaio-dev
+      - libattr1-dev
+      - libbrlapi-dev
+      - libcap-ng-dev
+      - libgnutls-dev
+      - libgtk-3-dev
+      - libiscsi-dev
+      - liblttng-ust-dev
+      - libncurses5-dev
+      - libnss3-dev
+      - libpixman-1-dev
+      - libpng12-dev
+      - librados-dev
+      - libsdl1.2-dev
+      - libseccomp-dev
+      - libspice-protocol-dev
+      - libspice-server-dev
+      - libssh2-1-dev
+      - liburcu-dev
+      - libusb-1.0-0-dev
+      - libvte-2.90-dev
+      - sparse
+      - uuid-dev
+
 notifications:
  irc:
    channels:
@@ -12,29 +41,16 @@ notifications:
    on_failure: always
 env:
  global:
-    - TEST_CMD=""
+    - TEST_CMD="make check"
    - EXTRA_CONFIG=""
-    # Development packages, EXTRA_PKGS saved for additional builds
-    - CORE_PKGS="libusb-1.0-0-dev libiscsi-dev librados-dev libncurses5-dev"
-    - NET_PKGS="libseccomp-dev libgnutls-dev libssh2-1-dev  libspice-server-dev libspice-protocol-dev libnss3-dev"
-    - GUI_PKGS="libgtk-3-dev libvte-2.90-dev libsdl1.2-dev libpng12-dev libpixman-1-dev"
-    - EXTRA_PKGS=""
  matrix:
    # Group major targets together with their linux-user counterparts
-    - TARGETS=alpha-softmmu,alpha-linux-user
+    - TARGETS=alpha-softmmu,alpha-linux-user,cris-softmmu,cris-linux-user,m68k-softmmu,m68k-linux-user,microblaze-softmmu,microblazeel-softmmu,microblaze-linux-user,microblazeel-linux-user
    - TARGETS=arm-softmmu,arm-linux-user,armeb-linux-user,aarch64-softmmu,aarch64-linux-user
-    - TARGETS=cris-softmmu,cris-linux-user
    - TARGETS=i386-softmmu,i386-linux-user,x86_64-softmmu,x86_64-linux-user
-    - TARGETS=m68k-softmmu,m68k-linux-user
-    - TARGETS=microblaze-softmmu,microblazeel-softmmu,microblaze-linux-user,microblazeel-linux-user
-    - TARGETS=mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu
-    - TARGETS=mips-linux-user,mips64-linux-user,mips64el-linux-user,mipsel-linux-user,mipsn32-linux-user,mipsn32el-linux-user
-    - TARGETS=or32-softmmu,or32-linux-user
-    - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu,ppc-linux-user,ppc64-linux-user,ppc64abi32-linux-user,ppc64le-linux-user
-    - TARGETS=s390x-softmmu,s390x-linux-user
-    - TARGETS=sh4-softmmu,sh4eb-softmmu,sh4-linux-user sh4eb-linux-user
-    - TARGETS=sparc-softmmu,sparc64-softmmu,sparc-linux-user,sparc32plus-linux-user,sparc64-linux-user
-    - TARGETS=unicore32-softmmu,unicore32-linux-user
+    - TARGETS=mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu,mips-linux-user,mips64-linux-user,mips64el-linux-user,mipsel-linux-user,mipsn32-linux-user,mipsn32el-linux-user
+    - TARGETS=or32-softmmu,or32-linux-user,ppc-softmmu,ppc64-softmmu,ppcemb-softmmu,ppc-linux-user,ppc64-linux-user,ppc64abi32-linux-user,ppc64le-linux-user
+    - TARGETS=s390x-softmmu,s390x-linux-user,sh4-softmmu,sh4eb-softmmu,sh4-linux-user,sh4eb-linux-user,sparc-softmmu,sparc64-softmmu,sparc-linux-user,sparc32plus-linux-user,sparc64-linux-user,unicore32-softmmu,unicore32-linux-user
    # Group remaining softmmu only targets into one build
    - TARGETS=lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,xtensaeb-softmmu
 git:
@@ -43,8 +59,6 @@ git:
 before_install:
  - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
  - git submodule update --init --recursive
-  - sudo apt-get update -qq
-  - sudo apt-get install -qq ${CORE_PKGS} ${NET_PKGS} ${GUI_PKGS} ${EXTRA_PKGS}
 before_script:
  - ./configure --target-list=${TARGETS} --enable-debug-tcg ${EXTRA_CONFIG}
 script:
@@ -52,44 +66,59 @@ script:
 matrix:
  # We manually include a number of additional build for non-standard bits
  include:
-    # Make check target (we only do this once)
-    - env:
-        - TARGETS=alpha-softmmu,arm-softmmu,aarch64-softmmu,cris-softmmu,i386-softmmu,x86_64-softmmu,m68k-softmmu,microblaze-softmmu,microblazeel-softmmu,mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu,or32-softmmu,ppc-softmmu,ppc64-softmmu,ppcemb-softmmu,s390x-softmmu,sh4-softmmu,sh4eb-softmmu,sparc-softmmu,sparc64-softmmu,unicore32-softmmu,unicore32-linux-user,lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,xtensaeb-softmmu
-          TEST_CMD="make check"
-      compiler: gcc
    # Debug related options
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
+    - env: TARGETS=x86_64-softmmu
           EXTRA_CONFIG="--enable-debug"
      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
+    # We currently disable "make check"
+    - env: TARGETS=alpha-softmmu
           EXTRA_CONFIG="--enable-debug --enable-tcg-interpreter"
+           TEST_CMD=""
      compiler: gcc
-    # All the extra -dev packages
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-           EXTRA_PKGS="libaio-dev libcap-ng-dev libattr1-dev libbrlapi-dev uuid-dev libusb-1.0.0-dev"
+    # Disable a few of the optional features
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--disable-linux-aio --disable-cap-ng --disable-attr --disable-brlapi --disable-uuid --disable-libusb"
      compiler: gcc
    # Currently configure doesn't force --disable-pie
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
+    - env: TARGETS=x86_64-softmmu
           EXTRA_CONFIG="--enable-gprof --enable-gcov --disable-pie"
      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-           EXTRA_PKGS="sparse"
+    # Sparse
+    - env: TARGETS=x86_64-softmmu
           EXTRA_CONFIG="--enable-sparse"
      compiler: gcc
-    # All the trace backends (apart from dtrace)
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-           EXTRA_CONFIG="--enable-trace-backends=stderr"
-      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-           EXTRA_CONFIG="--enable-trace-backends=simple"
-      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-           EXTRA_CONFIG="--enable-trace-backends=ftrace"
-      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
-          EXTRA_PKGS="liblttng-ust-dev liburcu-dev"
-          EXTRA_CONFIG="--enable-trace-backends=ust"
-      compiler: gcc
-    - env: TARGETS=i386-softmmu,x86_64-softmmu
+    # Modules
+    - env: TARGETS=arm-softmmu,x86_64-softmmu
           EXTRA_CONFIG="--enable-modules"
      compiler: gcc
+    # All the trace backends (apart from dtrace)
+    - env: TARGETS=i386-softmmu
+           EXTRA_CONFIG="--enable-trace-backends=log"
+      compiler: gcc
+    # We currently disable "make check" (until 41fc57e44ed regression fixed)
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--enable-trace-backends=simple"
+           TEST_CMD=""
+      compiler: gcc
+    # We currently disable "make check"
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--enable-trace-backends=ftrace"
+           TEST_CMD=""
+      compiler: gcc
+    # We currently disable "make check"
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--enable-trace-backends=ust"
+           TEST_CMD=""
+      compiler: gcc
+    # All the co-routine backends (apart from windows)
+    # We currently disable "make check"
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--with-coroutine=gthread"
+           TEST_CMD=""
+      compiler: gcc
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--with-coroutine=ucontext"
+      compiler: gcc
+    - env: TARGETS=x86_64-softmmu
+           EXTRA_CONFIG="--with-coroutine=sigaltstack"
+      compiler: gcc
--- a/55
+++ b/55
@@ -157,3 +157,58 @@ painful. These are:
 * you may assume that integers are 2s complement representation
 * you may assume that right shift of a signed integer duplicates
   the sign bit (ie it is an arithmetic shift, not a logical shift)
+
+7. Error handling and reporting
+
+7.1 Reporting errors to the human user
+
+Do not use printf(), fprintf() or monitor_printf().  Instead, use
+error_report() or error_vreport() from error-report.h.  This ensures the
+error is reported in the right place (current monitor or stderr), and in
+a uniform format.
+
+Use error_printf() & friends to print additional information.
+
+error_report() prints the current location.  In certain common cases
+like command line parsing, the current location is tracked
+automatically.  To manipulate it manually, use the loc_*() from
+error-report.h.
+
+7.2 Propagating errors
+
+An error can't always be reported to the user right where it's detected,
+but often needs to be propagated up the call chain to a place that can
+handle it.  This can be done in various ways.
+
+The most flexible one is Error objects.  See error.h for usage
+information.
+
+Use the simplest suitable method to communicate success / failure to
+callers.  Stick to common methods: non-negative on success / -1 on
+error, non-negative / -errno, non-null / null, or Error objects.
+
+Example: when a function returns a non-null pointer on success, and it
+can fail only in one way (as far as the caller is concerned), returning
+null on failure is just fine, and certainly simpler and a lot easier on
+the eyes than propagating an Error object through an Error ** parameter.
+
+Example: when a function's callers need to report details on failure
+only the function really knows, use Error **, and set suitable errors.
+
+Do not report an error to the user when you're also returning an error
+for somebody else to handle.  Leave the reporting to the place that
+consumes the error returned.
+
+7.3 Handling errors
+
+Calling exit() is fine when handling configuration errors during
+startup.  It's problematic during normal operation.  In particular,
+monitor commands should never exit().
+
+Do not call exit() or abort() to handle an error that can be triggered
+by the guest (e.g., some unimplemented corner case in guest code
+translation or device emulation).  Guests should not be able to
+terminate QEMU.
+
+Note that &error_fatal is just another way to exit(1), and &error_abort
+is just another way to abort().
--- a/60
+++ b/60
@@ -52,6 +52,11 @@ General Project Administration
 ------------------------------
 M: Peter Maydell <peter.maydell@linaro.org>

+All patches CC here
+L: qemu-devel@nongnu.org
+F: *
+F: */
+
 Responsible Disclosure, Reporting Security Issues
 ------------------------------
 W: http://wiki.qemu.org/SecurityProcess
@@ -79,6 +84,13 @@ F: include/exec/exec-all.h
 F: include/exec/helper*.h
 F: include/exec/tb-hash.h

+FPU emulation
+M: Aurelien Jarno <aurelien@aurel32.net>
+M: Peter Maydell <peter.maydell@linaro.org>
+S: Odd Fixes
+F: fpu/
+F: include/fpu/
+
 Alpha
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
@@ -222,6 +234,7 @@ L: kvm@vger.kernel.org
 S: Supported
 F: kvm-*
 F: */kvm.*
+F: include/sysemu/kvm*.h

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -351,6 +364,7 @@ M: Dmitry Solodkiy <d.solodkiy@samsung.com>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/exynos*
+F: include/hw/arm/exynos4210.h

 Calxeda Highbank
 M: Rob Herring <robh@kernel.org>
@@ -378,6 +392,7 @@ L: qemu-arm@nongnu.org
 S: Odd fixes
 F: hw/*/imx*
 F: hw/arm/kzm.c
+F: include/hw/arm/fsl-imx31.h

 Integrator CP
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -420,6 +435,7 @@ F: hw/arm/spitz.c
 F: hw/arm/tosa.c
 F: hw/arm/z2.c
 F: hw/*/pxa2xx*
+F: include/hw/arm/pxa.h

 Stellaris
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -641,12 +657,6 @@ F: hw/*/grlib*

 S390 Machines
 -------------
-S390 Virtio
-M: Alexander Graf <agraf@suse.de>
-S: Maintained
-F: hw/s390x/s390-*.c
-X: hw/s390x/*pci*.[hc]
-
 S390 Virtio-ccw
 M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
@@ -654,7 +664,6 @@ M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
 F: hw/s390x/
-X: hw/s390x/s390-virtio-bus.[ch]
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
 F: hw/watchdog/wdt_diag288.c
@@ -708,6 +717,12 @@ F: hw/timer/hpet*
 F: hw/timer/i8254*
 F: hw/timer/mc146818rtc*

+Machine core
+M: Eduardo Habkost <ehabkost@redhat.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
+S: Supported
+F: hw/core/machine.c
+F: include/hw/boards.h

 Xtensa Machines
 ---------------
@@ -756,6 +771,7 @@ OMAP
 M: Peter Maydell <peter.maydell@linaro.org>
 S: Maintained
 F: hw/*/omap*
+F: include/hw/arm/omap.h

 IPack
 M: Alberto Garcia <berto@igalia.com>
@@ -841,6 +857,10 @@ M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb/*
 F: tests/usb-*-test.c
+F: docs/usb2.txt
+F: docs/usb-storage.txt
+F: include/hw/usb.h
+F: include/hw/usb/

 USB (serial adapter)
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -852,6 +872,7 @@ VFIO
 M: Alex Williamson <alex.williamson@redhat.com>
 S: Supported
 F: hw/vfio/*
+F: include/hw/vfio/

 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
@@ -863,6 +884,7 @@ M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/virtio*
 F: net/vhost-user.c
+F: include/hw/virtio/

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
@@ -908,6 +930,7 @@ M: Amit Shah <amit.shah@redhat.com>
 S: Supported
 F: hw/virtio/virtio-rng.c
 F: include/hw/virtio/virtio-rng.h
+F: include/sysemu/rng*.h
 F: backends/rng*.c

 nvme
@@ -993,7 +1016,7 @@ F: blockjob.c
 F: include/block/blockjob.h
 F: block/backup.c
 F: block/commit.c
-F: block/stream.h
+F: block/stream.c
 F: block/mirror.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block

@@ -1071,6 +1094,7 @@ SPICE
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Supported
 F: include/ui/qemu-spice.h
+F: include/ui/spice-display.h
 F: ui/spice-*.c
 F: audio/spiceaudio.c
 F: hw/display/qxl*
@@ -1079,6 +1103,7 @@ Graphics
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Odd Fixes
 F: ui/
+F: include/ui/

 Cocoa graphics
 M: Andreas Färber <andreas.faerber@web.de>
@@ -1106,6 +1131,7 @@ Network device backends
 M: Jason Wang <jasowang@redhat.com>
 S: Maintained
 F: net/
+F: include/net/
 T: git git://github.com/jasowang/qemu.git net

 Netmap network backend
@@ -1201,10 +1227,12 @@ F: scripts/qmp/
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 SLIRP
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
 M: Jan Kiszka <jan.kiszka@siemens.com>
 S: Maintained
 F: slirp/
 F: net/slirp.c
+F: include/net/slirp.h
 T: git git://git.kiszka.org/qemu.git queues/slirp

 Tracing
@@ -1229,6 +1257,7 @@ F: include/migration/
 F: migration/
 F: scripts/vmstate-static-checker.py
 F: tests/vmstate-static-checker-data/
+F: docs/migration.txt

 Seccomp
 M: Eduardo Otubo <eduardo.otubo@profitbricks.com>
@@ -1271,6 +1300,15 @@ S: Maintained
 F: include/qemu/sockets.h
 F: util/qemu-sockets.c

+Throttling infrastructure
+M: Alberto Garcia <berto@igalia.com>
+S: Supported
+F: block/throttle-groups.c
+F: include/block/throttle-groups.h
+F: include/qemu/throttle.h
+F: util/throttle.c
+L: qemu-block@nongnu.org
+
 Usermode Emulation
 ------------------
 Overall
@@ -1566,6 +1604,12 @@ L: qemu-block@nongnu.org
 S: Supported
 F: tests/image-fuzzer/

+Build and test automation
+-------------------------
+M: Alex Bennée <alex.bennee@linaro.org>
+L: qemu-devel@nongnu.org
+S: Supported
+F: .travis.yml

 Documentation
 -------------
--- a/16
+++ b/16
@@ -234,11 +234,11 @@ util/module.o-cflags = -D'CONFIG_BLOCK_MODULES=$(block-modules)'

 qemu-img.o: qemu-img-cmds.h

-qemu-img$(EXESUF): qemu-img.o $(block-obj-y) $(crypto-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
-qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) $(crypto-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
-qemu-io$(EXESUF): qemu-io.o $(block-obj-y) $(crypto-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
+qemu-img$(EXESUF): qemu-img.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
+qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a
+qemu-io$(EXESUF): qemu-io.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a

-qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o
+qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a libqemustub.a

 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap
@@ -272,7 +272,8 @@ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \
               $(SRC_PATH)/qapi/event.json $(SRC_PATH)/qapi/introspect.json \
-               $(SRC_PATH)/qapi/crypto.json
+               $(SRC_PATH)/qapi/crypto.json $(SRC_PATH)/qapi/rocker.json \
+               $(SRC_PATH)/qapi/trace.json

 qapi-types.c qapi-types.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
@@ -328,7 +329,7 @@ ifneq ($(EXESUF),)
 qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI)
 endif

-ivshmem-client$(EXESUF): $(ivshmem-client-obj-y)
+ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) libqemuutil.a libqemustub.a
 	$(call LINK, $^)
 ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
 	$(call LINK, $^)
@@ -390,7 +391,7 @@ bepo    cz
 ifdef INSTALL_BLOBS
 BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
 vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin vgabios-virtio.bin \
-acpi-dsdt.aml q35-acpi-dsdt.aml \
+acpi-dsdt.aml \
 ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \
 pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \
 pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \
@@ -399,7 +400,6 @@ efi-pcnet.rom efi-rtl8139.rom efi-virtio.rom \
 qemu-icon.bmp qemu_logo_no_text.svg \
 bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
 multiboot.bin linuxboot.bin kvmvapic.bin \
-s390-zipl.rom \
 s390-ccw.img \
 spapr-rtas.bin slof.bin \
 palcode-clipper \
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -1,6 +1,6 @@
 #######################################################################
 # Common libraries for tools and emulators
-stub-obj-y = stubs/
+stub-obj-y = stubs/ crypto/
 util-obj-y = util/ qobject/ qapi/
 util-obj-y += qmp-introspect.o qapi-types.o qapi-visit.o qapi-event.o

@@ -89,7 +89,6 @@ endif

 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += qemu-log.o
 common-obj-y += tcg-runtime.o
 common-obj-y += hw/
 common-obj-y += qom/
--- a/accel.c
+++ b/accel.c
@@ -23,6 +23,7 @@
 * THE SOFTWARE.
 */

+#include "qemu/osdep.h"
 #include "sysemu/accel.h"
 #include "hw/boards.h"
 #include "qemu-common.h"
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -13,11 +13,12 @@
 * GNU GPL, version 2 or (at your option) any later version.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1
 #include <sys/epoll.h>
 #endif

@@ -32,7 +33,7 @@ struct AioHandler
    QLIST_ENTRY(AioHandler) node;
 };

-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1

 /* The fd number threashold to switch to epoll */
 #define EPOLL_ENABLE_THRESHOLD 64
@@ -482,7 +483,7 @@ bool aio_poll(AioContext *ctx, bool blocking)

 void aio_context_setup(AioContext *ctx, Error **errp)
 {
-#ifdef CONFIG_EPOLL
+#ifdef CONFIG_EPOLL_CREATE1
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -15,6 +15,7 @@
 * GNU GPL, version 2 or (at your option) any later version.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block.h"
 #include "qemu/queue.h"
--- a/arch_init.c
+++ b/arch_init.c
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <stdint.h>
+#include "qemu/osdep.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
--- a/async.c
+++ b/async.c
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/aio.h"
 #include "block/thread-pool.h"
--- a/audio/audio.h
+++ b/audio/audio.h
@@ -24,7 +24,6 @@
 #ifndef QEMU_AUDIO_H
 #define QEMU_AUDIO_H

-#include "config-host.h"
 #include "qemu/queue.h"

 typedef void (*audio_callback_fn) (void *opaque, int avail);
--- a/backends/baum.c
+++ b/backends/baum.c
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/char.h"
 #include "qemu/timer.h"
@@ -566,7 +567,7 @@ static CharDriverState *chr_baum_init(const char *id,
                                      ChardevReturn *ret,
                                      Error **errp)
 {
-    ChardevCommon *common = qapi_ChardevDummy_base(backend->u.braille);
+    ChardevCommon *common = backend->u.braille.data;
    BaumDriverState *baum;
    CharDriverState *chr;
    brlapi_handle_t *handle;
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -9,6 +9,7 @@
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/sysemu.h"
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -9,6 +9,7 @@
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
+#include "qemu/osdep.h"
 #include "sysemu/hostmem.h"
 #include "qom/object_interfaces.h"

--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -9,6 +9,7 @@
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
+#include "qemu/osdep.h"
 #include "sysemu/hostmem.h"
 #include "hw/boards.h"
 #include "qapi/visitor.h"
@@ -26,18 +27,18 @@ QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
 #endif

 static void
-host_memory_backend_get_size(Object *obj, Visitor *v, void *opaque,
-                             const char *name, Error **errp)
+host_memory_backend_get_size(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
    uint64_t value = backend->size;

-    visit_type_size(v, &value, name, errp);
+    visit_type_size(v, name, &value, errp);
 }

 static void
-host_memory_backend_set_size(Object *obj, Visitor *v, void *opaque,
-                             const char *name, Error **errp)
+host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
    Error *local_err = NULL;
@@ -48,7 +49,7 @@ host_memory_backend_set_size(Object *obj, Visitor *v, void *opaque,
        goto out;
    }

-    visit_type_size(v, &value, name, &local_err);
+    visit_type_size(v, name, &value, &local_err);
    if (local_err) {
        goto out;
    }
@@ -63,8 +64,8 @@ out:
 }

 static void
-host_memory_backend_get_host_nodes(Object *obj, Visitor *v, void *opaque,
-                                   const char *name, Error **errp)
+host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
    uint16List *host_nodes = NULL;
@@ -91,18 +92,18 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor *v, void *opaque,
        node = &(*node)->next;
    } while (true);

-    visit_type_uint16List(v, &host_nodes, name, errp);
+    visit_type_uint16List(v, name, &host_nodes, errp);
 }

 static void
-host_memory_backend_set_host_nodes(Object *obj, Visitor *v, void *opaque,
-                                   const char *name, Error **errp)
+host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
 {
 #ifdef CONFIG_NUMA
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
    uint16List *l = NULL;

-    visit_type_uint16List(v, &l, name, errp);
+    visit_type_uint16List(v, name, &l, errp);

    while (l) {
        bitmap_set(backend->host_nodes, l->value, 1);
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <stdlib.h>
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/char.h"
 #include "ui/console.h"
@@ -68,7 +68,7 @@ static CharDriverState *qemu_chr_open_msmouse(const char *id,
                                              ChardevReturn *ret,
                                              Error **errp)
 {
-    ChardevCommon *common = qapi_ChardevDummy_base(backend->u.msmouse);
+    ChardevCommon *common = backend->u.msmouse.data;
    CharDriverState *chr;

    chr = qemu_chr_alloc(common, errp);
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -10,6 +10,7 @@
 * See the COPYING file in the top-level directory.
 */

+#include "qemu/osdep.h"
 #include "sysemu/rng.h"
 #include "sysemu/char.h"
 #include "qapi/qmp/qerror.h"
@@ -24,33 +25,12 @@ typedef struct RngEgd

    CharDriverState *chr;
    char *chr_name;
-
-    GSList *requests;
 } RngEgd;

-typedef struct RngRequest
-{
-    EntropyReceiveFunc *receive_entropy;
-    uint8_t *data;
-    void *opaque;
-    size_t offset;
-    size_t size;
-} RngRequest;
-
-static void rng_egd_request_entropy(RngBackend *b, size_t size,
-                                    EntropyReceiveFunc *receive_entropy,
-                                    void *opaque)
+static void rng_egd_request_entropy(RngBackend *b, RngRequest *req)
 {
    RngEgd *s = RNG_EGD(b);
-    RngRequest *req;
-
-    req = g_malloc(sizeof(*req));
-
-    req->offset = 0;
-    req->size = size;
-    req->receive_entropy = receive_entropy;
-    req->opaque = opaque;
-    req->data = g_malloc(req->size);
+    size_t size = req->size;

    while (size > 0) {
        uint8_t header[2];
@@ -64,24 +44,15 @@ static void rng_egd_request_entropy(RngBackend *b, size_t size,

        size -= len;
    }
-
-    s->requests = g_slist_append(s->requests, req);
-}
-
-static void rng_egd_free_request(RngRequest *req)
-{
-    g_free(req->data);
-    g_free(req);
 }

 static int rng_egd_chr_can_read(void *opaque)
 {
    RngEgd *s = RNG_EGD(opaque);
-    GSList *i;
+    RngRequest *req;
    int size = 0;

-    for (i = s->requests; i; i = i->next) {
-        RngRequest *req = i->data;
+    QSIMPLEQ_FOREACH(req, &s->parent.requests, next) {
        size += req->size - req->offset;
    }

@@ -93,8 +64,8 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
    RngEgd *s = RNG_EGD(opaque);
    size_t buf_offset = 0;

-    while (size > 0 && s->requests) {
-        RngRequest *req = s->requests->data;
+    while (size > 0 && !QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
        int len = MIN(size, req->size - req->offset);

        memcpy(req->data + req->offset, buf + buf_offset, len);
@@ -103,38 +74,13 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
        size -= len;

        if (req->offset == req->size) {
-            s->requests = g_slist_remove_link(s->requests, s->requests);
-
            req->receive_entropy(req->opaque, req->data, req->size);

-            rng_egd_free_request(req);
+            rng_backend_finalize_request(&s->parent, req);
        }
    }
 }

-static void rng_egd_free_requests(RngEgd *s)
-{
-    GSList *i;
-
-    for (i = s->requests; i; i = i->next) {
-        rng_egd_free_request(i->data);
-    }
-
-    g_slist_free(s->requests);
-    s->requests = NULL;
-}
-
-static void rng_egd_cancel_requests(RngBackend *b)
-{
-    RngEgd *s = RNG_EGD(b);
-
-    /* We simply delete the list of pending requests.  If there is data in the 
-     * queue waiting to be read, this is okay, because there will always be
-     * more data than we requested originally
-     */
-    rng_egd_free_requests(s);
-}
-
 static void rng_egd_opened(RngBackend *b, Error **errp)
 {
    RngEgd *s = RNG_EGD(b);
@@ -203,8 +149,6 @@ static void rng_egd_finalize(Object *obj)
    }

    g_free(s->chr_name);
-
-    rng_egd_free_requests(s);
 }

 static void rng_egd_class_init(ObjectClass *klass, void *data)
@@ -212,7 +156,6 @@ static void rng_egd_class_init(ObjectClass *klass, void *data)
    RngBackendClass *rbc = RNG_BACKEND_CLASS(klass);

    rbc->request_entropy = rng_egd_request_entropy;
-    rbc->cancel_requests = rng_egd_cancel_requests;
    rbc->opened = rng_egd_opened;
 }

--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -10,6 +10,7 @@
 * See the COPYING file in the top-level directory.
 */

+#include "qemu/osdep.h"
 #include "sysemu/rng-random.h"
 #include "sysemu/rng.h"
 #include "qapi/qmp/qerror.h"
@@ -21,10 +22,6 @@ struct RndRandom

    int fd;
    char *filename;
-
-    EntropyReceiveFunc *receive_func;
-    void *opaque;
-    size_t size;
 };

 /**
@@ -37,36 +34,35 @@ struct RndRandom
 static void entropy_available(void *opaque)
 {
    RndRandom *s = RNG_RANDOM(opaque);
-    uint8_t buffer[s->size];
-    ssize_t len;

-    len = read(s->fd, buffer, s->size);
-    if (len < 0 && errno == EAGAIN) {
-        return;
+    while (!QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
+        ssize_t len;
+
+        len = read(s->fd, req->data, req->size);
+        if (len < 0 && errno == EAGAIN) {
+            return;
+        }
+        g_assert(len != -1);
+
+        req->receive_entropy(req->opaque, req->data, len);
+
+        rng_backend_finalize_request(&s->parent, req);
    }
-    g_assert(len != -1);
-
-    s->receive_func(s->opaque, buffer, len);
-    s->receive_func = NULL;

+    /* We've drained all requests, the fd handler can be reset. */
    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
 }

-static void rng_random_request_entropy(RngBackend *b, size_t size,
-                                        EntropyReceiveFunc *receive_entropy,
-                                        void *opaque)
+static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
    RndRandom *s = RNG_RANDOM(b);

-    if (s->receive_func) {
-        s->receive_func(s->opaque, NULL, 0);
+    if (QSIMPLEQ_EMPTY(&s->parent.requests)) {
+        /* If there are no pending requests yet, we need to
+         * install our fd handler. */
+        qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
    }
-
-    s->receive_func = receive_entropy;
-    s->opaque = opaque;
-    s->size = size;
-
-    qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
 }

 static void rng_random_opened(RngBackend *b, Error **errp)
--- a/backends/rng.c
+++ b/backends/rng.c
@@ -10,6 +10,7 @@
 * See the COPYING file in the top-level directory.
 */

+#include "qemu/osdep.h"
 #include "sysemu/rng.h"
 #include "qapi/qmp/qerror.h"
 #include "qom/object_interfaces.h"
@@ -19,18 +20,20 @@ void rng_backend_request_entropy(RngBackend *s, size_t size,
                                 void *opaque)
 {
    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+    RngRequest *req;

    if (k->request_entropy) {
-        k->request_entropy(s, size, receive_entropy, opaque);
-    }
-}
+        req = g_malloc(sizeof(*req));

-void rng_backend_cancel_requests(RngBackend *s)
-{
-    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+        req->offset = 0;
+        req->size = size;
+        req->receive_entropy = receive_entropy;
+        req->opaque = opaque;
+        req->data = g_malloc(req->size);

-    if (k->cancel_requests) {
-        k->cancel_requests(s);
+        k->request_entropy(s, req);
+
+        QSIMPLEQ_INSERT_TAIL(&s->requests, req, next);
    }
 }

@@ -72,14 +75,48 @@ static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp)
    s->opened = true;
 }

+static void rng_backend_free_request(RngRequest *req)
+{
+    g_free(req->data);
+    g_free(req);
+}
+
+static void rng_backend_free_requests(RngBackend *s)
+{
+    RngRequest *req, *next;
+
+    QSIMPLEQ_FOREACH_SAFE(req, &s->requests, next, next) {
+        rng_backend_free_request(req);
+    }
+
+    QSIMPLEQ_INIT(&s->requests);
+}
+
+void rng_backend_finalize_request(RngBackend *s, RngRequest *req)
+{
+    QSIMPLEQ_REMOVE(&s->requests, req, RngRequest, next);
+    rng_backend_free_request(req);
+}
+
 static void rng_backend_init(Object *obj)
 {
+    RngBackend *s = RNG_BACKEND(obj);
+
+    QSIMPLEQ_INIT(&s->requests);
+
    object_property_add_bool(obj, "opened",
                             rng_backend_prop_get_opened,
                             rng_backend_prop_set_opened,
                             NULL);
 }

+static void rng_backend_finalize(Object *obj)
+{
+    RngBackend *s = RNG_BACKEND(obj);
+
+    rng_backend_free_requests(s);
+}
+
 static void rng_backend_class_init(ObjectClass *oc, void *data)
 {
    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
@@ -92,6 +129,7 @@ static const TypeInfo rng_backend_info = {
    .parent = TYPE_OBJECT,
    .instance_size = sizeof(RngBackend),
    .instance_init = rng_backend_init,
+    .instance_finalize = rng_backend_finalize,
    .class_size = sizeof(RngBackendClass),
    .class_init = rng_backend_class_init,
    .abstract = true,
--- a/backends/testdev.c
+++ b/backends/testdev.c
@@ -23,6 +23,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/char.h"

--- a/backends/tpm.c
+++ b/backends/tpm.c
@@ -12,6 +12,7 @@
 * Based on backends/rng.c by Anthony Liguori
 */

+#include "qemu/osdep.h"
 #include "sysemu/tpm_backend.h"
 #include "qapi/qmp/qerror.h"
 #include "sysemu/tpm.h"
--- a/balloon.c
+++ b/balloon.c
@@ -24,6 +24,7 @@
 * THE SOFTWARE.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "exec/cpu-common.h"
 #include "sysemu/kvm.h"
--- a/block.c
+++ b/block.c
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include "config-host.h"
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "trace.h"
 #include "block/block_int.h"
@@ -42,8 +42,6 @@
 #include "block/throttle-groups.h"

 #ifdef CONFIG_BSD
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/queue.h>
 #ifndef __DragonFly__
@@ -55,30 +53,14 @@
 #include <windows.h>
 #endif

-/**
- * A BdrvDirtyBitmap can be in three possible states:
- * (1) successor is NULL and disabled is false: full r/w mode
- * (2) successor is NULL and disabled is true: read only mode ("disabled")
- * (3) successor is set: frozen mode.
- *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
- *     or enabled. A frozen bitmap can only abdicate() or reclaim().
- */
-struct BdrvDirtyBitmap {
-    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
-    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
-    char *name;                 /* Optional non-empty unique ID */
-    int64_t size;               /* Size of the bitmap (Number of sectors) */
-    bool disabled;              /* Bitmap is read-only */
-    QLIST_ENTRY(BdrvDirtyBitmap) list;
-};
-
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

-struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states);
-
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);

+static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
+    QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
+
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
    QLIST_HEAD_INITIALIZER(bdrv_drivers);

@@ -87,10 +69,11 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                             BlockDriverState *parent,
                             const BdrvChildRole *child_role, Error **errp);

-static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;

+static void bdrv_close(BlockDriverState *bs);
+
 #ifdef _WIN32
 static int is_windows_drive_prefix(const char *filename)
 {
@@ -241,10 +224,7 @@ void bdrv_register(BlockDriver *bdrv)

 BlockDriverState *bdrv_new_root(void)
 {
-    BlockDriverState *bs = bdrv_new();
-
-    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
-    return bs;
+    return bdrv_new();
 }

 BlockDriverState *bdrv_new(void)
@@ -257,19 +237,15 @@ BlockDriverState *bdrv_new(void)
    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
        QLIST_INIT(&bs->op_blockers[i]);
    }
-    notifier_list_init(&bs->close_notifiers);
    notifier_with_return_list_init(&bs->before_write_notifiers);
    qemu_co_queue_init(&bs->throttled_reqs[0]);
    qemu_co_queue_init(&bs->throttled_reqs[1]);
    bs->refcnt = 1;
    bs->aio_context = qemu_get_aio_context();

-    return bs;
-}
+    QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);

-void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
-{
-    notifier_list_add(&bs->close_notifiers, notify);
+    return bs;
 }

 BlockDriver *bdrv_find_format(const char *format_name)
@@ -686,13 +662,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
 }

 /*
- * Returns the flags that a temporary snapshot should get, based on the
- * originally requested flags (the originally requested image will have flags
- * like a backing file)
+ * Returns the options and flags that a temporary snapshot should get, based on
+ * the originally requested flags (the originally requested image will have
+ * flags like a backing file)
 */
-static int bdrv_temp_snapshot_flags(int flags)
+static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
+                                       int parent_flags, QDict *parent_options)
 {
-    return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+
+    /* For temporary files, unconditional cache=unsafe is fine */
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on");
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
+    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
 }

 /*
@@ -1190,17 +1172,12 @@ static int bdrv_fill_options(QDict **options, const char *filename,
        }
    }

-    if (runstate_check(RUN_STATE_INMIGRATE)) {
-        *flags |= BDRV_O_INACTIVE;
-    }
-
    return 0;
 }

-static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
-                                    BlockDriverState *child_bs,
-                                    const char *child_name,
-                                    const BdrvChildRole *child_role)
+BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
+                                  const char *child_name,
+                                  const BdrvChildRole *child_role)
 {
    BdrvChild *child = g_new(BdrvChild, 1);
    *child = (BdrvChild) {
@@ -1209,24 +1186,43 @@ static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
        .role   = child_role,
    };

-    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
    QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent);

    return child;
 }

+static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+                                    BlockDriverState *child_bs,
+                                    const char *child_name,
+                                    const BdrvChildRole *child_role)
+{
+    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
+    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
+    return child;
+}
+
 static void bdrv_detach_child(BdrvChild *child)
 {
-    QLIST_REMOVE(child, next);
+    if (child->next.le_prev) {
+        QLIST_REMOVE(child, next);
+        child->next.le_prev = NULL;
+    }
    QLIST_REMOVE(child, next_parent);
    g_free(child->name);
    g_free(child);
 }

-void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
+void bdrv_root_unref_child(BdrvChild *child)
 {
    BlockDriverState *child_bs;

+    child_bs = child->bs;
+    bdrv_detach_child(child);
+    bdrv_unref(child_bs);
+}
+
+void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
+{
    if (child == NULL) {
        return;
    }
@@ -1235,9 +1231,7 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
        child->bs->inherits_from = NULL;
    }

-    child_bs = child->bs;
-    bdrv_detach_child(child);
-    bdrv_unref(child_bs);
+    bdrv_root_unref_child(child);
 }

 /*
@@ -1427,13 +1421,13 @@ done:
    return c;
 }

-int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
+static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
+                                     QDict *snapshot_options, Error **errp)
 {
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
    char *tmp_filename = g_malloc0(PATH_MAX + 1);
    int64_t total_size;
    QemuOpts *opts = NULL;
-    QDict *snapshot_options;
    BlockDriverState *bs_snapshot;
    Error *local_err = NULL;
    int ret;
@@ -1467,8 +1461,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
        goto out;
    }

-    /* Prepare a new options QDict for the temporary file */
-    snapshot_options = qdict_new();
+    /* Prepare options QDict for the temporary file */
    qdict_put(snapshot_options, "file.driver",
              qstring_from_str("file"));
    qdict_put(snapshot_options, "file.filename",
@@ -1480,6 +1473,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)

    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
                    flags, &local_err);
+    snapshot_options = NULL;
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto out;
@@ -1488,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
    bdrv_append(bs_snapshot, bs);

 out:
+    QDECREF(snapshot_options);
    g_free(tmp_filename);
    return ret;
 }
@@ -1519,6 +1514,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    const char *drvname;
    const char *backing;
    Error *local_err = NULL;
+    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;

    assert(pbs);
@@ -1610,7 +1606,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
            flags |= BDRV_O_ALLOW_RDWR;
        }
        if (flags & BDRV_O_SNAPSHOT) {
-            snapshot_flags = bdrv_temp_snapshot_flags(flags);
+            snapshot_options = qdict_new();
+            bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
+                                       flags, options);
            bdrv_backing_options(&flags, options, flags, options);
        }

@@ -1684,9 +1682,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
            error_setg(errp, "Block protocol '%s' doesn't support the option "
                       "'%s'", drv->format_name, entry->key);
        } else {
-            error_setg(errp, "Block format '%s' used by device '%s' doesn't "
-                       "support the option '%s'", drv->format_name,
-                       bdrv_get_device_name(bs), entry->key);
+            error_setg(errp,
+                       "Block format '%s' does not support the option '%s'",
+                       drv->format_name, entry->key);
        }

        ret = -EINVAL;
@@ -1712,7 +1710,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
     * temporary snapshot afterwards. */
    if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
+        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
+                                        &local_err);
+        snapshot_options = NULL;
        if (local_err) {
            goto close_and_fail;
        }
@@ -1724,6 +1724,7 @@ fail:
    if (file != NULL) {
        bdrv_unref_child(bs, file);
    }
+    QDECREF(snapshot_options);
    QDECREF(bs->explicit_options);
    QDECREF(bs->options);
    QDECREF(options);
@@ -1746,6 +1747,7 @@ close_and_fail:
    } else {
        bdrv_unref(bs);
    }
+    QDECREF(snapshot_options);
    QDECREF(options);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -2138,13 +2140,11 @@ void bdrv_reopen_abort(BDRVReopenState *reopen_state)
 }


-void bdrv_close(BlockDriverState *bs)
+static void bdrv_close(BlockDriverState *bs)
 {
    BdrvAioNotifier *ban, *ban_next;

-    if (bs->job) {
-        block_job_cancel_sync(bs->job);
-    }
+    assert(!bs->job);

    /* Disable I/O limits and drain all pending throttled requests */
    if (bs->throttle_state) {
@@ -2155,7 +2155,8 @@ void bdrv_close(BlockDriverState *bs)
    bdrv_flush(bs);
    bdrv_drain(bs); /* in case flush left pending I/O */

-    notifier_list_notify(&bs->close_notifiers, bs);
+    bdrv_release_named_dirty_bitmaps(bs);
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

    if (bs->blk) {
        blk_dev_change_media_cb(bs->blk, false);
@@ -2210,32 +2211,40 @@ void bdrv_close(BlockDriverState *bs)
 void bdrv_close_all(void)
 {
    BlockDriverState *bs;
+    AioContext *aio_context;

-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    /* Drop references from requests still in flight, such as canceled block
+     * jobs whose AIO context has not been polled yet */
+    bdrv_drain_all();

-        aio_context_acquire(aio_context);
-        bdrv_close(bs);
-        aio_context_release(aio_context);
+    blk_remove_all_bs();
+    blockdev_close_all_bdrv_states();
+
+    /* Cancel all block jobs */
+    while (!QTAILQ_EMPTY(&all_bdrv_states)) {
+        QTAILQ_FOREACH(bs, &all_bdrv_states, bs_list) {
+            aio_context = bdrv_get_aio_context(bs);
+
+            aio_context_acquire(aio_context);
+            if (bs->job) {
+                block_job_cancel_sync(bs->job);
+                aio_context_release(aio_context);
+                break;
+            }
+            aio_context_release(aio_context);
+        }
+
+        /* All the remaining BlockDriverStates are referenced directly or
+         * indirectly from block jobs, so there needs to be at least one BDS
+         * directly used by a block job */
+        assert(bs);
    }
 }

-/* make a BlockDriverState anonymous by removing from bdrv_state and
- * graph_bdrv_state list.
-   Also, NULL terminate the device_name to prevent double remove */
+/* make a BlockDriverState anonymous by removing from graph_bdrv_state list.
+ * Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
 {
-    /*
-     * Take care to remove bs from bdrv_states only when it's actually
-     * in it.  Note that bs->device_list.tqe_prev is initially null,
-     * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
-     * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
-     * resetting it to null on remove.
-     */
-    if (bs->device_list.tqe_prev) {
-        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
-        bs->device_list.tqe_prev = NULL;
-    }
    if (bs->node_name[0] != '\0') {
        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
    }
@@ -2262,6 +2271,14 @@ static void change_parent_backing_link(BlockDriverState *from,
 {
    BdrvChild *c, *next;

+    if (from->blk) {
+        /* FIXME We bypass blk_set_bs(), so we need to make these updates
+         * manually. The root problem is not in this change function, but the
+         * existence of BlockDriverState.blk. */
+        to->blk = from->blk;
+        from->blk = NULL;
+    }
+
    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->role != &child_backing);
        c->bs = to;
@@ -2270,13 +2287,6 @@ static void change_parent_backing_link(BlockDriverState *from,
        bdrv_ref(to);
        bdrv_unref(from);
    }
-    if (from->blk) {
-        blk_set_bs(from->blk, to);
-        if (!to->device_list.tqe_prev) {
-            QTAILQ_INSERT_BEFORE(from, to, device_list);
-        }
-        QTAILQ_REMOVE(&bdrv_states, from, device_list);
-    }
 }

 static void swap_feature_fields(BlockDriverState *bs_top,
@@ -2366,13 +2376,14 @@ static void bdrv_delete(BlockDriverState *bs)
    assert(!bs->job);
    assert(bdrv_op_blocker_is_empty(bs));
    assert(!bs->refcnt);
-    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

    bdrv_close(bs);

    /* remove from list, if necessary */
    bdrv_make_anon(bs);

+    QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
+
    g_free(bs);
 }

@@ -2506,26 +2517,6 @@ ro_cleanup:
    return ret;
 }

-int bdrv_commit_all(void)
-{
-    BlockDriverState *bs;
-
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
-        if (bs->drv && bs->backing) {
-            int ret = bdrv_commit(bs);
-            if (ret < 0) {
-                aio_context_release(aio_context);
-                return ret;
-            }
-        }
-        aio_context_release(aio_context);
-    }
-    return 0;
-}
-
 /*
 * Return values:
 * 0        - success
@@ -2974,12 +2965,23 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
    return QTAILQ_NEXT(bs, node_list);
 }

+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
-    if (!bs) {
-        return QTAILQ_FIRST(&bdrv_states);
+    if (!bs || bs->blk) {
+        bs = blk_next_root_bs(bs);
+        if (bs) {
+            return bs;
+        }
    }
-    return QTAILQ_NEXT(bs, device_list);
+
+    /* Ignore all BDSs that are attached to a BlockBackend here; they have been
+     * handled by the above block already */
+    do {
+        bs = bdrv_next_monitor_owned(bs);
+    } while (bs && bs->blk);
+    return bs;
 }

 const char *bdrv_get_node_name(const BlockDriverState *bs)
@@ -3287,10 +3289,10 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)

 void bdrv_invalidate_cache_all(Error **errp)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
    Error *local_err = NULL;

-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3320,10 +3322,10 @@ static int bdrv_inactivate(BlockDriverState *bs)

 int bdrv_inactivate_all(void)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
    int ret;

-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3410,327 +3412,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
    }
 }

-BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
-{
-    BdrvDirtyBitmap *bm;
-
-    assert(name);
-    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
-        if (bm->name && !strcmp(name, bm->name)) {
-            return bm;
-        }
-    }
-    return NULL;
-}
-
-void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    g_free(bitmap->name);
-    bitmap->name = NULL;
-}
-
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
-                                          uint32_t granularity,
-                                          const char *name,
-                                          Error **errp)
-{
-    int64_t bitmap_size;
-    BdrvDirtyBitmap *bitmap;
-    uint32_t sector_granularity;
-
-    assert((granularity & (granularity - 1)) == 0);
-
-    if (name && bdrv_find_dirty_bitmap(bs, name)) {
-        error_setg(errp, "Bitmap already exists: %s", name);
-        return NULL;
-    }
-    sector_granularity = granularity >> BDRV_SECTOR_BITS;
-    assert(sector_granularity);
-    bitmap_size = bdrv_nb_sectors(bs);
-    if (bitmap_size < 0) {
-        error_setg_errno(errp, -bitmap_size, "could not get length of device");
-        errno = -bitmap_size;
-        return NULL;
-    }
-    bitmap = g_new0(BdrvDirtyBitmap, 1);
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
-    bitmap->size = bitmap_size;
-    bitmap->name = g_strdup(name);
-    bitmap->disabled = false;
-    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
-    return bitmap;
-}
-
-bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
-{
-    return bitmap->successor;
-}
-
-bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
-{
-    return !(bitmap->disabled || bitmap->successor);
-}
-
-DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
-{
-    if (bdrv_dirty_bitmap_frozen(bitmap)) {
-        return DIRTY_BITMAP_STATUS_FROZEN;
-    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
-        return DIRTY_BITMAP_STATUS_DISABLED;
-    } else {
-        return DIRTY_BITMAP_STATUS_ACTIVE;
-    }
-}
-
-/**
- * Create a successor bitmap destined to replace this bitmap after an operation.
- * Requires that the bitmap is not frozen and has no successor.
- */
-int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
-                                       BdrvDirtyBitmap *bitmap, Error **errp)
-{
-    uint64_t granularity;
-    BdrvDirtyBitmap *child;
-
-    if (bdrv_dirty_bitmap_frozen(bitmap)) {
-        error_setg(errp, "Cannot create a successor for a bitmap that is "
-                   "currently frozen");
-        return -1;
-    }
-    assert(!bitmap->successor);
-
-    /* Create an anonymous successor */
-    granularity = bdrv_dirty_bitmap_granularity(bitmap);
-    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
-    if (!child) {
-        return -1;
-    }
-
-    /* Successor will be on or off based on our current state. */
-    child->disabled = bitmap->disabled;
-
-    /* Install the successor and freeze the parent */
-    bitmap->successor = child;
-    return 0;
-}
-
-/**
- * For a bitmap with a successor, yield our name to the successor,
- * delete the old bitmap, and return a handle to the new bitmap.
- */
-BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
-                                            BdrvDirtyBitmap *bitmap,
-                                            Error **errp)
-{
-    char *name;
-    BdrvDirtyBitmap *successor = bitmap->successor;
-
-    if (successor == NULL) {
-        error_setg(errp, "Cannot relinquish control if "
-                   "there's no successor present");
-        return NULL;
-    }
-
-    name = bitmap->name;
-    bitmap->name = NULL;
-    successor->name = name;
-    bitmap->successor = NULL;
-    bdrv_release_dirty_bitmap(bs, bitmap);
-
-    return successor;
-}
-
-/**
- * In cases of failure where we can no longer safely delete the parent,
- * we may wish to re-join the parent and child/successor.
- * The merged parent will be un-frozen, but not explicitly re-enabled.
- */
-BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
-                                           BdrvDirtyBitmap *parent,
-                                           Error **errp)
-{
-    BdrvDirtyBitmap *successor = parent->successor;
-
-    if (!successor) {
-        error_setg(errp, "Cannot reclaim a successor when none is present");
-        return NULL;
-    }
-
-    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
-        error_setg(errp, "Merging of parent and successor bitmap failed");
-        return NULL;
-    }
-    bdrv_release_dirty_bitmap(bs, successor);
-    parent->successor = NULL;
-
-    return parent;
-}
-
-/**
- * Truncates _all_ bitmaps attached to a BDS.
- */
-static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
-{
-    BdrvDirtyBitmap *bitmap;
-    uint64_t size = bdrv_nb_sectors(bs);
-
-    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
-        assert(!bdrv_dirty_bitmap_frozen(bitmap));
-        hbitmap_truncate(bitmap->bitmap, size);
-        bitmap->size = size;
-    }
-}
-
-void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
-{
-    BdrvDirtyBitmap *bm, *next;
-    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
-        if (bm == bitmap) {
-            assert(!bdrv_dirty_bitmap_frozen(bm));
-            QLIST_REMOVE(bitmap, list);
-            hbitmap_free(bitmap->bitmap);
-            g_free(bitmap->name);
-            g_free(bitmap);
-            return;
-        }
-    }
-}
-
-void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    bitmap->disabled = true;
-}
-
-void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
-{
-    assert(!bdrv_dirty_bitmap_frozen(bitmap));
-    bitmap->disabled = false;
-}
-
-BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
-{
-    BdrvDirtyBitmap *bm;
-    BlockDirtyInfoList *list = NULL;
-    BlockDirtyInfoList **plist = &list;
-
-    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
-        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
-        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bm);
-        info->granularity = bdrv_dirty_bitmap_granularity(bm);
-        info->has_name = !!bm->name;
-        info->name = g_strdup(bm->name);
-        info->status = bdrv_dirty_bitmap_status(bm);
-        entry->value = info;
-        *plist = entry;
-        plist = &entry->next;
-    }
-
-    return list;
-}
-
-int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
-{
-    if (bitmap) {
-        return hbitmap_get(bitmap->bitmap, sector);
-    } else {
-        return 0;
-    }
-}
-
-/**
- * Chooses a default granularity based on the existing cluster size,
- * but clamped between [4K, 64K]. Defaults to 64K in the case that there
- * is no cluster size information available.
- */
-uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-    uint32_t granularity;
-
-    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
-        granularity = MAX(4096, bdi.cluster_size);
-        granularity = MIN(65536, granularity);
-    } else {
-        granularity = 65536;
-    }
-
-    return granularity;
-}
-
-uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
-{
-    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
-}
-
-void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
-{
-    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
-}
-
-void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int nr_sectors)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
-}
-
-void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int nr_sectors)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
-}
-
-void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
-{
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    if (!out) {
-        hbitmap_reset_all(bitmap->bitmap);
-    } else {
-        HBitmap *backup = bitmap->bitmap;
-        bitmap->bitmap = hbitmap_alloc(bitmap->size,
-                                       hbitmap_granularity(backup));
-        *out = backup;
-    }
-}
-
-void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
-{
-    HBitmap *tmp = bitmap->bitmap;
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    bitmap->bitmap = in;
-    hbitmap_free(tmp);
-}
-
-void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int nr_sectors)
-{
-    BdrvDirtyBitmap *bitmap;
-    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
-        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
-            continue;
-        }
-        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
-    }
-}
-
-/**
- * Advance an HBitmapIter to an arbitrary offset.
- */
-void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
-{
-    assert(hbi->hb);
-    hbitmap_iter_init(hbi, hbi->hb, offset);
-}
-
-int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
-{
-    return hbitmap_count(bitmap->bitmap);
-}
-
 /* Get a reference to bs */
 void bdrv_ref(BlockDriverState *bs)
 {
@@ -4150,10 +3831,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 */
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;

    /* walk down the bs forest recursively */
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs)) != NULL) {
        bool perm;

        /* try to recurse in this top level bs */
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,7 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
-block-obj-y += accounting.o
+block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o

 common-obj-y += stream.o
--- a/block/backup.c
+++ b/block/backup.c
@@ -20,11 +20,9 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "qemu/bitmap.h"

-#define BACKUP_CLUSTER_BITS 16
-#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
-#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
-
+#define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 100000000ULL /* ns */

 typedef struct CowRequest {
@@ -45,10 +43,17 @@ typedef struct BackupBlockJob {
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
    uint64_t sectors_read;
-    HBitmap *bitmap;
+    unsigned long *done_bitmap;
+    int64_t cluster_size;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

+/* Size of a cluster in sectors, instead of bytes. */
+static inline int64_t cluster_size_sectors(BackupBlockJob *job)
+{
+  return job->cluster_size / BDRV_SECTOR_SIZE;
+}
+
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                       int64_t start,
@@ -97,13 +102,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
    QEMUIOVector bounce_qiov;
    void *bounce_buffer = NULL;
    int ret = 0;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int64_t start, end;
    int n;

    qemu_co_rwlock_rdlock(&job->flush_rwlock);

-    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
+    start = sector_num / sectors_per_cluster;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);

    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);

@@ -111,19 +117,19 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
    cow_request_begin(&cow_request, job, start, end);

    for (; start < end; start++) {
-        if (hbitmap_get(job->bitmap, start)) {
+        if (test_bit(start, job->done_bitmap)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }

        trace_backup_do_cow_process(job, start);

-        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
+        n = MIN(sectors_per_cluster,
                job->common.len / BDRV_SECTOR_SIZE -
-                start * BACKUP_SECTORS_PER_CLUSTER);
+                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
+            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
@@ -131,10 +137,10 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,

        if (is_write_notifier) {
            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * BACKUP_SECTORS_PER_CLUSTER,
+                                           start * sectors_per_cluster,
                                           n, &bounce_qiov);
        } else {
-            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
                                &bounce_qiov);
        }
        if (ret < 0) {
@@ -147,11 +153,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
            ret = bdrv_co_write_zeroes(job->target,
-                                       start * BACKUP_SECTORS_PER_CLUSTER,
+                                       start * sectors_per_cluster,
                                       n, BDRV_REQ_MAY_UNMAP);
        } else {
            ret = bdrv_co_writev(job->target,
-                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
+                                 start * sectors_per_cluster, n,
                                 &bounce_qiov);
        }
        if (ret < 0) {
@@ -162,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
            goto out;
        }

-        hbitmap_set(job->bitmap, start, 1);
+        set_bit(start, job->done_bitmap);

        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
@@ -322,21 +328,22 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t cluster;
    int64_t end;
    int64_t last_cluster = -1;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
-    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);

    /* Find the next dirty sector(s) */
    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
-        cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+        cluster = sector / sectors_per_cluster;

        /* Fake progress updates for any clusters we skipped */
        if (cluster != last_cluster + 1) {
            job->common.offset += ((cluster - last_cluster - 1) *
-                                   BACKUP_CLUSTER_SIZE);
+                                   job->cluster_size);
        }

        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
@@ -344,8 +351,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
-                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
+                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
                    backup_error_action(job, error_is_read, -ret) ==
@@ -357,17 +364,17 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)

        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
-        if (granularity < BACKUP_CLUSTER_SIZE) {
-            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+        if (granularity < job->cluster_size) {
+            bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster);
        }

        last_cluster = cluster - 1;
    }

    /* Play some final catchup with the progress meter */
-    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    end = DIV_ROUND_UP(job->common.len, job->cluster_size);
    if (last_cluster + 1 < end) {
-        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+        job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
    }

    return ret;
@@ -384,15 +391,16 @@ static void coroutine_fn backup_run(void *opaque)
        .notify = backup_before_write_notify,
    };
    int64_t start, end;
+    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

    start = 0;
-    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    end = DIV_ROUND_UP(job->common.len, job->cluster_size);

-    job->bitmap = hbitmap_alloc(end, 0);
+    job->done_bitmap = bitmap_new(end);

    bdrv_set_enable_write_cache(target, true);
    if (target->blk) {
@@ -427,7 +435,7 @@ static void coroutine_fn backup_run(void *opaque)
                /* Check to see if these blocks are already in the
                 * backing file. */

-                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
+                for (i = 0; i < sectors_per_cluster;) {
                    /* bdrv_is_allocated() only returns true/false based
                     * on the first set of sectors it comes across that
                     * are are all in the same state.
@@ -436,8 +444,8 @@ static void coroutine_fn backup_run(void *opaque)
                     * needed but at some point that is always the case. */
                    alloced =
                        bdrv_is_allocated(bs,
-                                start * BACKUP_SECTORS_PER_CLUSTER + i,
-                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
+                                start * sectors_per_cluster + i,
+                                sectors_per_cluster - i, &n);
                    i += n;

                    if (alloced == 1 || n == 0) {
@@ -452,8 +460,8 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
-                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
+            ret = backup_do_cow(bs, start * sectors_per_cluster,
+                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
                BlockErrorAction action =
@@ -473,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque)
    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
-    hbitmap_free(job->bitmap);
+    g_free(job->done_bitmap);

    if (target->blk) {
        blk_iostatus_disable(target->blk);
@@ -494,6 +502,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  BlockJobTxn *txn, Error **errp)
 {
    int64_t len;
+    BlockDriverInfo bdi;
+    int ret;

    assert(bs);
    assert(target);
@@ -563,14 +573,32 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        goto error;
    }

-    bdrv_op_block_all(target, job->common.blocker);
-
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
+
+    /* If there is no backing file on the target, we cannot rely on COW if our
+     * backup cluster size is smaller than the target cluster size. Even for
+     * targets with a backing file, try to avoid COW if possible. */
+    ret = bdrv_get_info(job->target, &bdi);
+    if (ret < 0 && !target->backing) {
+        error_setg_errno(errp, -ret,
+            "Couldn't determine the cluster size of the target image, "
+            "which has no backing file");
+        error_append_hint(errp,
+            "Aborting, since this may create an unusable destination image\n");
+        goto error;
+    } else if (ret < 0 && target->backing) {
+        /* Not fatal; just trudge on ahead. */
+        job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT;
+    } else {
+        job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
+    }
+
+    bdrv_op_block_all(target, job->common.blocker);
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
    block_job_txn_add_job(txn, &job->common);
--- a/block/block-backend.c
+++ b/block/block-backend.c
--- a/block/curl.c
+++ b/block/curl.c
@@ -27,6 +27,7 @@
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qstring.h"
+#include "crypto/secret.h"
 #include <curl/curl.h>

 // #define DEBUG_CURL
@@ -78,6 +79,10 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
 #define CURL_BLOCK_OPT_TIMEOUT "timeout"
 #define CURL_BLOCK_OPT_COOKIE    "cookie"
+#define CURL_BLOCK_OPT_USERNAME "username"
+#define CURL_BLOCK_OPT_PASSWORD_SECRET "password-secret"
+#define CURL_BLOCK_OPT_PROXY_USERNAME "proxy-username"
+#define CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET "proxy-password-secret"

 struct BDRVCURLState;

@@ -120,6 +125,10 @@ typedef struct BDRVCURLState {
    char *cookie;
    bool accept_range;
    AioContext *aio_context;
+    char *username;
+    char *password;
+    char *proxyusername;
+    char *proxypassword;
 } BDRVCURLState;

 static void curl_clean_state(CURLState *s);
@@ -419,6 +428,21 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
        curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
        curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);

+        if (s->username) {
+            curl_easy_setopt(state->curl, CURLOPT_USERNAME, s->username);
+        }
+        if (s->password) {
+            curl_easy_setopt(state->curl, CURLOPT_PASSWORD, s->password);
+        }
+        if (s->proxyusername) {
+            curl_easy_setopt(state->curl,
+                             CURLOPT_PROXYUSERNAME, s->proxyusername);
+        }
+        if (s->proxypassword) {
+            curl_easy_setopt(state->curl,
+                             CURLOPT_PROXYPASSWORD, s->proxypassword);
+        }
+
        /* Restrict supported protocols to avoid security issues in the more
         * obscure protocols.  For example, do not allow POP3/SMTP/IMAP see
         * CVE-2013-0249.
@@ -525,10 +549,31 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Pass the cookie or list of cookies with each request"
        },
+        {
+            .name = CURL_BLOCK_OPT_USERNAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Username for HTTP auth"
+        },
+        {
+            .name = CURL_BLOCK_OPT_PASSWORD_SECRET,
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret used as password for HTTP auth",
+        },
+        {
+            .name = CURL_BLOCK_OPT_PROXY_USERNAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Username for HTTP proxy auth"
+        },
+        {
+            .name = CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET,
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret used as password for HTTP proxy auth",
+        },
        { /* end of list */ }
    },
 };

+
 static int curl_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
@@ -539,6 +584,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    const char *file;
    const char *cookie;
    double d;
+    const char *secretid;

    static int inited = 0;

@@ -580,6 +626,26 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        goto out_noclean;
    }

+    s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME));
+    secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET);
+
+    if (secretid) {
+        s->password = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!s->password) {
+            goto out_noclean;
+        }
+    }
+
+    s->proxyusername = g_strdup(
+        qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_USERNAME));
+    secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET);
+    if (secretid) {
+        s->proxypassword = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!s->proxypassword) {
+            goto out_noclean;
+        }
+    }
+
    if (!inited) {
        curl_global_init(CURL_GLOBAL_ALL);
        inited = 1;
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -0,0 +1,387 @@
+/*
+ * Block Dirty Bitmap
+ *
+ * Copyright (c) 2016 Red Hat. Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "config-host.h"
+#include "qemu-common.h"
+#include "trace.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+
+/**
+ * A BdrvDirtyBitmap can be in three possible states:
+ * (1) successor is NULL and disabled is false: full r/w mode
+ * (2) successor is NULL and disabled is true: read only mode ("disabled")
+ * (3) successor is set: frozen mode.
+ *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
+ *     or enabled. A frozen bitmap can only abdicate() or reclaim().
+ */
+struct BdrvDirtyBitmap {
+    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
+    char *name;                 /* Optional non-empty unique ID */
+    int64_t size;               /* Size of the bitmap (Number of sectors) */
+    bool disabled;              /* Bitmap is read-only */
+    QLIST_ENTRY(BdrvDirtyBitmap) list;
+};
+
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
+{
+    BdrvDirtyBitmap *bm;
+
+    assert(name);
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->name && !strcmp(name, bm->name)) {
+            return bm;
+        }
+    }
+    return NULL;
+}
+
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    g_free(bitmap->name);
+    bitmap->name = NULL;
+}
+
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          uint32_t granularity,
+                                          const char *name,
+                                          Error **errp)
+{
+    int64_t bitmap_size;
+    BdrvDirtyBitmap *bitmap;
+    uint32_t sector_granularity;
+
+    assert((granularity & (granularity - 1)) == 0);
+
+    if (name && bdrv_find_dirty_bitmap(bs, name)) {
+        error_setg(errp, "Bitmap already exists: %s", name);
+        return NULL;
+    }
+    sector_granularity = granularity >> BDRV_SECTOR_BITS;
+    assert(sector_granularity);
+    bitmap_size = bdrv_nb_sectors(bs);
+    if (bitmap_size < 0) {
+        error_setg_errno(errp, -bitmap_size, "could not get length of device");
+        errno = -bitmap_size;
+        return NULL;
+    }
+    bitmap = g_new0(BdrvDirtyBitmap, 1);
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+    bitmap->size = bitmap_size;
+    bitmap->name = g_strdup(name);
+    bitmap->disabled = false;
+    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+    return bitmap;
+}
+
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->successor;
+}
+
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
+{
+    return !(bitmap->disabled || bitmap->successor);
+}
+
+DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
+{
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        return DIRTY_BITMAP_STATUS_FROZEN;
+    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+        return DIRTY_BITMAP_STATUS_DISABLED;
+    } else {
+        return DIRTY_BITMAP_STATUS_ACTIVE;
+    }
+}
+
+/**
+ * Create a successor bitmap destined to replace this bitmap after an operation.
+ * Requires that the bitmap is not frozen and has no successor.
+ */
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap, Error **errp)
+{
+    uint64_t granularity;
+    BdrvDirtyBitmap *child;
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp, "Cannot create a successor for a bitmap that is "
+                   "currently frozen");
+        return -1;
+    }
+    assert(!bitmap->successor);
+
+    /* Create an anonymous successor */
+    granularity = bdrv_dirty_bitmap_granularity(bitmap);
+    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+    if (!child) {
+        return -1;
+    }
+
+    /* Successor will be on or off based on our current state. */
+    child->disabled = bitmap->disabled;
+
+    /* Install the successor and freeze the parent */
+    bitmap->successor = child;
+    return 0;
+}
+
+/**
+ * For a bitmap with a successor, yield our name to the successor,
+ * delete the old bitmap, and return a handle to the new bitmap.
+ */
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp)
+{
+    char *name;
+    BdrvDirtyBitmap *successor = bitmap->successor;
+
+    if (successor == NULL) {
+        error_setg(errp, "Cannot relinquish control if "
+                   "there's no successor present");
+        return NULL;
+    }
+
+    name = bitmap->name;
+    bitmap->name = NULL;
+    successor->name = name;
+    bitmap->successor = NULL;
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+    return successor;
+}
+
+/**
+ * In cases of failure where we can no longer safely delete the parent,
+ * we may wish to re-join the parent and child/successor.
+ * The merged parent will be un-frozen, but not explicitly re-enabled.
+ */
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *parent,
+                                           Error **errp)
+{
+    BdrvDirtyBitmap *successor = parent->successor;
+
+    if (!successor) {
+        error_setg(errp, "Cannot reclaim a successor when none is present");
+        return NULL;
+    }
+
+    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
+        error_setg(errp, "Merging of parent and successor bitmap failed");
+        return NULL;
+    }
+    bdrv_release_dirty_bitmap(bs, successor);
+    parent->successor = NULL;
+
+    return parent;
+}
+
+/**
+ * Truncates _all_ bitmaps attached to a BDS.
+ */
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bitmap;
+    uint64_t size = bdrv_nb_sectors(bs);
+
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        assert(!bdrv_dirty_bitmap_frozen(bitmap));
+        hbitmap_truncate(bitmap->bitmap, size);
+        bitmap->size = size;
+    }
+}
+
+static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
+                                                  BdrvDirtyBitmap *bitmap,
+                                                  bool only_named)
+{
+    BdrvDirtyBitmap *bm, *next;
+    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
+        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+            assert(!bdrv_dirty_bitmap_frozen(bm));
+            QLIST_REMOVE(bm, list);
+            hbitmap_free(bm->bitmap);
+            g_free(bm->name);
+            g_free(bm);
+
+            if (bitmap) {
+                return;
+            }
+        }
+    }
+}
+
+void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+{
+    bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false);
+}
+
+/**
+ * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()).
+ * There must not be any frozen bitmaps attached.
+ */
+void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
+{
+    bdrv_do_release_matching_dirty_bitmap(bs, NULL, true);
+}
+
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = true;
+}
+
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = false;
+}
+
+BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bm;
+    BlockDirtyInfoList *list = NULL;
+    BlockDirtyInfoList **plist = &list;
+
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
+        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
+        info->count = bdrv_get_dirty_count(bm);
+        info->granularity = bdrv_dirty_bitmap_granularity(bm);
+        info->has_name = !!bm->name;
+        info->name = g_strdup(bm->name);
+        info->status = bdrv_dirty_bitmap_status(bm);
+        entry->value = info;
+        *plist = entry;
+        plist = &entry->next;
+    }
+
+    return list;
+}
+
+int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                   int64_t sector)
+{
+    if (bitmap) {
+        return hbitmap_get(bitmap->bitmap, sector);
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * Chooses a default granularity based on the existing cluster size,
+ * but clamped between [4K, 64K]. Defaults to 64K in the case that there
+ * is no cluster size information available.
+ */
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    uint32_t granularity;
+
+    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
+        granularity = MAX(4096, bdi.cluster_size);
+        granularity = MIN(65536, granularity);
+    } else {
+        granularity = 65536;
+    }
+
+    return granularity;
+}
+
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+{
+    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+{
+    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
+}
+
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                           int64_t cur_sector, int nr_sectors)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                             int64_t cur_sector, int nr_sectors)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    if (!out) {
+        hbitmap_reset_all(bitmap->bitmap);
+    } else {
+        HBitmap *backup = bitmap->bitmap;
+        bitmap->bitmap = hbitmap_alloc(bitmap->size,
+                                       hbitmap_granularity(backup));
+        *out = backup;
+    }
+}
+
+void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
+{
+    HBitmap *tmp = bitmap->bitmap;
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    bitmap->bitmap = in;
+    hbitmap_free(tmp);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                    int nr_sectors)
+{
+    BdrvDirtyBitmap *bitmap;
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
+        hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+    }
+}
+
+/**
+ * Advance an HBitmapIter to an arbitrary offset.
+ */
+void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+{
+    assert(hbi->hb);
+    hbitmap_iter_init(hbi, hbi->hb, offset);
+}
+
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
+{
+    return hbitmap_count(bitmap->bitmap);
+}
--- a/block/io.c
+++ b/block/io.c
@@ -44,12 +44,6 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                         int64_t sector_num, int nb_sectors,
                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                         int64_t sector_num,
                                         QEMUIOVector *qiov,
@@ -301,6 +295,7 @@ void bdrv_drain_all(void)
        if (bs->job) {
            block_job_pause(bs->job);
        }
+        bdrv_drain_recurse(bs);
        aio_context_release(aio_context);

        if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -620,20 +615,6 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
 }

-/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
-int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
-                          uint8_t *buf, int nb_sectors)
-{
-    bool enabled;
-    int ret;
-
-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
-    return ret;
-}
-
 /* Return < 0 if error. Important errors are:
  -EIO         generic I/O error (may happen for all errors)
  -ENOMEDIUM   No media inserted.
@@ -664,6 +645,7 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
 {
    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+    BlockDriverState *file;
    int n;

    target_sectors = bdrv_nb_sectors(bs);
@@ -676,7 +658,7 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
        if (nb_sectors <= 0) {
            return 0;
        }
-        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
        if (ret < 0) {
            error_report("error getting block status at sector %" PRId64 ": %s",
                         sector_num, strerror(-ret));
@@ -937,7 +919,7 @@ out:
 /*
 * Handle a read request in coroutine context
 */
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
 {
@@ -1282,7 +1264,7 @@ fail:
 /*
 * Handle a write request in coroutine context
 */
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags)
 {
@@ -1443,29 +1425,10 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
                             BDRV_REQ_ZERO_WRITE | flags);
 }

-int bdrv_flush_all(void)
-{
-    BlockDriverState *bs = NULL;
-    int result = 0;
-
-    while ((bs = bdrv_next(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        ret = bdrv_flush(bs);
-        if (ret < 0 && !result) {
-            result = ret;
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
-}
-
 typedef struct BdrvCoGetBlockStatusData {
    BlockDriverState *bs;
    BlockDriverState *base;
+    BlockDriverState **file;
    int64_t sector_num;
    int nb_sectors;
    int *pnum;
@@ -1487,10 +1450,14 @@ typedef struct BdrvCoGetBlockStatusData {
 *
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
 * beyond the end of the disk image it will be clamped.
+ *
+ * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
+ * points to the BDS which the sector range is allocated in.
 */
 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum)
+                                                     int nb_sectors, int *pnum,
+                                                     BlockDriverState **file)
 {
    int64_t total_sectors;
    int64_t n;
@@ -1520,7 +1487,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
        return ret;
    }

-    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+    *file = NULL;
+    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
+                                            file);
    if (ret < 0) {
        *pnum = 0;
        return ret;
@@ -1529,7 +1498,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
    if (ret & BDRV_BLOCK_RAW) {
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum);
+                                     *pnum, pnum, file);
    }

    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
@@ -1546,13 +1515,14 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
        }
    }

-    if (bs->file &&
+    if (*file && *file != bs &&
        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
        (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        BlockDriverState *file2;
        int file_pnum;

-        ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
-                                        *pnum, &file_pnum);
+        ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
+                                        *pnum, &file_pnum, &file2);
        if (ret2 >= 0) {
            /* Ignore errors.  This is just providing extra information, it
             * is useful but not necessary.
@@ -1577,14 +1547,15 @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
        BlockDriverState *base,
        int64_t sector_num,
        int nb_sectors,
-        int *pnum)
+        int *pnum,
+        BlockDriverState **file)
 {
    BlockDriverState *p;
    int64_t ret = 0;

    assert(bs != base);
    for (p = bs; p != base; p = backing_bs(p)) {
-        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
            break;
        }
@@ -1603,7 +1574,8 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
                                               data->sector_num,
                                               data->nb_sectors,
-                                               data->pnum);
+                                               data->pnum,
+                                               data->file);
    data->done = true;
 }

@@ -1615,12 +1587,14 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
                                    BlockDriverState *base,
                                    int64_t sector_num,
-                                    int nb_sectors, int *pnum)
+                                    int nb_sectors, int *pnum,
+                                    BlockDriverState **file)
 {
    Coroutine *co;
    BdrvCoGetBlockStatusData data = {
        .bs = bs,
        .base = base,
+        .file = file,
        .sector_num = sector_num,
        .nb_sectors = nb_sectors,
        .pnum = pnum,
@@ -1644,16 +1618,19 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,

 int64_t bdrv_get_block_status(BlockDriverState *bs,
                              int64_t sector_num,
-                              int nb_sectors, int *pnum)
+                              int nb_sectors, int *pnum,
+                              BlockDriverState **file)
 {
    return bdrv_get_block_status_above(bs, backing_bs(bs),
-                                       sector_num, nb_sectors, pnum);
+                                       sector_num, nb_sectors, pnum, file);
 }

 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
                                   int nb_sectors, int *pnum)
 {
-    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+    BlockDriverState *file;
+    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
+                                        &file);
    if (ret < 0) {
        return ret;
    }
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -39,6 +39,7 @@
 #include "sysemu/sysemu.h"
 #include "qmp-commands.h"
 #include "qapi/qmp/qstring.h"
+#include "crypto/secret.h"

 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
@@ -532,7 +533,8 @@ static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,

 static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
                                                  int64_t sector_num,
-                                                  int nb_sectors, int *pnum)
+                                                  int nb_sectors, int *pnum,
+                                                  BlockDriverState **file)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct scsi_get_lba_status *lbas = NULL;
@@ -624,6 +626,9 @@ out:
    if (iTask.task != NULL) {
        scsi_free_scsi_task(iTask.task);
    }
+    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
+        *file = bs;
+    }
    return ret;
 }

@@ -650,7 +655,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
        int64_t ret;
        int pnum;
-        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum);
+        BlockDriverState *file;
+        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
        if (ret < 0) {
            return ret;
        }
@@ -1075,6 +1081,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
    QemuOpts *opts;
    const char *user = NULL;
    const char *password = NULL;
+    const char *secretid;
+    char *secret = NULL;

    list = qemu_find_opts("iscsi");
    if (!list) {
@@ -1094,8 +1102,20 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
        return;
    }

+    secretid = qemu_opt_get(opts, "password-secret");
    password = qemu_opt_get(opts, "password");
-    if (!password) {
+    if (secretid && password) {
+        error_setg(errp, "'password' and 'password-secret' properties are "
+                   "mutually exclusive");
+        return;
+    }
+    if (secretid) {
+        secret = qcrypto_secret_lookup_as_utf8(secretid, errp);
+        if (!secret) {
+            return;
+        }
+        password = secret;
+    } else if (!password) {
        error_setg(errp, "CHAP username specified but no password was given");
        return;
    }
@@ -1103,6 +1123,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target,
    if (iscsi_set_initiator_username_pwd(iscsi, user, password)) {
        error_setg(errp, "Failed to set initiator username and password");
    }
+
+    g_free(secret);
 }

 static void parse_header_digest(struct iscsi_context *iscsi, const char *target,
@@ -1852,6 +1874,11 @@ static QemuOptsList qemu_iscsi_opts = {
            .name = "password",
            .type = QEMU_OPT_STRING,
            .help = "password for CHAP authentication to target",
+        },{
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of the secret providing password for CHAP "
+                    "authentication to target",
        },{
            .name = "header-digest",
            .type = QEMU_OPT_STRING,
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -47,7 +47,6 @@ typedef struct MirrorBlockJob {
    BlockdevOnError on_source_error, on_target_error;
    bool synced;
    bool should_complete;
-    int64_t sector_num;
    int64_t granularity;
    size_t buf_size;
    int64_t bdev_length;
@@ -64,6 +63,8 @@ typedef struct MirrorBlockJob {
    int ret;
    bool unmap;
    bool waiting_for_io;
+    int target_cluster_sectors;
+    int max_iov;
 } MirrorBlockJob;

 typedef struct MirrorOp {
@@ -159,114 +160,84 @@ static void mirror_read_complete(void *opaque, int ret)
                    mirror_write_complete, op);
 }

-static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
+ * return the offset of the adjusted tail sector against original. */
+static int mirror_cow_align(MirrorBlockJob *s,
+                            int64_t *sector_num,
+                            int *nb_sectors)
+{
+    bool need_cow;
+    int ret = 0;
+    int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
+    int64_t align_sector_num = *sector_num;
+    int align_nb_sectors = *nb_sectors;
+    int max_sectors = chunk_sectors * s->max_iov;
+
+    need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
+    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
+                          s->cow_bitmap);
+    if (need_cow) {
+        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+                               &align_sector_num, &align_nb_sectors);
+    }
+
+    if (align_nb_sectors > max_sectors) {
+        align_nb_sectors = max_sectors;
+        if (need_cow) {
+            align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
+                                               s->target_cluster_sectors);
+        }
+    }
+
+    ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
+    *sector_num = align_sector_num;
+    *nb_sectors = align_nb_sectors;
+    assert(ret >= 0);
+    return ret;
+}
+
+static inline void mirror_wait_for_io(MirrorBlockJob *s)
+{
+    assert(!s->waiting_for_io);
+    s->waiting_for_io = true;
+    qemu_coroutine_yield();
+    s->waiting_for_io = false;
+}
+
+/* Submit async read while handling COW.
+ * Returns: nb_sectors if no alignment is necessary, or
+ *          (new_end - sector_num) if tail is rounded up or down due to
+ *          alignment or buffer limit.
+ */
+static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
+                          int nb_sectors)
 {
    BlockDriverState *source = s->common.bs;
-    int nb_sectors, sectors_per_chunk, nb_chunks, max_iov;
-    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
-    uint64_t delay_ns = 0;
+    int sectors_per_chunk, nb_chunks;
+    int ret = nb_sectors;
    MirrorOp *op;
-    int pnum;
-    int64_t ret;

-    max_iov = MIN(source->bl.max_iov, s->target->bl.max_iov);
-
-    s->sector_num = hbitmap_iter_next(&s->hbi);
-    if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
-        s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
-        assert(s->sector_num >= 0);
-    }
-
-    hbitmap_next_sector = s->sector_num;
-    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    end = s->bdev_length / BDRV_SECTOR_SIZE;

-    /* Extend the QEMUIOVector to include all adjacent blocks that will
-     * be copied in this operation.
-     *
-     * We have to do this if we have no backing file yet in the destination,
-     * and the cluster size is very large.  Then we need to do COW ourselves.
-     * The first time a cluster is copied, copy it entirely.  Note that,
-     * because both the granularity and the cluster size are powers of two,
-     * the number of sectors to copy cannot exceed one cluster.
-     *
-     * We also want to extend the QEMUIOVector to include more adjacent
-     * dirty blocks if possible, to limit the number of I/O operations and
-     * run efficiently even with a small granularity.
-     */
-    nb_chunks = 0;
-    nb_sectors = 0;
-    next_sector = sector_num;
-    next_chunk = sector_num / sectors_per_chunk;
+    /* We can only handle as much as buf_size at a time. */
+    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
+    assert(nb_sectors);

-    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
-    while (test_bit(next_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
-        s->waiting_for_io = true;
-        qemu_coroutine_yield();
-        s->waiting_for_io = false;
+    if (s->cow_bitmap) {
+        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
    }
+    assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
+    /* The sector range must meet granularity because:
+     * 1) Caller passes in aligned values;
+     * 2) mirror_cow_align is used only when target cluster is larger. */
+    assert(!(nb_sectors % sectors_per_chunk));
+    assert(!(sector_num % sectors_per_chunk));
+    nb_chunks = nb_sectors / sectors_per_chunk;

-    do {
-        int added_sectors, added_chunks;
-
-        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
-            test_bit(next_chunk, s->in_flight_bitmap)) {
-            assert(nb_sectors > 0);
-            break;
-        }
-
-        added_sectors = sectors_per_chunk;
-        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
-            bdrv_round_to_clusters(s->target,
-                                   next_sector, added_sectors,
-                                   &next_sector, &added_sectors);
-
-            /* On the first iteration, the rounding may make us copy
-             * sectors before the first dirty one.
-             */
-            if (next_sector < sector_num) {
-                assert(nb_sectors == 0);
-                sector_num = next_sector;
-                next_chunk = next_sector / sectors_per_chunk;
-            }
-        }
-
-        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
-        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
-
-        /* When doing COW, it may happen that there is not enough space for
-         * a full cluster.  Wait if that is the case.
-         */
-        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
-            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
-            s->waiting_for_io = true;
-            qemu_coroutine_yield();
-            s->waiting_for_io = false;
-        }
-        if (s->buf_free_count < nb_chunks + added_chunks) {
-            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
-            break;
-        }
-        if (max_iov < nb_chunks + added_chunks) {
-            trace_mirror_break_iov_max(s, nb_chunks, added_chunks);
-            break;
-        }
-
-        /* We have enough free space to copy these sectors.  */
-        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
-
-        nb_sectors += added_sectors;
-        nb_chunks += added_chunks;
-        next_sector += added_sectors;
-        next_chunk += added_chunks;
-        if (!s->synced && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
-        }
-    } while (delay_ns == 0 && next_sector < end);
+    while (s->buf_free_count < nb_chunks) {
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        mirror_wait_for_io(s);
+    }

    /* Allocate a MirrorOp that is used as an AIO callback.  */
    op = g_new(MirrorOp, 1);
@@ -278,47 +249,151 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     * from s->buf_free.
     */
    qemu_iovec_init(&op->qiov, nb_chunks);
-    next_sector = sector_num;
    while (nb_chunks-- > 0) {
        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
-        size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size;
+        size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;

        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
        s->buf_free_count--;
        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
-
-        /* Advance the HBitmapIter in parallel, so that we do not examine
-         * the same sector twice.
-         */
-        if (next_sector > hbitmap_next_sector
-            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
-            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
-        }
-
-        next_sector += sectors_per_chunk;
    }

-    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
-
    /* Copy the dirty cluster.  */
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    ret = bdrv_get_block_status_above(source, NULL, sector_num,
-                                      nb_sectors, &pnum);
-    if (ret < 0 || pnum < nb_sectors ||
-            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
-        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
-                       mirror_read_complete, op);
-    } else if (ret & BDRV_BLOCK_ZERO) {
+    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+                   mirror_read_complete, op);
+    return ret;
+}
+
+static void mirror_do_zero_or_discard(MirrorBlockJob *s,
+                                      int64_t sector_num,
+                                      int nb_sectors,
+                                      bool is_discard)
+{
+    MirrorOp *op;
+
+    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
+     * so the freeing in mirror_iteration_done is nop. */
+    op = g_new0(MirrorOp, 1);
+    op->s = s;
+    op->sector_num = sector_num;
+    op->nb_sectors = nb_sectors;
+
+    s->in_flight++;
+    s->sectors_in_flight += nb_sectors;
+    if (is_discard) {
+        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
+                         mirror_write_complete, op);
+    } else {
        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
-    } else {
-        assert(!(ret & BDRV_BLOCK_DATA));
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+    }
+}
+
+static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+{
+    BlockDriverState *source = s->common.bs;
+    int64_t sector_num;
+    uint64_t delay_ns = 0;
+    /* At least the first dirty chunk is mirrored in one iteration. */
+    int nb_chunks = 1;
+    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
+    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+
+    sector_num = hbitmap_iter_next(&s->hbi);
+    if (sector_num < 0) {
+        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
+        sector_num = hbitmap_iter_next(&s->hbi);
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
+        assert(sector_num >= 0);
+    }
+
+    /* Find the number of consective dirty chunks following the first dirty
+     * one, and wait for in flight requests in them. */
+    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
+        int64_t hbitmap_next;
+        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
+        int64_t next_chunk = next_sector / sectors_per_chunk;
+        if (next_sector >= end ||
+            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
+            break;
+        }
+        if (test_bit(next_chunk, s->in_flight_bitmap)) {
+            if (nb_chunks > 0) {
+                break;
+            }
+            trace_mirror_yield_in_flight(s, next_sector, s->in_flight);
+            mirror_wait_for_io(s);
+            /* Now retry.  */
+        } else {
+            hbitmap_next = hbitmap_iter_next(&s->hbi);
+            assert(hbitmap_next == next_sector);
+            nb_chunks++;
+        }
+    }
+
+    /* Clear dirty bits before querying the block status, because
+     * calling bdrv_get_block_status_above could yield - if some blocks are
+     * marked dirty in this window, we need to know.
+     */
+    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
+                            nb_chunks * sectors_per_chunk);
+    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
+    while (nb_chunks > 0 && sector_num < end) {
+        int ret;
+        int io_sectors;
+        BlockDriverState *file;
+        enum MirrorMethod {
+            MIRROR_METHOD_COPY,
+            MIRROR_METHOD_ZERO,
+            MIRROR_METHOD_DISCARD
+        } mirror_method = MIRROR_METHOD_COPY;
+
+        assert(!(sector_num % sectors_per_chunk));
+        ret = bdrv_get_block_status_above(source, NULL, sector_num,
+                                          nb_chunks * sectors_per_chunk,
+                                          &io_sectors, &file);
+        if (ret < 0) {
+            io_sectors = nb_chunks * sectors_per_chunk;
+        }
+
+        io_sectors -= io_sectors % sectors_per_chunk;
+        if (io_sectors < sectors_per_chunk) {
+            io_sectors = sectors_per_chunk;
+        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
+            int64_t target_sector_num;
+            int target_nb_sectors;
+            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+                                   &target_sector_num, &target_nb_sectors);
+            if (target_sector_num == sector_num &&
+                target_nb_sectors == io_sectors) {
+                mirror_method = ret & BDRV_BLOCK_ZERO ?
+                                    MIRROR_METHOD_ZERO :
+                                    MIRROR_METHOD_DISCARD;
+            }
+        }
+
+        switch (mirror_method) {
+        case MIRROR_METHOD_COPY:
+            io_sectors = mirror_do_read(s, sector_num, io_sectors);
+            break;
+        case MIRROR_METHOD_ZERO:
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
+            break;
+        case MIRROR_METHOD_DISCARD:
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
+            break;
+        default:
+            abort();
+        }
+        assert(io_sectors);
+        sector_num += io_sectors;
+        nb_chunks -= io_sectors / sectors_per_chunk;
+        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
    }
    return delay_ns;
 }
@@ -343,9 +418,7 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void mirror_drain(MirrorBlockJob *s)
 {
    while (s->in_flight > 0) {
-        s->waiting_for_io = true;
-        qemu_coroutine_yield();
-        s->waiting_for_io = false;
+        mirror_wait_for_io(s);
    }
 }

@@ -419,6 +492,7 @@ static void coroutine_fn mirror_run(void *opaque)
                                 checking for a NULL string */
    int ret = 0;
    int n;
+    int target_cluster_size = BDRV_SECTOR_SIZE;

    if (block_job_is_cancelled(&s->common)) {
        goto immediate_exit;
@@ -448,16 +522,16 @@ static void coroutine_fn mirror_run(void *opaque)
     */
    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
-    if (backing_filename[0] && !s->target->backing) {
-        ret = bdrv_get_info(s->target, &bdi);
-        if (ret < 0) {
-            goto immediate_exit;
-        }
-        if (s->granularity < bdi.cluster_size) {
-            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
-            s->cow_bitmap = bitmap_new(length);
-        }
+    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+        target_cluster_size = bdi.cluster_size;
    }
+    if (backing_filename[0] && !s->target->backing
+        && s->granularity < target_cluster_size) {
+        s->buf_size = MAX(s->buf_size, target_cluster_size);
+        s->cow_bitmap = bitmap_new(length);
+    }
+    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
+    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);

    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -532,9 +606,7 @@ static void coroutine_fn mirror_run(void *opaque)
            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
-                s->waiting_for_io = true;
-                qemu_coroutine_yield();
-                s->waiting_for_io = false;
+                mirror_wait_for_io(s);
                continue;
            } else if (cnt != 0) {
                delay_ns = mirror_iteration(s);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -28,7 +28,6 @@

 #include "qemu/osdep.h"
 #include "nbd-client.h"
-#include "qemu/sockets.h"

 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
@@ -48,13 +47,21 @@ static void nbd_teardown_connection(BlockDriverState *bs)
 {
    NbdClientSession *client = nbd_get_client_session(bs);

+    if (!client->ioc) { /* Already closed */
+        return;
+    }
+
    /* finish any pending coroutines */
-    shutdown(client->sock, 2);
+    qio_channel_shutdown(client->ioc,
+                         QIO_CHANNEL_SHUTDOWN_BOTH,
+                         NULL);
    nbd_recv_coroutines_enter_all(client);

    nbd_client_detach_aio_context(bs);
-    closesocket(client->sock);
-    client->sock = -1;
+    object_unref(OBJECT(client->sioc));
+    client->sioc = NULL;
+    object_unref(OBJECT(client->ioc));
+    client->ioc = NULL;
 }

 static void nbd_reply_ready(void *opaque)
@@ -64,12 +71,16 @@ static void nbd_reply_ready(void *opaque)
    uint64_t i;
    int ret;

+    if (!s->ioc) { /* Already closed */
+        return;
+    }
+
    if (s->reply.handle == 0) {
        /* No reply already in flight.  Fetch a header.  It is possible
         * that another thread has done the same thing in parallel, so
         * the socket is not readable anymore.
         */
-        ret = nbd_receive_reply(s->sock, &s->reply);
+        ret = nbd_receive_reply(s->ioc, &s->reply);
        if (ret == -EAGAIN) {
            return;
        }
@@ -120,32 +131,35 @@ static int nbd_co_send_request(BlockDriverState *bs,
        }
    }

+    g_assert(qemu_in_coroutine());
    assert(i < MAX_NBD_REQUESTS);
    request->handle = INDEX_TO_HANDLE(s, i);
+
+    if (!s->ioc) {
+        qemu_co_mutex_unlock(&s->send_mutex);
+        return -EPIPE;
+    }
+
    s->send_coroutine = qemu_coroutine_self();
    aio_context = bdrv_get_aio_context(bs);

-    aio_set_fd_handler(aio_context, s->sock, false,
+    aio_set_fd_handler(aio_context, s->sioc->fd, false,
                       nbd_reply_ready, nbd_restart_write, bs);
    if (qiov) {
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 1);
-        }
-        rc = nbd_send_request(s->sock, request);
+        qio_channel_set_cork(s->ioc, true);
+        rc = nbd_send_request(s->ioc, request);
        if (rc >= 0) {
-            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
+                               offset, request->len, 0);
            if (ret != request->len) {
                rc = -EIO;
            }
        }
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 0);
-        }
+        qio_channel_set_cork(s->ioc, false);
    } else {
-        rc = nbd_send_request(s->sock, request);
+        rc = nbd_send_request(s->ioc, request);
    }
-    aio_set_fd_handler(aio_context, s->sock, false,
+    aio_set_fd_handler(aio_context, s->sioc->fd, false,
                       nbd_reply_ready, NULL, bs);
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
@@ -162,12 +176,13 @@ static void nbd_co_receive_reply(NbdClientSession *s,
     * peek at the next reply and avoid yielding if it's ours?  */
    qemu_coroutine_yield();
    *reply = s->reply;
-    if (reply->handle != request->handle) {
+    if (reply->handle != request->handle ||
+        !s->ioc) {
        reply->error = EIO;
    } else {
        if (qiov && reply->error == 0) {
-            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
+                               offset, request->len, 1);
            if (ret != request->len) {
                reply->error = EIO;
            }
@@ -350,14 +365,14 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sock,
+                       nbd_get_client_session(bs)->sioc->fd,
                       false, NULL, NULL, NULL);
 }

 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                   AioContext *new_context)
 {
-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock,
+    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
                       false, nbd_reply_ready, NULL, bs);
 }

@@ -370,16 +385,20 @@ void nbd_client_close(BlockDriverState *bs)
        .len = 0
    };

-    if (client->sock == -1) {
+    if (client->ioc == NULL) {
        return;
    }

-    nbd_send_request(client->sock, &request);
+    nbd_send_request(client->ioc, &request);

    nbd_teardown_connection(bs);
 }

-int nbd_client_init(BlockDriverState *bs, int sock, const char *export,
+int nbd_client_init(BlockDriverState *bs,
+                    QIOChannelSocket *sioc,
+                    const char *export,
+                    QCryptoTLSCreds *tlscreds,
+                    const char *hostname,
                    Error **errp)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
@@ -387,22 +406,32 @@ int nbd_client_init(BlockDriverState *bs, int sock, const char *export,

    /* NBD handshake */
    logout("session init %s\n", export);
-    qemu_set_block(sock);
-    ret = nbd_receive_negotiate(sock, export,
-                                &client->nbdflags, &client->size, errp);
+    qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL);
+
+    ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export,
+                                &client->nbdflags,
+                                tlscreds, hostname,
+                                &client->ioc,
+                                &client->size, errp);
    if (ret < 0) {
        logout("Failed to negotiate with the NBD server\n");
-        closesocket(sock);
        return ret;
    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_mutex_init(&client->free_sema);
-    client->sock = sock;
+    client->sioc = sioc;
+    object_ref(OBJECT(client->sioc));
+
+    if (!client->ioc) {
+        client->ioc = QIO_CHANNEL(sioc);
+        object_ref(OBJECT(client->ioc));
+    }

    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
-    qemu_set_nonblock(sock);
+    qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
+
    nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));

    logout("Established connection with NBD server\n");
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -4,6 +4,7 @@
 #include "qemu-common.h"
 #include "block/nbd.h"
 #include "block/block_int.h"
+#include "io/channel-socket.h"

 /* #define DEBUG_NBD */

@@ -17,7 +18,8 @@
 #define MAX_NBD_REQUESTS    16

 typedef struct NbdClientSession {
-    int sock;
+    QIOChannelSocket *sioc; /* The master data channel */
+    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
    uint32_t nbdflags;
    off_t size;

@@ -34,7 +36,11 @@ typedef struct NbdClientSession {

 NbdClientSession *nbd_get_client_session(BlockDriverState *bs);

-int nbd_client_init(BlockDriverState *bs, int sock, const char *export_name,
+int nbd_client_init(BlockDriverState *bs,
+                    QIOChannelSocket *sock,
+                    const char *export_name,
+                    QCryptoTLSCreds *tlscreds,
+                    const char *hostname,
                    Error **errp);
 void nbd_client_close(BlockDriverState *bs);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -31,7 +31,6 @@
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qemu/sockets.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qint.h"
@@ -205,18 +204,20 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
    saddr = g_new0(SocketAddress, 1);

    if (qdict_haskey(options, "path")) {
+        UnixSocketAddress *q_unix;
        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
-        saddr->u.q_unix = g_new0(UnixSocketAddress, 1);
-        saddr->u.q_unix->path = g_strdup(qdict_get_str(options, "path"));
+        q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
+        q_unix->path = g_strdup(qdict_get_str(options, "path"));
        qdict_del(options, "path");
    } else {
+        InetSocketAddress *inet;
        saddr->type = SOCKET_ADDRESS_KIND_INET;
-        saddr->u.inet = g_new0(InetSocketAddress, 1);
-        saddr->u.inet->host = g_strdup(qdict_get_str(options, "host"));
+        inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1);
+        inet->host = g_strdup(qdict_get_str(options, "host"));
        if (!qdict_get_try_str(options, "port")) {
-            saddr->u.inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+            inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
        } else {
-            saddr->u.inet->port = g_strdup(qdict_get_str(options, "port"));
+            inet->port = g_strdup(qdict_get_str(options, "port"));
        }
        qdict_del(options, "host");
        qdict_del(options, "port");
@@ -238,55 +239,113 @@ NbdClientSession *nbd_get_client_session(BlockDriverState *bs)
    return &s->client;
 }

-static int nbd_establish_connection(BlockDriverState *bs,
-                                    SocketAddress *saddr,
-                                    Error **errp)
+static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
+                                                  Error **errp)
 {
-    BDRVNBDState *s = bs->opaque;
-    int sock;
+    QIOChannelSocket *sioc;
+    Error *local_err = NULL;

-    sock = socket_connect(saddr, errp, NULL, NULL);
+    sioc = qio_channel_socket_new();

-    if (sock < 0) {
-        logout("Failed to establish connection to NBD server\n");
-        return -EIO;
+    qio_channel_socket_connect_sync(sioc,
+                                    saddr,
+                                    &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return NULL;
    }

-    if (!s->client.is_unix) {
-        socket_set_nodelay(sock);
-    }
+    qio_channel_set_delay(QIO_CHANNEL(sioc), false);

-    return sock;
+    return sioc;
 }

+
+static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
+{
+    Object *obj;
+    QCryptoTLSCreds *creds;
+
+    obj = object_resolve_path_component(
+        object_get_objects_root(), id);
+    if (!obj) {
+        error_setg(errp, "No TLS credentials with id '%s'",
+                   id);
+        return NULL;
+    }
+    creds = (QCryptoTLSCreds *)
+        object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS);
+    if (!creds) {
+        error_setg(errp, "Object with id '%s' is not TLS credentials",
+                   id);
+        return NULL;
+    }
+
+    if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) {
+        error_setg(errp,
+                   "Expecting TLS credentials with a client endpoint");
+        return NULL;
+    }
+    object_ref(obj);
+    return creds;
+}
+
+
 static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVNBDState *s = bs->opaque;
    char *export = NULL;
-    int result, sock;
+    QIOChannelSocket *sioc = NULL;
    SocketAddress *saddr;
+    const char *tlscredsid;
+    QCryptoTLSCreds *tlscreds = NULL;
+    const char *hostname = NULL;
+    int ret = -EINVAL;

    /* Pop the config into our state object. Exit if invalid. */
    saddr = nbd_config(s, options, &export, errp);
    if (!saddr) {
-        return -EINVAL;
+        goto error;
+    }
+
+    tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds"));
+    if (tlscredsid) {
+        qdict_del(options, "tls-creds");
+        tlscreds = nbd_get_tls_creds(tlscredsid, errp);
+        if (!tlscreds) {
+            goto error;
+        }
+
+        if (saddr->type != SOCKET_ADDRESS_KIND_INET) {
+            error_setg(errp, "TLS only supported over IP sockets");
+            goto error;
+        }
+        hostname = saddr->u.inet.data->host;
    }

    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
-    sock = nbd_establish_connection(bs, saddr, errp);
-    qapi_free_SocketAddress(saddr);
-    if (sock < 0) {
-        g_free(export);
-        return sock;
+    sioc = nbd_establish_connection(saddr, errp);
+    if (!sioc) {
+        ret = -ECONNREFUSED;
+        goto error;
    }

    /* NBD handshake */
-    result = nbd_client_init(bs, sock, export, errp);
+    ret = nbd_client_init(bs, sioc, export,
+                          tlscreds, hostname, errp);
+ error:
+    if (sioc) {
+        object_unref(OBJECT(sioc));
+    }
+    if (tlscreds) {
+        object_unref(OBJECT(tlscreds));
+    }
+    qapi_free_SocketAddress(saddr);
    g_free(export);
-    return result;
+    return ret;
 }

 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
@@ -348,6 +407,7 @@ static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
    const char *host   = qdict_get_try_str(options, "host");
    const char *port   = qdict_get_try_str(options, "port");
    const char *export = qdict_get_try_str(options, "export");
+    const char *tlscreds = qdict_get_try_str(options, "tls-creds");

    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));

@@ -382,6 +442,9 @@ static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
    if (export) {
        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export)));
    }
+    if (tlscreds) {
+        qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds)));
+    }

    bs->full_open_options = opts;
 }
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -36,6 +36,7 @@
 #include <nfsc/libnfs.h>

 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
+#define QEMU_NFS_MAX_DEBUG_LEVEL 2

 typedef struct NFSClient {
    struct nfs_context *context;
@@ -333,6 +334,17 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
                val = QEMU_NFS_MAX_READAHEAD_SIZE;
            }
            nfs_set_readahead(client->context, val);
+#endif
+#ifdef LIBNFS_FEATURE_DEBUG
+        } else if (!strcmp(qp->p[i].name, "debug")) {
+            /* limit the maximum debug level to avoid potential flooding
+             * of our log files. */
+            if (val > QEMU_NFS_MAX_DEBUG_LEVEL) {
+                error_report("NFS Warning: Limiting NFS debug level"
+                             " to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
+                val = QEMU_NFS_MAX_DEBUG_LEVEL;
+            }
+            nfs_set_debug(client->context, val);
 #endif
        } else {
            error_setg(errp, "Unknown NFS parameter name: %s",
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -30,6 +30,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"
@@ -261,7 +262,7 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)


 static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVParallelsState *s = bs->opaque;
    int64_t offset;
@@ -274,6 +275,7 @@ static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
        return 0;
    }

+    *file = bs->file->bs;
    return (offset << BDRV_SECTOR_BITS) |
        BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }
@@ -460,7 +462,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size, cl_size;
    uint8_t tmp[BDRV_SECTOR_SIZE];
    Error *local_err = NULL;
-    BlockDriverState *file;
+    BlockBackend *file;
    uint32_t bat_entries, bat_sectors;
    ParallelsHeader header;
    int ret;
@@ -476,14 +478,17 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
        return ret;
    }

-    file = NULL;
-    ret = bdrv_open(&file, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+    file = blk_new_open(filename, NULL, NULL,
+                        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                        &local_err);
+    if (file == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }
-    ret = bdrv_truncate(file, 0);
+
+    blk_set_allow_write_beyond_eof(file, true);
+
+    ret = blk_truncate(file, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -507,18 +512,18 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(tmp, 0, sizeof(tmp));
    memcpy(tmp, &header, sizeof(header));

-    ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
    if (ret < 0) {
        goto exit;
    }
-    ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
    if (ret < 0) {
        goto exit;
    }
    ret = 0;

 done:
-    bdrv_unref(file);
+    blk_unref(file);
    return ret;

 exit:
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -92,6 +92,26 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
        info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max;
        info->iops_wr_max     = cfg.buckets[THROTTLE_OPS_WRITE].max;

+        info->has_bps_max_length     = info->has_bps_max;
+        info->bps_max_length         =
+            cfg.buckets[THROTTLE_BPS_TOTAL].burst_length;
+        info->has_bps_rd_max_length  = info->has_bps_rd_max;
+        info->bps_rd_max_length      =
+            cfg.buckets[THROTTLE_BPS_READ].burst_length;
+        info->has_bps_wr_max_length  = info->has_bps_wr_max;
+        info->bps_wr_max_length      =
+            cfg.buckets[THROTTLE_BPS_WRITE].burst_length;
+
+        info->has_iops_max_length    = info->has_iops_max;
+        info->iops_max_length        =
+            cfg.buckets[THROTTLE_OPS_TOTAL].burst_length;
+        info->has_iops_rd_max_length = info->has_iops_rd_max;
+        info->iops_rd_max_length     =
+            cfg.buckets[THROTTLE_OPS_READ].burst_length;
+        info->has_iops_wr_max_length = info->has_iops_wr_max;
+        info->iops_wr_max_length     =
+            cfg.buckets[THROTTLE_OPS_WRITE].burst_length;
+
        info->has_iops_size = cfg.op_size;
        info->iops_size = cfg.op_size;

@@ -211,11 +231,13 @@ void bdrv_query_image_info(BlockDriverState *bs,
    Error *err = NULL;
    ImageInfo *info;

+    aio_context_acquire(bdrv_get_aio_context(bs));
+
    size = bdrv_getlength(bs);
    if (size < 0) {
        error_setg_errno(errp, -size, "Can't get size of device '%s'",
                         bdrv_get_device_name(bs));
-        return;
+        goto out;
    }

    info = g_new0(ImageInfo, 1);
@@ -283,10 +305,13 @@ void bdrv_query_image_info(BlockDriverState *bs,
    default:
        error_propagate(errp, err);
        qapi_free_ImageInfo(info);
-        return;
+        goto out;
    }

    *p_info = info;
+
+out:
+    aio_context_release(bdrv_get_aio_context(bs));
 }

 /* @p_info will be set only on success. */
@@ -300,7 +325,7 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
    info->locked = blk_dev_is_medium_locked(blk);
    info->removable = blk_dev_has_removable_media(blk);

-    if (blk_dev_has_removable_media(blk)) {
+    if (blk_dev_has_tray(blk)) {
        info->has_tray_open = true;
        info->tray_open = blk_dev_is_tray_open(blk);
    }
@@ -330,100 +355,116 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
    qapi_free_BlockInfo(info);
 }

-static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
-                                    bool query_backing)
+static BlockStats *bdrv_query_stats(BlockBackend *blk,
+                                    const BlockDriverState *bs,
+                                    bool query_backing);
+
+static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk)
 {
-    BlockStats *s;
+    BlockAcctStats *stats = blk_get_stats(blk);
+    BlockAcctTimedStats *ts = NULL;

-    s = g_malloc0(sizeof(*s));
+    s->has_device = true;
+    s->device = g_strdup(blk_name(blk));

-    if (bdrv_get_device_name(bs)[0]) {
-        s->has_device = true;
-        s->device = g_strdup(bdrv_get_device_name(bs));
+    s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
+    s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+    s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
+    s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+
+    s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
+    s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+    s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
+
+    s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
+    s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+    s->stats->invalid_flush_operations =
+        stats->invalid_ops[BLOCK_ACCT_FLUSH];
+
+    s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
+    s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+    s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
+    s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+    s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
+    s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
+
+    s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
+    if (s->stats->has_idle_time_ns) {
+        s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
    }

+    s->stats->account_invalid = stats->account_invalid;
+    s->stats->account_failed = stats->account_failed;
+
+    while ((ts = block_acct_interval_next(stats, ts))) {
+        BlockDeviceTimedStatsList *timed_stats =
+            g_malloc0(sizeof(*timed_stats));
+        BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
+        timed_stats->next = s->stats->timed_stats;
+        timed_stats->value = dev_stats;
+        s->stats->timed_stats = timed_stats;
+
+        TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
+        TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+        TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
+
+        dev_stats->interval_length = ts->interval_length;
+
+        dev_stats->min_rd_latency_ns = timed_average_min(rd);
+        dev_stats->max_rd_latency_ns = timed_average_max(rd);
+        dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
+
+        dev_stats->min_wr_latency_ns = timed_average_min(wr);
+        dev_stats->max_wr_latency_ns = timed_average_max(wr);
+        dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+
+        dev_stats->min_flush_latency_ns = timed_average_min(fl);
+        dev_stats->max_flush_latency_ns = timed_average_max(fl);
+        dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+        dev_stats->avg_rd_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+        dev_stats->avg_wr_queue_depth =
+            block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+    }
+}
+
+static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs,
+                                 bool query_backing)
+{
    if (bdrv_get_node_name(bs)[0]) {
        s->has_node_name = true;
        s->node_name = g_strdup(bdrv_get_node_name(bs));
    }

-    s->stats = g_malloc0(sizeof(*s->stats));
-    if (bs->blk) {
-        BlockAcctStats *stats = blk_get_stats(bs->blk);
-        BlockAcctTimedStats *ts = NULL;
-
-        s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
-        s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
-        s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
-        s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
-
-        s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
-        s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
-        s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
-
-        s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
-        s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
-        s->stats->invalid_flush_operations =
-            stats->invalid_ops[BLOCK_ACCT_FLUSH];
-
-        s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
-        s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
-        s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
-        s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
-        s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
-        s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
-
-        s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
-        if (s->stats->has_idle_time_ns) {
-            s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
-        }
-
-        s->stats->account_invalid = stats->account_invalid;
-        s->stats->account_failed = stats->account_failed;
-
-        while ((ts = block_acct_interval_next(stats, ts))) {
-            BlockDeviceTimedStatsList *timed_stats =
-                g_malloc0(sizeof(*timed_stats));
-            BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
-            timed_stats->next = s->stats->timed_stats;
-            timed_stats->value = dev_stats;
-            s->stats->timed_stats = timed_stats;
-
-            TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
-            TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
-            TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
-
-            dev_stats->interval_length = ts->interval_length;
-
-            dev_stats->min_rd_latency_ns = timed_average_min(rd);
-            dev_stats->max_rd_latency_ns = timed_average_max(rd);
-            dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
-
-            dev_stats->min_wr_latency_ns = timed_average_min(wr);
-            dev_stats->max_wr_latency_ns = timed_average_max(wr);
-            dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
-
-            dev_stats->min_flush_latency_ns = timed_average_min(fl);
-            dev_stats->max_flush_latency_ns = timed_average_max(fl);
-            dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
-
-            dev_stats->avg_rd_queue_depth =
-                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
-            dev_stats->avg_wr_queue_depth =
-                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
-        }
-    }
-
    s->stats->wr_highest_offset = bs->wr_highest_offset;

    if (bs->file) {
        s->has_parent = true;
-        s->parent = bdrv_query_stats(bs->file->bs, query_backing);
+        s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing);
    }

    if (query_backing && bs->backing) {
        s->has_backing = true;
-        s->backing = bdrv_query_stats(bs->backing->bs, query_backing);
+        s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing);
+    }
+
+}
+
+static BlockStats *bdrv_query_stats(BlockBackend *blk,
+                                    const BlockDriverState *bs,
+                                    bool query_backing)
+{
+    BlockStats *s;
+
+    s = g_malloc0(sizeof(*s));
+    s->stats = g_malloc0(sizeof(*s->stats));
+
+    if (blk) {
+        bdrv_query_blk_stats(s, blk);
+    }
+    if (bs) {
+        bdrv_query_bds_stats(s, bs, query_backing);
    }

    return s;
@@ -452,22 +493,38 @@ BlockInfoList *qmp_query_block(Error **errp)
    return head;
 }

+static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs,
+                           bool query_nodes)
+{
+    if (query_nodes) {
+        *bs = bdrv_next_node(*bs);
+        return !!*bs;
+    }
+
+    *blk = blk_next(*blk);
+    *bs = *blk ? blk_bs(*blk) : NULL;
+
+    return !!*blk;
+}
+
 BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
                                     bool query_nodes,
                                     Error **errp)
 {
    BlockStatsList *head = NULL, **p_next = &head;
+    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;

    /* Just to be safe if query_nodes is not always initialized */
    query_nodes = has_query_nodes && query_nodes;

-    while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) {
+    while (next_query_bds(&blk, &bs, query_nodes)) {
        BlockStatsList *info = g_malloc0(sizeof(*info));
-        AioContext *ctx = bdrv_get_aio_context(bs);
+        AioContext *ctx = blk ? blk_get_aio_context(blk)
+                              : bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
-        info->value = bdrv_query_stats(bs, !query_nodes);
+        info->value = bdrv_query_stats(blk, bs, !query_nodes);
        aio_context_release(ctx);

        *p_next = info;
@@ -636,7 +693,7 @@ void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,
    QmpOutputVisitor *ov = qmp_output_visitor_new();
    QObject *obj, *data;

-    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL,
+    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec,
                                 &error_abort);
    obj = qmp_output_get_qobject(ov);
    assert(qobject_type(obj) == QTYPE_QDICT);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
@@ -120,11 +121,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }
    if (header.version != QCOW_VERSION) {
-        char version[64];
-        snprintf(version, sizeof(version), "QCOW version %" PRIu32,
-                 header.version);
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "qcow", version);
+        error_setg(errp, "Unsupported qcow version %" PRIu32, header.version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -489,7 +486,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
 }

 static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster, n;
@@ -510,6 +507,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
        return BDRV_BLOCK_DATA;
    }
    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+    *file = bs->file->bs;
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
 }

@@ -779,7 +777,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    int flags = 0;
    Error *local_err = NULL;
    int ret;
-    BlockDriverState *qcow_bs;
+    BlockBackend *qcow_blk;

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
@@ -795,15 +793,18 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
        goto cleanup;
    }

-    qcow_bs = NULL;
-    ret = bdrv_open(&qcow_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+    qcow_blk = blk_new_open(filename, NULL, NULL,
+                            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                            &local_err);
+    if (qcow_blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto cleanup;
    }

-    ret = bdrv_truncate(qcow_bs, 0);
+    blk_set_allow_write_beyond_eof(qcow_blk, true);
+
+    ret = blk_truncate(qcow_blk, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -843,13 +844,13 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* write all the data */
-    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
    if (ret != sizeof(header)) {
        goto exit;
    }

    if (backing_file) {
-        ret = bdrv_pwrite(qcow_bs, sizeof(header),
+        ret = blk_pwrite(qcow_blk, sizeof(header),
            backing_file, backing_filename_len);
        if (ret != backing_filename_len) {
            goto exit;
@@ -859,7 +860,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    tmp = g_malloc0(BDRV_SECTOR_SIZE);
    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
        BDRV_SECTOR_SIZE); i++) {
-        ret = bdrv_pwrite(qcow_bs, header_size +
+        ret = blk_pwrite(qcow_blk, header_size +
            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
@@ -870,7 +871,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    g_free(tmp);
    ret = 0;
 exit:
-    bdrv_unref(qcow_bs);
+    blk_unref(qcow_blk);
 cleanup:
    g_free(backing_file);
    return ret;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include <zlib.h>
 #include "block/qcow2.h"
@@ -197,22 +198,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs)
    }
 }

-static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
-    Error **errp, const char *fmt, ...)
-{
-    char msg[64];
-    va_list ap;
-
-    va_start(ap, fmt);
-    vsnprintf(msg, sizeof(msg), fmt, ap);
-    va_end(ap);
-
-    error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-               bdrv_get_device_or_node_name(bs), "qcow2", msg);
-}
-
-static void report_unsupported_feature(BlockDriverState *bs,
-    Error **errp, Qcow2Feature *table, uint64_t mask)
+static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
+                                       uint64_t mask)
 {
    char *features = g_strdup("");
    char *old;
@@ -237,7 +224,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
        g_free(old);
    }

-    report_unsupported(bs, errp, "%s", features);
+    error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
    g_free(features);
 }

@@ -854,7 +841,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }
    if (header.version < 2 || header.version > 3) {
-        report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version);
+        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -934,7 +921,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
                              &feature_table, NULL);
-        report_unsupported_feature(bs, errp, feature_table,
+        report_unsupported_feature(errp, feature_table,
                                   s->incompatible_features &
                                   ~QCOW2_INCOMPAT_MASK);
        ret = -ENOTSUP;
@@ -1330,7 +1317,7 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
 }

 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t cluster_offset;
@@ -1349,6 +1336,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
        !s->cipher) {
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+        *file = bs->file->bs;
        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
    }
    if (ret == QCOW2_CLUSTER_ZERO) {
@@ -2096,7 +2084,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
-    BlockDriverState* bs;
+    BlockBackend *blk;
    QCowHeader *header;
    uint64_t* refcount_table;
    Error *local_err = NULL;
@@ -2171,14 +2159,16 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        return ret;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* Write the header */
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
    header = g_malloc0(cluster_size);
@@ -2206,7 +2196,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

-    ret = bdrv_pwrite(bs, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size);
    g_free(header);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2216,7 +2206,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
    g_free(refcount_table);

    if (ret < 0) {
@@ -2224,8 +2214,8 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    bdrv_unref(bs);
-    bs = NULL;
+    blk_unref(blk);
+    blk = NULL;

    /*
     * And now open the image and make it consistent first (i.e. increase the
@@ -2234,15 +2224,16 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     */
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
-    ret = bdrv_open(&bs, filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, options,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

-    ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
+    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
@@ -2254,14 +2245,14 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* Create a full header (including things like feature table) */
-    ret = qcow2_update_header(bs);
+    ret = qcow2_update_header(blk_bs(blk));
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
    }

    /* Okay, now that we have a valid image, let's give it the right size */
-    ret = bdrv_truncate(bs, total_size);
+    ret = blk_truncate(blk, total_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not resize image");
        goto out;
@@ -2269,7 +2260,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,

    /* Want a backing file? There you go.*/
    if (backing_file) {
-        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+        ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
                             "with format '%s'", backing_file, backing_format);
@@ -2279,9 +2270,9 @@ static int qcow2_create2(const char *filename, int64_t total_size,

    /* And if we're supposed to preallocate metadata, do that now */
    if (prealloc != PREALLOC_MODE_OFF) {
-        BDRVQcow2State *s = bs->opaque;
+        BDRVQcow2State *s = blk_bs(blk)->opaque;
        qemu_co_mutex_lock(&s->lock);
-        ret = preallocate(bs);
+        ret = preallocate(blk_bs(blk));
        qemu_co_mutex_unlock(&s->lock);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not preallocate metadata");
@@ -2289,24 +2280,25 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        }
    }

-    bdrv_unref(bs);
-    bs = NULL;
+    blk_unref(blk);
+    blk = NULL;

    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow2"));
-    ret = bdrv_open(&bs, filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
-                    &local_err);
-    if (local_err) {
+    blk = blk_new_open(filename, NULL, options,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

    ret = 0;
 out:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    return ret;
 }
@@ -2808,15 +2800,15 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)

    *spec_info = (ImageInfoSpecific){
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
-        .u.qcow2 = g_new(ImageInfoSpecificQCow2, 1),
+        .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
    };
    if (s->qcow_version == 2) {
-        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
        };
    } else if (s->qcow_version == 3) {
-        *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){
+        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
--- a/block/qed.c
+++ b/block/qed.c
@@ -18,6 +18,7 @@
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
 #include "migration/migration.h"
+#include "sysemu/block-backend.h"

 static const AIOCBInfo qed_aiocb_info = {
    .aiocb_size         = sizeof(QEDAIOCB),
@@ -376,18 +377,6 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
    }
 }

-static void bdrv_qed_drain(BlockDriverState *bs)
-{
-    BDRVQEDState *s = bs->opaque;
-
-    /* Cancel timer and start doing I/O that were meant to happen as if it
-     * fired, that way we get bdrv_drain() taking care of the ongoing requests
-     * correctly. */
-    qed_cancel_need_check_timer(s);
-    qed_plug_allocating_write_reqs(s);
-    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
-}
-
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
@@ -411,11 +400,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    }
    if (s->header.features & ~QED_FEATURE_MASK) {
        /* image uses unsupported feature bits */
-        char buf[64];
-        snprintf(buf, sizeof(buf), "%" PRIx64,
-            s->header.features & ~QED_FEATURE_MASK);
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "QED", buf);
+        error_setg(errp, "Unsupported QED features: %" PRIx64,
+                   s->header.features & ~QED_FEATURE_MASK);
        return -ENOTSUP;
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -580,7 +566,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    size_t l1_size = header.cluster_size * header.table_size;
    Error *local_err = NULL;
    int ret = 0;
-    BlockDriverState *bs;
+    BlockBackend *blk;

    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
@@ -588,17 +574,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
        return ret;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
-        return ret;
+        return -EIO;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* File must start empty and grow, check truncate is supported */
-    ret = bdrv_truncate(bs, 0);
+    ret = blk_truncate(blk, 0);
    if (ret < 0) {
        goto out;
    }
@@ -614,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    qed_header_cpu_to_le(&header, &le_header);
-    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
    if (ret < 0) {
        goto out;
    }
-    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
-                      header.backing_filename_size);
+    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
+                     header.backing_filename_size);
    if (ret < 0) {
        goto out;
    }

    l1_table = g_malloc0(l1_size);
-    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
    if (ret < 0) {
        goto out;
    }
@@ -633,7 +620,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    ret = 0; /* success */
 out:
    g_free(l1_table);
-    bdrv_unref(bs);
+    blk_unref(blk);
    return ret;
 }

@@ -693,6 +680,7 @@ typedef struct {
    uint64_t pos;
    int64_t status;
    int *pnum;
+    BlockDriverState **file;
 } QEDIsAllocatedCB;

 static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
@@ -704,6 +692,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
    case QED_CLUSTER_FOUND:
        offset |= qed_offset_into_cluster(s, cb->pos);
        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+        *cb->file = cb->bs->file->bs;
        break;
    case QED_CLUSTER_ZERO:
        cb->status = BDRV_BLOCK_ZERO;
@@ -725,7 +714,8 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l

 static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
                                                 int64_t sector_num,
-                                                 int nb_sectors, int *pnum)
+                                                 int nb_sectors, int *pnum,
+                                                 BlockDriverState **file)
 {
    BDRVQEDState *s = bs->opaque;
    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
@@ -734,6 +724,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
        .status = BDRV_BLOCK_OFFSET_MASK,
        .pnum = pnum,
+        .file = file,
    };
    QEDRequest request = { .l2_table = NULL };

@@ -1688,7 +1679,6 @@ static BlockDriver bdrv_qed = {
    .bdrv_check               = bdrv_qed_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
-    .bdrv_drain               = bdrv_qed_drain,
 };

 static void bdrv_qed_init(void)
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -215,14 +215,16 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
    return acb;
 }

-static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
+static void quorum_report_bad(QuorumOpType type, uint64_t sector_num,
+                              int nb_sectors, char *node_name, int ret)
 {
    const char *msg = NULL;
    if (ret < 0) {
        msg = strerror(-ret);
    }
-    qapi_event_send_quorum_report_bad(!!msg, msg, node_name,
-                                      acb->sector_num, acb->nb_sectors, &error_abort);
+
+    qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name,
+                                      sector_num, nb_sectors, &error_abort);
 }

 static void quorum_report_failure(QuorumAIOCB *acb)
@@ -284,9 +286,19 @@ static void quorum_aio_cb(void *opaque, int ret)
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;

+    if (ret == 0) {
+        acb->success_count++;
+    } else {
+        QuorumOpType type;
+        type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
+        quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
+                          sacb->aiocb->bs->node_name, ret);
+    }
+
    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
        /* We try to read next child in FIFO order if we fail to read */
-        if (ret < 0 && ++acb->child_iter < s->num_children) {
+        if (ret < 0 && (acb->child_iter + 1) < s->num_children) {
+            acb->child_iter++;
            read_fifo_child(acb);
            return;
        }
@@ -301,11 +313,6 @@ static void quorum_aio_cb(void *opaque, int ret)

    sacb->ret = ret;
    acb->count++;
-    if (ret == 0) {
-        acb->success_count++;
-    } else {
-        quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
-    }
    assert(acb->count <= s->num_children);
    assert(acb->success_count <= s->num_children);
    if (acb->count < s->num_children) {
@@ -337,7 +344,9 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
            continue;
        }
        QLIST_FOREACH(item, &version->items, next) {
-            quorum_report_bad(acb, s->children[item->index]->bs->node_name, 0);
+            quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num,
+                              acb->nb_sectors,
+                              s->children[item->index]->bs->node_name, 0);
        }
    }
 }
@@ -647,8 +656,9 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
    }

    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_readv(s->children[i]->bs, acb->sector_num, &acb->qcrs[i].qiov,
-                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
+        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num,
+                                            &acb->qcrs[i].qiov, acb->nb_sectors,
+                                            quorum_aio_cb, &acb->qcrs[i]);
    }

    return &acb->common;
@@ -663,9 +673,10 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
                     acb->qcrs[acb->child_iter].buf);
-    bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
-                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
-                   quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+    acb->qcrs[acb->child_iter].aiocb =
+        bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
+                       &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+                       quorum_aio_cb, &acb->qcrs[acb->child_iter]);

    return &acb->common;
 }
@@ -759,19 +770,30 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
    QuorumVoteValue result_value;
    int i;
    int result = 0;
+    int success_count = 0;

    QLIST_INIT(&error_votes.vote_list);
    error_votes.compare = quorum_64bits_compare;

    for (i = 0; i < s->num_children; i++) {
        result = bdrv_co_flush(s->children[i]->bs);
-        result_value.l = result;
-        quorum_count_vote(&error_votes, &result_value, i);
+        if (result) {
+            quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0,
+                              bdrv_nb_sectors(s->children[i]->bs),
+                              s->children[i]->bs->node_name, result);
+            result_value.l = result;
+            quorum_count_vote(&error_votes, &result_value, i);
+        } else {
+            success_count++;
+        }
    }

-    winner = quorum_get_vote_winner(&error_votes);
-    result = winner->value.l;
-
+    if (success_count >= s->threshold) {
+        result = 0;
+    } else {
+        winner = quorum_get_vote_winner(&error_votes);
+        result = winner->value.l;
+    }
    quorum_free_vote_list(&error_votes);

    return result;
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1818,7 +1818,8 @@ static int find_allocation(BlockDriverState *bs, off_t start,
 */
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                                    int64_t sector_num,
-                                                    int nb_sectors, int *pnum)
+                                                    int nb_sectors, int *pnum,
+                                                    BlockDriverState **file)
 {
    off_t start, data = 0, hole = 0;
    int64_t total_size;
@@ -1860,6 +1861,7 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
        ret = BDRV_BLOCK_ZERO;
    }
+    *file = bs;
    return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }

--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -115,9 +115,11 @@ fail:

 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum)
+                                            int nb_sectors, int *pnum,
+                                            BlockDriverState **file)
 {
    *pnum = nb_sectors;
+    *file = bs->file->bs;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
 }
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -16,6 +16,7 @@
 #include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "block/block_int.h"
+#include "crypto/secret.h"

 #include <rbd/librbd.h>

@@ -228,6 +229,27 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
    return NULL;
 }

+
+static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
+                             Error **errp)
+{
+    if (secretid == 0) {
+        return 0;
+    }
+
+    gchar *secret = qcrypto_secret_lookup_as_base64(secretid,
+                                                    errp);
+    if (!secret) {
+        return -1;
+    }
+
+    rados_conf_set(cluster, "key", secret);
+    g_free(secret);
+
+    return 0;
+}
+
+
 static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
                             bool only_read_conf_file,
                             Error **errp)
@@ -299,10 +321,13 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    char conf[RBD_MAX_CONF_SIZE];
    char clientname_buf[RBD_MAX_CONF_SIZE];
    char *clientname;
+    const char *secretid;
    rados_t cluster;
    rados_ioctx_t io_ctx;
    int ret;

+    secretid = qemu_opt_get(opts, "password-secret");
+
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
                           snap_buf, sizeof(snap_buf),
                           name, sizeof(name),
@@ -350,6 +375,11 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }

+    if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
+        rados_shutdown(cluster);
+        return -EIO;
+    }
+
    if (rados_connect(cluster) < 0) {
        error_setg(errp, "error connecting");
        rados_shutdown(cluster);
@@ -423,6 +453,11 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Specification of the rbd image",
        },
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret providing the password",
+        },
        { /* end of list */ }
    },
 };
@@ -436,6 +471,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    char conf[RBD_MAX_CONF_SIZE];
    char clientname_buf[RBD_MAX_CONF_SIZE];
    char *clientname;
+    const char *secretid;
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
@@ -450,6 +486,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    }

    filename = qemu_opt_get(opts, "filename");
+    secretid = qemu_opt_get(opts, "password-secret");

    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
                           snap_buf, sizeof(snap_buf),
@@ -488,6 +525,11 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) {
+        r = -EIO;
+        goto failed_shutdown;
+    }
+
    /*
     * Fallback to more conservative semantics if setting cache
     * options fails. Ignore errors from setting rbd_cache because the
@@ -919,6 +961,11 @@ static QemuOptsList qemu_rbd_create_opts = {
            .type = QEMU_OPT_SIZE,
            .help = "RBD object size"
        },
+        {
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of secret providing the password",
+        },
        { /* end of list */ }
    }
 };
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -18,6 +18,7 @@
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/bitops.h"

 #define SD_PROTO_VER 0x01
@@ -284,6 +285,12 @@ static inline bool is_snapshot(struct SheepdogInode *inode)
    return !!inode->snap_ctime;
 }

+static inline size_t count_data_objs(const struct SheepdogInode *inode)
+{
+    return DIV_ROUND_UP(inode->vdi_size,
+                        (1UL << inode->block_size_shift));
+}
+
 #undef DPRINTF
 #ifdef DEBUG_SDOG
 #define DPRINTF(fmt, args...)                                       \
@@ -609,14 +616,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
    if (ret != sizeof(*hdr)) {
        error_report("failed to send a req, %s", strerror(errno));
-        ret = -socket_error();
-        return ret;
+        return -errno;
    }

    ret = qemu_co_send(sockfd, data, *wlen);
    if (ret != *wlen) {
-        ret = -socket_error();
        error_report("failed to send a req, %s", strerror(errno));
+        return -errno;
    }

    return ret;
@@ -1631,7 +1637,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,

 static int sd_prealloc(const char *filename, Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    BDRVSheepdogState *base = NULL;
    unsigned long buf_size;
    uint32_t idx, max_idx;
@@ -1640,19 +1646,23 @@ static int sd_prealloc(const char *filename, Error **errp)
    void *buf = NULL;
    int ret;

-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    errp);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       errp);
+    if (blk == NULL) {
+        ret = -EIO;
        goto out_with_err_set;
    }

-    vdi_size = bdrv_getlength(bs);
+    blk_set_allow_write_beyond_eof(blk, true);
+
+    vdi_size = blk_getlength(blk);
    if (vdi_size < 0) {
        ret = vdi_size;
        goto out;
    }

-    base = bs->opaque;
+    base = blk_bs(blk)->opaque;
    object_size = (UINT32_C(1) << base->inode.block_size_shift);
    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
    buf = g_malloc0(buf_size);
@@ -1664,23 +1674,24 @@ static int sd_prealloc(const char *filename, Error **errp)
         * The created image can be a cloned image, so we need to read
         * a data from the source image.
         */
-        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
+        ret = blk_pread(blk, idx * buf_size, buf, buf_size);
        if (ret < 0) {
            goto out;
        }
-        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
        if (ret < 0) {
            goto out;
        }
    }

+    ret = 0;
 out:
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Can't pre-allocate");
    }
 out_with_err_set:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    g_free(buf);

@@ -1820,7 +1831,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
    }

    if (backing_file) {
-        BlockDriverState *bs;
+        BlockBackend *blk;
        BDRVSheepdogState *base;
        BlockDriver *drv;

@@ -1832,22 +1843,23 @@ static int sd_create(const char *filename, QemuOpts *opts,
            goto out;
        }

-        bs = NULL;
-        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp);
-        if (ret < 0) {
+        blk = blk_new_open(backing_file, NULL, NULL,
+                           BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp);
+        if (blk == NULL) {
+            ret = -EIO;
            goto out;
        }

-        base = bs->opaque;
+        base = blk_bs(blk)->opaque;

        if (!is_snapshot(&base->inode)) {
            error_setg(errp, "cannot clone from a non snapshot vdi");
-            bdrv_unref(bs);
+            blk_unref(blk);
            ret = -EINVAL;
            goto out;
        }
        s->inode.vdi_id = base->inode.vdi_id;
-        bdrv_unref(bs);
+        blk_unref(blk);
    }

    s->aio_context = qemu_get_aio_context();
@@ -2478,13 +2490,131 @@ out:
    return ret;
 }

+#define NR_BATCHED_DISCARD 128
+
+static bool remove_objects(BDRVSheepdogState *s)
+{
+    int fd, i = 0, nr_objs = 0;
+    Error *local_err = NULL;
+    int ret = 0;
+    bool result = true;
+    SheepdogInode *inode = &s->inode;
+
+    fd = connect_to_sdog(s, &local_err);
+    if (fd < 0) {
+        error_report_err(local_err);
+        return false;
+    }
+
+    nr_objs = count_data_objs(inode);
+    while (i < nr_objs) {
+        int start_idx, nr_filled_idx;
+
+        while (i < nr_objs && !inode->data_vdi_id[i]) {
+            i++;
+        }
+        start_idx = i;
+
+        nr_filled_idx = 0;
+        while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
+            if (inode->data_vdi_id[i]) {
+                inode->data_vdi_id[i] = 0;
+                nr_filled_idx++;
+            }
+
+            i++;
+        }
+
+        ret = write_object(fd, s->aio_context,
+                           (char *)&inode->data_vdi_id[start_idx],
+                           vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
+                           (i - start_idx) * sizeof(uint32_t),
+                           offsetof(struct SheepdogInode,
+                                    data_vdi_id[start_idx]),
+                           false, s->cache_flags);
+        if (ret < 0) {
+            error_report("failed to discard snapshot inode.");
+            result = false;
+            goto out;
+        }
+    }
+
+out:
+    closesocket(fd);
+    return result;
+}
+
 static int sd_snapshot_delete(BlockDriverState *bs,
                              const char *snapshot_id,
                              const char *name,
                              Error **errp)
 {
-    /* FIXME: Delete specified snapshot id.  */
-    return 0;
+    unsigned long snap_id = 0;
+    char snap_tag[SD_MAX_VDI_TAG_LEN];
+    Error *local_err = NULL;
+    int fd, ret;
+    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
+    BDRVSheepdogState *s = bs->opaque;
+    unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
+    uint32_t vid;
+    SheepdogVdiReq hdr = {
+        .opcode = SD_OP_DEL_VDI,
+        .data_length = wlen,
+        .flags = SD_FLAG_CMD_WRITE,
+    };
+    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
+
+    if (!remove_objects(s)) {
+        return -1;
+    }
+
+    memset(buf, 0, sizeof(buf));
+    memset(snap_tag, 0, sizeof(snap_tag));
+    pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
+    ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
+    if (ret || snap_id > UINT32_MAX) {
+        error_setg(errp, "Invalid snapshot ID: %s",
+                         snapshot_id ? snapshot_id : "<null>");
+        return -EINVAL;
+    }
+
+    if (snap_id) {
+        hdr.snapid = (uint32_t) snap_id;
+    } else {
+        pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
+        pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
+    }
+
+    ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true,
+                        &local_err);
+    if (ret) {
+        return ret;
+    }
+
+    fd = connect_to_sdog(s, &local_err);
+    if (fd < 0) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+                 buf, &wlen, &rlen);
+    closesocket(fd);
+    if (ret) {
+        return ret;
+    }
+
+    switch (rsp->result) {
+    case SD_RES_NO_VDI:
+        error_report("%s was already deleted", s->name);
+    case SD_RES_SUCCESS:
+        break;
+    default:
+        error_report("%s, %s", sd_strerror(rsp->result), s->name);
+        return -1;
+    }
+
+    return ret;
 }

 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
@@ -2708,7 +2838,7 @@ retry:

 static coroutine_fn int64_t
 sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                       int *pnum)
+                       int *pnum, BlockDriverState **file)
 {
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
@@ -2739,6 +2869,9 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    if (*pnum > nb_sectors) {
        *pnum = nb_sectors;
    }
+    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
+        *file = bs;
+    }
    return ret;
 }

--- a/block/vdi.c
+++ b/block/vdi.c
@@ -52,6 +52,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
@@ -527,7 +528,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
 }

 static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
@@ -551,6 +552,7 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
    offset = s->header.offset_data +
                              (uint64_t)bmap_entry * s->block_size +
                              sector_in_block * SECTOR_SIZE;
+    *file = bs->file->bs;
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }

@@ -732,7 +734,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    size_t bmap_size;
    int64_t offset = 0;
    Error *local_err = NULL;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    uint32_t *bmap = NULL;

    logout("\n");
@@ -765,13 +767,18 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
        error_propagate(errp, local_err);
        goto exit;
    }
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* We need enough blocks to store the given disk size,
       so always round up. */
    blocks = DIV_ROUND_UP(bytes, block_size);
@@ -801,7 +808,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header));
    if (ret < 0) {
        error_setg(errp, "Error writing header to %s", filename);
        goto exit;
@@ -822,7 +829,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size);
        if (ret < 0) {
            error_setg(errp, "Error writing bmap to %s", filename);
            goto exit;
@@ -831,7 +838,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    if (image_type == VDI_TYPE_STATIC) {
-        ret = bdrv_truncate(bs, offset + blocks * block_size);
+        ret = blk_truncate(blk, offset + blocks * block_size);
        if (ret < 0) {
            error_setg(errp, "Failed to statically allocate %s", filename);
            goto exit;
@@ -839,7 +846,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    }

 exit:
-    bdrv_unref(bs);
+    blk_unref(blk);
    g_free(bmap);
    return ret;
 }
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
 #include "block/vhdx.h"
@@ -264,10 +265,10 @@ static void vhdx_region_unregister_all(BDRVVHDXState *s)

 static void vhdx_set_shift_bits(BDRVVHDXState *s)
 {
-    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
-    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
-    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
-    s->block_size_bits =          31 - clz32(s->block_size);
+    s->logical_sector_size_bits = ctz32(s->logical_sector_size);
+    s->sectors_per_block_bits =   ctz32(s->sectors_per_block);
+    s->chunk_ratio_bits =         ctz64(s->chunk_ratio);
+    s->block_size_bits =          ctz32(s->block_size);
 }

 /*
@@ -857,14 +858,8 @@ static void vhdx_calc_bat_entries(BDRVVHDXState *s)
 {
    uint32_t data_blocks_cnt, bitmap_blocks_cnt;

-    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
-    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
-        data_blocks_cnt++;
-    }
-    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
-    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
-        bitmap_blocks_cnt++;
-    }
+    data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size);
+    bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio);

    if (s->parent_entries) {
        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
@@ -1778,7 +1773,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)

    gunichar2 *creator = NULL;
    glong creator_items;
-    BlockDriverState *bs;
+    BlockBackend *blk;
    char *type = NULL;
    VHDXImageType image_type;
    Error *local_err = NULL;
@@ -1843,14 +1838,17 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /* Create (A) */

    /* The creator field is optional, but may be useful for
@@ -1858,13 +1856,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                              &creator_items, NULL);
    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
    if (ret < 0) {
        goto delete_and_exit;
    }
    if (creator) {
-        ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                          creator, creator_items * sizeof(gunichar2));
+        ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
+                         creator, creator_items * sizeof(gunichar2));
        if (ret < 0) {
            goto delete_and_exit;
        }
@@ -1872,13 +1870,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)


    /* Creates (B),(C) */
-    ret = vhdx_create_new_headers(bs, image_size, log_size);
+    ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size);
    if (ret < 0) {
        goto delete_and_exit;
    }

    /* Creates (D),(E),(G) explicitly. (F) created as by-product */
-    ret = vhdx_create_new_region_table(bs, image_size, block_size, 512,
+    ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512,
                                       log_size, use_zero_blocks, image_type,
                                       &metadata_offset);
    if (ret < 0) {
@@ -1886,7 +1884,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* Creates (H) */
-    ret = vhdx_create_new_metadata(bs, image_size, block_size, 512,
+    ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512,
                                   metadata_offset, image_type);
    if (ret < 0) {
        goto delete_and_exit;
@@ -1894,7 +1892,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)


 delete_and_exit:
-    bdrv_unref(bs);
+    blk_unref(blk);
 exit:
    g_free(type);
    g_free(creator);
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -26,6 +26,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
@@ -242,15 +243,17 @@ static void vmdk_free_last_extent(BlockDriverState *bs)

 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
-    char desc[DESC_SIZE];
+    char *desc;
    uint32_t cid = 0xffffffff;
    const char *p_name, *cid_str;
    size_t cid_str_size;
    BDRVVmdkState *s = bs->opaque;
    int ret;

+    desc = g_malloc0(DESC_SIZE);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
+        g_free(desc);
        return 0;
    }

@@ -269,41 +272,45 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
        sscanf(p_name, "%" SCNx32, &cid);
    }

+    g_free(desc);
    return cid;
 }

 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 {
-    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+    char *desc, *tmp_desc;
    char *p_name, *tmp_str;
    BDRVVmdkState *s = bs->opaque;
-    int ret;
+    int ret = 0;

+    desc = g_malloc0(DESC_SIZE);
+    tmp_desc = g_malloc0(DESC_SIZE);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
-        return ret;
+        goto out;
    }

    desc[DESC_SIZE - 1] = '\0';
    tmp_str = strstr(desc, "parentCID");
    if (tmp_str == NULL) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }

-    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
        p_name += sizeof("CID");
-        snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid);
-        pstrcat(desc, sizeof(desc), tmp_desc);
+        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
+        pstrcat(desc, DESC_SIZE, tmp_desc);
    }

    ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
-    if (ret < 0) {
-        return ret;
-    }

-    return 0;
+out:
+    g_free(desc);
+    g_free(tmp_desc);
+    return ret;
 }

 static int vmdk_is_cid_valid(BlockDriverState *bs)
@@ -337,15 +344,16 @@ static int vmdk_reopen_prepare(BDRVReopenState *state,
 static int vmdk_parent_open(BlockDriverState *bs)
 {
    char *p_name;
-    char desc[DESC_SIZE + 1];
+    char *desc;
    BDRVVmdkState *s = bs->opaque;
    int ret;

-    desc[DESC_SIZE] = '\0';
+    desc = g_malloc0(DESC_SIZE + 1);
    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
-        return ret;
+        goto out;
    }
+    ret = 0;

    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
@@ -354,16 +362,20 @@ static int vmdk_parent_open(BlockDriverState *bs)
        p_name += sizeof("parentFileNameHint") + 1;
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out;
        }
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
-            return -EINVAL;
+            ret = -EINVAL;
+            goto out;
        }

        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
    }

-    return 0;
+out:
+    g_free(desc);
+    return ret;
 }

 /* Create and append extent to the extent array. Return the added VmdkExtent
@@ -571,6 +583,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
    int64_t l1_backup_offset = 0;
+    bool compressed;

    ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header));
    if (ret < 0) {
@@ -645,14 +658,14 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
        header = footer.header;
    }

+    compressed =
+        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
    if (le32_to_cpu(header.version) > 3) {
-        char buf[64];
-        snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
-                 le32_to_cpu(header.version));
-        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                   bdrv_get_device_or_node_name(bs), "vmdk", buf);
+        error_setg(errp, "Unsupported VMDK version %" PRIu32,
+                   le32_to_cpu(header.version));
        return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
+               !compressed) {
        /* VMware KB 2064959 explains that version 3 added support for
         * persistent changed block tracking (CBT), and backup software can
         * read it as version=1 if it doesn't care about the changed area
@@ -1257,7 +1270,7 @@ static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
 }

 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVVmdkState *s = bs->opaque;
    int64_t index_in_cluster, n, ret;
@@ -1274,6 +1287,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
                             0, 0);
    qemu_co_mutex_unlock(&s->lock);

+    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
@@ -1286,14 +1300,15 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
        break;
    case VMDK_OK:
        ret = BDRV_BLOCK_DATA;
-        if (extent->file == bs->file && !extent->compressed) {
-            ret |= BDRV_BLOCK_OFFSET_VALID | offset;
+        if (!extent->compressed) {
+            ret |= BDRV_BLOCK_OFFSET_VALID;
+            ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS))
+                    & BDRV_BLOCK_OFFSET_MASK;
        }
-
+        *file = extent->file->bs;
        break;
    }

-    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
    n = extent->cluster_sectors - index_in_cluster;
    if (n > nb_sectors) {
        n = nb_sectors;
@@ -1633,7 +1648,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                              QemuOpts *opts, Error **errp)
 {
    int ret, i;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;
    VMDK4Header header;
    Error *local_err = NULL;
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
@@ -1646,16 +1661,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        goto exit;
    }

-    assert(bs == NULL);
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    if (flat) {
-        ret = bdrv_truncate(bs, filesize);
+        ret = blk_truncate(blk, filesize);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not truncate file");
        }
@@ -1710,18 +1728,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }

-    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
+    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not truncate file");
        goto exit;
@@ -1734,8 +1752,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
+    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                     gd_buf, gd_buf_size);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1746,8 +1764,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         i < gt_count; i++, tmp += gt_size) {
        gd_buf[i] = cpu_to_le32(tmp);
    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
+    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                     gd_buf, gd_buf_size);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1755,8 +1773,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,

    ret = 0;
 exit:
-    if (bs) {
-        bdrv_unref(bs);
+    if (blk) {
+        blk_unref(blk);
    }
    g_free(gd_buf);
    return ret;
@@ -1805,7 +1823,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
 {
    int idx = 0;
-    BlockDriverState *new_bs = NULL;
+    BlockBackend *new_blk = NULL;
    Error *local_err = NULL;
    char *desc = NULL;
    int64_t total_size = 0, filesize;
@@ -1916,7 +1934,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }
    if (backing_file) {
-        BlockDriverState *bs = NULL;
+        BlockBackend *blk;
        char *full_backing = g_new0(char, PATH_MAX);
        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
                                                     full_backing, PATH_MAX,
@@ -1927,18 +1945,21 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
            ret = -ENOENT;
            goto exit;
        }
-        ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, errp);
+
+        blk = blk_new_open(full_backing, NULL, NULL,
+                           BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp);
        g_free(full_backing);
-        if (ret != 0) {
+        if (blk == NULL) {
+            ret = -EIO;
            goto exit;
        }
-        if (strcmp(bs->drv->format_name, "vmdk")) {
-            bdrv_unref(bs);
+        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
+            blk_unref(blk);
            ret = -EINVAL;
            goto exit;
        }
-        parent_cid = vmdk_read_cid(bs, 0);
-        bdrv_unref(bs);
+        parent_cid = vmdk_read_cid(blk_bs(blk), 0);
+        blk_unref(blk);
        snprintf(parent_desc_line, BUF_SIZE,
                "parentFileNameHint=\"%s\"", backing_file);
    }
@@ -1996,14 +2017,19 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
            goto exit;
        }
    }
-    assert(new_bs == NULL);
-    ret = bdrv_open(&new_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
-    if (ret < 0) {
+
+    new_blk = blk_new_open(filename, NULL, NULL,
+                           BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                           &local_err);
+    if (new_blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto exit;
    }
-    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
+
+    blk_set_allow_write_beyond_eof(new_blk, true);
+
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2011,14 +2037,14 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
-        ret = bdrv_truncate(new_bs, desc_len);
+        ret = blk_truncate(new_blk, desc_len);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not truncate file");
        }
    }
 exit:
-    if (new_bs) {
-        bdrv_unref(new_bs);
+    if (new_blk) {
+        blk_unref(new_blk);
    }
    g_free(adapter_type);
    g_free(backing_file);
@@ -2177,18 +2203,18 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)

    *spec_info = (ImageInfoSpecific){
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
-        {
-            .vmdk = g_new0(ImageInfoSpecificVmdk, 1),
+        .u = {
+            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
        },
    };

-    *spec_info->u.vmdk = (ImageInfoSpecificVmdk) {
+    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

-    next = &spec_info->u.vmdk->extents;
+    next = &spec_info->u.vmdk.data->extents;
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
 #if defined(CONFIG_UUID)
@@ -46,8 +47,14 @@ enum vhd_type {
 // Seconds since Jan 1, 2000 0:00:00 (UTC)
 #define VHD_TIMESTAMP_BASE 946684800

+#define VHD_CHS_MAX_C   65535LL
+#define VHD_CHS_MAX_H   16
+#define VHD_CHS_MAX_S   255
+
 #define VHD_MAX_SECTORS       (65535LL * 255 * 255)
-#define VHD_MAX_GEOMETRY      (65535LL *  16 * 255)
+#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
+
+#define VPC_OPT_FORCE_SIZE "force_size"

 // always big-endian
 typedef struct vhd_footer {
@@ -128,6 +135,8 @@ typedef struct BDRVVPCState {

    uint32_t block_size;
    uint32_t bitmap_size;
+    bool force_use_chs;
+    bool force_use_sz;

 #ifdef CACHE
    uint8_t *pageentry_u8;
@@ -140,6 +149,22 @@ typedef struct BDRVVPCState {
    Error *migration_blocker;
 } BDRVVPCState;

+#define VPC_OPT_SIZE_CALC "force_size_calc"
+static QemuOptsList vpc_runtime_opts = {
+    .name = "vpc-runtime-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
+    .desc = {
+        {
+            .name = VPC_OPT_SIZE_CALC,
+            .type = QEMU_OPT_STRING,
+            .help = "Force disk size calculation to use either CHS geometry, "
+                    "or use the disk current_size specified in the VHD footer. "
+                    "{chs, current_size}"
+        },
+        { /* end of list */ }
+    }
+};
+
 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
 {
    uint32_t res = 0;
@@ -159,6 +184,25 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
    return 0;
 }

+static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
+                              Error **errp)
+{
+    BDRVVPCState *s = bs->opaque;
+    const char *size_calc;
+
+    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
+
+    if (!size_calc) {
+       /* no override, use autodetect only */
+    } else if (!strcmp(size_calc, "current_size")) {
+        s->force_use_sz = true;
+    } else if (!strcmp(size_calc, "chs")) {
+        s->force_use_chs = true;
+    } else {
+        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
+    }
+}
+
 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
@@ -166,6 +210,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    int i;
    VHDFooter *footer;
    VHDDynDiskHeader *dyndisk_header;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
+    bool use_chs;
    uint8_t buf[HEADER_SIZE];
    uint32_t checksum;
    uint64_t computed_size;
@@ -173,6 +220,21 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    int disk_type = VHD_DYNAMIC;
    int ret;

+    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    vpc_parse_options(bs, opts, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
    ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
    if (ret < 0) {
        goto fail;
@@ -218,12 +280,36 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    bs->total_sectors = (int64_t)
        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;

-    /* Images that have exactly the maximum geometry are probably bigger and
-     * would be truncated if we adhered to the geometry for them. Rely on
-     * footer->current_size for them. */
-    if (bs->total_sectors == VHD_MAX_GEOMETRY) {
+    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
+     * VHD image sizes differently.  VPC will rely on CHS geometry,
+     * while Hyper-V and disk2vhd use the size specified in the footer.
+     *
+     * We use a couple of approaches to try and determine the correct method:
+     * look at the Creator App field, and look for images that have CHS
+     * geometry that is the maximum value.
+     *
+     * If the CHS geometry is the maximum CHS geometry, then we assume that
+     * the size is the footer->current_size to avoid truncation.  Otherwise,
+     * we follow the table based on footer->creator_app:
+     *
+     *  Known creator apps:
+     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
+     *      'qemu'  :  CHS              QEMU (uses disk geometry)
+     *      'qem2'  :  current_size     QEMU (uses current_size)
+     *      'win '  :  current_size     Hyper-V
+     *      'd2v '  :  current_size     Disk2vhd
+     *
+     *  The user can override the table values via drive options, however
+     *  even with an override we will still use current_size for images
+     *  that have CHS geometry of the maximum size.
+     */
+    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
+               !!strncmp(footer->creator_app, "qem2", 4) &&
+               !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs;
+
+    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
        bs->total_sectors = be64_to_cpu(footer->current_size) /
-                            BDRV_SECTOR_SIZE;
+                                        BDRV_SECTOR_SIZE;
    }

    /* Allow a maximum disk size of approximately 2 TB */
@@ -579,7 +665,7 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
 }

 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum)
+        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
    BDRVVPCState *s = bs->opaque;
    VHDFooter *footer = (VHDFooter*) s->footer_buf;
@@ -589,6 +675,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        *pnum = nb_sectors;
+        *file = bs->file->bs;
        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
               (sector_num << BDRV_SECTOR_BITS);
    }
@@ -610,6 +697,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
        /* *pnum can't be greater than one block for allocated
         * sectors since there is always a bitmap in between. */
        if (allocated) {
+            *file = bs->file->bs;
            return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
        }
        if (nb_sectors == 0) {
@@ -671,7 +759,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    return 0;
 }

-static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
+static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
                               int64_t total_sectors)
 {
    VHDDynDiskHeader *dyndisk_header =
@@ -685,13 +773,13 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
    if (ret) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
    if (ret < 0) {
        goto fail;
    }
@@ -701,7 +789,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = bdrv_pwrite_sync(bs, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512);
        if (ret < 0) {
            goto fail;
        }
@@ -728,7 +816,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    // Write the header
    offset = 512;

-    ret = bdrv_pwrite_sync(bs, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024);
    if (ret < 0) {
        goto fail;
    }
@@ -737,7 +825,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
    return ret;
 }

-static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
+static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
                             int64_t total_size)
 {
    int ret;
@@ -745,12 +833,12 @@ static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
    /* Add footer to total size */
    total_size += HEADER_SIZE;

-    ret = bdrv_truncate(bs, total_size);
+    ret = blk_truncate(blk, total_size);
    if (ret < 0) {
        return ret;
    }

-    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
    if (ret < 0) {
        return ret;
    }
@@ -771,8 +859,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size;
    int disk_type;
    int ret = -EIO;
+    bool force_size;
    Error *local_err = NULL;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk = NULL;

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
@@ -791,30 +880,44 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
        disk_type = VHD_DYNAMIC;
    }

+    force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
+
    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto out;
    }
-    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
-                    &local_err);
-    if (ret < 0) {
+
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
        error_propagate(errp, local_err);
+        ret = -EIO;
        goto out;
    }

+    blk_set_allow_write_beyond_eof(blk, true);
+
    /*
     * Calculate matching total_size and geometry. Increase the number of
     * sectors requested until we get enough (or fail). This ensures that
     * qemu-img convert doesn't truncate images, but rather rounds up.
     *
-     * If the image size can't be represented by a spec conform CHS geometry,
+     * If the image size can't be represented by a spec conformant CHS geometry,
     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
     * the image size from the VHD footer to calculate total_sectors.
     */
-    total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
-    for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
-        calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
+    if (force_size) {
+        /* This will force the use of total_size for sector count, below */
+        cyls         = VHD_CHS_MAX_C;
+        heads        = VHD_CHS_MAX_H;
+        secs_per_cyl = VHD_CHS_MAX_S;
+    } else {
+        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
+        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
+            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
+        }
    }

    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
@@ -833,8 +936,11 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(buf, 0, 1024);

    memcpy(footer->creator, "conectix", 8);
-    /* TODO Check if "qemu" creator_app is ok for VPC */
-    memcpy(footer->creator_app, "qemu", 4);
+    if (force_size) {
+        memcpy(footer->creator_app, "qem2", 4);
+    } else {
+        memcpy(footer->creator_app, "qemu", 4);
+    }
    memcpy(footer->creator_os, "Wi2k", 4);

    footer->features = cpu_to_be32(0x02);
@@ -864,13 +970,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));

    if (disk_type == VHD_DYNAMIC) {
-        ret = create_dynamic_disk(bs, buf, total_sectors);
+        ret = create_dynamic_disk(blk, buf, total_sectors);
    } else {
-        ret = create_fixed_disk(bs, buf, total_size);
+        ret = create_fixed_disk(blk, buf, total_size);
    }

 out:
-    bdrv_unref(bs);
+    blk_unref(blk);
    g_free(disk_type_param);
    return ret;
 }
@@ -915,6 +1021,13 @@ static QemuOptsList vpc_create_opts = {
                "Type of virtual hard disk format. Supported formats are "
                "{dynamic (default) | fixed} "
        },
+        {
+            .name = VPC_OPT_FORCE_SIZE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Force disk size calculation to use the actual size "
+                    "specified, rather than using the nearest CHS-based "
+                    "calculation"
+        },
        { /* end of list */ }
    }
 };
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2884,7 +2884,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
 }

 static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
-	int64_t sector_num, int nb_sectors, int* n)
+	int64_t sector_num, int nb_sectors, int *n, BlockDriverState **file)
 {
    BDRVVVFATState* s = bs->opaque;
    *n = s->sector_count - sector_num;
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -9,6 +9,7 @@
 * later.  See the COPYING file in the top-level directory.
 */

+#include "qemu/osdep.h"
 #include "sysemu/blockdev.h"
 #include "sysemu/block-backend.h"
 #include "hw/block/block.h"
@@ -17,57 +18,128 @@
 #include "qmp-commands.h"
 #include "trace.h"
 #include "block/nbd.h"
-#include "qemu/sockets.h"
+#include "io/channel-socket.h"

-static int server_fd = -1;
+typedef struct NBDServerData {
+    QIOChannelSocket *listen_ioc;
+    int watch;
+    QCryptoTLSCreds *tlscreds;
+} NBDServerData;

-static void nbd_accept(void *opaque)
+static NBDServerData *nbd_server;
+
+
+static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,
+                           gpointer opaque)
 {
-    struct sockaddr_in addr;
-    socklen_t addr_len = sizeof(addr);
+    QIOChannelSocket *cioc;

-    int fd = accept(server_fd, (struct sockaddr *)&addr, &addr_len);
-    if (fd >= 0) {
-        nbd_client_new(NULL, fd, nbd_client_put);
+    if (!nbd_server) {
+        return FALSE;
    }
+
+    cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc),
+                                     NULL);
+    if (!cioc) {
+        return TRUE;
+    }
+
+    nbd_client_new(NULL, cioc,
+                   nbd_server->tlscreds, NULL,
+                   nbd_client_put);
+    object_unref(OBJECT(cioc));
+    return TRUE;
 }

-void qmp_nbd_server_start(SocketAddress *addr, Error **errp)
+
+static void nbd_server_free(NBDServerData *server)
 {
-    if (server_fd != -1) {
+    if (!server) {
+        return;
+    }
+
+    if (server->watch != -1) {
+        g_source_remove(server->watch);
+    }
+    object_unref(OBJECT(server->listen_ioc));
+    if (server->tlscreds) {
+        object_unref(OBJECT(server->tlscreds));
+    }
+
+    g_free(server);
+}
+
+static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
+{
+    Object *obj;
+    QCryptoTLSCreds *creds;
+
+    obj = object_resolve_path_component(
+        object_get_objects_root(), id);
+    if (!obj) {
+        error_setg(errp, "No TLS credentials with id '%s'",
+                   id);
+        return NULL;
+    }
+    creds = (QCryptoTLSCreds *)
+        object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS);
+    if (!creds) {
+        error_setg(errp, "Object with id '%s' is not TLS credentials",
+                   id);
+        return NULL;
+    }
+
+    if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_SERVER) {
+        error_setg(errp,
+                   "Expecting TLS credentials with a server endpoint");
+        return NULL;
+    }
+    object_ref(obj);
+    return creds;
+}
+
+
+void qmp_nbd_server_start(SocketAddress *addr,
+                          bool has_tls_creds, const char *tls_creds,
+                          Error **errp)
+{
+    if (nbd_server) {
        error_setg(errp, "NBD server already running");
        return;
    }

-    server_fd = socket_listen(addr, errp);
-    if (server_fd != -1) {
-        qemu_set_fd_handler(server_fd, nbd_accept, NULL, NULL);
+    nbd_server = g_new0(NBDServerData, 1);
+    nbd_server->watch = -1;
+    nbd_server->listen_ioc = qio_channel_socket_new();
+    if (qio_channel_socket_listen_sync(
+            nbd_server->listen_ioc, addr, errp) < 0) {
+        goto error;
    }
-}

-/*
- * Hook into the BlockBackend notifiers to close the export when the
- * backend is closed.
- */
-typedef struct NBDCloseNotifier {
-    Notifier n;
-    NBDExport *exp;
-    QTAILQ_ENTRY(NBDCloseNotifier) next;
-} NBDCloseNotifier;
+    if (has_tls_creds) {
+        nbd_server->tlscreds = nbd_get_tls_creds(tls_creds, errp);
+        if (!nbd_server->tlscreds) {
+            goto error;
+        }

-static QTAILQ_HEAD(, NBDCloseNotifier) close_notifiers =
-    QTAILQ_HEAD_INITIALIZER(close_notifiers);
+        if (addr->type != SOCKET_ADDRESS_KIND_INET) {
+            error_setg(errp, "TLS is only supported with IPv4/IPv6");
+            goto error;
+        }
+    }

-static void nbd_close_notifier(Notifier *n, void *data)
-{
-    NBDCloseNotifier *cn = DO_UPCAST(NBDCloseNotifier, n, n);
+    nbd_server->watch = qio_channel_add_watch(
+        QIO_CHANNEL(nbd_server->listen_ioc),
+        G_IO_IN,
+        nbd_accept,
+        NULL,
+        NULL);

-    notifier_remove(&cn->n);
-    QTAILQ_REMOVE(&close_notifiers, cn, next);
+    return;

-    nbd_export_close(cn->exp);
-    nbd_export_put(cn->exp);
-    g_free(cn);
+ error:
+    nbd_server_free(nbd_server);
+    nbd_server = NULL;
 }

 void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
@@ -75,9 +147,8 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
 {
    BlockBackend *blk;
    NBDExport *exp;
-    NBDCloseNotifier *n;

-    if (server_fd == -1) {
+    if (!nbd_server) {
        error_setg(errp, "NBD server not running");
        return;
    }
@@ -113,23 +184,16 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,

    nbd_export_set_name(exp, device);

-    n = g_new0(NBDCloseNotifier, 1);
-    n->n.notify = nbd_close_notifier;
-    n->exp = exp;
-    blk_add_close_notifier(blk, &n->n);
-    QTAILQ_INSERT_TAIL(&close_notifiers, n, next);
+    /* The list of named exports has a strong reference to this export now and
+     * our only way of accessing it is through nbd_export_find(), so we can drop
+     * the strong reference that is @exp. */
+    nbd_export_put(exp);
 }

 void qmp_nbd_server_stop(Error **errp)
 {
-    while (!QTAILQ_EMPTY(&close_notifiers)) {
-        NBDCloseNotifier *cn = QTAILQ_FIRST(&close_notifiers);
-        nbd_close_notifier(&cn->n, nbd_export_get_blockdev(cn->exp));
-    }
+    nbd_export_close_all();

-    if (server_fd != -1) {
-        qemu_set_fd_handler(server_fd, NULL, NULL, NULL);
-        close(server_fd);
-        server_fd = -1;
-    }
+    nbd_server_free(nbd_server);
+    nbd_server = NULL;
 }
--- a/blockdev.c
+++ b/blockdev.c
@@ -30,6 +30,7 @@
 * THE SOFTWARE.
 */

+#include "qemu/osdep.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "hw/block/block.h"
@@ -50,6 +51,9 @@
 #include "trace.h"
 #include "sysemu/arch_init.h"

+static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
+    QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
+
 static const char *const if_name[IF_COUNT] = {
    [IF_NONE] = "none",
    [IF_IDE] = "ide",
@@ -143,6 +147,7 @@ void blockdev_auto_del(BlockBackend *blk)
    DriveInfo *dinfo = blk_legacy_dinfo(blk);

    if (dinfo && dinfo->auto_del) {
+        monitor_remove_blk(blk);
        blk_unref(blk);
    }
 }
@@ -339,29 +344,6 @@ static bool parse_stats_intervals(BlockAcctStats *stats, QList *intervals,
    return true;
 }

-static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
-{
-    if (throttle_conflicting(cfg)) {
-        error_setg(errp, "bps/iops/max total values and read/write values"
-                         " cannot be used at the same time");
-        return false;
-    }
-
-    if (!throttle_is_valid(cfg)) {
-        error_setg(errp, "bps/iops/max values must be within [0, %lld]",
-                   THROTTLE_VALUE_MAX);
-        return false;
-    }
-
-    if (throttle_max_is_missing_limit(cfg)) {
-        error_setg(errp, "bps_max/iops_max require corresponding"
-                         " bps/iops values");
-        return false;
-    }
-
-    return true;
-}
-
 typedef enum { MEDIA_DISK, MEDIA_CDROM } DriveMediaType;

 /* All parameters but @opts are optional and may be set to NULL. */
@@ -406,7 +388,7 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
    }

    if (throttle_cfg) {
-        memset(throttle_cfg, 0, sizeof(*throttle_cfg));
+        throttle_config_init(throttle_cfg);
        throttle_cfg->buckets[THROTTLE_BPS_TOTAL].avg =
            qemu_opt_get_number(opts, "throttling.bps-total", 0);
        throttle_cfg->buckets[THROTTLE_BPS_READ].avg  =
@@ -433,10 +415,23 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
        throttle_cfg->buckets[THROTTLE_OPS_WRITE].max =
            qemu_opt_get_number(opts, "throttling.iops-write-max", 0);

+        throttle_cfg->buckets[THROTTLE_BPS_TOTAL].burst_length =
+            qemu_opt_get_number(opts, "throttling.bps-total-max-length", 1);
+        throttle_cfg->buckets[THROTTLE_BPS_READ].burst_length  =
+            qemu_opt_get_number(opts, "throttling.bps-read-max-length", 1);
+        throttle_cfg->buckets[THROTTLE_BPS_WRITE].burst_length =
+            qemu_opt_get_number(opts, "throttling.bps-write-max-length", 1);
+        throttle_cfg->buckets[THROTTLE_OPS_TOTAL].burst_length =
+            qemu_opt_get_number(opts, "throttling.iops-total-max-length", 1);
+        throttle_cfg->buckets[THROTTLE_OPS_READ].burst_length =
+            qemu_opt_get_number(opts, "throttling.iops-read-max-length", 1);
+        throttle_cfg->buckets[THROTTLE_OPS_WRITE].burst_length =
+            qemu_opt_get_number(opts, "throttling.iops-write-max-length", 1);
+
        throttle_cfg->op_size =
            qemu_opt_get_number(opts, "throttling.iops-size", 0);

-        if (!check_throttle_config(throttle_cfg, errp)) {
+        if (!throttle_is_valid(throttle_cfg, errp)) {
            return;
        }
    }
@@ -567,7 +562,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new(qemu_opts_id(opts), errp);
+        blk = blk_new(errp);
        if (!blk) {
            goto early_err;
        }
@@ -599,15 +594,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");

-        if (snapshot) {
-            /* always use cache=unsafe with snapshot */
-            qdict_put(bs_opts, BDRV_OPT_CACHE_WB, qstring_from_str("on"));
-            qdict_put(bs_opts, BDRV_OPT_CACHE_DIRECT, qstring_from_str("off"));
-            qdict_put(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, qstring_from_str("on"));
+        if (runstate_check(RUN_STATE_INMIGRATE)) {
+            bdrv_flags |= BDRV_O_INACTIVE;
        }

-        blk = blk_new_open(qemu_opts_id(opts), file, NULL, bs_opts, bdrv_flags,
-                           errp);
+        blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
        if (!blk) {
            goto err_no_bs_opts;
        }
@@ -639,6 +630,12 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

    blk_set_on_error(blk, on_read_error, on_write_error);

+    if (!monitor_add_blk(blk, qemu_opts_id(opts), errp)) {
+        blk_unref(blk);
+        blk = NULL;
+        goto err_no_bs_opts;
+    }
+
 err_no_bs_opts:
    qemu_opts_del(opts);
    QDECREF(interval_dict);
@@ -684,6 +681,17 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        goto fail;
    }

+    /* bdrv_open() defaults to the values in bdrv_flags (for compatibility
+     * with other callers) rather than what we want as the real defaults.
+     * Apply the defaults here instead. */
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on");
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
+    qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
+
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        bdrv_flags |= BDRV_O_INACTIVE;
+    }
+
    bs = NULL;
    ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp);
    if (ret < 0) {
@@ -702,6 +710,26 @@ fail:
    return NULL;
 }

+void blockdev_close_all_bdrv_states(void)
+{
+    BlockDriverState *bs, *next_bs;
+
+    QTAILQ_FOREACH_SAFE(bs, &monitor_bdrv_states, monitor_list, next_bs) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(ctx);
+        bdrv_unref(bs);
+        aio_context_release(ctx);
+    }
+}
+
+/* Iterates over the list of monitor-owned BlockDriverStates */
+BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs)
+{
+    return bs ? QTAILQ_NEXT(bs, monitor_list)
+              : QTAILQ_FIRST(&monitor_bdrv_states);
+}
+
 static void qemu_opt_rename(QemuOpts *opts, const char *from, const char *to,
                            Error **errp)
 {
@@ -1158,7 +1186,7 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
    int ret;

    if (!strcmp(device, "all")) {
-        ret = bdrv_commit_all();
+        ret = blk_commit_all();
    } else {
        BlockDriverState *bs;
        AioContext *aio_context;
@@ -1187,15 +1215,11 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
    }
 }

-static void blockdev_do_action(TransactionActionKind type, void *data,
-                               Error **errp)
+static void blockdev_do_action(TransactionAction *action, Error **errp)
 {
-    TransactionAction action;
    TransactionActionList list;

-    action.type = type;
-    action.u.data = data;
-    list.value = &action;
+    list.value = action;
    list.next = NULL;
    qmp_transaction(&list, false, NULL, errp);
 }
@@ -1221,8 +1245,11 @@ void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
        .has_mode = has_mode,
        .mode = mode,
    };
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC,
-                       &snapshot, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC,
+        .u.blockdev_snapshot_sync.data = &snapshot,
+    };
+    blockdev_do_action(&action, errp);
 }

 void qmp_blockdev_snapshot(const char *node, const char *overlay,
@@ -1232,9 +1259,11 @@ void qmp_blockdev_snapshot(const char *node, const char *overlay,
        .node = (char *) node,
        .overlay = (char *) overlay
    };
-
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT,
-                       &snapshot_data, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT,
+        .u.blockdev_snapshot.data = &snapshot_data,
+    };
+    blockdev_do_action(&action, errp);
 }

 void qmp_blockdev_snapshot_internal_sync(const char *device,
@@ -1245,9 +1274,11 @@ void qmp_blockdev_snapshot_internal_sync(const char *device,
        .device = (char *) device,
        .name = (char *) name
    };
-
-    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC,
-                       &snapshot, errp);
+    TransactionAction action = {
+        .type = TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC,
+        .u.blockdev_snapshot_internal_sync.data = &snapshot,
+    };
+    blockdev_do_action(&action, errp);
 }

 SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device,
@@ -1484,7 +1515,7 @@ static void internal_snapshot_prepare(BlkActionState *common,

    g_assert(common->action->type ==
             TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC);
-    internal = common->action->u.blockdev_snapshot_internal_sync;
+    internal = common->action->u.blockdev_snapshot_internal_sync.data;
    state = DO_UPCAST(InternalSnapshotState, common, common);

    /* 1. parse input */
@@ -1634,7 +1665,7 @@ static void external_snapshot_prepare(BlkActionState *common,
    switch (action->type) {
    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT:
        {
-            BlockdevSnapshot *s = action->u.blockdev_snapshot;
+            BlockdevSnapshot *s = action->u.blockdev_snapshot.data;
            device = s->node;
            node_name = s->node;
            new_image_file = NULL;
@@ -1643,7 +1674,7 @@ static void external_snapshot_prepare(BlkActionState *common,
        break;
    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC:
        {
-            BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+            BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync.data;
            device = s->has_device ? s->device : NULL;
            node_name = s->has_node_name ? s->node_name : NULL;
            new_image_file = s->snapshot_file;
@@ -1692,7 +1723,7 @@ static void external_snapshot_prepare(BlkActionState *common,
    }

    if (action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC) {
-        BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+        BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync.data;
        const char *format = s->has_format ? s->format : "qcow2";
        enum NewImageMode mode;
        const char *snapshot_node_name =
@@ -1714,10 +1745,15 @@ static void external_snapshot_prepare(BlkActionState *common,
        /* create new image w/backing file */
        mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
        if (mode != NEW_IMAGE_MODE_EXISTING) {
+            int64_t size = bdrv_getlength(state->old_bs);
+            if (size < 0) {
+                error_setg_errno(errp, -size, "bdrv_getlength failed");
+                return;
+            }
            bdrv_img_create(new_image_file, format,
                            state->old_bs->filename,
                            state->old_bs->drv->format_name,
-                            NULL, -1, flags, &local_err, false);
+                            NULL, size, flags, &local_err, false);
            if (local_err) {
                error_propagate(errp, local_err);
                return;
@@ -1825,7 +1861,7 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp)
    Error *local_err = NULL;

    assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP);
-    backup = common->action->u.drive_backup;
+    backup = common->action->u.drive_backup.data;

    blk = blk_by_name(backup->device);
    if (!blk) {
@@ -1907,7 +1943,7 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
    Error *local_err = NULL;

    assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP);
-    backup = common->action->u.blockdev_backup;
+    backup = common->action->u.blockdev_backup.data;

    blk = blk_by_name(backup->device);
    if (!blk) {
@@ -1993,7 +2029,7 @@ static void block_dirty_bitmap_add_prepare(BlkActionState *common,
        return;
    }

-    action = common->action->u.block_dirty_bitmap_add;
+    action = common->action->u.block_dirty_bitmap_add.data;
    /* AIO context taken and released within qmp_block_dirty_bitmap_add */
    qmp_block_dirty_bitmap_add(action->node, action->name,
                               action->has_granularity, action->granularity,
@@ -2012,7 +2048,7 @@ static void block_dirty_bitmap_add_abort(BlkActionState *common)
    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
                                             common, common);

-    action = common->action->u.block_dirty_bitmap_add;
+    action = common->action->u.block_dirty_bitmap_add.data;
    /* Should not be able to fail: IF the bitmap was added via .prepare(),
     * then the node reference and bitmap name must have been valid.
     */
@@ -2032,7 +2068,7 @@ static void block_dirty_bitmap_clear_prepare(BlkActionState *common,
        return;
    }

-    action = common->action->u.block_dirty_bitmap_clear;
+    action = common->action->u.block_dirty_bitmap_clear.data;
    state->bitmap = block_dirty_bitmap_lookup(action->node,
                                              action->name,
                                              &state->bs,
@@ -2304,6 +2340,11 @@ void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
        return;
    }

+    if (!blk_dev_has_tray(blk)) {
+        /* Ignore this command on tray-less devices */
+        return;
+    }
+
    if (blk_dev_is_tray_open(blk)) {
        return;
    }
@@ -2334,6 +2375,11 @@ void qmp_blockdev_close_tray(const char *device, Error **errp)
        return;
    }

+    if (!blk_dev_has_tray(blk)) {
+        /* Ignore this command on tray-less devices */
+        return;
+    }
+
    if (!blk_dev_is_tray_open(blk)) {
        return;
    }
@@ -2363,7 +2409,7 @@ void qmp_x_blockdev_remove_medium(const char *device, Error **errp)
        return;
    }

-    if (has_device && !blk_dev_is_tray_open(blk)) {
+    if (has_device && blk_dev_has_tray(blk) && !blk_dev_is_tray_open(blk)) {
        error_setg(errp, "Tray of device '%s' is not open", device);
        return;
    }
@@ -2380,14 +2426,16 @@ void qmp_x_blockdev_remove_medium(const char *device, Error **errp)
        goto out;
    }

-    /* This follows the convention established by bdrv_make_anon() */
-    if (bs->device_list.tqe_prev) {
-        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
-        bs->device_list.tqe_prev = NULL;
-    }
-
    blk_remove_bs(blk);

+    if (!blk_dev_has_tray(blk)) {
+        /* For tray-less devices, blockdev-open-tray is a no-op (or may not be
+         * called at all); therefore, the medium needs to be ejected here.
+         * Do it after blk_remove_bs() so blk_is_inserted(blk) returns the @load
+         * value passed here (i.e. false). */
+        blk_dev_change_media_cb(blk, false);
+    }
+
 out:
    aio_context_release(aio_context);
 }
@@ -2413,7 +2461,7 @@ static void qmp_blockdev_insert_anon_medium(const char *device,
        return;
    }

-    if (has_device && !blk_dev_is_tray_open(blk)) {
+    if (has_device && blk_dev_has_tray(blk) && !blk_dev_is_tray_open(blk)) {
        error_setg(errp, "Tray of device '%s' is not open", device);
        return;
    }
@@ -2425,7 +2473,14 @@ static void qmp_blockdev_insert_anon_medium(const char *device,

    blk_insert_bs(blk, bs);

-    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
+    if (!blk_dev_has_tray(blk)) {
+        /* For tray-less devices, blockdev-close-tray is a no-op (or may not be
+         * called at all); therefore, the medium needs to be pushed into the
+         * slot here.
+         * Do it after blk_insert_bs() so blk_is_inserted(blk) returns the @load
+         * value passed here (i.e. true). */
+        blk_dev_change_media_cb(blk, true);
+    }
 }

 void qmp_x_blockdev_insert_medium(const char *device, const char *node_name,
@@ -2472,6 +2527,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
    }

    bdrv_flags = blk_get_open_flags_from_root_state(blk);
+    bdrv_flags &= ~(BDRV_O_TEMPORARY | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING |
+        BDRV_O_PROTOCOL);

    if (!has_read_only) {
        read_only = BLOCKDEV_CHANGE_READ_ONLY_MODE_RETAIN;
@@ -2557,6 +2614,18 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
                               int64_t iops_rd_max,
                               bool has_iops_wr_max,
                               int64_t iops_wr_max,
+                               bool has_bps_max_length,
+                               int64_t bps_max_length,
+                               bool has_bps_rd_max_length,
+                               int64_t bps_rd_max_length,
+                               bool has_bps_wr_max_length,
+                               int64_t bps_wr_max_length,
+                               bool has_iops_max_length,
+                               int64_t iops_max_length,
+                               bool has_iops_rd_max_length,
+                               int64_t iops_rd_max_length,
+                               bool has_iops_wr_max_length,
+                               int64_t iops_wr_max_length,
                               bool has_iops_size,
                               int64_t iops_size,
                               bool has_group,
@@ -2583,7 +2652,7 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
        goto out;
    }

-    memset(&cfg, 0, sizeof(cfg));
+    throttle_config_init(&cfg);
    cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
    cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
    cfg.buckets[THROTTLE_BPS_WRITE].avg = bps_wr;
@@ -2611,11 +2680,30 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
        cfg.buckets[THROTTLE_OPS_WRITE].max = iops_wr_max;
    }

+    if (has_bps_max_length) {
+        cfg.buckets[THROTTLE_BPS_TOTAL].burst_length = bps_max_length;
+    }
+    if (has_bps_rd_max_length) {
+        cfg.buckets[THROTTLE_BPS_READ].burst_length = bps_rd_max_length;
+    }
+    if (has_bps_wr_max_length) {
+        cfg.buckets[THROTTLE_BPS_WRITE].burst_length = bps_wr_max_length;
+    }
+    if (has_iops_max_length) {
+        cfg.buckets[THROTTLE_OPS_TOTAL].burst_length = iops_max_length;
+    }
+    if (has_iops_rd_max_length) {
+        cfg.buckets[THROTTLE_OPS_READ].burst_length = iops_rd_max_length;
+    }
+    if (has_iops_wr_max_length) {
+        cfg.buckets[THROTTLE_OPS_WRITE].burst_length = iops_wr_max_length;
+    }
+
    if (has_iops_size) {
        cfg.op_size = iops_size;
    }

-    if (!check_throttle_config(&cfg, errp)) {
+    if (!throttle_is_valid(&cfg, errp)) {
        goto out;
    }

@@ -2742,6 +2830,15 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
    AioContext *aio_context;
    Error *local_err = NULL;

+    bs = bdrv_find_node(id);
+    if (bs) {
+        qmp_x_blockdev_del(false, NULL, true, id, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+        }
+        return;
+    }
+
    blk = blk_by_name(id);
    if (!blk) {
        error_report("Device '%s' not found", id);
@@ -2765,16 +2862,19 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
            return;
        }

-        bdrv_close(bs);
+        blk_remove_bs(blk);
    }

-    /* if we have a device attached to this BlockDriverState
-     * then we need to make the drive anonymous until the device
-     * can be removed.  If this is a drive with no device backing
-     * then we can just get rid of the block driver state right here.
+    /* Make the BlockBackend and the attached BlockDriverState anonymous */
+    monitor_remove_blk(blk);
+    if (blk_bs(blk)) {
+        bdrv_make_anon(blk_bs(blk));
+    }
+
+    /* If this BlockBackend has a device attached to it, its refcount will be
+     * decremented when the device is removed; otherwise we have to do so here.
     */
    if (blk_get_attached_dev(blk)) {
-        blk_hide_on_behalf_of_hmp_drive_del(blk);
        /* Further I/O must not pause the guest */
        blk_set_on_error(blk, BLOCKDEV_ON_ERROR_REPORT,
                         BLOCKDEV_ON_ERROR_REPORT);
@@ -3793,6 +3893,37 @@ out:
    aio_context_release(aio_context);
 }

+void hmp_drive_add_node(Monitor *mon, const char *optstr)
+{
+    QemuOpts *opts;
+    QDict *qdict;
+    Error *local_err = NULL;
+
+    opts = qemu_opts_parse_noisily(&qemu_drive_opts, optstr, false);
+    if (!opts) {
+        return;
+    }
+
+    qdict = qemu_opts_to_qdict(opts, NULL);
+
+    if (!qdict_get_try_str(qdict, "node-name")) {
+        QDECREF(qdict);
+        error_report("'node-name' needs to be specified");
+        goto out;
+    }
+
+    BlockDriverState *bs = bds_tree_init(qdict, &local_err);
+    if (!bs) {
+        error_report_err(local_err);
+        goto out;
+    }
+
+    QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list);
+
+out:
+    qemu_opts_del(opts);
+}
+
 void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
 {
    QmpOutputVisitor *ov = qmp_output_visitor_new();
@@ -3817,8 +3948,8 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
        }
    }

-    visit_type_BlockdevOptions(qmp_output_get_visitor(ov),
-                               &options, NULL, &local_err);
+    visit_type_BlockdevOptions(qmp_output_get_visitor(ov), NULL, &options,
+                               &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto fail;
@@ -3848,12 +3979,16 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
        if (!bs) {
            goto fail;
        }
+
+        QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list);
    }

    if (bs && bdrv_key_required(bs)) {
        if (blk) {
+            monitor_remove_blk(blk);
            blk_unref(blk);
        } else {
+            QTAILQ_REMOVE(&monitor_bdrv_states, bs, monitor_list);
            bdrv_unref(bs);
        }
        error_setg(errp, "blockdev-add doesn't support encrypted devices");
@@ -3880,6 +4015,7 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
    }

    if (has_id) {
+        /* blk_by_name() never returns a BB that is not owned by the monitor */
        blk = blk_by_name(id);
        if (!blk) {
            error_setg(errp, "Cannot find block backend %s", id);
@@ -3913,7 +4049,13 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
            goto out;
        }

-        if (bs->refcnt > 1 || !QLIST_EMPTY(&bs->parents)) {
+        if (!blk && !bs->monitor_list.tqe_prev) {
+            error_setg(errp, "Node %s is not owned by the monitor",
+                       bs->node_name);
+            goto out;
+        }
+
+        if (bs->refcnt > 1) {
            error_setg(errp, "Block device %s is in use",
                       bdrv_get_device_or_node_name(bs));
            goto out;
@@ -3921,8 +4063,10 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
    }

    if (blk) {
+        monitor_remove_blk(blk);
        blk_unref(blk);
    } else {
+        QTAILQ_REMOVE(&monitor_bdrv_states, bs, monitor_list);
        bdrv_unref(bs);
    }

@@ -4033,6 +4177,30 @@ QemuOptsList qemu_common_drive_opts = {
            .name = "throttling.bps-write-max",
            .type = QEMU_OPT_NUMBER,
            .help = "total bytes write burst",
+        },{
+            .name = "throttling.iops-total-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the iops-total-max burst period, in seconds",
+        },{
+            .name = "throttling.iops-read-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the iops-read-max burst period, in seconds",
+        },{
+            .name = "throttling.iops-write-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the iops-write-max burst period, in seconds",
+        },{
+            .name = "throttling.bps-total-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the bps-total-max burst period, in seconds",
+        },{
+            .name = "throttling.bps-read-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the bps-read-max burst period, in seconds",
+        },{
+            .name = "throttling.bps-write-max-length",
+            .type = QEMU_OPT_NUMBER,
+            .help = "length of the bps-write-max burst period, in seconds",
        },{
            .name = "throttling.iops-size",
            .type = QEMU_OPT_NUMBER,
@@ -4066,7 +4234,7 @@ QemuOptsList qemu_common_drive_opts = {

 static QemuOptsList qemu_root_bds_opts = {
    .name = "root-bds",
-    .head = QTAILQ_HEAD_INITIALIZER(qemu_common_drive_opts.head),
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_root_bds_opts.head),
    .desc = {
        {
            .name = "discard",
--- a/blockjob.c
+++ b/blockjob.c
@@ -23,7 +23,7 @@
 * THE SOFTWARE.
 */

-#include "config-host.h"
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "trace.h"
 #include "block/block.h"
@@ -278,14 +278,6 @@ void block_job_iostatus_reset(BlockJob *job)
    }
 }

-struct BlockFinishData {
-    BlockJob *job;
-    BlockCompletionFunc *cb;
-    void *opaque;
-    bool cancelled;
-    int ret;
-};
-
 static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
@@ -304,7 +296,9 @@ static int block_job_finish_sync(BlockJob *job,
        return -EBUSY;
    }
    while (!job->completed) {
-        aio_poll(bdrv_get_aio_context(bs), true);
+        aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
+                                              bdrv_get_aio_context(bs),
+                 true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
    block_job_unref(job);
@@ -478,6 +472,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    aio_context = bdrv_get_aio_context(data->job->bs);
    aio_context_acquire(aio_context);

+    data->job->deferred_to_main_loop = false;
    data->fn(data->job, data->opaque);

    aio_context_release(aio_context);
@@ -497,6 +492,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
    data->aio_context = bdrv_get_aio_context(job->bs);
    data->fn = fn;
    data->opaque = opaque;
+    job->deferred_to_main_loop = true;

    qemu_bh_schedule(data->bh);
 }
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */

+#include "qemu/osdep.h"
 #include "sysemu/sysemu.h"
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
@@ -270,21 +271,21 @@ typedef struct {
    DeviceState *dev;
 } BootIndexProperty;

-static void device_get_bootindex(Object *obj, Visitor *v, void *opaque,
-                                 const char *name, Error **errp)
+static void device_get_bootindex(Object *obj, Visitor *v, const char *name,
+                                 void *opaque, Error **errp)
 {
    BootIndexProperty *prop = opaque;
-    visit_type_int32(v, prop->bootindex, name, errp);
+    visit_type_int32(v, name, prop->bootindex, errp);
 }

-static void device_set_bootindex(Object *obj, Visitor *v, void *opaque,
-                                 const char *name, Error **errp)
+static void device_set_bootindex(Object *obj, Visitor *v, const char *name,
+                                 void *opaque, Error **errp)
 {
    BootIndexProperty *prop = opaque;
    int32_t boot_index;
    Error *local_err = NULL;

-    visit_type_int32(v, &boot_index, name, &local_err);
+    visit_type_int32(v, name, &boot_index, &local_err);
    if (local_err) {
        goto out;
    }
--- a/bsd-user/bsdload.c
+++ b/bsd-user/bsdload.c
@@ -1,12 +1,6 @@
 /* Code for loading BSD executables.  Mostly linux kernel code.  */

-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include "qemu/osdep.h"

 #include "qemu.h"

--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -1,13 +1,7 @@
 /* This is the Linux kernel elf-loading code, ported into user space */

-#include <stdio.h>
-#include <sys/types.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <unistd.h>
+#include "qemu/osdep.h"
 #include <sys/mman.h>
-#include <stdlib.h>
-#include <string.h>

 #include "qemu.h"
 #include "disas/disas.h"
--- a/bsd-user/i386/target_syscall.h
+++ b/bsd-user/i386/target_syscall.h
@@ -1,3 +1,6 @@
+#ifndef TARGET_SYSCALL_H
+#define TARGET_SYSCALL_H
+
 /* default linux values for the selectors */
 #define __USER_CS	(0x23)
 #define __USER_DS	(0x2B)
@@ -159,3 +162,4 @@ struct target_vm86plus_struct {

 #define UNAME_MACHINE "i386"

+#endif  /* TARGET_SYSCALL_H */
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -16,14 +16,8 @@
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
+#include "qemu/osdep.h"
 #include <machine/trap.h>
-#include <sys/types.h>
 #include <sys/mman.h>

 #include "qemu.h"
@@ -33,6 +27,7 @@
 #include "tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
+#include "exec/log.h"

 int singlestep;
 unsigned long mmap_min_addr;
--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -16,12 +16,7 @@
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
+#include "qemu/osdep.h"
 #include <sys/mman.h>

 #include "qemu.h"
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -17,15 +17,12 @@
 #ifndef QEMU_H
 #define QEMU_H

-#include <signal.h>
-#include <string.h>

 #include "cpu.h"
 #include "exec/cpu_ldst.h"

 #undef DEBUG_REMAP
 #ifdef DEBUG_REMAP
-#include <stdlib.h>
 #endif /* DEBUG_REMAP */

 #include "exec/user/abitypes.h"
@@ -38,7 +35,7 @@ enum BSDType {
 extern enum BSDType bsd_type;

 #include "syscall_defs.h"
-#include "syscall.h"
+#include "target_syscall.h"
 #include "target_signal.h"
 #include "exec/gdbstub.h"

--- a/bsd-user/signal.c
+++ b/bsd-user/signal.c
@@ -16,12 +16,7 @@
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <errno.h>
+#include "qemu/osdep.h"

 #include "qemu.h"
 #include "target_signal.h"
--- a/bsd-user/sparc/target_syscall.h
+++ b/bsd-user/sparc/target_syscall.h
@@ -1,3 +1,6 @@
+#ifndef TARGET_SYSCALL_H
+#define TARGET_SYSCALL_H
+
 struct target_pt_regs {
 	abi_ulong psr;
 	abi_ulong pc;
@@ -7,3 +10,5 @@ struct target_pt_regs {
 };

 #define UNAME_MACHINE "sun4"
+
+#endif  /* TARGET_SYSCALL_H */
--- a/bsd-user/sparc64/target_syscall.h
+++ b/bsd-user/sparc64/target_syscall.h
@@ -1,3 +1,6 @@
+#ifndef TARGET_SYSCALL_H
+#define TARGET_SYSCALL_H
+
 struct target_pt_regs {
 	abi_ulong u_regs[16];
 	abi_ulong tstate;
@@ -8,3 +11,5 @@ struct target_pt_regs {
 };

 #define UNAME_MACHINE "sun4u"
+
+#endif  /* TARGET_SYSCALL_H */
--- a/bsd-user/strace.c
+++ b/bsd-user/strace.c
@@ -16,14 +16,10 @@
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

-#include <stdio.h>
-#include <errno.h>
+#include "qemu/osdep.h"
 #include <sys/select.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/ioccom.h>
-#include <ctype.h>

 #include "qemu.h"

--- a/bsd-user/syscall.c
+++ b/bsd-user/syscall.c
@@ -16,17 +16,7 @@
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <time.h>
-#include <limits.h>
-#include <sys/types.h>
+#include "qemu/osdep.h"
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/param.h>
--- a/bsd-user/uaccess.c
+++ b/bsd-user/uaccess.c
@@ -1,6 +1,5 @@
 /* User memory access */
-#include <stdio.h>
-#include <string.h>
+#include "qemu/osdep.h"

 #include "qemu.h"

--- a/bsd-user/x86_64/target_syscall.h
+++ b/bsd-user/x86_64/target_syscall.h
@@ -1,3 +1,6 @@
+#ifndef TARGET_SYSCALL_H
+#define TARGET_SYSCALL_H
+
 #define __USER_CS	(0x33)
 #define __USER_DS	(0x2B)

@@ -114,3 +117,5 @@ struct target_msqid64_ds {
 #define TARGET_ARCH_SET_FS 0x1002
 #define TARGET_ARCH_GET_FS 0x1003
 #define TARGET_ARCH_GET_GS 0x1004
+
+#endif  /* TARGET_SYSCALL_H */
--- a/bt-host.c
+++ b/bt-host.c
@@ -17,12 +17,12 @@
 * with this program; if not, see <http://www.gnu.org/licenses/>.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/bt.h"
 #include "qemu/main-loop.h"

 #ifndef _WIN32
-# include <errno.h>
 # include <sys/ioctl.h>
 # include <sys/uio.h>
 # ifdef CONFIG_BLUEZ
--- a/bt-vhci.c
+++ b/bt-vhci.c
@@ -17,6 +17,7 @@
 * with this program; if not, see <http://www.gnu.org/licenses/>.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "sysemu/bt.h"
 #include "hw/bt.h"
--- a/292
+++ b/292
@@ -116,38 +116,6 @@ compile_prog() {
  do_cc $QEMU_CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
 }

-do_libtool() {
-    local mode=$1
-    shift
-    # Run the compiler, capturing its output to the log.
-    echo $libtool $mode --tag=CC $cc "$@" >> config.log
-    $libtool $mode --tag=CC $cc "$@" >> config.log 2>&1 || return $?
-    # Test passed. If this is an --enable-werror build, rerun
-    # the test with -Werror and bail out if it fails. This
-    # makes warning-generating-errors in configure test code
-    # obvious to developers.
-    if test "$werror" != "yes"; then
-        return 0
-    fi
-    # Don't bother rerunning the compile if we were already using -Werror
-    case "$*" in
-        *-Werror*)
-           return 0
-        ;;
-    esac
-    echo $libtool $mode --tag=CC $cc -Werror "$@" >> config.log
-    $libtool $mode --tag=CC $cc -Werror "$@" >> config.log 2>&1 && return $?
-    error_exit "configure test passed without -Werror but failed with -Werror." \
-        "This is probably a bug in the configure script. The failing command" \
-        "will be at the bottom of config.log." \
-        "You can run configure with --disable-werror to bypass this check."
-}
-
-libtool_prog() {
-    do_libtool --mode=compile $QEMU_CFLAGS -c -fPIE -DPIE -o $TMPO $TMPC || return $?
-    do_libtool --mode=link $LDFLAGS -o $TMPA $TMPL -rpath /usr/local/lib
-}
-
 # symbolically link $1 to $2.  Portable version of "ln -sf".
 symlink() {
  rm -rf "$2"
@@ -303,7 +271,7 @@ pkgversion=""
 pie=""
 zero_malloc=""
 qom_cast_debug="yes"
-trace_backends="nop"
+trace_backends="log"
 trace_file="trace"
 spice=""
 rbd=""
@@ -311,6 +279,8 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+opengl_dmabuf="no"
+avx2_opt="no"
 zlib="yes"
 lzo=""
 snappy=""
@@ -336,8 +306,10 @@ gtkabi=""
 gtk_gl="no"
 gnutls=""
 gnutls_hash=""
+gnutls_rnd=""
 nettle=""
 gcrypt=""
+gcrypt_kdf="no"
 vte=""
 virglrenderer=""
 tpm="yes"
@@ -398,7 +370,6 @@ as="${AS-${cross_prefix}as}"
 cpp="${CPP-$cc -E}"
 objcopy="${OBJCOPY-${cross_prefix}objcopy}"
 ld="${LD-${cross_prefix}ld}"
-libtool="${LIBTOOL-${cross_prefix}libtool}"
 nm="${NM-${cross_prefix}nm}"
 strip="${STRIP-${cross_prefix}strip}"
 windres="${WINDRES-${cross_prefix}windres}"
@@ -1514,7 +1485,6 @@ EOF
    if do_cc $QEMU_CFLAGS -Werror $flag -c -o $TMPO $TMPC &&
       compile_prog "-Werror $flag" ""; then
      QEMU_CFLAGS="$QEMU_CFLAGS $flag"
-      LIBTOOLFLAGS="$LIBTOOLFLAGS -Wc,$flag"
      sp_on=1
      break
    fi
@@ -1609,32 +1579,6 @@ EOF
  fi
 fi

-# check for broken gcc and libtool in RHEL5
-if test -n "$libtool" -a "$pie" != "no" ; then
-  cat > $TMPC <<EOF
-
-void *f(unsigned char *buf, int len);
-void *g(unsigned char *buf, int len);
-
-void *
-f(unsigned char *buf, int len)
-{
-    return (void*)0L;
-}
-
-void *
-g(unsigned char *buf, int len)
-{
-    return f(buf, len);
-}
-
-EOF
-  if ! libtool_prog; then
-    echo "Disabling libtool due to broken toolchain support"
-    libtool=
-  fi
-fi
-
 ##########################################
 # __sync_fetch_and_and requires at least -march=i486. Many toolchains
 # use i686 as default anyway, but for those that don't, an explicit
@@ -1832,6 +1776,21 @@ EOF
 fi

 ##########################################
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void *bar_ifunc(void) {return (void*) bar;}
+static void foo(void) __attribute__((ifunc("bar_ifunc")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "-mavx2" "" ; then
+    if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
+        avx2_opt="yes"
+    fi
+fi
+
+#########################################
 # zlib check

 if test "$zlib" != "no" ; then
@@ -2111,100 +2070,10 @@ EOF
    xen_ctrl_version=420
    xen=yes

-  elif
-      cat > $TMPC <<EOF &&
-#include <xenctrl.h>
-#include <xs.h>
-#include <stdint.h>
-#include <xen/hvm/hvm_info_table.h>
-#if !defined(HVM_MAX_VCPUS)
-# error HVM_MAX_VCPUS not defined
-#endif
-int main(void) {
-  xs_daemon_open();
-  xc_interface_open(0, 0, 0);
-  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
-  xc_gnttab_open(NULL, 0);
-  xc_domain_add_to_physmap(0, 0, XENMAPSPACE_gmfn, 0, 0);
-  return 0;
-}
-EOF
-      compile_prog "" "$xen_libs"
-    then
-    xen_ctrl_version=410
-    xen=yes
-
-  # Xen 4.0.0
-  elif
-      cat > $TMPC <<EOF &&
-#include <xenctrl.h>
-#include <xs.h>
-#include <stdint.h>
-#include <xen/hvm/hvm_info_table.h>
-#if !defined(HVM_MAX_VCPUS)
-# error HVM_MAX_VCPUS not defined
-#endif
-int main(void) {
-  struct xen_add_to_physmap xatp = {
-    .domid = 0, .space = XENMAPSPACE_gmfn, .idx = 0, .gpfn = 0,
-  };
-  xs_daemon_open();
-  xc_interface_open();
-  xc_gnttab_open();
-  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
-  xc_memory_op(0, XENMEM_add_to_physmap, &xatp);
-  return 0;
-}
-EOF
-      compile_prog "" "$xen_libs"
-    then
-    xen_ctrl_version=400
-    xen=yes
-
-  # Xen 3.4.0
-  elif
-      cat > $TMPC <<EOF &&
-#include <xenctrl.h>
-#include <xs.h>
-int main(void) {
-  struct xen_add_to_physmap xatp = {
-    .domid = 0, .space = XENMAPSPACE_gmfn, .idx = 0, .gpfn = 0,
-  };
-  xs_daemon_open();
-  xc_interface_open();
-  xc_gnttab_open();
-  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
-  xc_memory_op(0, XENMEM_add_to_physmap, &xatp);
-  return 0;
-}
-EOF
-      compile_prog "" "$xen_libs"
-    then
-    xen_ctrl_version=340
-    xen=yes
-
-  # Xen 3.3.0
-  elif
-      cat > $TMPC <<EOF &&
-#include <xenctrl.h>
-#include <xs.h>
-int main(void) {
-  xs_daemon_open();
-  xc_interface_open();
-  xc_gnttab_open();
-  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
-  return 0;
-}
-EOF
-      compile_prog "" "$xen_libs"
-    then
-    xen_ctrl_version=330
-    xen=yes
-
-  # Xen version unsupported
  else
    if test "$xen" = "yes" ; then
-      feature_not_found "xen (unsupported version)" "Install supported xen (e.g. 4.0, 3.4, 3.3)"
+      feature_not_found "xen (unsupported version)" \
+                        "Install a supported xen (xen 4.2 or newer)"
    fi
    xen=no
  fi
@@ -2218,15 +2087,10 @@ EOF
 fi

 if test "$xen_pci_passthrough" != "no"; then
-  if test "$xen" = "yes" && test "$linux" = "yes" &&
-    test "$xen_ctrl_version" -ge 340; then
+  if test "$xen" = "yes" && test "$linux" = "yes"; then
    xen_pci_passthrough=yes
  else
    if test "$xen_pci_passthrough" = "yes"; then
-      if test "$xen_ctrl_version" -lt 340; then
-        error_exit "User requested feature Xen PCI Passthrough" \
-            "This feature does not work with Xen 3.3"
-      fi
      error_exit "User requested feature Xen PCI Passthrough" \
          " but this feature requires /sys from Linux"
    fi
@@ -2240,21 +2104,6 @@ if test "$xen_pv_domain_build" = "yes" &&
 	       "which requires Xen support."
 fi

-##########################################
-# libtool probe
-
-if ! has $libtool; then
-    libtool=
-fi
-
-# MacOSX ships with a libtool which isn't the GNU one; weed this
-# out by checking whether libtool supports the --version switch
-if test -n "$libtool"; then
-  if ! "$libtool" --version >/dev/null 2>&1; then
-    libtool=
-  fi
-fi
-
 ##########################################
 # Sparse probe
 if test "$sparse" != "no" ; then
@@ -2354,6 +2203,13 @@ if test "$gnutls" != "no"; then
 	    gnutls_hash="no"
 	fi

+	# gnutls_rnd requires >= 2.11.0
+	if $pkg_config --exists "gnutls >= 2.11.0"; then
+	    gnutls_rnd="yes"
+	else
+	    gnutls_rnd="no"
+	fi
+
 	if $pkg_config --exists 'gnutls >= 3.0'; then
 	    gnutls_gcrypt=no
 	    gnutls_nettle=yes
@@ -2381,9 +2237,11 @@ if test "$gnutls" != "no"; then
    else
        gnutls="no"
        gnutls_hash="no"
+        gnutls_rnd="no"
    fi
 else
    gnutls_hash="no"
+    gnutls_rnd="no"
 fi


@@ -2445,6 +2303,19 @@ if test "$gcrypt" != "no"; then
        if test -z "$nettle"; then
           nettle="no"
        fi
+
+        cat > $TMPC << EOF
+#include <gcrypt.h>
+int main(void) {
+  gcry_kdf_derive(NULL, 0, GCRY_KDF_PBKDF2,
+                  GCRY_MD_SHA256,
+                  NULL, 0, 0, 0, NULL);
+ return 0;
+}
+EOF
+        if compile_prog "$gcrypt_cflags" "$gcrypt_libs" ; then
+            gcrypt_kdf=yes
+        fi
    else
        if test "$gcrypt" = "yes"; then
            feature_not_found "gcrypt" "Install gcrypt devel"
@@ -2965,7 +2836,7 @@ fi
 # curses probe
 if test "$curses" != "no" ; then
  if test "$mingw32" = "yes" ; then
-    curses_list="-lpdcurses"
+    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
  else
    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lncurses:-lcurses"
  fi
@@ -3063,6 +2934,30 @@ for i in $glib_modules; do
    fi
 done

+# Sanity check that the current size_t matches the
+# size that glib thinks it should be. This catches
+# problems on multi-arch where people try to build
+# 32-bit QEMU while pointing at 64-bit glib headers
+cat > $TMPC <<EOF
+#include <glib.h>
+#include <unistd.h>
+
+#define QEMU_BUILD_BUG_ON(x) \
+  typedef char qemu_build_bug_on[(x)?-1:1] __attribute__((unused));
+
+int main(void) {
+   QEMU_BUILD_BUG_ON(sizeof(size_t) != GLIB_SIZEOF_SIZE_T);
+   return 0;
+}
+EOF
+
+if ! compile_prog "-Werror $CFLAGS" "$LIBS" ; then
+    error_exit "sizeof(size_t) doesn't match GLIB_SIZEOF_SIZE_T."\
+               "You probably need to set PKG_CONFIG_LIBDIR"\
+	       "to point to the right pkg-config files for your"\
+	       "build target"
+fi
+
 # g_test_trap_subprocess added in 2.38. Used by some tests.
 glib_subprocess=yes
 if ! $pkg_config --atleast-version=2.38 glib-2.0; then
@@ -3420,7 +3315,7 @@ libs_softmmu="$libs_softmmu $fdt_libs"
 # opengl probe (for sdl2, gtk, milkymist-tmu2)

 if test "$opengl" != "no" ; then
-  opengl_pkgs="epoxy"
+  opengl_pkgs="epoxy libdrm gbm"
  if $pkg_config $opengl_pkgs x11; then
    opengl_cflags="$($pkg_config --cflags $opengl_pkgs) $x11_cflags"
    opengl_libs="$($pkg_config --libs $opengl_pkgs) $x11_libs"
@@ -3438,6 +3333,18 @@ if test "$opengl" != "no" ; then
  fi
 fi

+if test "$opengl" = "yes"; then
+  cat > $TMPC << EOF
+#include <epoxy/egl.h>
+#ifndef EGL_MESA_image_dma_buf_export
+# error mesa/epoxy lacks support for dmabufs (mesa 10.6+)
+#endif
+int main(void) { return 0; }
+EOF
+  if compile_prog "" "" ; then
+    opengl_dmabuf=yes
+  fi
+fi

 ##########################################
 # archipelago probe
@@ -4831,7 +4738,9 @@ echo "GTK support       $gtk"
 echo "GTK GL support    $gtk_gl"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
+echo "GNUTLS rnd        $gnutls_rnd"
 echo "libgcrypt         $gcrypt"
+echo "libgcrypt kdf     $gcrypt_kdf"
 if test "$nettle" = "yes"; then
    echo "nettle            $nettle ($nettle_version)"
 else
@@ -4898,6 +4807,7 @@ echo "smartcard support $smartcard"
 echo "libusb            $libusb"
 echo "usb net redir     $usb_redir"
 echo "OpenGL support    $opengl"
+echo "OpenGL dmabufs    $opengl_dmabuf"
 echo "libiscsi support  $libiscsi"
 echo "libnfs support    $libnfs"
 echo "build guest agent $guest_agent"
@@ -4922,6 +4832,7 @@ echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"

 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5196,6 +5107,7 @@ if test "$gtk" = "yes" ; then
  echo "CONFIG_GTK=y" >> $config_host_mak
  echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak
  echo "GTK_CFLAGS=$gtk_cflags" >> $config_host_mak
+  echo "GTK_LIBS=$gtk_libs" >> $config_host_mak
  if test "$gtk_gl" = "yes" ; then
    echo "CONFIG_GTK_GL=y" >> $config_host_mak
  fi
@@ -5206,8 +5118,14 @@ fi
 if test "$gnutls_hash" = "yes" ; then
  echo "CONFIG_GNUTLS_HASH=y" >> $config_host_mak
 fi
+if test "$gnutls_rnd" = "yes" ; then
+  echo "CONFIG_GNUTLS_RND=y" >> $config_host_mak
+fi
 if test "$gcrypt" = "yes" ; then
  echo "CONFIG_GCRYPT=y" >> $config_host_mak
+  if test "$gcrypt_kdf" = "yes" ; then
+    echo "CONFIG_GCRYPT_KDF=y" >> $config_host_mak
+  fi
 fi
 if test "$nettle" = "yes" ; then
  echo "CONFIG_NETTLE=y" >> $config_host_mak
@@ -5304,6 +5222,13 @@ if test "$opengl" = "yes" ; then
  echo "CONFIG_OPENGL=y" >> $config_host_mak
  echo "OPENGL_CFLAGS=$opengl_cflags" >> $config_host_mak
  echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
+  if test "$opengl_dmabuf" = "yes" ; then
+    echo "CONFIG_OPENGL_DMABUF=y" >> $config_host_mak
+  fi
+fi
+
+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
 fi

 if test "$lzo" = "yes" ; then
@@ -5445,8 +5370,8 @@ if have_backend "simple"; then
  # Set the appropriate trace file.
  trace_file="\"$trace_file-\" FMT_pid"
 fi
-if have_backend "stderr"; then
-  echo "CONFIG_TRACE_STDERR=y" >> $config_host_mak
+if have_backend "log"; then
+  echo "CONFIG_TRACE_LOG=y" >> $config_host_mak
 fi
 if have_backend "ust"; then
  echo "CONFIG_TRACE_UST=y" >> $config_host_mak
@@ -5501,13 +5426,8 @@ echo "MAKE=$make" >> $config_host_mak
 echo "INSTALL=$install" >> $config_host_mak
 echo "INSTALL_DIR=$install -d -m 0755" >> $config_host_mak
 echo "INSTALL_DATA=$install -c -m 0644" >> $config_host_mak
-if test -n "$libtool"; then
-  echo "INSTALL_PROG=\$(LIBTOOL) --mode=install $install -c -m 0755" >> $config_host_mak
-  echo "INSTALL_LIB=\$(LIBTOOL) --mode=install $install -c -m 0644" >> $config_host_mak
-else
-  echo "INSTALL_PROG=$install -c -m 0755" >> $config_host_mak
-  echo "INSTALL_LIB=$install -c -m 0644" >> $config_host_mak
-fi
+echo "INSTALL_PROG=$install -c -m 0755" >> $config_host_mak
+echo "INSTALL_LIB=$install -c -m 0644" >> $config_host_mak
 echo "PYTHON=$python" >> $config_host_mak
 echo "CC=$cc" >> $config_host_mak
 if $iasl -h > /dev/null 2>&1; then
@@ -5525,7 +5445,6 @@ echo "OBJCOPY=$objcopy" >> $config_host_mak
 echo "LD=$ld" >> $config_host_mak
 echo "NM=$nm" >> $config_host_mak
 echo "WINDRES=$windres" >> $config_host_mak
-echo "LIBTOOL=$libtool" >> $config_host_mak
 echo "CFLAGS=$CFLAGS" >> $config_host_mak
 echo "CFLAGS_NOPIE=$CFLAGS_NOPIE" >> $config_host_mak
 echo "QEMU_CFLAGS=$QEMU_CFLAGS" >> $config_host_mak
@@ -5544,7 +5463,6 @@ else
 fi
 echo "LDFLAGS=$LDFLAGS" >> $config_host_mak
 echo "LDFLAGS_NOPIE=$LDFLAGS_NOPIE" >> $config_host_mak
-echo "LIBTOOLFLAGS=$LIBTOOLFLAGS" >> $config_host_mak
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "LIBS_TOOLS+=$libs_tools" >> $config_host_mak
 echo "EXESUF=$EXESUF" >> $config_host_mak
--- a/contrib/ivshmem-client/ivshmem-client.c
+++ b/contrib/ivshmem-client/ivshmem-client.c
@@ -6,7 +6,7 @@
 * top-level directory.
 */

-#include <sys/types.h>
+#include "qemu/osdep.h"
 #include <sys/socket.h>
 #include <sys/un.h>

--- a/contrib/ivshmem-client/ivshmem-client.h
+++ b/contrib/ivshmem-client/ivshmem-client.h
@@ -19,7 +19,6 @@
 * purposes.
 */

-#include <limits.h>
 #include <sys/select.h>

 #include "qemu/queue.h"
--- a/contrib/ivshmem-client/main.c
+++ b/contrib/ivshmem-client/main.c
@@ -6,6 +6,7 @@
 * top-level directory.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"

 #include "ivshmem-client.h"
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -5,16 +5,13 @@
 * (at your option) any later version.  See the COPYING file in the
 * top-level directory.
 */
+#include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/sockets.h"

 #include <sys/mman.h>
-#include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
-#ifdef CONFIG_LINUX
-#include <sys/vfs.h>
-#endif

 #include "ivshmem-server.h"

@@ -257,7 +254,8 @@ ivshmem_server_ftruncate(int fd, unsigned shmsize)
 /* Init a new ivshmem server */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
+                    size_t shm_size, unsigned n_vectors,
                    bool verbose)
 {
    int ret;
@@ -278,6 +276,7 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
        return -1;
    }

+    server->use_shm_open = use_shm_open;
    server->shm_size = shm_size;
    server->n_vectors = n_vectors;

@@ -286,31 +285,6 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
    return 0;
 }

-#ifdef CONFIG_LINUX
-
-#define HUGETLBFS_MAGIC       0x958458f6
-
-static long gethugepagesize(const char *path)
-{
-    struct statfs fs;
-    int ret;
-
-    do {
-        ret = statfs(path, &fs);
-    } while (ret != 0 && errno == EINTR);
-
-    if (ret != 0) {
-        return -1;
-    }
-
-    if (fs.f_type != HUGETLBFS_MAGIC) {
-        return -1;
-    }
-
-    return fs.f_bsize;
-}
-#endif
-
 /* open shm, create and bind to the unix socket */
 int
 ivshmem_server_start(IvshmemServer *server)
@@ -319,27 +293,17 @@ ivshmem_server_start(IvshmemServer *server)
    int shm_fd, sock_fd, ret;

    /* open shm file */
-#ifdef CONFIG_LINUX
-    long hpagesize;
-
-    hpagesize = gethugepagesize(server->shm_path);
-    if (hpagesize < 0 && errno != ENOENT) {
-        IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n",
-                             server->shm_path, strerror(errno));
-    }
-
-    if (hpagesize > 0) {
+    if (server->use_shm_open) {
+        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
+                             server->shm_path);
+        shm_fd = shm_open(server->shm_path, O_CREAT | O_RDWR, S_IRWXU);
+    } else {
        gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path);
-        IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path);
+        IVSHMEM_SERVER_DEBUG(server, "Using file-backed shared memory: %s\n",
+                             server->shm_path);
        shm_fd = mkstemp(filename);
        unlink(filename);
        g_free(filename);
-    } else
-#endif
-    {
-        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
-                             server->shm_path);
-        shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU);
    }

    if (shm_fd < 0) {
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -26,10 +26,7 @@
 * associated to the ivshmem shared memory.
 */

-#include <limits.h>
 #include <sys/select.h>
-#include <stdint.h>
-#include <stdbool.h>

 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -69,6 +66,7 @@ typedef struct IvshmemServer {
    char unix_sock_path[PATH_MAX];   /**< path to unix socket */
    int sock_fd;                     /**< unix sock file descriptor */
    char shm_path[PATH_MAX];         /**< path to shm */
+    bool use_shm_open;
    size_t shm_size;                 /**< size of shm */
    int shm_fd;                      /**< shm file descriptor */
    unsigned n_vectors;              /**< number of vectors */
@@ -92,7 +90,8 @@ typedef struct IvshmemServer {
 */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
+                    size_t shm_size, unsigned n_vectors,
                    bool verbose);

 /**
--- a/contrib/ivshmem-server/main.c
+++ b/contrib/ivshmem-server/main.c
@@ -6,6 +6,7 @@
 * top-level directory.
 */

+#include "qemu/osdep.h"
 #include "qemu-common.h"

 #include "ivshmem-server.h"
@@ -28,35 +29,38 @@ typedef struct IvshmemServerArgs {
    const char *pid_file;
    const char *unix_socket_path;
    const char *shm_path;
+    bool use_shm_open;
    uint64_t shm_size;
    unsigned n_vectors;
 } IvshmemServerArgs;

-/* show ivshmem_server_usage and exit with given error code */
 static void
-ivshmem_server_usage(const char *name, int code)
+ivshmem_server_usage(const char *progname)
 {
-    fprintf(stderr, "%s [opts]\n", name);
-    fprintf(stderr, "  -h: show this help\n");
-    fprintf(stderr, "  -v: verbose mode\n");
-    fprintf(stderr, "  -F: foreground mode (default is to daemonize)\n");
-    fprintf(stderr, "  -p <pid_file>: path to the PID file (used in daemon\n"
-                    "     mode only).\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
-    fprintf(stderr, "  -S <unix_socket_path>: path to the unix socket\n"
-                    "     to listen to.\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH);
-    fprintf(stderr, "  -m <shm_path>: path to the shared memory.\n"
-                    "     The path corresponds to a POSIX shm name or a\n"
-                    "     hugetlbfs mount point.\n"
-                    "     default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
-    fprintf(stderr, "  -l <size>: size of shared memory in bytes. The suffix\n"
-                    "     K, M and G can be used (ex: 1K means 1024).\n"
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE);
-    fprintf(stderr, "  -n <n_vects>: number of vectors.\n"
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+    printf("Usage: %s [OPTION]...\n"
+           "  -h: show this help\n"
+           "  -v: verbose mode\n"
+           "  -F: foreground mode (default is to daemonize)\n"
+           "  -p <pid-file>: path to the PID file (used in daemon mode only)\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_PID_FILE "\n"
+           "  -S <unix-socket-path>: path to the unix socket to listen to\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "\n"
+           "  -M <shm-name>: POSIX shared memory object to use\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_SHM_PATH "\n"
+           "  -m <dir-name>: where to create shared memory\n"
+           "  -l <size>: size of shared memory in bytes\n"
+           "     suffixes K, M and G can be used, e.g. 1K means 1024\n"
+           "     default %u\n"
+           "  -n <nvectors>: number of vectors\n"
+           "     default %u\n",
+           progname, IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
+           IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+}

-    exit(code);
+static void
+ivshmem_server_help(const char *progname)
+{
+    fprintf(stderr, "Try '%s -h' for more information.\n", progname);
 }

 /* parse the program arguments, exit on error */
@@ -67,20 +71,12 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    unsigned long long v;
    Error *err = NULL;

-    while ((c = getopt(argc, argv,
-                       "h"  /* help */
-                       "v"  /* verbose */
-                       "F"  /* foreground */
-                       "p:" /* pid_file */
-                       "S:" /* unix_socket_path */
-                       "m:" /* shm_path */
-                       "l:" /* shm_size */
-                       "n:" /* n_vectors */
-                      )) != -1) {
+    while ((c = getopt(argc, argv, "hvFp:S:m:M:l:n:")) != -1) {

        switch (c) {
        case 'h': /* help */
-            ivshmem_server_usage(argv[0], 0);
+            ivshmem_server_usage(argv[0]);
+            exit(0);
            break;

        case 'v': /* verbose */
@@ -91,36 +87,41 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
            args->foreground = 1;
            break;

-        case 'p': /* pid_file */
+        case 'p': /* pid file */
            args->pid_file = optarg;
            break;

-        case 'S': /* unix_socket_path */
+        case 'S': /* unix socket path */
            args->unix_socket_path = optarg;
            break;

-        case 'm': /* shm_path */
+        case 'M': /* shm name */
+        case 'm': /* dir name */
            args->shm_path = optarg;
+            args->use_shm_open = c == 'M';
            break;

-        case 'l': /* shm_size */
+        case 'l': /* shm size */
            parse_option_size("shm_size", optarg, &args->shm_size, &err);
            if (err) {
                error_report_err(err);
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
+                exit(1);
            }
            break;

-        case 'n': /* n_vectors */
+        case 'n': /* number of vectors */
            if (parse_uint_full(optarg, &v, 0) < 0) {
                fprintf(stderr, "cannot parse n_vectors\n");
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
+                exit(1);
            }
            args->n_vectors = v;
            break;

        default:
-            ivshmem_server_usage(argv[0], 1);
+            ivshmem_server_usage(argv[0]);
+            exit(1);
            break;
        }
    }
@@ -128,12 +129,14 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) {
        fprintf(stderr, "too many requested vectors (max is %d)\n",
                IVSHMEM_SERVER_MAX_VECTORS);
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
+        exit(1);
    }

    if (args->verbose == 1 && args->foreground == 0) {
        fprintf(stderr, "cannot use verbose in daemon mode\n");
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
+        exit(1);
    }
 }

@@ -191,11 +194,18 @@ main(int argc, char *argv[])
        .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE,
        .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH,
        .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH,
+        .use_shm_open = true,
        .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
        .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS,
    };
    int ret = 1;

+    /*
+     * Do not remove this notice without adding proper error handling!
+     * Start with handling ivshmem_server_send_one_msg() failure.
+     */
+    printf("*** Example code, do not use in production ***\n");
+
    /* parse arguments, will exit on error */
    ivshmem_server_parse_args(&args, argc, argv);

@@ -218,7 +228,8 @@ main(int argc, char *argv[])
    }

    /* init the ivshms structure */
-    if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path,
+    if (ivshmem_server_init(&server, args.unix_socket_path,
+                            args.shm_path, args.use_shm_open,
                            args.shm_size, args.n_vectors, args.verbose) < 0) {
        fprintf(stderr, "cannot init server\n");
        goto err;
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -27,6 +27,7 @@
 #include "exec/address-spaces.h"
 #include "qemu/rcu.h"
 #include "exec/tb-hash.h"
+#include "exec/log.h"
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
--- a/cpus.c
+++ b/cpus.c
@@ -29,6 +29,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/block-backend.h"
 #include "exec/gdbstub.h"
 #include "sysemu/dma.h"
 #include "sysemu/kvm.h"
@@ -370,9 +371,12 @@ static void icount_warp_rt(void)
    }
 }

-static void icount_dummy_timer(void *opaque)
+static void icount_timer_cb(void *opaque)
 {
-    (void)opaque;
+    /* No need for a checkpoint because the timer already synchronizes
+     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
+     */
+    icount_warp_rt();
 }

 void qtest_clock_warp(int64_t dest)
@@ -396,17 +400,12 @@ void qtest_clock_warp(int64_t dest)
    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 }

-void qemu_clock_warp(QEMUClockType type)
+void qemu_start_warp_timer(void)
 {
    int64_t clock;
    int64_t deadline;

-    /*
-     * There are too many global variables to make the "warp" behavior
-     * applicable to other clocks.  But a clock argument removes the
-     * need for if statements all over the place.
-     */
-    if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
+    if (!use_icount) {
        return;
    }

@@ -418,29 +417,17 @@ void qemu_clock_warp(QEMUClockType type)
    }

    /* warp clock deterministically in record/replay mode */
-    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
        return;
    }

-    if (icount_sleep) {
-        /*
-         * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
-         * This ensures that the deadline for the timer is computed correctly
-         * below.
-         * This also makes sure that the insn counter is synchronized before
-         * the CPU starts running, in case the CPU is woken by an event other
-         * than the earliest QEMU_CLOCK_VIRTUAL timer.
-         */
-        icount_warp_rt();
-        timer_del(icount_warp_timer);
-    }
    if (!all_cpu_threads_idle()) {
        return;
    }

    if (qtest_enabled()) {
        /* When testing, qtest commands advance icount.  */
-	return;
+        return;
    }

    /* We want to use the earliest deadline from ALL vm_clocks */
@@ -496,6 +483,28 @@ void qemu_clock_warp(QEMUClockType type)
    }
 }

+static void qemu_account_warp_timer(void)
+{
+    if (!use_icount || !icount_sleep) {
+        return;
+    }
+
+    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
+     * do not fire, so computing the deadline does not make sense.
+     */
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    /* warp clock deterministically in record/replay mode */
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
+        return;
+    }
+
+    timer_del(icount_warp_timer);
+    icount_warp_rt();
+}
+
 static bool icount_state_needed(void *opaque)
 {
    return use_icount;
@@ -624,13 +633,13 @@ void configure_icount(QemuOpts *opts, Error **errp)
    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
-                                         icount_dummy_timer, NULL);
+                                         icount_timer_cb, NULL);
    }

    icount_align_option = qemu_opt_get_bool(opts, "align", false);

    if (icount_align_option && !icount_sleep) {
-        error_setg(errp, "align=on and sleep=no are incompatible");
+        error_setg(errp, "align=on and sleep=off are incompatible");
    }
    if (strcmp(option, "auto") != 0) {
        errno = 0;
@@ -643,7 +652,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
    } else if (icount_align_option) {
        error_setg(errp, "shift=auto and align=on are incompatible");
    } else if (!icount_sleep) {
-        error_setg(errp, "shift=auto and sleep=no are incompatible");
+        error_setg(errp, "shift=auto and sleep=off are incompatible");
    }

    use_icount = 2;
@@ -726,7 +735,7 @@ static int do_vm_stop(RunState state)
    }

    bdrv_drain_all();
-    ret = bdrv_flush_all();
+    ret = blk_flush_all();

    return ret;
 }
@@ -995,9 +1004,6 @@ static void qemu_wait_io_event_common(CPUState *cpu)
 static void qemu_tcg_wait_io_event(CPUState *cpu)
 {
    while (all_cpu_threads_idle()) {
-       /* Start accounting real time to the virtual clock if the CPUs
-          are idle.  */
-        qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
    }

@@ -1428,7 +1434,7 @@ int vm_stop_force_state(RunState state)
        bdrv_drain_all();
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
-        return bdrv_flush_all();
+        return blk_flush_all();
    }
 }

@@ -1499,7 +1505,7 @@ static void tcg_exec_all(void)
    int r;

    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-    qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
+    qemu_account_warp_timer();

    if (next_cpu == NULL) {
        next_cpu = first_cpu;
@@ -1568,28 +1574,22 @@ CpuInfoList *qmp_query_cpus(Error **errp)
        info->value->thread_id = cpu->thread_id;
 #if defined(TARGET_I386)
        info->value->arch = CPU_INFO_ARCH_X86;
-        info->value->u.x86 = g_new0(CpuInfoX86, 1);
-        info->value->u.x86->pc = env->eip + env->segs[R_CS].base;
+        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
 #elif defined(TARGET_PPC)
        info->value->arch = CPU_INFO_ARCH_PPC;
-        info->value->u.ppc = g_new0(CpuInfoPPC, 1);
-        info->value->u.ppc->nip = env->nip;
+        info->value->u.ppc.nip = env->nip;
 #elif defined(TARGET_SPARC)
        info->value->arch = CPU_INFO_ARCH_SPARC;
-        info->value->u.sparc = g_new0(CpuInfoSPARC, 1);
-        info->value->u.sparc->pc = env->pc;
-        info->value->u.sparc->npc = env->npc;
+        info->value->u.q_sparc.pc = env->pc;
+        info->value->u.q_sparc.npc = env->npc;
 #elif defined(TARGET_MIPS)
        info->value->arch = CPU_INFO_ARCH_MIPS;
-        info->value->u.mips = g_new0(CpuInfoMIPS, 1);
-        info->value->u.mips->PC = env->active_tc.PC;
+        info->value->u.q_mips.PC = env->active_tc.PC;
 #elif defined(TARGET_TRICORE)
        info->value->arch = CPU_INFO_ARCH_TRICORE;
-        info->value->u.tricore = g_new0(CpuInfoTricore, 1);
-        info->value->u.tricore->PC = env->PC;
+        info->value->u.tricore.PC = env->PC;
 #else
        info->value->arch = CPU_INFO_ARCH_OTHER;
-        info->value->u.other = g_new0(CpuInfoOther, 1);
 #endif

        /* XXX: waiting for the qapi to support GSList */
--- a/cputlb.c
+++ b/cputlb.c
@@ -416,8 +416,8 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
            /* Write access calls the I/O callback.  */
            te->addr_write = address | TLB_MMIO;
        } else if (memory_region_is_ram(section->mr)
-                   && cpu_physical_memory_is_clean(section->mr->ram_addr
-                                                   + xlat)) {
+                   && cpu_physical_memory_is_clean(
+                        memory_region_get_ram_addr(section->mr) + xlat)) {
            te->addr_write = address | TLB_NOTDIRTY;
        } else {
            te->addr_write = address;
--- a/crypto/Makefile.objs
+++ b/crypto/Makefile.objs
@@ -8,6 +8,23 @@ crypto-obj-y += tlscredsanon.o
 crypto-obj-y += tlscredsx509.o
 crypto-obj-y += tlssession.o
 crypto-obj-y += secret.o
+crypto-obj-$(CONFIG_GCRYPT) += random-gcrypt.o
+crypto-obj-$(if $(CONFIG_GCRYPT),n,$(CONFIG_GNUTLS_RND)) += random-gnutls.o
+crypto-obj-y += pbkdf.o
+crypto-obj-$(CONFIG_NETTLE) += pbkdf-nettle.o
+crypto-obj-$(if $(CONFIG_NETTLE),n,$(CONFIG_GCRYPT_KDF)) += pbkdf-gcrypt.o
+crypto-obj-y += ivgen.o
+crypto-obj-y += ivgen-essiv.o
+crypto-obj-y += ivgen-plain.o
+crypto-obj-y += ivgen-plain64.o
+crypto-obj-y += afsplit.o
+crypto-obj-y += xts.o
+crypto-obj-y += block.o
+crypto-obj-y += block-qcow.o
+crypto-obj-y += block-luks.o

 # Let the userspace emulators avoid linking gnutls/etc
 crypto-aes-obj-y = aes.o
+
+stub-obj-y += random-stub.o
+stub-obj-y += pbkdf-stub.o
--- a/crypto/afsplit.c
+++ b/crypto/afsplit.c
@@ -0,0 +1,158 @@
+/*
+ * QEMU Crypto anti forensic information splitter
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * Derived from cryptsetup package lib/luks1/af.c
+ *
+ * Copyright (C) 2004, Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2009-2012, Red Hat, Inc. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/afsplit.h"
+#include "crypto/random.h"
+
+
+static void qcrypto_afsplit_xor(size_t blocklen,
+                                const uint8_t *in1,
+                                const uint8_t *in2,
+                                uint8_t *out)
+{
+    size_t i;
+    for (i = 0; i < blocklen; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+
+static int qcrypto_afsplit_hash(QCryptoHashAlgorithm hash,
+                                size_t blocklen,
+                                uint8_t *block,
+                                Error **errp)
+{
+    size_t digestlen = qcrypto_hash_digest_len(hash);
+
+    size_t hashcount = blocklen / digestlen;
+    size_t finallen = blocklen % digestlen;
+    uint32_t i;
+
+    if (finallen) {
+        hashcount++;
+    } else {
+        finallen = digestlen;
+    }
+
+    for (i = 0; i < hashcount; i++) {
+        uint8_t *out = NULL;
+        size_t outlen = 0;
+        uint32_t iv = cpu_to_be32(i);
+        struct iovec in[] = {
+            { .iov_base = &iv,
+              .iov_len = sizeof(iv) },
+            { .iov_base = block + (i * digestlen),
+              .iov_len = (i == (hashcount - 1)) ? finallen : digestlen },
+        };
+
+        if (qcrypto_hash_bytesv(hash,
+                                in,
+                                G_N_ELEMENTS(in),
+                                &out, &outlen,
+                                errp) < 0) {
+            return -1;
+        }
+
+        assert(outlen == digestlen);
+        memcpy(block + (i * digestlen), out,
+               (i == (hashcount - 1)) ? finallen : digestlen);
+        g_free(out);
+    }
+
+    return 0;
+}
+
+
+int qcrypto_afsplit_encode(QCryptoHashAlgorithm hash,
+                           size_t blocklen,
+                           uint32_t stripes,
+                           const uint8_t *in,
+                           uint8_t *out,
+                           Error **errp)
+{
+    uint8_t *block = g_new0(uint8_t, blocklen);
+    size_t i;
+    int ret = -1;
+
+    for (i = 0; i < (stripes - 1); i++) {
+        if (qcrypto_random_bytes(out + (i * blocklen), blocklen, errp) < 0) {
+            goto cleanup;
+        }
+
+        qcrypto_afsplit_xor(blocklen,
+                            out + (i * blocklen),
+                            block,
+                            block);
+        if (qcrypto_afsplit_hash(hash, blocklen, block,
+                                 errp) < 0) {
+            goto cleanup;
+        }
+    }
+    qcrypto_afsplit_xor(blocklen,
+                        in,
+                        block,
+                        out + (i * blocklen));
+    ret = 0;
+
+ cleanup:
+    g_free(block);
+    return ret;
+}
+
+
+int qcrypto_afsplit_decode(QCryptoHashAlgorithm hash,
+                           size_t blocklen,
+                           uint32_t stripes,
+                           const uint8_t *in,
+                           uint8_t *out,
+                           Error **errp)
+{
+    uint8_t *block = g_new0(uint8_t, blocklen);
+    size_t i;
+    int ret = -1;
+
+    for (i = 0; i < (stripes - 1); i++) {
+        qcrypto_afsplit_xor(blocklen,
+                            in + (i * blocklen),
+                            block,
+                            block);
+        if (qcrypto_afsplit_hash(hash, blocklen, block,
+                                 errp) < 0) {
+            goto cleanup;
+        }
+    }
+
+    qcrypto_afsplit_xor(blocklen,
+                        in + (i * blocklen),
+                        block,
+                        out);
+
+    ret = 0;
+
+ cleanup:
+    g_free(block);
+    return ret;
+}
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
--- a/crypto/block-luks.h
+++ b/crypto/block-luks.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block device encryption LUKS format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_LUKS_H__
+#define QCRYPTO_BLOCK_LUKS_H__
+
+#include "crypto/blockpriv.h"
+
+extern const QCryptoBlockDriver qcrypto_block_driver_luks;
+
+#endif /* QCRYPTO_BLOCK_LUKS_H__ */
--- a/crypto/block-qcow.c
+++ b/crypto/block-qcow.c
@@ -0,0 +1,173 @@
+/*
+ * QEMU Crypto block device encryption QCow/QCow2 AES-CBC format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * Note that the block encryption implemented in this file is broken
+ * by design. This exists only to allow data to be liberated from
+ * existing qcow[2] images and should not be used in any new areas.
+ */
+
+#include "qemu/osdep.h"
+
+#include "crypto/block-qcow.h"
+#include "crypto/secret.h"
+
+#define QCRYPTO_BLOCK_QCOW_SECTOR_SIZE 512
+
+
+static bool
+qcrypto_block_qcow_has_format(const uint8_t *buf G_GNUC_UNUSED,
+                              size_t buf_size G_GNUC_UNUSED)
+{
+    return false;
+}
+
+
+static int
+qcrypto_block_qcow_init(QCryptoBlock *block,
+                        const char *keysecret,
+                        Error **errp)
+{
+    char *password;
+    int ret;
+    uint8_t keybuf[16];
+    int len;
+
+    memset(keybuf, 0, 16);
+
+    password = qcrypto_secret_lookup_as_utf8(keysecret, errp);
+    if (!password) {
+        return -1;
+    }
+
+    len = strlen(password);
+    memcpy(keybuf, password, MIN(len, sizeof(keybuf)));
+    g_free(password);
+
+    block->niv = qcrypto_cipher_get_iv_len(QCRYPTO_CIPHER_ALG_AES_128,
+                                           QCRYPTO_CIPHER_MODE_CBC);
+    block->ivgen = qcrypto_ivgen_new(QCRYPTO_IVGEN_ALG_PLAIN64,
+                                     0, 0, NULL, 0, errp);
+    if (!block->ivgen) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    block->cipher = qcrypto_cipher_new(QCRYPTO_CIPHER_ALG_AES_128,
+                                       QCRYPTO_CIPHER_MODE_CBC,
+                                       keybuf, G_N_ELEMENTS(keybuf),
+                                       errp);
+    if (!block->cipher) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    block->payload_offset = 0;
+
+    return 0;
+
+ fail:
+    qcrypto_cipher_free(block->cipher);
+    qcrypto_ivgen_free(block->ivgen);
+    return ret;
+}
+
+
+static int
+qcrypto_block_qcow_open(QCryptoBlock *block,
+                        QCryptoBlockOpenOptions *options,
+                        QCryptoBlockReadFunc readfunc G_GNUC_UNUSED,
+                        void *opaque G_GNUC_UNUSED,
+                        unsigned int flags,
+                        Error **errp)
+{
+    if (flags & QCRYPTO_BLOCK_OPEN_NO_IO) {
+        return 0;
+    } else {
+        if (!options->u.qcow.key_secret) {
+            error_setg(errp,
+                       "Parameter 'key-secret' is required for cipher");
+            return -1;
+        }
+        return qcrypto_block_qcow_init(block,
+                                       options->u.qcow.key_secret, errp);
+    }
+}
+
+
+static int
+qcrypto_block_qcow_create(QCryptoBlock *block,
+                          QCryptoBlockCreateOptions *options,
+                          QCryptoBlockInitFunc initfunc G_GNUC_UNUSED,
+                          QCryptoBlockWriteFunc writefunc G_GNUC_UNUSED,
+                          void *opaque G_GNUC_UNUSED,
+                          Error **errp)
+{
+    if (!options->u.qcow.key_secret) {
+        error_setg(errp, "Parameter 'key-secret' is required for cipher");
+        return -1;
+    }
+    /* QCow2 has no special header, since everything is hardwired */
+    return qcrypto_block_qcow_init(block, options->u.qcow.key_secret, errp);
+}
+
+
+static void
+qcrypto_block_qcow_cleanup(QCryptoBlock *block)
+{
+}
+
+
+static int
+qcrypto_block_qcow_decrypt(QCryptoBlock *block,
+                           uint64_t startsector,
+                           uint8_t *buf,
+                           size_t len,
+                           Error **errp)
+{
+    return qcrypto_block_decrypt_helper(block->cipher,
+                                        block->niv, block->ivgen,
+                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
+                                        startsector, buf, len, errp);
+}
+
+
+static int
+qcrypto_block_qcow_encrypt(QCryptoBlock *block,
+                           uint64_t startsector,
+                           uint8_t *buf,
+                           size_t len,
+                           Error **errp)
+{
+    return qcrypto_block_encrypt_helper(block->cipher,
+                                        block->niv, block->ivgen,
+                                        QCRYPTO_BLOCK_QCOW_SECTOR_SIZE,
+                                        startsector, buf, len, errp);
+}
+
+
+const QCryptoBlockDriver qcrypto_block_driver_qcow = {
+    .open = qcrypto_block_qcow_open,
+    .create = qcrypto_block_qcow_create,
+    .cleanup = qcrypto_block_qcow_cleanup,
+    .decrypt = qcrypto_block_qcow_decrypt,
+    .encrypt = qcrypto_block_qcow_encrypt,
+    .has_format = qcrypto_block_qcow_has_format,
+};
--- a/crypto/block-qcow.h
+++ b/crypto/block-qcow.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block device encryption QCow/QCow2 AES-CBC format
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_QCOW_H__
+#define QCRYPTO_BLOCK_QCOW_H__
+
+#include "crypto/blockpriv.h"
+
+extern const QCryptoBlockDriver qcrypto_block_driver_qcow;
+
+#endif /* QCRYPTO_BLOCK_QCOW_H__ */
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -0,0 +1,260 @@
+/*
+ * QEMU Crypto block device encryption
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/blockpriv.h"
+#include "crypto/block-qcow.h"
+#include "crypto/block-luks.h"
+
+static const QCryptoBlockDriver *qcrypto_block_drivers[] = {
+    [Q_CRYPTO_BLOCK_FORMAT_QCOW] = &qcrypto_block_driver_qcow,
+    [Q_CRYPTO_BLOCK_FORMAT_LUKS] = &qcrypto_block_driver_luks,
+};
+
+
+bool qcrypto_block_has_format(QCryptoBlockFormat format,
+                              const uint8_t *buf,
+                              size_t len)
+{
+    const QCryptoBlockDriver *driver;
+
+    if (format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[format]) {
+        return false;
+    }
+
+    driver = qcrypto_block_drivers[format];
+
+    return driver->has_format(buf, len);
+}
+
+
+QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
+                                 QCryptoBlockReadFunc readfunc,
+                                 void *opaque,
+                                 unsigned int flags,
+                                 Error **errp)
+{
+    QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+
+    block->format = options->format;
+
+    if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[options->format]) {
+        error_setg(errp, "Unsupported block driver %d", options->format);
+        g_free(block);
+        return NULL;
+    }
+
+    block->driver = qcrypto_block_drivers[options->format];
+
+    if (block->driver->open(block, options,
+                            readfunc, opaque, flags, errp) < 0) {
+        g_free(block);
+        return NULL;
+    }
+
+    return block;
+}
+
+
+QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
+                                   QCryptoBlockInitFunc initfunc,
+                                   QCryptoBlockWriteFunc writefunc,
+                                   void *opaque,
+                                   Error **errp)
+{
+    QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+
+    block->format = options->format;
+
+    if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
+        !qcrypto_block_drivers[options->format]) {
+        error_setg(errp, "Unsupported block driver %d", options->format);
+        g_free(block);
+        return NULL;
+    }
+
+    block->driver = qcrypto_block_drivers[options->format];
+
+    if (block->driver->create(block, options, initfunc,
+                              writefunc, opaque, errp) < 0) {
+        g_free(block);
+        return NULL;
+    }
+
+    return block;
+}
+
+
+int qcrypto_block_decrypt(QCryptoBlock *block,
+                          uint64_t startsector,
+                          uint8_t *buf,
+                          size_t len,
+                          Error **errp)
+{
+    return block->driver->decrypt(block, startsector, buf, len, errp);
+}
+
+
+int qcrypto_block_encrypt(QCryptoBlock *block,
+                          uint64_t startsector,
+                          uint8_t *buf,
+                          size_t len,
+                          Error **errp)
+{
+    return block->driver->encrypt(block, startsector, buf, len, errp);
+}
+
+
+QCryptoCipher *qcrypto_block_get_cipher(QCryptoBlock *block)
+{
+    return block->cipher;
+}
+
+
+QCryptoIVGen *qcrypto_block_get_ivgen(QCryptoBlock *block)
+{
+    return block->ivgen;
+}
+
+
+QCryptoHashAlgorithm qcrypto_block_get_kdf_hash(QCryptoBlock *block)
+{
+    return block->kdfhash;
+}
+
+
+uint64_t qcrypto_block_get_payload_offset(QCryptoBlock *block)
+{
+    return block->payload_offset;
+}
+
+
+void qcrypto_block_free(QCryptoBlock *block)
+{
+    if (!block) {
+        return;
+    }
+
+    block->driver->cleanup(block);
+
+    qcrypto_cipher_free(block->cipher);
+    qcrypto_ivgen_free(block->ivgen);
+    g_free(block);
+}
+
+
+int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp)
+{
+    uint8_t *iv;
+    int ret = -1;
+
+    iv = niv ? g_new0(uint8_t, niv) : NULL;
+
+    while (len > 0) {
+        size_t nbytes;
+        if (niv) {
+            if (qcrypto_ivgen_calculate(ivgen,
+                                        startsector,
+                                        iv, niv,
+                                        errp) < 0) {
+                goto cleanup;
+            }
+
+            if (qcrypto_cipher_setiv(cipher,
+                                     iv, niv,
+                                     errp) < 0) {
+                goto cleanup;
+            }
+        }
+
+        nbytes = len > sectorsize ? sectorsize : len;
+        if (qcrypto_cipher_decrypt(cipher, buf, buf,
+                                   nbytes, errp) < 0) {
+            goto cleanup;
+        }
+
+        startsector++;
+        buf += nbytes;
+        len -= nbytes;
+    }
+
+    ret = 0;
+ cleanup:
+    g_free(iv);
+    return ret;
+}
+
+
+int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp)
+{
+    uint8_t *iv;
+    int ret = -1;
+
+    iv = niv ? g_new0(uint8_t, niv) : NULL;
+
+    while (len > 0) {
+        size_t nbytes;
+        if (niv) {
+            if (qcrypto_ivgen_calculate(ivgen,
+                                        startsector,
+                                        iv, niv,
+                                        errp) < 0) {
+                goto cleanup;
+            }
+
+            if (qcrypto_cipher_setiv(cipher,
+                                     iv, niv,
+                                     errp) < 0) {
+                goto cleanup;
+            }
+        }
+
+        nbytes = len > sectorsize ? sectorsize : len;
+        if (qcrypto_cipher_encrypt(cipher, buf, buf,
+                                   nbytes, errp) < 0) {
+            goto cleanup;
+        }
+
+        startsector++;
+        buf += nbytes;
+        len -= nbytes;
+    }
+
+    ret = 0;
+ cleanup:
+    g_free(iv);
+    return ret;
+}
--- a/crypto/blockpriv.h
+++ b/crypto/blockpriv.h
@@ -0,0 +1,92 @@
+/*
+ * QEMU Crypto block device encryption
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_BLOCK_PRIV_H__
+#define QCRYPTO_BLOCK_PRIV_H__
+
+#include "crypto/block.h"
+
+typedef struct QCryptoBlockDriver QCryptoBlockDriver;
+
+struct QCryptoBlock {
+    QCryptoBlockFormat format;
+
+    const QCryptoBlockDriver *driver;
+    void *opaque;
+
+    QCryptoCipher *cipher;
+    QCryptoIVGen *ivgen;
+    QCryptoHashAlgorithm kdfhash;
+    size_t niv;
+    uint64_t payload_offset; /* In bytes */
+};
+
+struct QCryptoBlockDriver {
+    int (*open)(QCryptoBlock *block,
+                QCryptoBlockOpenOptions *options,
+                QCryptoBlockReadFunc readfunc,
+                void *opaque,
+                unsigned int flags,
+                Error **errp);
+
+    int (*create)(QCryptoBlock *block,
+                  QCryptoBlockCreateOptions *options,
+                  QCryptoBlockInitFunc initfunc,
+                  QCryptoBlockWriteFunc writefunc,
+                  void *opaque,
+                  Error **errp);
+
+    void (*cleanup)(QCryptoBlock *block);
+
+    int (*encrypt)(QCryptoBlock *block,
+                   uint64_t startsector,
+                   uint8_t *buf,
+                   size_t len,
+                   Error **errp);
+    int (*decrypt)(QCryptoBlock *block,
+                   uint64_t startsector,
+                   uint8_t *buf,
+                   size_t len,
+                   Error **errp);
+
+    bool (*has_format)(const uint8_t *buf,
+                       size_t buflen);
+};
+
+
+int qcrypto_block_decrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp);
+
+int qcrypto_block_encrypt_helper(QCryptoCipher *cipher,
+                                 size_t niv,
+                                 QCryptoIVGen *ivgen,
+                                 int sectorsize,
+                                 uint64_t startsector,
+                                 uint8_t *buf,
+                                 size_t len,
+                                 Error **errp);
+
+#endif /* QCRYPTO_BLOCK_PRIV_H__ */
--- a/crypto/cipher-builtin.c
+++ b/crypto/cipher-builtin.c
@@ -21,11 +21,17 @@
 #include "qemu/osdep.h"
 #include "crypto/aes.h"
 #include "crypto/desrfb.h"
+#include "crypto/xts.h"

+typedef struct QCryptoCipherBuiltinAESContext QCryptoCipherBuiltinAESContext;
+struct QCryptoCipherBuiltinAESContext {
+    AES_KEY enc;
+    AES_KEY dec;
+};
 typedef struct QCryptoCipherBuiltinAES QCryptoCipherBuiltinAES;
 struct QCryptoCipherBuiltinAES {
-    AES_KEY encrypt_key;
-    AES_KEY decrypt_key;
+    QCryptoCipherBuiltinAESContext key;
+    QCryptoCipherBuiltinAESContext key_tweak;
    uint8_t iv[AES_BLOCK_SIZE];
 };
 typedef struct QCryptoCipherBuiltinDESRFB QCryptoCipherBuiltinDESRFB;
@@ -67,6 +73,82 @@ static void qcrypto_cipher_free_aes(QCryptoCipher *cipher)
 }


+static void qcrypto_cipher_aes_ecb_encrypt(AES_KEY *key,
+                                           const void *in,
+                                           void *out,
+                                           size_t len)
+{
+    const uint8_t *inptr = in;
+    uint8_t *outptr = out;
+    while (len) {
+        if (len > AES_BLOCK_SIZE) {
+            AES_encrypt(inptr, outptr, key);
+            inptr += AES_BLOCK_SIZE;
+            outptr += AES_BLOCK_SIZE;
+            len -= AES_BLOCK_SIZE;
+        } else {
+            uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
+            memcpy(tmp1, inptr, len);
+            /* Fill with 0 to avoid valgrind uninitialized reads */
+            memset(tmp1 + len, 0, sizeof(tmp1) - len);
+            AES_encrypt(tmp1, tmp2, key);
+            memcpy(outptr, tmp2, len);
+            len = 0;
+        }
+    }
+}
+
+
+static void qcrypto_cipher_aes_ecb_decrypt(AES_KEY *key,
+                                           const void *in,
+                                           void *out,
+                                           size_t len)
+{
+    const uint8_t *inptr = in;
+    uint8_t *outptr = out;
+    while (len) {
+        if (len > AES_BLOCK_SIZE) {
+            AES_decrypt(inptr, outptr, key);
+            inptr += AES_BLOCK_SIZE;
+            outptr += AES_BLOCK_SIZE;
+            len -= AES_BLOCK_SIZE;
+        } else {
+            uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
+            memcpy(tmp1, inptr, len);
+            /* Fill with 0 to avoid valgrind uninitialized reads */
+            memset(tmp1 + len, 0, sizeof(tmp1) - len);
+            AES_decrypt(tmp1, tmp2, key);
+            memcpy(outptr, tmp2, len);
+            len = 0;
+        }
+    }
+}
+
+
+static void qcrypto_cipher_aes_xts_encrypt(const void *ctx,
+                                           size_t length,
+                                           uint8_t *dst,
+                                           const uint8_t *src)
+{
+    const QCryptoCipherBuiltinAESContext *aesctx = ctx;
+
+    qcrypto_cipher_aes_ecb_encrypt((AES_KEY *)&aesctx->enc,
+                                   src, dst, length);
+}
+
+
+static void qcrypto_cipher_aes_xts_decrypt(const void *ctx,
+                                           size_t length,
+                                           uint8_t *dst,
+                                           const uint8_t *src)
+{
+    const QCryptoCipherBuiltinAESContext *aesctx = ctx;
+
+    qcrypto_cipher_aes_ecb_decrypt((AES_KEY *)&aesctx->dec,
+                                   src, dst, length);
+}
+
+
 static int qcrypto_cipher_encrypt_aes(QCryptoCipher *cipher,
                                      const void *in,
                                      void *out,
@@ -75,29 +157,26 @@ static int qcrypto_cipher_encrypt_aes(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    if (cipher->mode == QCRYPTO_CIPHER_MODE_ECB) {
-        const uint8_t *inptr = in;
-        uint8_t *outptr = out;
-        while (len) {
-            if (len > AES_BLOCK_SIZE) {
-                AES_encrypt(inptr, outptr, &ctxt->state.aes.encrypt_key);
-                inptr += AES_BLOCK_SIZE;
-                outptr += AES_BLOCK_SIZE;
-                len -= AES_BLOCK_SIZE;
-            } else {
-                uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
-                memcpy(tmp1, inptr, len);
-                /* Fill with 0 to avoid valgrind uninitialized reads */
-                memset(tmp1 + len, 0, sizeof(tmp1) - len);
-                AES_encrypt(tmp1, tmp2, &ctxt->state.aes.encrypt_key);
-                memcpy(outptr, tmp2, len);
-                len = 0;
-            }
-        }
-    } else {
+    switch (cipher->mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+        qcrypto_cipher_aes_ecb_encrypt(&ctxt->state.aes.key.enc,
+                                       in, out, len);
+        break;
+    case QCRYPTO_CIPHER_MODE_CBC:
        AES_cbc_encrypt(in, out, len,
-                        &ctxt->state.aes.encrypt_key,
+                        &ctxt->state.aes.key.enc,
                        ctxt->state.aes.iv, 1);
+        break;
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_encrypt(&ctxt->state.aes.key,
+                    &ctxt->state.aes.key_tweak,
+                    qcrypto_cipher_aes_xts_encrypt,
+                    qcrypto_cipher_aes_xts_decrypt,
+                    ctxt->state.aes.iv,
+                    len, out, in);
+        break;
+    default:
+        g_assert_not_reached();
    }

    return 0;
@@ -112,29 +191,26 @@ static int qcrypto_cipher_decrypt_aes(QCryptoCipher *cipher,
 {
    QCryptoCipherBuiltin *ctxt = cipher->opaque;

-    if (cipher->mode == QCRYPTO_CIPHER_MODE_ECB) {
-        const uint8_t *inptr = in;
-        uint8_t *outptr = out;
-        while (len) {
-            if (len > AES_BLOCK_SIZE) {
-                AES_decrypt(inptr, outptr, &ctxt->state.aes.decrypt_key);
-                inptr += AES_BLOCK_SIZE;
-                outptr += AES_BLOCK_SIZE;
-                len -= AES_BLOCK_SIZE;
-            } else {
-                uint8_t tmp1[AES_BLOCK_SIZE], tmp2[AES_BLOCK_SIZE];
-                memcpy(tmp1, inptr, len);
-                /* Fill with 0 to avoid valgrind uninitialized reads */
-                memset(tmp1 + len, 0, sizeof(tmp1) - len);
-                AES_decrypt(tmp1, tmp2, &ctxt->state.aes.decrypt_key);
-                memcpy(outptr, tmp2, len);
-                len = 0;
-            }
-        }
-    } else {
+    switch (cipher->mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+        qcrypto_cipher_aes_ecb_decrypt(&ctxt->state.aes.key.dec,
+                                       in, out, len);
+        break;
+    case QCRYPTO_CIPHER_MODE_CBC:
        AES_cbc_encrypt(in, out, len,
-                        &ctxt->state.aes.decrypt_key,
+                        &ctxt->state.aes.key.dec,
                        ctxt->state.aes.iv, 0);
+        break;
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_decrypt(&ctxt->state.aes.key,
+                    &ctxt->state.aes.key_tweak,
+                    qcrypto_cipher_aes_xts_encrypt,
+                    qcrypto_cipher_aes_xts_decrypt,
+                    ctxt->state.aes.iv,
+                    len, out, in);
+        break;
+    default:
+        g_assert_not_reached();
    }

    return 0;
@@ -166,21 +242,46 @@ static int qcrypto_cipher_init_aes(QCryptoCipher *cipher,
    QCryptoCipherBuiltin *ctxt;

    if (cipher->mode != QCRYPTO_CIPHER_MODE_CBC &&
-        cipher->mode != QCRYPTO_CIPHER_MODE_ECB) {
+        cipher->mode != QCRYPTO_CIPHER_MODE_ECB &&
+        cipher->mode != QCRYPTO_CIPHER_MODE_XTS) {
        error_setg(errp, "Unsupported cipher mode %d", cipher->mode);
        return -1;
    }

    ctxt = g_new0(QCryptoCipherBuiltin, 1);

-    if (AES_set_encrypt_key(key, nkey * 8, &ctxt->state.aes.encrypt_key) != 0) {
-        error_setg(errp, "Failed to set encryption key");
-        goto error;
-    }
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        if (AES_set_encrypt_key(key, nkey * 4, &ctxt->state.aes.key.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }

-    if (AES_set_decrypt_key(key, nkey * 8, &ctxt->state.aes.decrypt_key) != 0) {
-        error_setg(errp, "Failed to set decryption key");
-        goto error;
+        if (AES_set_decrypt_key(key, nkey * 4, &ctxt->state.aes.key.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
+
+        if (AES_set_encrypt_key(key + (nkey / 2), nkey * 4,
+                                &ctxt->state.aes.key_tweak.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }
+
+        if (AES_set_decrypt_key(key + (nkey / 2), nkey * 4,
+                                &ctxt->state.aes.key_tweak.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
+    } else {
+        if (AES_set_encrypt_key(key, nkey * 8, &ctxt->state.aes.key.enc) != 0) {
+            error_setg(errp, "Failed to set encryption key");
+            goto error;
+        }
+
+        if (AES_set_decrypt_key(key, nkey * 8, &ctxt->state.aes.key.dec) != 0) {
+            error_setg(errp, "Failed to set decryption key");
+            goto error;
+        }
    }

    ctxt->blocksize = AES_BLOCK_SIZE;
@@ -322,7 +423,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    cipher->alg = alg;
    cipher->mode = mode;

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        goto error;
    }

--- a/crypto/cipher-gcrypt.c
+++ b/crypto/cipher-gcrypt.c
@@ -19,6 +19,8 @@
 */

 #include "qemu/osdep.h"
+#include "crypto/xts.h"
+
 #include <gcrypt.h>


@@ -29,6 +31,12 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
        return true;
    default:
        return false;
@@ -38,7 +46,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
 typedef struct QCryptoCipherGcrypt QCryptoCipherGcrypt;
 struct QCryptoCipherGcrypt {
    gcry_cipher_hd_t handle;
+    gcry_cipher_hd_t tweakhandle;
    size_t blocksize;
+    uint8_t *iv;
 };

 QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
@@ -53,6 +63,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

    switch (mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_XTS:
        gcrymode = GCRY_CIPHER_MODE_ECB;
        break;
    case QCRYPTO_CIPHER_MODE_CBC:
@@ -63,7 +74,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        return NULL;
    }

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        return NULL;
    }

@@ -84,6 +95,30 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        gcryalg = GCRY_CIPHER_AES256;
        break;

+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+        gcryalg = GCRY_CIPHER_CAST5;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+        gcryalg = GCRY_CIPHER_SERPENT128;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+        gcryalg = GCRY_CIPHER_SERPENT192;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        gcryalg = GCRY_CIPHER_SERPENT256;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+        gcryalg = GCRY_CIPHER_TWOFISH128;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        gcryalg = GCRY_CIPHER_TWOFISH;
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        return NULL;
@@ -101,6 +136,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
                   gcry_strerror(err));
        goto error;
    }
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        err = gcry_cipher_open(&ctx->tweakhandle, gcryalg, gcrymode, 0);
+        if (err != 0) {
+            error_setg(errp, "Cannot initialize cipher: %s",
+                       gcry_strerror(err));
+            goto error;
+        }
+    }

    if (cipher->alg == QCRYPTO_CIPHER_ALG_DES_RFB) {
        /* We're using standard DES cipher from gcrypt, so we need
@@ -112,13 +155,44 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
        g_free(rfbkey);
        ctx->blocksize = 8;
    } else {
-        err = gcry_cipher_setkey(ctx->handle, key, nkey);
-        ctx->blocksize = 16;
+        if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+            nkey /= 2;
+            err = gcry_cipher_setkey(ctx->handle, key, nkey);
+            if (err != 0) {
+                error_setg(errp, "Cannot set key: %s",
+                           gcry_strerror(err));
+                goto error;
+            }
+            err = gcry_cipher_setkey(ctx->tweakhandle, key + nkey, nkey);
+        } else {
+            err = gcry_cipher_setkey(ctx->handle, key, nkey);
+        }
+        if (err != 0) {
+            error_setg(errp, "Cannot set key: %s",
+                       gcry_strerror(err));
+            goto error;
+        }
+        switch (cipher->alg) {
+        case QCRYPTO_CIPHER_ALG_AES_128:
+        case QCRYPTO_CIPHER_ALG_AES_192:
+        case QCRYPTO_CIPHER_ALG_AES_256:
+        case QCRYPTO_CIPHER_ALG_SERPENT_128:
+        case QCRYPTO_CIPHER_ALG_SERPENT_192:
+        case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+        case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+            ctx->blocksize = 16;
+            break;
+        case QCRYPTO_CIPHER_ALG_CAST5_128:
+            ctx->blocksize = 8;
+            break;
+        default:
+            g_assert_not_reached();
+        }
    }
-    if (err != 0) {
-        error_setg(errp, "Cannot set key: %s",
-                   gcry_strerror(err));
-        goto error;
+
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        ctx->iv = g_new0(uint8_t, ctx->blocksize);
    }

    cipher->opaque = ctx;
@@ -126,6 +200,9 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

 error:
    gcry_cipher_close(ctx->handle);
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        gcry_cipher_close(ctx->tweakhandle);
+    }
    g_free(ctx);
    g_free(cipher);
    return NULL;
@@ -140,11 +217,35 @@ void qcrypto_cipher_free(QCryptoCipher *cipher)
    }
    ctx = cipher->opaque;
    gcry_cipher_close(ctx->handle);
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        gcry_cipher_close(ctx->tweakhandle);
+    }
+    g_free(ctx->iv);
    g_free(ctx);
    g_free(cipher);
 }


+static void qcrypto_gcrypt_xts_encrypt(const void *ctx,
+                                       size_t length,
+                                       uint8_t *dst,
+                                       const uint8_t *src)
+{
+    gcry_error_t err;
+    err = gcry_cipher_encrypt((gcry_cipher_hd_t)ctx, dst, length, src, length);
+    g_assert(err == 0);
+}
+
+static void qcrypto_gcrypt_xts_decrypt(const void *ctx,
+                                       size_t length,
+                                       uint8_t *dst,
+                                       const uint8_t *src)
+{
+    gcry_error_t err;
+    err = gcry_cipher_decrypt((gcry_cipher_hd_t)ctx, dst, length, src, length);
+    g_assert(err == 0);
+}
+
 int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
                           const void *in,
                           void *out,
@@ -160,13 +261,20 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
        return -1;
    }

-    err = gcry_cipher_encrypt(ctx->handle,
-                              out, len,
-                              in, len);
-    if (err != 0) {
-        error_setg(errp, "Cannot encrypt data: %s",
-                   gcry_strerror(err));
-        return -1;
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        xts_encrypt(ctx->handle, ctx->tweakhandle,
+                    qcrypto_gcrypt_xts_encrypt,
+                    qcrypto_gcrypt_xts_decrypt,
+                    ctx->iv, len, out, in);
+    } else {
+        err = gcry_cipher_encrypt(ctx->handle,
+                                  out, len,
+                                  in, len);
+        if (err != 0) {
+            error_setg(errp, "Cannot encrypt data: %s",
+                       gcry_strerror(err));
+            return -1;
+        }
    }

    return 0;
@@ -188,13 +296,20 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
        return -1;
    }

-    err = gcry_cipher_decrypt(ctx->handle,
-                              out, len,
-                              in, len);
-    if (err != 0) {
-        error_setg(errp, "Cannot decrypt data: %s",
-                   gcry_strerror(err));
-        return -1;
+    if (cipher->mode == QCRYPTO_CIPHER_MODE_XTS) {
+        xts_decrypt(ctx->handle, ctx->tweakhandle,
+                    qcrypto_gcrypt_xts_encrypt,
+                    qcrypto_gcrypt_xts_decrypt,
+                    ctx->iv, len, out, in);
+    } else {
+        err = gcry_cipher_decrypt(ctx->handle,
+                                  out, len,
+                                  in, len);
+        if (err != 0) {
+            error_setg(errp, "Cannot decrypt data: %s",
+                       gcry_strerror(err));
+            return -1;
+        }
    }

    return 0;
@@ -213,12 +328,16 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
        return -1;
    }

-    gcry_cipher_reset(ctx->handle);
-    err = gcry_cipher_setiv(ctx->handle, iv, niv);
-    if (err != 0) {
-        error_setg(errp, "Cannot set IV: %s",
+    if (ctx->iv) {
+        memcpy(ctx->iv, iv, niv);
+    } else {
+        gcry_cipher_reset(ctx->handle);
+        err = gcry_cipher_setiv(ctx->handle, iv, niv);
+        if (err != 0) {
+            error_setg(errp, "Cannot set IV: %s",
                   gcry_strerror(err));
-        return -1;
+            return -1;
+        }
    }

    return 0;
--- a/crypto/cipher-nettle.c
+++ b/crypto/cipher-nettle.c
@@ -19,56 +19,174 @@
 */

 #include "qemu/osdep.h"
+#include "crypto/xts.h"
+
 #include <nettle/nettle-types.h>
 #include <nettle/aes.h>
 #include <nettle/des.h>
 #include <nettle/cbc.h>
+#include <nettle/cast128.h>
+#include <nettle/serpent.h>
+#include <nettle/twofish.h>
+
+typedef void (*QCryptoCipherNettleFuncWrapper)(const void *ctx,
+                                               size_t length,
+                                               uint8_t *dst,
+                                               const uint8_t *src);

 #if CONFIG_NETTLE_VERSION_MAJOR < 3
-typedef nettle_crypt_func nettle_cipher_func;
-
+typedef nettle_crypt_func * QCryptoCipherNettleFuncNative;
 typedef void *       cipher_ctx_t;
 typedef unsigned     cipher_length_t;
+
+#define cast5_set_key cast128_set_key
 #else
+typedef nettle_cipher_func * QCryptoCipherNettleFuncNative;
 typedef const void * cipher_ctx_t;
 typedef size_t       cipher_length_t;
 #endif

-static nettle_cipher_func aes_encrypt_wrapper;
-static nettle_cipher_func aes_decrypt_wrapper;
-static nettle_cipher_func des_encrypt_wrapper;
-static nettle_cipher_func des_decrypt_wrapper;
+typedef struct QCryptoNettleAES {
+    struct aes_ctx enc;
+    struct aes_ctx dec;
+} QCryptoNettleAES;

-static void aes_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
-                                uint8_t *dst, const uint8_t *src)
+static void aes_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
 {
-    aes_encrypt(ctx, length, dst, src);
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_encrypt(&aesctx->enc, length, dst, src);
 }

-static void aes_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
-                                uint8_t *dst, const uint8_t *src)
+static void aes_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
 {
-    aes_decrypt(ctx, length, dst, src);
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_decrypt(&aesctx->dec, length, dst, src);
 }

-static void des_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
+static void des_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    des_encrypt(ctx, length, dst, src);
+}
+
+static void des_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    des_decrypt(ctx, length, dst, src);
+}
+
+static void cast128_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    cast128_encrypt(ctx, length, dst, src);
+}
+
+static void cast128_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    cast128_decrypt(ctx, length, dst, src);
+}
+
+static void serpent_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    serpent_encrypt(ctx, length, dst, src);
+}
+
+static void serpent_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    serpent_decrypt(ctx, length, dst, src);
+}
+
+static void twofish_encrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    twofish_encrypt(ctx, length, dst, src);
+}
+
+static void twofish_decrypt_native(cipher_ctx_t ctx, cipher_length_t length,
+                                   uint8_t *dst, const uint8_t *src)
+{
+    twofish_decrypt(ctx, length, dst, src);
+}
+
+static void aes_encrypt_wrapper(const void *ctx, size_t length,
+                                uint8_t *dst, const uint8_t *src)
+{
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_encrypt(&aesctx->enc, length, dst, src);
+}
+
+static void aes_decrypt_wrapper(const void *ctx, size_t length,
+                                uint8_t *dst, const uint8_t *src)
+{
+    const QCryptoNettleAES *aesctx = ctx;
+    aes_decrypt(&aesctx->dec, length, dst, src);
+}
+
+static void des_encrypt_wrapper(const void *ctx, size_t length,
                                uint8_t *dst, const uint8_t *src)
 {
    des_encrypt(ctx, length, dst, src);
 }

-static void des_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
+static void des_decrypt_wrapper(const void *ctx, size_t length,
                                uint8_t *dst, const uint8_t *src)
 {
    des_decrypt(ctx, length, dst, src);
 }

+static void cast128_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    cast128_encrypt(ctx, length, dst, src);
+}
+
+static void cast128_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    cast128_decrypt(ctx, length, dst, src);
+}
+
+static void serpent_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    serpent_encrypt(ctx, length, dst, src);
+}
+
+static void serpent_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    serpent_decrypt(ctx, length, dst, src);
+}
+
+static void twofish_encrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    twofish_encrypt(ctx, length, dst, src);
+}
+
+static void twofish_decrypt_wrapper(const void *ctx, size_t length,
+                                    uint8_t *dst, const uint8_t *src)
+{
+    twofish_decrypt(ctx, length, dst, src);
+}
+
 typedef struct QCryptoCipherNettle QCryptoCipherNettle;
 struct QCryptoCipherNettle {
-    void *ctx_encrypt;
-    void *ctx_decrypt;
-    nettle_cipher_func *alg_encrypt;
-    nettle_cipher_func *alg_decrypt;
+    /* Primary cipher context for all modes */
+    void *ctx;
+    /* Second cipher context for XTS mode only */
+    void *ctx_tweak;
+    /* Cipher callbacks for both contexts */
+    QCryptoCipherNettleFuncNative alg_encrypt_native;
+    QCryptoCipherNettleFuncNative alg_decrypt_native;
+    QCryptoCipherNettleFuncWrapper alg_encrypt_wrapper;
+    QCryptoCipherNettleFuncWrapper alg_decrypt_wrapper;
+
    uint8_t *iv;
    size_t blocksize;
 };
@@ -80,6 +198,13 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_192:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
        return true;
    default:
        return false;
@@ -99,13 +224,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    switch (mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
        break;
    default:
        error_setg(errp, "Unsupported cipher mode %d", mode);
        return NULL;
    }

-    if (!qcrypto_cipher_validate_key_length(alg, nkey, errp)) {
+    if (!qcrypto_cipher_validate_key_length(alg, mode, nkey, errp)) {
        return NULL;
    }

@@ -117,14 +243,15 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,

    switch (alg) {
    case QCRYPTO_CIPHER_ALG_DES_RFB:
-        ctx->ctx_encrypt = g_new0(struct des_ctx, 1);
-        ctx->ctx_decrypt = NULL; /* 1 ctx can do both */
+        ctx->ctx = g_new0(struct des_ctx, 1);
        rfbkey = qcrypto_cipher_munge_des_rfb_key(key, nkey);
-        des_set_key(ctx->ctx_encrypt, rfbkey);
+        des_set_key(ctx->ctx, rfbkey);
        g_free(rfbkey);

-        ctx->alg_encrypt = des_encrypt_wrapper;
-        ctx->alg_decrypt = des_decrypt_wrapper;
+        ctx->alg_encrypt_native = des_encrypt_native;
+        ctx->alg_decrypt_native = des_decrypt_native;
+        ctx->alg_encrypt_wrapper = des_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = des_decrypt_wrapper;

        ctx->blocksize = DES_BLOCK_SIZE;
        break;
@@ -132,17 +259,103 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
-        ctx->ctx_encrypt = g_new0(struct aes_ctx, 1);
-        ctx->ctx_decrypt = g_new0(struct aes_ctx, 1);
+        ctx->ctx = g_new0(QCryptoNettleAES, 1);

-        aes_set_encrypt_key(ctx->ctx_encrypt, nkey, key);
-        aes_set_decrypt_key(ctx->ctx_decrypt, nkey, key);
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(QCryptoNettleAES, 1);

-        ctx->alg_encrypt = aes_encrypt_wrapper;
-        ctx->alg_decrypt = aes_decrypt_wrapper;
+            nkey /= 2;
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx)->enc,
+                                nkey, key);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx)->dec,
+                                nkey, key);
+
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx_tweak)->enc,
+                                nkey, key + nkey);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx_tweak)->dec,
+                                nkey, key + nkey);
+        } else {
+            aes_set_encrypt_key(&((QCryptoNettleAES *)ctx->ctx)->enc,
+                                nkey, key);
+            aes_set_decrypt_key(&((QCryptoNettleAES *)ctx->ctx)->dec,
+                                nkey, key);
+        }
+
+        ctx->alg_encrypt_native = aes_encrypt_native;
+        ctx->alg_decrypt_native = aes_decrypt_native;
+        ctx->alg_encrypt_wrapper = aes_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = aes_decrypt_wrapper;

        ctx->blocksize = AES_BLOCK_SIZE;
        break;
+
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+        ctx->ctx = g_new0(struct cast128_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct cast128_ctx, 1);
+
+            nkey /= 2;
+            cast5_set_key(ctx->ctx, nkey, key);
+            cast5_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            cast5_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = cast128_encrypt_native;
+        ctx->alg_decrypt_native = cast128_decrypt_native;
+        ctx->alg_encrypt_wrapper = cast128_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = cast128_decrypt_wrapper;
+
+        ctx->blocksize = CAST128_BLOCK_SIZE;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        ctx->ctx = g_new0(struct serpent_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct serpent_ctx, 1);
+
+            nkey /= 2;
+            serpent_set_key(ctx->ctx, nkey, key);
+            serpent_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            serpent_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = serpent_encrypt_native;
+        ctx->alg_decrypt_native = serpent_decrypt_native;
+        ctx->alg_encrypt_wrapper = serpent_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = serpent_decrypt_wrapper;
+
+        ctx->blocksize = SERPENT_BLOCK_SIZE;
+        break;
+
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_192:
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        ctx->ctx = g_new0(struct twofish_ctx, 1);
+
+        if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+            ctx->ctx_tweak = g_new0(struct twofish_ctx, 1);
+
+            nkey /= 2;
+            twofish_set_key(ctx->ctx, nkey, key);
+            twofish_set_key(ctx->ctx_tweak, nkey, key + nkey);
+        } else {
+            twofish_set_key(ctx->ctx, nkey, key);
+        }
+
+        ctx->alg_encrypt_native = twofish_encrypt_native;
+        ctx->alg_decrypt_native = twofish_decrypt_native;
+        ctx->alg_encrypt_wrapper = twofish_encrypt_wrapper;
+        ctx->alg_decrypt_wrapper = twofish_decrypt_wrapper;
+
+        ctx->blocksize = TWOFISH_BLOCK_SIZE;
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d", alg);
        goto error;
@@ -170,8 +383,8 @@ void qcrypto_cipher_free(QCryptoCipher *cipher)

    ctx = cipher->opaque;
    g_free(ctx->iv);
-    g_free(ctx->ctx_encrypt);
-    g_free(ctx->ctx_decrypt);
+    g_free(ctx->ctx);
+    g_free(ctx->ctx_tweak);
    g_free(ctx);
    g_free(cipher);
 }
@@ -193,14 +406,21 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,

    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
-        ctx->alg_encrypt(ctx->ctx_encrypt, len, out, in);
+        ctx->alg_encrypt_wrapper(ctx->ctx, len, out, in);
        break;

    case QCRYPTO_CIPHER_MODE_CBC:
-        cbc_encrypt(ctx->ctx_encrypt, ctx->alg_encrypt,
+        cbc_encrypt(ctx->ctx, ctx->alg_encrypt_native,
                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
+
+    case QCRYPTO_CIPHER_MODE_XTS:
+        xts_encrypt(ctx->ctx, ctx->ctx_tweak,
+                    ctx->alg_encrypt_wrapper, ctx->alg_encrypt_wrapper,
+                    ctx->iv, len, out, in);
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d",
                   cipher->alg);
@@ -226,15 +446,26 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,

    switch (cipher->mode) {
    case QCRYPTO_CIPHER_MODE_ECB:
-        ctx->alg_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                         len, out, in);
+        ctx->alg_decrypt_wrapper(ctx->ctx, len, out, in);
        break;

    case QCRYPTO_CIPHER_MODE_CBC:
-        cbc_decrypt(ctx->ctx_decrypt ? ctx->ctx_decrypt : ctx->ctx_encrypt,
-                    ctx->alg_decrypt, ctx->blocksize, ctx->iv,
+        cbc_decrypt(ctx->ctx, ctx->alg_decrypt_native,
+                    ctx->blocksize, ctx->iv,
                    len, out, in);
        break;
+
+    case QCRYPTO_CIPHER_MODE_XTS:
+        if (ctx->blocksize != XTS_BLOCK_SIZE) {
+            error_setg(errp, "Block size must be %d not %zu",
+                       XTS_BLOCK_SIZE, ctx->blocksize);
+            return -1;
+        }
+        xts_decrypt(ctx->ctx, ctx->ctx_tweak,
+                    ctx->alg_encrypt_wrapper, ctx->alg_decrypt_wrapper,
+                    ctx->iv, len, out, in);
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher algorithm %d",
                   cipher->alg);
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -27,6 +27,13 @@ static size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = {
    [QCRYPTO_CIPHER_ALG_AES_192] = 24,
    [QCRYPTO_CIPHER_ALG_AES_256] = 32,
    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
+    [QCRYPTO_CIPHER_ALG_CAST5_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_192] = 24,
+    [QCRYPTO_CIPHER_ALG_SERPENT_256] = 32,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32,
 };

 static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
@@ -34,11 +41,19 @@ static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
    [QCRYPTO_CIPHER_ALG_AES_192] = 16,
    [QCRYPTO_CIPHER_ALG_AES_256] = 16,
    [QCRYPTO_CIPHER_ALG_DES_RFB] = 8,
+    [QCRYPTO_CIPHER_ALG_CAST5_128] = 8,
+    [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_192] = 16,
+    [QCRYPTO_CIPHER_ALG_SERPENT_256] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 16,
+    [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 16,
 };

 static bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {
    [QCRYPTO_CIPHER_MODE_ECB] = false,
    [QCRYPTO_CIPHER_MODE_CBC] = true,
+    [QCRYPTO_CIPHER_MODE_XTS] = true,
 };


@@ -79,6 +94,7 @@ size_t qcrypto_cipher_get_iv_len(QCryptoCipherAlgorithm alg,

 static bool
 qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
+                                   QCryptoCipherMode mode,
                                   size_t nkey,
                                   Error **errp)
 {
@@ -88,10 +104,27 @@ qcrypto_cipher_validate_key_length(QCryptoCipherAlgorithm alg,
        return false;
    }

-    if (alg_key_len[alg] != nkey) {
-        error_setg(errp, "Cipher key length %zu should be %zu",
-                   nkey, alg_key_len[alg]);
-        return false;
+    if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+        if (alg == QCRYPTO_CIPHER_ALG_DES_RFB) {
+            error_setg(errp, "XTS mode not compatible with DES-RFB");
+            return false;
+        }
+        if (nkey % 2) {
+            error_setg(errp, "XTS cipher key length should be a multiple of 2");
+            return false;
+        }
+
+        if (alg_key_len[alg] != (nkey / 2)) {
+            error_setg(errp, "Cipher key length %zu should be %zu",
+                       nkey, alg_key_len[alg] * 2);
+            return false;
+        }
+    } else {
+        if (alg_key_len[alg] != nkey) {
+            error_setg(errp, "Cipher key length %zu should be %zu",
+                       nkey, alg_key_len[alg]);
+            return false;
+        }
    }
    return true;
 }
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -24,12 +24,8 @@
 #ifdef CONFIG_GNUTLS_HASH
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>
+#endif

-static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = {
-    [QCRYPTO_HASH_ALG_MD5] = GNUTLS_DIG_MD5,
-    [QCRYPTO_HASH_ALG_SHA1] = GNUTLS_DIG_SHA1,
-    [QCRYPTO_HASH_ALG_SHA256] = GNUTLS_DIG_SHA256,
-};

 static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = {
    [QCRYPTO_HASH_ALG_MD5] = 16,
@@ -37,6 +33,22 @@ static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = {
    [QCRYPTO_HASH_ALG_SHA256] = 32,
 };

+size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg)
+{
+    if (alg >= G_N_ELEMENTS(qcrypto_hash_alg_size)) {
+        return 0;
+    }
+    return qcrypto_hash_alg_size[alg];
+}
+
+
+#ifdef CONFIG_GNUTLS_HASH
+static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = {
+    [QCRYPTO_HASH_ALG_MD5] = GNUTLS_DIG_MD5,
+    [QCRYPTO_HASH_ALG_SHA1] = GNUTLS_DIG_SHA1,
+    [QCRYPTO_HASH_ALG_SHA256] = GNUTLS_DIG_SHA256,
+};
+
 gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
 {
    if (alg < G_N_ELEMENTS(qcrypto_hash_alg_map)) {
@@ -45,14 +57,6 @@ gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
    return false;
 }

-size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg)
-{
-    if (alg >= G_N_ELEMENTS(qcrypto_hash_alg_size)) {
-        return 0;
-    }
-    return qcrypto_hash_alg_size[alg];
-}
-

 int qcrypto_hash_bytesv(QCryptoHashAlgorithm alg,
                        const struct iovec *iov,
--- a/crypto/ivgen-essiv.c
+++ b/crypto/ivgen-essiv.c
@@ -0,0 +1,118 @@
+/*
+ * QEMU Crypto block IV generator - essiv
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-essiv.h"
+
+typedef struct QCryptoIVGenESSIV QCryptoIVGenESSIV;
+struct QCryptoIVGenESSIV {
+    QCryptoCipher *cipher;
+};
+
+static int qcrypto_ivgen_essiv_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    uint8_t *salt;
+    size_t nhash;
+    size_t nsalt;
+    QCryptoIVGenESSIV *essiv = g_new0(QCryptoIVGenESSIV, 1);
+
+    /* Not necessarily the same as nkey */
+    nsalt = qcrypto_cipher_get_key_len(ivgen->cipher);
+
+    nhash = qcrypto_hash_digest_len(ivgen->hash);
+    /* Salt must be larger of hash size or key size */
+    salt = g_new0(uint8_t, MAX(nhash, nsalt));
+
+    if (qcrypto_hash_bytes(ivgen->hash, (const gchar *)key, nkey,
+                           &salt, &nhash,
+                           errp) < 0) {
+        g_free(essiv);
+        return -1;
+    }
+
+    /* Now potentially truncate salt to match cipher key len */
+    essiv->cipher = qcrypto_cipher_new(ivgen->cipher,
+                                       QCRYPTO_CIPHER_MODE_ECB,
+                                       salt, MIN(nhash, nsalt),
+                                       errp);
+    if (!essiv->cipher) {
+        g_free(essiv);
+        g_free(salt);
+        return -1;
+    }
+
+    g_free(salt);
+    ivgen->private = essiv;
+
+    return 0;
+}
+
+static int qcrypto_ivgen_essiv_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    QCryptoIVGenESSIV *essiv = ivgen->private;
+    size_t ndata = qcrypto_cipher_get_block_len(ivgen->cipher);
+    uint8_t *data = g_new(uint8_t, ndata);
+
+    sector = cpu_to_le64(sector);
+    memcpy(data, (uint8_t *)&sector, ndata);
+    if (sizeof(sector) < ndata) {
+        memset(data + sizeof(sector), 0, ndata - sizeof(sector));
+    }
+
+    if (qcrypto_cipher_encrypt(essiv->cipher,
+                               data,
+                               data,
+                               ndata,
+                               errp) < 0) {
+        g_free(data);
+        return -1;
+    }
+
+    if (ndata > niv) {
+        ndata = niv;
+    }
+    memcpy(iv, data, ndata);
+    if (ndata < niv) {
+        memset(iv + ndata, 0, niv - ndata);
+    }
+    g_free(data);
+    return 0;
+}
+
+static void qcrypto_ivgen_essiv_cleanup(QCryptoIVGen *ivgen)
+{
+    QCryptoIVGenESSIV *essiv = ivgen->private;
+
+    qcrypto_cipher_free(essiv->cipher);
+    g_free(essiv);
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_essiv = {
+    .init = qcrypto_ivgen_essiv_init,
+    .calculate = qcrypto_ivgen_essiv_calculate,
+    .cleanup = qcrypto_ivgen_essiv_cleanup,
+};
+
--- a/crypto/ivgen-essiv.h
+++ b/crypto/ivgen-essiv.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - essiv
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_ESSIV_H__
+#define QCRYPTO_IVGEN_ESSIV_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_essiv;
+
+#endif /* QCRYPTO_IVGEN_ESSIV_H__ */
--- a/crypto/ivgen-plain.c
+++ b/crypto/ivgen-plain.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-plain.h"
+
+static int qcrypto_ivgen_plain_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    return 0;
+}
+
+static int qcrypto_ivgen_plain_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    size_t ivprefix;
+    uint32_t shortsector = cpu_to_le32((sector & 0xffffffff));
+    ivprefix = sizeof(shortsector);
+    if (ivprefix > niv) {
+        ivprefix = niv;
+    }
+    memcpy(iv, &shortsector, ivprefix);
+    if (ivprefix < niv) {
+        memset(iv + ivprefix, 0, niv - ivprefix);
+    }
+    return 0;
+}
+
+static void qcrypto_ivgen_plain_cleanup(QCryptoIVGen *ivgen)
+{
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_plain = {
+    .init = qcrypto_ivgen_plain_init,
+    .calculate = qcrypto_ivgen_plain_calculate,
+    .cleanup = qcrypto_ivgen_plain_cleanup,
+};
+
--- a/crypto/ivgen-plain.h
+++ b/crypto/ivgen-plain.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_PLAIN_H__
+#define QCRYPTO_IVGEN_PLAIN_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_plain;
+
+#endif /* QCRYPTO_IVGEN_PLAIN_H__ */
--- a/crypto/ivgen-plain64.c
+++ b/crypto/ivgen-plain64.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU Crypto block IV generator - plain
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgen-plain.h"
+
+static int qcrypto_ivgen_plain_init(QCryptoIVGen *ivgen,
+                                    const uint8_t *key, size_t nkey,
+                                    Error **errp)
+{
+    return 0;
+}
+
+static int qcrypto_ivgen_plain_calculate(QCryptoIVGen *ivgen,
+                                         uint64_t sector,
+                                         uint8_t *iv, size_t niv,
+                                         Error **errp)
+{
+    size_t ivprefix;
+    ivprefix = sizeof(sector);
+    sector = cpu_to_le64(sector);
+    if (ivprefix > niv) {
+        ivprefix = niv;
+    }
+    memcpy(iv, &sector, ivprefix);
+    if (ivprefix < niv) {
+        memset(iv + ivprefix, 0, niv - ivprefix);
+    }
+    return 0;
+}
+
+static void qcrypto_ivgen_plain_cleanup(QCryptoIVGen *ivgen)
+{
+}
+
+
+struct QCryptoIVGenDriver qcrypto_ivgen_plain64 = {
+    .init = qcrypto_ivgen_plain_init,
+    .calculate = qcrypto_ivgen_plain_calculate,
+    .cleanup = qcrypto_ivgen_plain_cleanup,
+};
+
--- a/crypto/ivgen-plain64.h
+++ b/crypto/ivgen-plain64.h
@@ -0,0 +1,28 @@
+/*
+ * QEMU Crypto block IV generator - plain64
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "crypto/ivgenpriv.h"
+
+#ifndef QCRYPTO_IVGEN_PLAIN64_H__
+#define QCRYPTO_IVGEN_PLAIN64_H__
+
+extern struct QCryptoIVGenDriver qcrypto_ivgen_plain64;
+
+#endif /* QCRYPTO_IVGEN_PLAIN64_H__ */
--- a/crypto/ivgen.c
+++ b/crypto/ivgen.c
@@ -0,0 +1,99 @@
+/*
+ * QEMU Crypto block IV generator
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/ivgenpriv.h"
+#include "crypto/ivgen-plain.h"
+#include "crypto/ivgen-plain64.h"
+#include "crypto/ivgen-essiv.h"
+
+
+QCryptoIVGen *qcrypto_ivgen_new(QCryptoIVGenAlgorithm alg,
+                                QCryptoCipherAlgorithm cipheralg,
+                                QCryptoHashAlgorithm hash,
+                                const uint8_t *key, size_t nkey,
+                                Error **errp)
+{
+    QCryptoIVGen *ivgen = g_new0(QCryptoIVGen, 1);
+
+    ivgen->algorithm = alg;
+    ivgen->cipher = cipheralg;
+    ivgen->hash = hash;
+
+    switch (alg) {
+    case QCRYPTO_IVGEN_ALG_PLAIN:
+        ivgen->driver = &qcrypto_ivgen_plain;
+        break;
+    case QCRYPTO_IVGEN_ALG_PLAIN64:
+        ivgen->driver = &qcrypto_ivgen_plain64;
+        break;
+    case QCRYPTO_IVGEN_ALG_ESSIV:
+        ivgen->driver = &qcrypto_ivgen_essiv;
+        break;
+    default:
+        error_setg(errp, "Unknown block IV generator algorithm %d", alg);
+        g_free(ivgen);
+        return NULL;
+    }
+
+    if (ivgen->driver->init(ivgen, key, nkey, errp) < 0) {
+        g_free(ivgen);
+        return NULL;
+    }
+
+    return ivgen;
+}
+
+
+int qcrypto_ivgen_calculate(QCryptoIVGen *ivgen,
+                            uint64_t sector,
+                            uint8_t *iv, size_t niv,
+                            Error **errp)
+{
+    return ivgen->driver->calculate(ivgen, sector, iv, niv, errp);
+}
+
+
+QCryptoIVGenAlgorithm qcrypto_ivgen_get_algorithm(QCryptoIVGen *ivgen)
+{
+    return ivgen->algorithm;
+}
+
+
+QCryptoCipherAlgorithm qcrypto_ivgen_get_cipher(QCryptoIVGen *ivgen)
+{
+    return ivgen->cipher;
+}
+
+
+QCryptoHashAlgorithm qcrypto_ivgen_get_hash(QCryptoIVGen *ivgen)
+{
+    return ivgen->hash;
+}
+
+
+void qcrypto_ivgen_free(QCryptoIVGen *ivgen)
+{
+    if (!ivgen) {
+        return;
+    }
+    ivgen->driver->cleanup(ivgen);
+    g_free(ivgen);
+}
--- a/crypto/ivgenpriv.h
+++ b/crypto/ivgenpriv.h
@@ -0,0 +1,49 @@
+/*
+ * QEMU Crypto block IV generator
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QCRYPTO_IVGEN_PRIV_H__
+#define QCRYPTO_IVGEN_PRIV_H__
+
+#include "crypto/ivgen.h"
+
+typedef struct QCryptoIVGenDriver QCryptoIVGenDriver;
+
+struct QCryptoIVGenDriver {
+    int (*init)(QCryptoIVGen *ivgen,
+                const uint8_t *key, size_t nkey,
+                Error **errp);
+    int (*calculate)(QCryptoIVGen *ivgen,
+                     uint64_t sector,
+                     uint8_t *iv, size_t niv,
+                     Error **errp);
+    void (*cleanup)(QCryptoIVGen *ivgen);
+};
+
+struct QCryptoIVGen {
+    QCryptoIVGenDriver *driver;
+    void *private;
+
+    QCryptoIVGenAlgorithm algorithm;
+    QCryptoCipherAlgorithm cipher;
+    QCryptoHashAlgorithm hash;
+};
+
+
+#endif /* QCRYPTO_IVGEN_PRIV_H__ */
--- a/Show More
+++ b/Show More