hw/input/hid.c Fix capslock hid code

When ever USB keyboard is used, e.g. '-usbdevice keyboard' pressing caps lock key send 0x32 hid code, which is treated as backslash. Instead it should be 0x39 code. This affects sending uppercase keys, as they typed whith caps lock active. While on x86 this can be workarounded by using ps/2 protocol. On Power it is crusial as we don't have anything else than USB. This is fixes guest automation tasts over vnc. Signed-off-by: Dinar Valeev <dvaleev@suse.com> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
hid: handle full ptr queues in post_load
2015-01-22 12:19:48 +01:00 · 2015-01-22 12:19:48 +01:00 · 2015-01-22 12:19:48 +01:00 · 2015-01-20 16:19:58 +00:00 · 2015-01-20 15:19:35 +00:00 · 2015-01-20 15:19:35 +00:00
1237 changed files with 81470 additions and 22411 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,10 @@
 /trace/generated-tracers.dtrace
 /trace/generated-events.h
 /trace/generated-events.c
+/trace/generated-helpers-wrappers.h
+/trace/generated-helpers.h
+/trace/generated-helpers.c
+/trace/generated-tcg-tracers.h
 /trace/generated-ust-provider.h
 /trace/generated-ust.c
 /libcacard/trace/generated-tracers.c
@@ -105,3 +109,4 @@ cscope.*
 tags
 TAGS
 *~
+/tests/qemu-iotests/common.env
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ notifications:
    on_failure: always
 env:
  global:
-    - TEST_CMD="make check"
+    - TEST_CMD=""
    - EXTRA_CONFIG=""
    # Development packages, EXTRA_PKGS saved for additional builds
    - CORE_PKGS="libusb-1.0-0-dev libiscsi-dev librados-dev libncurses5-dev"
@@ -20,31 +20,51 @@ env:
    - GUI_PKGS="libgtk-3-dev libvte-2.90-dev libsdl1.2-dev libpng12-dev libpixman-1-dev"
    - EXTRA_PKGS=""
  matrix:
+    # Group major targets together with their linux-user counterparts
    - TARGETS=alpha-softmmu,alpha-linux-user
-    - TARGETS=arm-softmmu,arm-linux-user
-    - TARGETS=aarch64-softmmu,aarch64-linux-user
-    - TARGETS=cris-softmmu
-    - TARGETS=i386-softmmu,x86_64-softmmu
-    - TARGETS=lm32-softmmu
-    - TARGETS=m68k-softmmu
-    - TARGETS=microblaze-softmmu,microblazeel-softmmu
+    - TARGETS=arm-softmmu,arm-linux-user,armeb-linux-user,aarch64-softmmu,aarch64-linux-user
+    - TARGETS=cris-softmmu,cris-linux-user
+    - TARGETS=i386-softmmu,i386-linux-user,x86_64-softmmu,x86_64-linux-user
+    - TARGETS=m68k-softmmu,m68k-linux-user
+    - TARGETS=microblaze-softmmu,microblazeel-softmmu,microblaze-linux-user,microblazeel-linux-user
    - TARGETS=mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu
-    - TARGETS=moxie-softmmu
-    - TARGETS=or32-softmmu,
-    - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu
-    - TARGETS=s390x-softmmu
-    - TARGETS=sh4-softmmu,sh4eb-softmmu
-    - TARGETS=sparc-softmmu,sparc64-softmmu
-    - TARGETS=unicore32-softmmu
-    - TARGETS=xtensa-softmmu,xtensaeb-softmmu
+    - TARGETS=mips-linux-user,mips64-linux-user,mips64el-linux-user,mipsel-linux-user,mipsn32-linux-user,mipsn32el-linux-user
+    - TARGETS=or32-softmmu,or32-linux-user
+    - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu,ppc-linux-user,ppc64-linux-user,ppc64abi32-linux-user,ppc64le-linux-user
+    - TARGETS=s390x-softmmu,s390x-linux-user
+    - TARGETS=sh4-softmmu,sh4eb-softmmu,sh4-linux-user sh4eb-linux-user
+    - TARGETS=sparc-softmmu,sparc64-softmmu,sparc-linux-user,sparc32plus-linux-user,sparc64-linux-user
+    - TARGETS=unicore32-softmmu,unicore32-linux-user
+    # Group remaining softmmu only targets into one build
+    - TARGETS=lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,xtensaeb-softmmu
+git:
+  # we want to do this ourselves
+  submodules: false
 before_install:
+  - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
  - git submodule update --init --recursive
  - sudo apt-get update -qq
  - sudo apt-get install -qq ${CORE_PKGS} ${NET_PKGS} ${GUI_PKGS} ${EXTRA_PKGS}
-script: "./configure --target-list=${TARGETS} ${EXTRA_CONFIG} && make && ${TEST_CMD}"
+before_script:
+  - ./configure --target-list=${TARGETS} --enable-debug-tcg ${EXTRA_CONFIG}
+script:
+  - make -j2 && ${TEST_CMD}
 matrix:
  # We manually include a number of additional build for non-standard bits
  include:
+    # Make check target (we only do this once)
+    - env:
+        - TARGETS=alpha-softmmu,arm-softmmu,aarch64-softmmu,cris-softmmu,
+                  i386-softmmu,x86_64-softmmu,m68k-softmmu,microblaze-softmmu,
+                  microblazeel-softmmu,mips-softmmu,mips64-softmmu,
+                  mips64el-softmmu,mipsel-softmmu,or32-softmmu,ppc-softmmu,
+                  ppc64-softmmu,ppcemb-softmmu,s390x-softmmu,sh4-softmmu,
+                  sh4eb-softmmu,sparc-softmmu,sparc64-softmmu,
+                  unicore32-softmmu,unicore32-linux-user,
+                  lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,
+                  xtensaeb-softmmu
+          TEST_CMD="make check"
+      compiler: gcc
    # Debug related options
    - env: TARGETS=i386-softmmu,x86_64-softmmu
           EXTRA_CONFIG="--enable-debug"
@@ -73,7 +93,6 @@ matrix:
      compiler: gcc
    - env: TARGETS=i386-softmmu,x86_64-softmmu
           EXTRA_CONFIG="--enable-trace-backends=ftrace"
-           TEST_CMD=""
      compiler: gcc
    - env: TARGETS=i386-softmmu,x86_64-softmmu
          EXTRA_PKGS="liblttng-ust-dev liburcu-dev"
--- a/14
+++ b/14
@@ -91,3 +91,17 @@ Mixed declarations (interleaving statements and declarations within blocks)
 are not allowed; declarations should be at the beginning of blocks.  In other
 words, the code should not generate warnings if using GCC's
 -Wdeclaration-after-statement option.
+
+6. Conditional statements
+
+When comparing a variable for (in)equality with a constant, list the
+constant on the right, as in:
+
+if (a == 1) {
+    /* Reads like: "If a equals 1" */
+    do_something();
+}
+
+Rationale: Yoda conditions (as in 'if (1 == a)') are awkward to read.
+Besides, good compilers already warn users when '==' is mis-typed as '=',
+even when the constant is on the right.
--- a/2
+++ b/2
@@ -11,7 +11,7 @@ option) any later version.

 As of July 2013, contributions under version 2 of the GNU General Public
 License (and no later version) are only accepted for the following files
-or directories: bsd-user/, linux-user/, hw/misc/vfio.c, hw/xen/xen_pt*.
+or directories: bsd-user/, linux-user/, hw/vfio/, hw/xen/xen_pt*.

 3) The Tiny Code Generator (TCG) is released under the BSD license
   (see license headers in files).
--- a/148
+++ b/148
@@ -51,6 +51,7 @@ Descriptions of section entries:
 General Project Administration
 ------------------------------
 M: Anthony Liguori <aliguori@amazon.com>
+M: Peter Maydell <peter.maydell@linaro.org>

 Responsible Disclosure, Reporting Security Issues
 ------------------------------
@@ -61,11 +62,23 @@ L: secalert@redhat.com

 Guest CPU cores (TCG):
 ----------------------
+Overall
+L: qemu-devel@nongnu.org
+S: Odd fixes
+F: cpu-exec.c
+F: cputlb.c
+F: softmmu_template.h
+F: translate-all.c
+F: include/exec/cpu_ldst.h
+F: include/exec/cpu_ldst_template.h
+F: include/exec/helper*.h
+
 Alpha
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: target-alpha/
 F: hw/alpha/
+F: tests/tcg/alpha/

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -79,13 +92,19 @@ M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 S: Maintained
 F: target-cris/
 F: hw/cris/
+F: tests/tcg/cris/

 LM32
 M: Michael Walle <michael@walle.cc>
 S: Maintained
 F: target-lm32/
+F: disas/lm32.c
 F: hw/lm32/
-F: hw/char/lm32_*
+F: hw/*/lm32_*
+F: hw/*/milkymist-*
+F: include/hw/char/lm32_juart.h
+F: include/hw/lm32/
+F: tests/tcg/lm32/

 M68K
 S: Orphan
@@ -100,9 +119,11 @@ F: hw/microblaze/

 MIPS
 M: Aurelien Jarno <aurelien@aurel32.net>
-S: Odd Fixes
+M: Leon Alrae <leon.alrae@imgtec.com>
+S: Maintained
 F: target-mips/
 F: hw/mips/
+F: tests/tcg/mips/

 Moxie
 M: Anthony Green <green@moxielogic.com>
@@ -114,6 +135,7 @@ M: Jia Liu <proljc@gmail.com>
 S: Maintained
 F: target-openrisc/
 F: hw/openrisc/
+F: tests/tcg/openrisc/

 PowerPC
 M: Alexander Graf <agraf@suse.de>
@@ -149,7 +171,8 @@ F: target-unicore32/
 F: hw/unicore32/

 X86
-M: qemu-devel@nongnu.org
+M: Paolo Bonzini <pbonzini@redhat.com>
+M: Richard Henderson <rth@twiddle.net>
 S: Odd Fixes
 F: target-i386/
 F: hw/i386/
@@ -160,6 +183,13 @@ W: http://wiki.osll.spb.ru/doku.php?id=etc:users:jcmvbkbc:qemu-target-xtensa
 S: Maintained
 F: target-xtensa/
 F: hw/xtensa/
+F: tests/tcg/xtensa/
+
+TriCore
+M: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
+S: Maintained
+F: target-tricore/
+F: hw/tricore/

 Guest CPU Cores (KVM):
 ----------------------
@@ -192,9 +222,12 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target-s390x/kvm.c
-F: hw/intc/s390_flic.[hc]
+F: hw/intc/s390_flic.c
+F: hw/intc/s390_flic_kvm.c
+F: include/hw/s390x/s390_flic.h

 X86
+M: Paolo Bonzini <pbonzini@redhat.com>
 M: Marcelo Tosatti <mtosatti@redhat.com>
 L: kvm@vger.kernel.org
 S: Supported
@@ -260,7 +293,7 @@ F: include/hw/arm/digic.h
 F: hw/*/digic*

 Gumstix
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/arm/gumstix.c

@@ -276,7 +309,7 @@ S: Maintained
 F: hw/arm/integratorcp.c

 Mainstone
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/arm/mainstone.c

@@ -382,7 +415,7 @@ S: Maintained
 F: hw/mips/mips_malta.c

 Mipssim
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/mips/mips_mipssim.c

@@ -505,6 +538,7 @@ S390 Virtio
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: hw/s390x/s390-*.c
+X: hw/s390x/*pci*.[hc]

 S390 Virtio-ccw
 M: Cornelia Huck <cornelia.huck@de.ibm.com>
@@ -515,6 +549,9 @@ F: hw/s390x/s390-virtio-ccw.c
 F: hw/s390x/css.[hc]
 F: hw/s390x/sclp*.[hc]
 F: hw/s390x/ipl*.[hc]
+F: hw/s390x/*pci*.[hc]
+F: include/hw/s390x/
+F: pc-bios/s390-ccw/
 T: git git://github.com/cohuck/qemu virtio-ccw-upstr

 UniCore32 Machines
@@ -552,12 +589,13 @@ Xtensa Machines
 sim
 M: Max Filippov <jcmvbkbc@gmail.com>
 S: Maintained
-F: hw/xtensa/xtensa_sim.c
+F: hw/xtensa/sim.c

-Avnet LX60
+XTFPGA (LX60, LX200, ML605, KC705)
 M: Max Filippov <jcmvbkbc@gmail.com>
 S: Maintained
-F: hw/xtensa/xtensa_lx60.c
+F: hw/xtensa/xtfpga.c
+F: hw/net/opencores_eth.c

 Devices
 -------
@@ -614,12 +652,18 @@ USB
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb/*
-F: tests/usb-hcd-ehci-test.c
+F: tests/usb-*-test.c
+
+USB (serial adapter)
+M: Gerd Hoffmann <kraxel@redhat.com>
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
+S: Maintained
+F: hw/usb/dev-serial.c

 VFIO
 M: Alex Williamson <alex.williamson@redhat.com>
 S: Supported
-F: hw/misc/vfio.c
+F: hw/vfio/*

 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
@@ -658,6 +702,14 @@ M: Amit Shah <amit.shah@redhat.com>
 S: Supported
 F: hw/char/virtio-serial-bus.c
 F: hw/char/virtio-console.c
+F: include/hw/virtio/virtio-serial.h
+
+virtio-rng
+M: Amit Shah <amit.shah@redhat.com>
+S: Supported
+F: hw/virtio/virtio-rng.c
+F: include/hw/virtio/virtio-rng.h
+F: backends/rng*.c

 nvme
 M: Keith Busch <keith.busch@intel.com>
@@ -678,6 +730,12 @@ S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h

+Vmware
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/vmxnet*
+F: hw/scsi/vmw_pvscsi*
+
 Subsystems
 ----------
 Audio
@@ -694,18 +752,31 @@ Block
 M: Kevin Wolf <kwolf@redhat.com>
 M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Supported
+F: async.c
+F: aio-*.c
 F: block*
 F: block/
 F: hw/block/
+F: migration/block*
 F: qemu-img*
 F: qemu-io*
+F: tests/image-fuzzer/
+F: tests/qemu-iotests/
 T: git git://repo.or.cz/qemu/kevin.git block
 T: git git://github.com/stefanha/qemu.git block

 Character Devices
 M: Anthony Liguori <aliguori@amazon.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: qemu-char.c
+F: backends/msmouse.c
+F: backends/testdev.c
+
+Character Devices (Braille)
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
+S: Maintained
+F: backends/baum.c

 CPU
 M: Andreas Färber <afaerber@suse.de>
@@ -727,7 +798,7 @@ S: Maintained
 F: device_tree.[ch]

 GDB stub
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Odd Fixes
 F: gdbstub*
 F: gdb-xml/
@@ -764,7 +835,11 @@ F: ui/cocoa.m

 Main loop
 M: Anthony Liguori <aliguori@amazon.com>
-S: Supported
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Maintained
+F: cpus.c
+F: main-loop.c
+F: qemu-timer.c
 F: vl.c

 Human Monitor (HMP)
@@ -803,6 +878,7 @@ M: Luiz Capitulino <lcapitulino@redhat.com>
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
 S: Maintained
 F: qapi/
+F: tests/qapi-schema/
 T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp

 QAPI Schema
@@ -813,6 +889,18 @@ S: Supported
 F: qapi-schema.json
 T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp

+QObject
+M: Luiz Capitulino <lcapitulino@redhat.com>
+S: Maintained
+F: qobject/
+T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp
+
+QEMU Guest Agent
+M: Michael Roth <mdroth@linux.vnet.ibm.com>
+S: Maintained
+F: qga/
+T: git git://github.com/mdroth/qemu.git qga
+
 QOM
 M: Anthony Liguori <aliguori@amazon.com>
 M: Andreas Färber <afaerber@suse.de>
@@ -853,6 +941,17 @@ M: Blue Swirl <blauwirbel@gmail.com>
 S: Odd Fixes
 F: scripts/checkpatch.pl

+Migration
+M: Juan Quintela <quintela@redhat.com>
+M: Amit Shah <amit.shah@redhat.com>
+S: Maintained
+F: include/migration/
+F: migration/
+F: savevm.c
+F: arch_init.c
+F: scripts/vmstate-static-checker.py
+F: tests/vmstate-static-checker-data/
+
 Seccomp
 M: Eduardo Otubo <eduardo.otubo@profitbricks.com>
 S: Supported
@@ -861,6 +960,12 @@ F: include/sysemu/seccomp.h

 Usermode Emulation
 ------------------
+Overall
+M: Riku Voipio <riku.voipio@iki.fi>
+S: Maintained
+F: thunk.c
+F: user-exec.c
+
 BSD user
 M: Blue Swirl <blauwirbel@gmail.com>
 S: Maintained
@@ -874,7 +979,6 @@ F: linux-user/
 Tiny Code Generator (TCG)
 -------------------------
 Common code
-M: qemu-devel@nongnu.org
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: tcg/
@@ -891,7 +995,7 @@ S: Maintained
 F: tcg/arm/

 i386 target
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Maintained
 F: tcg/i386/

@@ -968,7 +1072,7 @@ S: Supported
 F: block/rbd.c

 Sheepdog
-M: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
+M: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
 M: Liu Yuan <namei.unix@gmail.com>
 L: sheepdog@lists.wpkg.org
 S: Supported
@@ -1000,3 +1104,13 @@ SSH
 M: Richard W.M. Jones <rjones@redhat.com>
 S: Supported
 F: block/ssh.c
+
+ARCHIPELAGO
+M: Chrysostomos Nanakos <chris@include.gr>
+S: Maintained
+F: block/archipelago.c
+
+Bootdevice
+M: Gonglei <arei.gonglei@huawei.com>
+S: Maintained
+F: bootdevice.c
--- a/13
+++ b/13
@@ -57,6 +57,12 @@ GENERATED_HEADERS += trace/generated-tracers-dtrace.h
 endif
 GENERATED_SOURCES += trace/generated-tracers.c

+GENERATED_HEADERS += trace/generated-tcg-tracers.h
+
+GENERATED_HEADERS += trace/generated-helpers-wrappers.h
+GENERATED_HEADERS += trace/generated-helpers.h
+GENERATED_SOURCES += trace/generated-helpers.c
+
 ifeq ($(findstring ust,$(TRACE_BACKENDS)),ust)
 GENERATED_HEADERS += trace/generated-ust-provider.h
 GENERATED_SOURCES += trace/generated-ust.c
@@ -202,7 +208,7 @@ Makefile: $(version-obj-y) $(version-lobj-y)
 # Build libraries

 libqemustub.a: $(stub-obj-y)
-libqemuutil.a: $(util-obj-y) qapi-types.o qapi-visit.o qapi-event.o
+libqemuutil.a: $(util-obj-y)

 block-modules = $(foreach o,$(block-obj-m),"$(basename $(subst /,-,$o))",) NULL
 util/module.o-cflags = -D'CONFIG_BLOCK_MODULES=$(block-modules)'
@@ -307,8 +313,8 @@ qemu-%.tar.bz2:

 distclean: clean
 	rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi
-	rm -f config-all-devices.mak config-all-disas.mak
-	rm -f po/*.mo
+	rm -f config-all-devices.mak config-all-disas.mak config.status
+	rm -f po/*.mo tests/qemu-iotests/common.env
 	rm -f roms/seabios/config.mak roms/vgabios/config.mak
 	rm -f qemu-doc.info qemu-doc.aux qemu-doc.cp qemu-doc.cps qemu-doc.dvi
 	rm -f qemu-doc.fn qemu-doc.fns qemu-doc.info qemu-doc.ky qemu-doc.kys
@@ -412,6 +418,7 @@ endif
 	set -e; for x in $(KEYMAPS); do \
 		$(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \
 	done
+	$(INSTALL_DATA) $(SRC_PATH)/trace-events "$(DESTDIR)$(qemu_datadir)/trace-events"
 	for d in $(TARGET_DIRS); do \
 	$(MAKE) $(SUBDIR_MAKEFLAGS) TARGET_DIR=$$d/ -C $$d $@ || exit 1 ; \
        done
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -1,7 +1,7 @@
 #######################################################################
 # Common libraries for tools and emulators
 stub-obj-y = stubs/
-util-obj-y = util/ qobject/ qapi/ trace/
+util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o

 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
@@ -12,7 +12,6 @@ block-obj-y += main-loop.o iohandler.o qemu-timer.o
 block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
-block-obj-y += qapi-types.o qapi-visit.o qapi-event.o
 block-obj-y += qemu-io-cmds.o

 block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
@@ -49,20 +48,15 @@ common-obj-$(CONFIG_POSIX) += os-posix.o

 common-obj-$(CONFIG_LINUX) += fsdev/

-common-obj-y += migration.o migration-tcp.o
-common-obj-y += vmstate.o
-common-obj-y += qemu-file.o
-common-obj-$(CONFIG_RDMA) += migration-rdma.o
+common-obj-y += migration/
 common-obj-y += qemu-char.o #aio.o
-common-obj-y += block-migration.o
-common-obj-y += page_cache.o xbzrle.o
-
-common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o
+common-obj-y += page_cache.o

 common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

 common-obj-y += audio/
 common-obj-y += hw/
+common-obj-y += accel.o

 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
@@ -88,11 +82,6 @@ common-obj-y += qmp-marshal.o
 common-obj-y += qmp.o hmp.o
 endif

-######################################################################
-# some qapi visitors are used by both system and user emulation:
-
-common-obj-y += qapi-visit.o qapi-types.o
-
 #######################################################################
 # Target-independent parts used in system and user emulation
 common-obj-y += qemu-log.o
@@ -106,10 +95,15 @@ common-obj-y += disas/
 version-obj-$(CONFIG_WIN32) += $(BUILD_DIR)/version.o
 version-lobj-$(CONFIG_WIN32) += $(BUILD_DIR)/version.lo

+######################################################################
+# tracing
+util-obj-y +=  trace/
+target-obj-y += trace/
+
 ######################################################################
 # guest agent

 # FIXME: a few definitions from qapi-types.o/qapi-visit.o are needed
 # by libqemuutil.a.  These should be moved to a separate .json schema.
-qga-obj-y = qga/ qapi-types.o qapi-visit.o
+qga-obj-y = qga/
 qga-vss-dll-obj-y = qga/
--- a/Makefile.target
+++ b/Makefile.target
@@ -38,7 +38,7 @@ config-target.h: config-target.h-timestamp
 config-target.h-timestamp: config-target.mak

 ifdef CONFIG_TRACE_SYSTEMTAP
-stap: $(QEMU_PROG).stp-installed $(QEMU_PROG).stp
+stap: $(QEMU_PROG).stp-installed $(QEMU_PROG).stp $(QEMU_PROG)-simpletrace.stp

 ifdef CONFIG_USER_ONLY
 TARGET_TYPE=user
@@ -64,6 +64,13 @@ $(QEMU_PROG).stp: $(SRC_PATH)/trace-events
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp")

+$(QEMU_PROG)-simpletrace.stp: $(SRC_PATH)/trace-events
+	$(call quiet-command,$(TRACETOOL) \
+		--format=simpletrace-stap \
+		--backends=$(TRACE_BACKENDS) \
+		--probe-prefix=qemu.$(TARGET_TYPE).$(TARGET_NAME) \
+		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG)-simpletrace.stp")
+
 else
 stap:
 endif
@@ -120,7 +127,7 @@ endif #CONFIG_BSD_USER
 # System emulator target
 ifdef CONFIG_SOFTMMU
 obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
-obj-y += qtest.o
+obj-y += qtest.o bootdevice.o
 obj-y += hw/
 obj-$(CONFIG_FDT) += device_tree.o
 obj-$(CONFIG_KVM) += kvm-all.o
@@ -152,15 +159,20 @@ endif # CONFIG_SOFTMMU
 dummy := $(call unnest-vars,,obj-y)
 all-obj-y := $(obj-y)

+target-obj-y :=
 block-obj-y :=
 common-obj-y :=
 include $(SRC_PATH)/Makefile.objs
+dummy := $(call unnest-vars,,target-obj-y)
+target-obj-y-save := $(target-obj-y)
 dummy := $(call unnest-vars,.., \
               block-obj-y \
               block-obj-m \
               common-obj-y \
               common-obj-m)
+target-obj-y := $(target-obj-y-save)
 all-obj-y += $(common-obj-y)
+all-obj-y += $(target-obj-y)
 all-obj-$(CONFIG_SOFTMMU) += $(block-obj-y)

 # build either PROG or PROGW
@@ -191,6 +203,7 @@ endif
 ifdef CONFIG_TRACE_SYSTEMTAP
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset"
 	$(INSTALL_DATA) $(QEMU_PROG).stp-installed "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG).stp"
+	$(INSTALL_DATA) $(QEMU_PROG)-simpletrace.stp "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG)-simpletrace.stp"
 endif

 GENERATED_HEADERS += config-target.h
--- a/2
+++ b/2
@@ -1 +1 @@
-2.1.0
+2.2.50
--- a/accel.c
+++ b/accel.c
@@ -0,0 +1,157 @@
+/*
+ * QEMU System Emulator, accelerator interfaces
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "sysemu/accel.h"
+#include "hw/boards.h"
+#include "qemu-common.h"
+#include "sysemu/arch_init.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "sysemu/qtest.h"
+#include "hw/xen/xen.h"
+#include "qom/object.h"
+#include "hw/boards.h"
+
+int tcg_tb_size;
+static bool tcg_allowed = true;
+
+static int tcg_init(MachineState *ms)
+{
+    tcg_exec_init(tcg_tb_size * 1024 * 1024);
+    return 0;
+}
+
+static const TypeInfo accel_type = {
+    .name = TYPE_ACCEL,
+    .parent = TYPE_OBJECT,
+    .class_size = sizeof(AccelClass),
+    .instance_size = sizeof(AccelState),
+};
+
+/* Lookup AccelClass from opt_name. Returns NULL if not found */
+static AccelClass *accel_find(const char *opt_name)
+{
+    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
+    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
+    g_free(class_name);
+    return ac;
+}
+
+static int accel_init_machine(AccelClass *acc, MachineState *ms)
+{
+    ObjectClass *oc = OBJECT_CLASS(acc);
+    const char *cname = object_class_get_name(oc);
+    AccelState *accel = ACCEL(object_new(cname));
+    int ret;
+    ms->accelerator = accel;
+    *(acc->allowed) = true;
+    ret = acc->init_machine(ms);
+    if (ret < 0) {
+        ms->accelerator = NULL;
+        *(acc->allowed) = false;
+        object_unref(OBJECT(accel));
+    }
+    return ret;
+}
+
+int configure_accelerator(MachineState *ms)
+{
+    const char *p;
+    char buf[10];
+    int ret;
+    bool accel_initialised = false;
+    bool init_failed = false;
+    AccelClass *acc = NULL;
+
+    p = qemu_opt_get(qemu_get_machine_opts(), "accel");
+    if (p == NULL) {
+        /* Use the default "accelerator", tcg */
+        p = "tcg";
+    }
+
+    while (!accel_initialised && *p != '\0') {
+        if (*p == ':') {
+            p++;
+        }
+        p = get_opt_name(buf, sizeof(buf), p, ':');
+        acc = accel_find(buf);
+        if (!acc) {
+            fprintf(stderr, "\"%s\" accelerator not found.\n", buf);
+            continue;
+        }
+        if (acc->available && !acc->available()) {
+            printf("%s not supported for this target\n",
+                   acc->name);
+            continue;
+        }
+        ret = accel_init_machine(acc, ms);
+        if (ret < 0) {
+            init_failed = true;
+            fprintf(stderr, "failed to initialize %s: %s\n",
+                    acc->name,
+                    strerror(-ret));
+        } else {
+            accel_initialised = true;
+        }
+    }
+
+    if (!accel_initialised) {
+        if (!init_failed) {
+            fprintf(stderr, "No accelerator found!\n");
+        }
+        exit(1);
+    }
+
+    if (init_failed) {
+        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
+    }
+
+    return !accel_initialised;
+}
+
+
+static void tcg_accel_class_init(ObjectClass *oc, void *data)
+{
+    AccelClass *ac = ACCEL_CLASS(oc);
+    ac->name = "tcg";
+    ac->init_machine = tcg_init;
+    ac->allowed = &tcg_allowed;
+}
+
+#define TYPE_TCG_ACCEL ACCEL_CLASS_NAME("tcg")
+
+static const TypeInfo tcg_accel_type = {
+    .name = TYPE_TCG_ACCEL,
+    .parent = TYPE_ACCEL,
+    .class_init = tcg_accel_class_init,
+};
+
+static void register_accel_types(void)
+{
+    type_register_static(&accel_type);
+    type_register_static(&tcg_accel_type);
+}
+
+type_init(register_accel_types);
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -73,7 +73,7 @@ void aio_set_fd_handler(AioContext *ctx,
    } else {
        if (node == NULL) {
            /* Alloc and insert if it's not already there */
-            node = g_malloc0(sizeof(AioHandler));
+            node = g_new0(AioHandler, 1);
            node->pfd.fd = fd;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

@@ -100,6 +100,11 @@ void aio_set_event_notifier(AioContext *ctx,
                       (IOHandler *)io_read, NULL, notifier);
 }

+bool aio_prepare(AioContext *ctx)
+{
+    return false;
+}
+
 bool aio_pending(AioContext *ctx)
 {
    AioHandler *node;
@@ -119,11 +124,20 @@ bool aio_pending(AioContext *ctx)
    return false;
 }

-static bool aio_dispatch(AioContext *ctx)
+bool aio_dispatch(AioContext *ctx)
 {
    AioHandler *node;
    bool progress = false;

+    /*
+     * If there are callbacks left that have been queued, we need to call them.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for aio_poll loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        progress = true;
+    }
+
    /*
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
@@ -184,22 +198,9 @@ bool aio_poll(AioContext *ctx, bool blocking)

    /* aio_notify can avoid the expensive event_notifier_set if
     * everything (file descriptors, bottom halves, timers) will
-     * be re-evaluated before the next blocking poll().  This happens
-     * in two cases:
-     *
-     * 1) when aio_poll is called with blocking == false
-     *
-     * 2) when we are called after poll().  If we are called before
-     *    poll(), bottom halves will not be re-evaluated and we need
-     *    aio_notify() if blocking == true.
-     *
-     * The first aio_dispatch() only does something when AioContext is
-     * running as a GSource, and in that case aio_poll is used only
-     * with blocking == false, so this optimization is already quite
-     * effective.  However, the code is ugly and should be restructured
-     * to have a single aio_dispatch() call.  To do this, we need to
-     * reorganize aio_poll into a prepare/poll/dispatch model like
-     * glib's.
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
     *
     * If we're in a nested event loop, ctx->dispatching might be true.
     * In that case we can restore it just before returning, but we
@@ -207,26 +208,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
     */
    aio_set_dispatching(ctx, !blocking);

-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Re-evaluate condition (1) above.  */
-    aio_set_dispatching(ctx, !blocking);
-    if (aio_dispatch(ctx)) {
-        progress = true;
-    }
-
-    if (progress && !blocking) {
-        goto out;
-    }
-
    ctx->walking_handlers++;

    g_array_set_size(ctx->pollfds, 0);
@@ -249,7 +230,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /* wait until next event */
    ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
                         ctx->pollfds->len,
-                         blocking ? timerlistgroup_deadline_ns(&ctx->tlg) : 0);
+                         blocking ? aio_compute_timeout(ctx) : 0);

    /* if we have any readable fds, dispatch event */
    if (ret > 0) {
@@ -268,7 +249,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
        progress = true;
    }

-out:
    aio_set_dispatching(ctx, was_dispatching);
    return progress;
 }
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -22,12 +22,80 @@

 struct AioHandler {
    EventNotifier *e;
+    IOHandler *io_read;
+    IOHandler *io_write;
    EventNotifierHandler *io_notify;
    GPollFD pfd;
    int deleted;
+    void *opaque;
    QLIST_ENTRY(AioHandler) node;
 };

+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        void *opaque)
+{
+    /* fd is a SOCKET in our case */
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.fd == fd && !node->deleted) {
+            break;
+        }
+    }
+
+    /* Are we deleting the fd handler? */
+    if (!io_read && !io_write) {
+        if (node) {
+            /* If the lock is held, just mark the node as deleted */
+            if (ctx->walking_handlers) {
+                node->deleted = 1;
+                node->pfd.revents = 0;
+            } else {
+                /* Otherwise, delete it for real.  We can't just mark it as
+                 * deleted because deleted nodes are only cleaned up after
+                 * releasing the walking_handlers lock.
+                 */
+                QLIST_REMOVE(node, node);
+                g_free(node);
+            }
+        }
+    } else {
+        HANDLE event;
+
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_new0(AioHandler, 1);
+            node->pfd.fd = fd;
+            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
+        }
+
+        node->pfd.events = 0;
+        if (node->io_read) {
+            node->pfd.events |= G_IO_IN;
+        }
+        if (node->io_write) {
+            node->pfd.events |= G_IO_OUT;
+        }
+
+        node->e = &ctx->notifier;
+
+        /* Update handler with latest information */
+        node->opaque = opaque;
+        node->io_read = io_read;
+        node->io_write = io_write;
+
+        event = event_notifier_get_handle(&ctx->notifier);
+        WSAEventSelect(node->pfd.fd, event,
+                       FD_READ | FD_ACCEPT | FD_CLOSE |
+                       FD_CONNECT | FD_WRITE | FD_OOB);
+    }
+
+    aio_notify(ctx);
+}
+
 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *e,
                            EventNotifierHandler *io_notify)
@@ -61,7 +129,7 @@ void aio_set_event_notifier(AioContext *ctx,
    } else {
        if (node == NULL) {
            /* Alloc and insert if it's not already there */
-            node = g_malloc0(sizeof(AioHandler));
+            node = g_new0(AioHandler, 1);
            node->e = e;
            node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
            node->pfd.events = G_IO_IN;
@@ -76,6 +144,43 @@ void aio_set_event_notifier(AioContext *ctx,
    aio_notify(ctx);
 }

+bool aio_prepare(AioContext *ctx)
+{
+    static struct timeval tv0;
+    AioHandler *node;
+    bool have_select_revents = false;
+    fd_set rfds, wfds;
+
+    /* fill fd sets */
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->io_read) {
+            FD_SET ((SOCKET)node->pfd.fd, &rfds);
+        }
+        if (node->io_write) {
+            FD_SET ((SOCKET)node->pfd.fd, &wfds);
+        }
+    }
+
+    if (select(0, &rfds, &wfds, NULL, &tv0) > 0) {
+        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+            node->pfd.revents = 0;
+            if (FD_ISSET(node->pfd.fd, &rfds)) {
+                node->pfd.revents |= G_IO_IN;
+                have_select_revents = true;
+            }
+
+            if (FD_ISSET(node->pfd.fd, &wfds)) {
+                node->pfd.revents |= G_IO_OUT;
+                have_select_revents = true;
+            }
+        }
+    }
+
+    return have_select_revents;
+}
+
 bool aio_pending(AioContext *ctx)
 {
    AioHandler *node;
@@ -84,47 +189,37 @@ bool aio_pending(AioContext *ctx)
        if (node->pfd.revents && node->io_notify) {
            return true;
        }
+
+        if ((node->pfd.revents & G_IO_IN) && node->io_read) {
+            return true;
+        }
+        if ((node->pfd.revents & G_IO_OUT) && node->io_write) {
+            return true;
+        }
    }

    return false;
 }

-bool aio_poll(AioContext *ctx, bool blocking)
+static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
 {
    AioHandler *node;
-    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool progress;
-    int count;
-    int timeout;
-
-    progress = false;
+    bool progress = false;

    /*
-     * If there are callbacks left that have been queued, we need to call then.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Run timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
-    /*
-     * Then dispatch any pending callbacks from the GSource.
-     *
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
    while (node) {
        AioHandler *tmp;
+        int revents = node->pfd.revents;

        ctx->walking_handlers++;

-        if (node->pfd.revents && node->io_notify) {
+        if (!node->deleted &&
+            (revents || event_notifier_get_handle(node->e) == event) &&
+            node->io_notify) {
            node->pfd.revents = 0;
            node->io_notify(node->e);

@@ -134,6 +229,28 @@ bool aio_poll(AioContext *ctx, bool blocking)
            }
        }

+        if (!node->deleted &&
+            (node->io_read || node->io_write)) {
+            node->pfd.revents = 0;
+            if ((revents & G_IO_IN) && node->io_read) {
+                node->io_read(node->opaque);
+                progress = true;
+            }
+            if ((revents & G_IO_OUT) && node->io_write) {
+                node->io_write(node->opaque);
+                progress = true;
+            }
+
+            /* if the next select() will return an event, we have progressed */
+            if (event == event_notifier_get_handle(&ctx->notifier)) {
+                WSANETWORKEVENTS ev;
+                WSAEnumNetworkEvents(node->pfd.fd, event, &ev);
+                if (ev.lNetworkEvents) {
+                    progress = true;
+                }
+            }
+        }
+
        tmp = node;
        node = QLIST_NEXT(node, node);

@@ -145,10 +262,47 @@ bool aio_poll(AioContext *ctx, bool blocking)
        }
    }

-    if (progress && !blocking) {
-        return true;
+    return progress;
+}
+
+bool aio_dispatch(AioContext *ctx)
+{
+    bool progress;
+
+    progress = aio_bh_poll(ctx);
+    progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+    return progress;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    bool was_dispatching, progress, have_select_revents, first;
+    int count;
+    int timeout;
+
+    have_select_revents = aio_prepare(ctx);
+    if (have_select_revents) {
+        blocking = false;
    }

+    was_dispatching = ctx->dispatching;
+    progress = false;
+
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
    ctx->walking_handlers++;

    /* fill fd sets */
@@ -160,64 +314,40 @@ bool aio_poll(AioContext *ctx, bool blocking)
    }

    ctx->walking_handlers--;
+    first = true;

    /* wait until next event */
    while (count > 0) {
+        HANDLE event;
        int ret;

-        timeout = blocking ?
-            qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg)) : 0;
+        timeout = blocking
+            ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
        ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        aio_set_dispatching(ctx, true);
+
+        if (first && aio_bh_poll(ctx)) {
+            progress = true;
+        }
+        first = false;

        /* if we have any signaled events, dispatch event */
-        if ((DWORD) (ret - WAIT_OBJECT_0) >= count) {
+        event = NULL;
+        if ((DWORD) (ret - WAIT_OBJECT_0) < count) {
+            event = events[ret - WAIT_OBJECT_0];
+            events[ret - WAIT_OBJECT_0] = events[--count];
+        } else if (!have_select_revents) {
            break;
        }

+        have_select_revents = false;
        blocking = false;

-        /* we have to walk very carefully in case
-         * aio_set_fd_handler is called while we're walking */
-        node = QLIST_FIRST(&ctx->aio_handlers);
-        while (node) {
-            AioHandler *tmp;
-
-            ctx->walking_handlers++;
-
-            if (!node->deleted &&
-                event_notifier_get_handle(node->e) == events[ret - WAIT_OBJECT_0] &&
-                node->io_notify) {
-                node->io_notify(node->e);
-
-                /* aio_notify() does not count as progress */
-                if (node->e != &ctx->notifier) {
-                    progress = true;
-                }
-            }
-
-            tmp = node;
-            node = QLIST_NEXT(node, node);
-
-            ctx->walking_handlers--;
-
-            if (!ctx->walking_handlers && tmp->deleted) {
-                QLIST_REMOVE(tmp, node);
-                g_free(tmp);
-            }
-        }
-
-        /* Try again, but only call each handler once.  */
-        events[ret - WAIT_OBJECT_0] = events[--count];
+        progress |= aio_dispatch_handlers(ctx, event);
    }

-    if (blocking) {
-        /* Run the timers a second time. We do this because otherwise aio_wait
-         * will not note progress - and will stop a drain early - if we have
-         * a timer that was not ready to run entering g_poll but is ready
-         * after g_poll. This will only do anything if a timer has expired.
-         */
-        progress |= timerlistgroup_run_timers(&ctx->tlg);
-    }
+    progress |= timerlistgroup_run_timers(&ctx->tlg);

+    aio_set_dispatching(ctx, was_dispatching);
    return progress;
 }
--- a/arch_init.c
+++ b/arch_init.c
@@ -104,6 +104,8 @@ int graphic_depth = 32;
 #define QEMU_ARCH QEMU_ARCH_XTENSA
 #elif defined(TARGET_UNICORE32)
 #define QEMU_ARCH QEMU_ARCH_UNICORE32
+#elif defined(TARGET_TRICORE)
+#define QEMU_ARCH QEMU_ARCH_TRICORE
 #endif

 const uint32_t arch_type = QEMU_ARCH;
@@ -344,7 +346,8 @@ static void xbzrle_cache_zero_page(ram_addr_t current_addr)

    /* We don't care if this fails to allocate a new cache page
     * as long as it updated an old one */
-    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE);
+    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
+                 bitmap_sync_count);
 }

 #define ENCODING_FLAG_XBZRLE 0x1
@@ -356,10 +359,11 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
    int encoded_len = 0, bytes_sent = -1;
    uint8_t *prev_cached_page;

-    if (!cache_is_cached(XBZRLE.cache, current_addr)) {
+    if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
        acct_info.xbzrle_cache_miss++;
        if (!last_stage) {
-            if (cache_insert(XBZRLE.cache, current_addr, *current_data) == -1) {
+            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
+                             bitmap_sync_count) == -1) {
                return -1;
            } else {
                /* update *current_data when the page has been
@@ -484,15 +488,23 @@ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)


 /* Needs iothread lock! */
+/* Fix me: there are too many global variables used in migration process. */
+static int64_t start_time;
+static int64_t bytes_xfer_prev;
+static int64_t num_dirty_pages_period;
+
+static void migration_bitmap_sync_init(void)
+{
+    start_time = 0;
+    bytes_xfer_prev = 0;
+    num_dirty_pages_period = 0;
+}

 static void migration_bitmap_sync(void)
 {
    RAMBlock *block;
    uint64_t num_dirty_pages_init = migration_dirty_pages;
    MigrationState *s = migrate_get_current();
-    static int64_t start_time;
-    static int64_t bytes_xfer_prev;
-    static int64_t num_dirty_pages_period;
    int64_t end_time;
    int64_t bytes_xfer_now;
    static uint64_t xbzrle_cache_miss_prev;
@@ -512,7 +524,7 @@ static void migration_bitmap_sync(void)
    address_space_sync_dirty_bitmap(&address_space_memory);

    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        migration_bitmap_sync_range(block->mr->ram_addr, block->length);
+        migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
    }
    trace_migration_bitmap_sync_end(migration_dirty_pages
                                    - num_dirty_pages_init);
@@ -658,7 +670,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
            offset >= last_offset) {
            break;
        }
-        if (offset >= block->length) {
+        if (offset >= block->used_length) {
            offset = 0;
            block = QTAILQ_NEXT(block, next);
            if (!block) {
@@ -717,7 +729,7 @@ uint64_t ram_bytes_total(void)
    uint64_t total = 0;

    QTAILQ_FOREACH(block, &ram_list.blocks, next)
-        total += block->length;
+        total += block->used_length;

    return total;
 }
@@ -772,6 +784,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    mig_throttle_on = false;
    dirty_rate_high_cnt = 0;
    bitmap_sync_count = 0;
+    migration_bitmap_sync_init();

    if (migrate_use_xbzrle()) {
        XBZRLE_cache_lock();
@@ -820,7 +833,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
        uint64_t block_pages;

-        block_pages = block->length >> TARGET_PAGE_BITS;
+        block_pages = block->used_length >> TARGET_PAGE_BITS;
        migration_dirty_pages += block_pages;
    }

@@ -833,7 +846,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
        qemu_put_byte(f, strlen(block->idstr));
        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
-        qemu_put_be64(f, block->length);
+        qemu_put_be64(f, block->used_length);
    }

    qemu_mutex_unlock_ramlist();
@@ -1004,7 +1017,7 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
-        if (!block) {
+        if (!block || block->max_length <= offset) {
            error_report("Ack, bad migration stream!");
            return NULL;
        }
@@ -1017,8 +1030,10 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    id[len] = 0;

    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (!strncmp(id, block->idstr, sizeof(id)))
+        if (!strncmp(id, block->idstr, sizeof(id)) &&
+            block->max_length > offset) {
            return memory_region_get_ram_ptr(block->mr) + offset;
+        }
    }

    error_report("Can't find block %s!", id);
@@ -1038,8 +1053,7 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)

 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
-    ram_addr_t addr;
-    int flags, ret = 0;
+    int flags = 0, ret = 0;
    static uint64_t seq_iter;

    seq_iter++;
@@ -1048,21 +1062,24 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
        ret = -EINVAL;
    }

-    while (!ret) {
-        addr = qemu_get_be64(f);
+    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+        ram_addr_t addr, total_ram_bytes;
+        void *host;
+        uint8_t ch;

+        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

-        if (flags & RAM_SAVE_FLAG_MEM_SIZE) {
+        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+        case RAM_SAVE_FLAG_MEM_SIZE:
            /* Synchronize RAM block list */
-            char id[256];
-            ram_addr_t length;
-            ram_addr_t total_ram_bytes = addr;
-
-            while (total_ram_bytes) {
+            total_ram_bytes = addr;
+            while (!ret && total_ram_bytes) {
                RAMBlock *block;
                uint8_t len;
+                char id[256];
+                ram_addr_t length;

                len = qemu_get_byte(f);
                qemu_get_buffer(f, (uint8_t *)id, len);
@@ -1071,11 +1088,14 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)

                QTAILQ_FOREACH(block, &ram_list.blocks, next) {
                    if (!strncmp(id, block->idstr, sizeof(id))) {
-                        if (block->length != length) {
-                            error_report("Length mismatch: %s: " RAM_ADDR_FMT
-                                         " in != " RAM_ADDR_FMT, id, length,
-                                         block->length);
-                            ret =  -EINVAL;
+                        if (length != block->used_length) {
+                            Error *local_err = NULL;
+
+                            ret = qemu_ram_resize(block->offset, length, &local_err);
+                            if (local_err) {
+                                error_report("%s", error_get_pretty(local_err));
+                                error_free(local_err);
+                            }
                        }
                        break;
                    }
@@ -1086,16 +1106,11 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                                 "accept migration", id);
                    ret = -EINVAL;
                }
-                if (ret) {
-                    break;
-                }

                total_ram_bytes -= length;
            }
-        } else if (flags & RAM_SAVE_FLAG_COMPRESS) {
-            void *host;
-            uint8_t ch;
-
+            break;
+        case RAM_SAVE_FLAG_COMPRESS:
            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
@@ -1105,9 +1120,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)

            ch = qemu_get_byte(f);
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
-        } else if (flags & RAM_SAVE_FLAG_PAGE) {
-            void *host;
-
+            break;
+        case RAM_SAVE_FLAG_PAGE:
            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
@@ -1116,8 +1130,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
-        } else if (flags & RAM_SAVE_FLAG_XBZRLE) {
-            void *host = host_from_stream_offset(f, addr, flags);
+            break;
+        case RAM_SAVE_FLAG_XBZRLE:
+            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
@@ -1130,17 +1145,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                ret = -EINVAL;
                break;
            }
-        } else if (flags & RAM_SAVE_FLAG_HOOK) {
-            ram_control_load_hook(f, flags);
-        } else if (flags & RAM_SAVE_FLAG_EOS) {
+            break;
+        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
-        } else {
-            error_report("Unknown migration flags: %#x", flags);
-            ret = -EINVAL;
-            break;
+        default:
+            if (flags & RAM_SAVE_FLAG_HOOK) {
+                ram_control_load_hook(f, flags);
+            } else {
+                error_report("Unknown combination of migration flags: %#x",
+                             flags);
+                ret = -EINVAL;
+            }
+        }
+        if (!ret) {
+            ret = qemu_file_get_error(f);
        }
-        ret = qemu_file_get_error(f);
    }

    DPRINTF("Completed load of VM with exit code %d seq iteration "
@@ -1335,11 +1355,6 @@ void cpudef_init(void)
 #endif
 }

-int tcg_available(void)
-{
-    return 1;
-}
-
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
--- a/async.c
+++ b/async.c
@@ -44,10 +44,12 @@ struct QEMUBH {
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 {
    QEMUBH *bh;
-    bh = g_malloc0(sizeof(QEMUBH));
-    bh->ctx = ctx;
-    bh->cb = cb;
-    bh->opaque = opaque;
+    bh = g_new(QEMUBH, 1);
+    *bh = (QEMUBH){
+        .ctx = ctx,
+        .cb = cb,
+        .opaque = opaque,
+    };
    qemu_mutex_lock(&ctx->bh_lock);
    bh->next = ctx->first_bh;
    /* Make sure that the members are ready before putting bh into list */
@@ -152,39 +154,48 @@ void qemu_bh_delete(QEMUBH *bh)
    bh->deleted = 1;
 }

-static gboolean
-aio_ctx_prepare(GSource *source, gint    *timeout)
+int64_t
+aio_compute_timeout(AioContext *ctx)
 {
-    AioContext *ctx = (AioContext *) source;
+    int64_t deadline;
+    int timeout = -1;
    QEMUBH *bh;
-    int deadline;

-    /* We assume there is no timeout already supplied */
-    *timeout = -1;
    for (bh = ctx->first_bh; bh; bh = bh->next) {
        if (!bh->deleted && bh->scheduled) {
            if (bh->idle) {
                /* idle bottom halves will be polled at least
                 * every 10ms */
-                *timeout = 10;
+                timeout = 10000000;
            } else {
                /* non-idle bottom halves will be executed
                 * immediately */
-                *timeout = 0;
-                return true;
+                return 0;
            }
        }
    }

-    deadline = qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg));
+    deadline = timerlistgroup_deadline_ns(&ctx->tlg);
    if (deadline == 0) {
-        *timeout = 0;
-        return true;
+        return 0;
    } else {
-        *timeout = qemu_soonest_timeout(*timeout, deadline);
+        return qemu_soonest_timeout(timeout, deadline);
+    }
+}
+
+static gboolean
+aio_ctx_prepare(GSource *source, gint    *timeout)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    /* We assume there is no timeout already supplied */
+    *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+
+    if (aio_prepare(ctx)) {
+        *timeout = 0;
    }

-    return false;
+    return *timeout == 0;
 }

 static gboolean
@@ -209,7 +220,7 @@ aio_ctx_dispatch(GSource     *source,
    AioContext *ctx = (AioContext *) source;

    assert(callback == NULL);
-    aio_poll(ctx, false);
+    aio_dispatch(ctx);
    return true;
 }

@@ -280,18 +291,25 @@ static void aio_rfifolock_cb(void *opaque)
    aio_notify(opaque);
 }

-AioContext *aio_context_new(void)
+AioContext *aio_context_new(Error **errp)
 {
+    int ret;
    AioContext *ctx;
    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    ret = event_notifier_init(&ctx->notifier, false);
+    if (ret < 0) {
+        g_source_destroy(&ctx->source);
+        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
+        return NULL;
+    }
+    g_source_set_can_recurse(&ctx->source, true);
+    aio_set_event_notifier(ctx, &ctx->notifier,
+                           (EventNotifierHandler *)
+                           event_notifier_test_and_clear);
    ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
    ctx->thread_pool = NULL;
    qemu_mutex_init(&ctx->bh_lock);
    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
-    event_notifier_init(&ctx->notifier, false);
-    aio_set_event_notifier(ctx, &ctx->notifier, 
-                           (EventNotifierHandler *)
-                           event_notifier_test_and_clear);
    timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);

    return ctx;
--- a/audio/audio_template.h
+++ b/audio/audio_template.h
@@ -191,9 +191,9 @@ static void glue (audio_pcm_hw_gc_, TYPE) (HW **hwp)
        audio_detach_capture (hw);
 #endif
        QLIST_REMOVE (hw, entries);
+        glue (hw->pcm_ops->fini_, TYPE) (hw);
        glue (s->nb_hw_voices_, TYPE) += 1;
        glue (audio_pcm_hw_free_resources_ ,TYPE) (hw);
-        glue (hw->pcm_ops->fini_, TYPE) (hw);
        g_free (hw);
        *hwp = NULL;
    }
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -1,7 +1,7 @@
 common-obj-y += rng.o rng-egd.o
 common-obj-$(CONFIG_POSIX) += rng-random.o

-common-obj-y += msmouse.o
+common-obj-y += msmouse.o testdev.o
 common-obj-$(CONFIG_BRLAPI) += baum.o
 baum.o-cflags := $(SDL_CFLAGS)

--- a/backends/baum.c
+++ b/backends/baum.c
@@ -629,7 +629,7 @@ fail_handle:

 static void register_types(void)
 {
-    register_char_driver_qapi("braille", CHARDEV_BACKEND_KIND_BRAILLE, NULL);
+    register_char_driver("braille", CHARDEV_BACKEND_KIND_BRAILLE, NULL);
 }

 type_init(register_types);
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -27,7 +27,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)

    path = object_get_canonical_path_component(OBJECT(backend));
    memory_region_init_ram(&backend->mr, OBJECT(backend), path,
-                           backend->size);
+                           backend->size, errp);
    g_free(path);
 }

--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -257,15 +257,6 @@ static void host_memory_backend_init(Object *obj)
                        host_memory_backend_set_policy, NULL, NULL, NULL);
 }

-static void host_memory_backend_finalize(Object *obj)
-{
-    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
-
-    if (memory_region_size(&backend->mr)) {
-        memory_region_destroy(&backend->mr);
-    }
-}
-
 MemoryRegion *
 host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp)
 {
@@ -304,7 +295,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
        /* ensure policy won't be ignored in case memory is preallocated
         * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
         * this doesn't catch hugepage case. */
-        unsigned flags = MPOL_MF_STRICT;
+        unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;

        /* check for invalid host-nodes and policies and give more verbose
         * error messages than mbind(). */
@@ -360,7 +351,6 @@ static const TypeInfo host_memory_backend_info = {
    .class_init = host_memory_backend_class_init,
    .instance_size = sizeof(HostMemoryBackend),
    .instance_init = host_memory_backend_init,
-    .instance_finalize = host_memory_backend_finalize,
    .interfaces = (InterfaceInfo[]) {
        { TYPE_USER_CREATABLE },
        { }
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -79,7 +79,7 @@ CharDriverState *qemu_chr_open_msmouse(void)

 static void register_types(void)
 {
-    register_char_driver_qapi("msmouse", CHARDEV_BACKEND_KIND_MSMOUSE, NULL);
+    register_char_driver("msmouse", CHARDEV_BACKEND_KIND_MSMOUSE, NULL);
 }

 type_init(register_types);
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -169,6 +169,7 @@ static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
    if (b->opened) {
        error_set(errp, QERR_PERMISSION_DENIED);
    } else {
+        g_free(s->chr_name);
        s->chr_name = g_strdup(value);
    }
 }
--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -88,11 +88,7 @@ static char *rng_random_get_filename(Object *obj, Error **errp)
 {
    RndRandom *s = RNG_RANDOM(obj);

-    if (s->filename) {
-        return g_strdup(s->filename);
-    }
-
-    return NULL;
+    return g_strdup(s->filename);
 }

 static void rng_random_set_filename(Object *obj, const char *filename,
--- a/backends/testdev.c
+++ b/backends/testdev.c
@@ -0,0 +1,131 @@
+/*
+ * QEMU Char Device for testsuite control
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "sysemu/char.h"
+
+#define BUF_SIZE 32
+
+typedef struct {
+    CharDriverState *chr;
+    uint8_t in_buf[32];
+    int in_buf_used;
+} TestdevCharState;
+
+/* Try to interpret a whole incoming packet */
+static int testdev_eat_packet(TestdevCharState *testdev)
+{
+    const uint8_t *cur = testdev->in_buf;
+    int len = testdev->in_buf_used;
+    uint8_t c;
+    int arg;
+
+#define EAT(c) do { \
+    if (!len--) {   \
+        return 0;   \
+    }               \
+    c = *cur++;     \
+} while (0)
+
+    EAT(c);
+
+    while (isspace(c)) {
+        EAT(c);
+    }
+
+    arg = 0;
+    while (isdigit(c)) {
+        arg = arg * 10 + c - '0';
+        EAT(c);
+    }
+
+    while (isspace(c)) {
+        EAT(c);
+    }
+
+    switch (c) {
+    case 'q':
+        exit((arg << 1) | 1);
+        break;
+    default:
+        break;
+    }
+    return cur - testdev->in_buf;
+}
+
+/* The other end is writing some data.  Store it and try to interpret */
+static int testdev_write(CharDriverState *chr, const uint8_t *buf, int len)
+{
+    TestdevCharState *testdev = chr->opaque;
+    int tocopy, eaten, orig_len = len;
+
+    while (len) {
+        /* Complete our buffer as much as possible */
+        tocopy = MIN(len, BUF_SIZE - testdev->in_buf_used);
+
+        memcpy(testdev->in_buf + testdev->in_buf_used, buf, tocopy);
+        testdev->in_buf_used += tocopy;
+        buf += tocopy;
+        len -= tocopy;
+
+        /* Interpret it as much as possible */
+        while (testdev->in_buf_used > 0 &&
+               (eaten = testdev_eat_packet(testdev)) > 0) {
+            memmove(testdev->in_buf, testdev->in_buf + eaten,
+                    testdev->in_buf_used - eaten);
+            testdev->in_buf_used -= eaten;
+        }
+    }
+    return orig_len;
+}
+
+static void testdev_close(struct CharDriverState *chr)
+{
+    TestdevCharState *testdev = chr->opaque;
+
+    g_free(testdev);
+}
+
+CharDriverState *chr_testdev_init(void)
+{
+    TestdevCharState *testdev;
+    CharDriverState *chr;
+
+    testdev = g_malloc0(sizeof(TestdevCharState));
+    testdev->chr = chr = g_malloc0(sizeof(CharDriverState));
+
+    chr->opaque = testdev;
+    chr->chr_write = testdev_write;
+    chr->chr_close = testdev_close;
+
+    return chr;
+}
+
+static void register_types(void)
+{
+    register_char_driver("testdev", CHARDEV_BACKEND_KIND_TESTDEV, NULL);
+}
+
+type_init(register_types);
--- a/block.c
+++ b/block.c
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,28 +1,28 @@
-block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-$(CONFIG_QUORUM) += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
-block-obj-y += snapshot.o qapi.o
+block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+block-obj-y += null.o mirror.o

-ifeq ($(CONFIG_POSIX),y)
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_LIBNFS) += nfs.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
-endif
+block-obj-y += accounting.o

 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += mirror.o
 common-obj-y += backup.o

 iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
@@ -35,5 +35,6 @@ gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
 gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
+archipelago.o-libs := $(ARCHIPELAGO_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -0,0 +1,56 @@
+/*
+ * QEMU System Emulator block accounting
+ *
+ * Copyright (c) 2011 Christoph Hellwig
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/accounting.h"
+#include "block/block_int.h"
+#include "qemu/timer.h"
+
+void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
+                      int64_t bytes, enum BlockAcctType type)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    cookie->bytes = bytes;
+    cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    cookie->type = type;
+}
+
+void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BLOCK_MAX_IOTYPE);
+
+    stats->nr_bytes[cookie->type] += cookie->bytes;
+    stats->nr_ops[cookie->type]++;
+    stats->total_time_ns[cookie->type] +=
+        qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns;
+}
+
+
+void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
+                               unsigned int nb_sectors)
+{
+    if (stats->wr_highest_sector < sector_num + nb_sectors - 1) {
+        stats->wr_highest_sector = sector_num + nb_sectors - 1;
+    }
+}
--- a/block/archipelago.c
+++ b/block/archipelago.c
--- a/block/backup.c
+++ b/block/backup.c
@@ -227,9 +227,25 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
    }
 }

+typedef struct {
+    int ret;
+} BackupCompleteData;
+
+static void backup_complete(BlockJob *job, void *opaque)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    BackupCompleteData *data = opaque;
+
+    bdrv_unref(s->target);
+
+    block_job_completed(job, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
+    BackupCompleteData *data;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
    BlockdevOnError on_target_error = job->on_target_error;
@@ -344,16 +360,18 @@ static void coroutine_fn backup_run(void *opaque)
    hbitmap_free(job->bitmap);

    bdrv_iostatus_disable(target);
-    bdrv_unref(target);
+    bdrv_op_unblock_all(target, job->common.blocker);

-    block_job_completed(&job->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }

 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  int64_t speed, MirrorSyncMode sync_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb, void *opaque,
+                  BlockCompletionFunc *cb, void *opaque,
                  Error **errp)
 {
    int64_t len;
@@ -362,6 +380,11 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    assert(target);
    assert(cb);

+    if (bs == target) {
+        error_setg(errp, "Source and target cannot be the same");
+        return;
+    }
+
    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
        !bdrv_iostatus_is_enabled(bs)) {
@@ -369,6 +392,26 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        return;
    }

+    if (!bdrv_is_inserted(bs)) {
+        error_setg(errp, "Device is not inserted: %s",
+                   bdrv_get_device_name(bs));
+        return;
+    }
+
+    if (!bdrv_is_inserted(target)) {
+        error_setg(errp, "Device is not inserted: %s",
+                   bdrv_get_device_name(target));
+        return;
+    }
+
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
+        return;
+    }
+
+    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
+        return;
+    }
+
    len = bdrv_getlength(bs);
    if (len < 0) {
        error_setg_errno(errp, -len, "unable to get length for '%s'",
@@ -382,6 +425,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        return;
    }

+    bdrv_op_block_all(target, job->common.blocker);
+
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
    job->target = target;
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -26,6 +26,10 @@
 #include "qemu/config-file.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"

 typedef struct BDRVBlkdebugState {
    int state;
@@ -37,7 +41,7 @@ typedef struct BDRVBlkdebugState {
 } BDRVBlkdebugState;

 typedef struct BlkdebugAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int ret;
 } BlkdebugAIOCB;
@@ -48,11 +52,8 @@ typedef struct BlkdebugSuspendedReq {
    QLIST_ENTRY(BlkdebugSuspendedReq) next;
 } BlkdebugSuspendedReq;

-static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
-
 static const AIOCBInfo blkdebug_aiocb_info = {
-    .aiocb_size = sizeof(BlkdebugAIOCB),
-    .cancel     = blkdebug_aio_cancel,
+    .aiocb_size    = sizeof(BlkdebugAIOCB),
 };

 enum {
@@ -194,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
    [BLKDBG_PWRITEV]                        = "pwritev",
    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
+
+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
 };

 static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -213,6 +216,7 @@ static int get_event_by_name(const char *name, BlkDebugEvent *event)
 struct add_rule_data {
    BDRVBlkdebugState *s;
    int action;
+    Error **errp;
 };

 static int add_rule(QemuOpts *opts, void *opaque)
@@ -225,7 +229,11 @@ static int add_rule(QemuOpts *opts, void *opaque)

    /* Find the right event for the rule */
    event_name = qemu_opt_get(opts, "event");
-    if (!event_name || get_event_by_name(event_name, &event) < 0) {
+    if (!event_name) {
+        error_setg(d->errp, "Missing event name for rule");
+        return -1;
+    } else if (get_event_by_name(event_name, &event) < 0) {
+        error_setg(d->errp, "Invalid event name \"%s\"", event_name);
        return -1;
    }

@@ -311,10 +319,21 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,

    d.s = s;
    d.action = ACTION_INJECT_ERROR;
-    qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0);
+    d.errp = &local_err;
+    qemu_opts_foreach(&inject_error_opts, add_rule, &d, 1);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }

    d.action = ACTION_SET_STATE;
-    qemu_opts_foreach(&set_state_opts, add_rule, &d, 0);
+    qemu_opts_foreach(&set_state_opts, add_rule, &d, 1);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }

    ret = 0;
 fail:
@@ -443,17 +462,11 @@ static void error_callback_bh(void *opaque)
    struct BlkdebugAIOCB *acb = opaque;
    qemu_bh_delete(acb->bh);
    acb->common.cb(acb->common.opaque, acb->ret);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

-static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    BlkdebugAIOCB *acb = container_of(blockacb, BlkdebugAIOCB, common);
-    qemu_aio_release(acb);
-}
-
-static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
-    BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
+static BlockAIOCB *inject_error(BlockDriverState *bs,
+    BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
 {
    BDRVBlkdebugState *s = bs->opaque;
    int error = rule->options.inject.error;
@@ -478,9 +491,9 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkdebugState *s = bs->opaque;
    BlkdebugRule *rule = NULL;
@@ -500,9 +513,9 @@ static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }

-static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkdebugState *s = bs->opaque;
    BlkdebugRule *rule = NULL;
@@ -522,6 +535,25 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }

+static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs,
+    BlockCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugRule *rule = NULL;
+
+    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+        if (rule->options.inject.sector == -1) {
+            break;
+        }
+    }
+
+    if (rule && rule->options.inject.error) {
+        return inject_error(bs, cb, opaque, rule);
+    }
+
+    return bdrv_aio_flush(bs->file, cb, opaque);
+}
+

 static void blkdebug_close(BlockDriverState *bs)
 {
@@ -687,6 +719,55 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
    return bdrv_getlength(bs->file);
 }

+static void blkdebug_refresh_filename(BlockDriverState *bs)
+{
+    QDict *opts;
+    const QDictEntry *e;
+    bool force_json = false;
+
+    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
+        if (strcmp(qdict_entry_key(e), "config") &&
+            strcmp(qdict_entry_key(e), "x-image") &&
+            strcmp(qdict_entry_key(e), "image") &&
+            strncmp(qdict_entry_key(e), "image.", strlen("image.")))
+        {
+            force_json = true;
+            break;
+        }
+    }
+
+    if (force_json && !bs->file->full_open_options) {
+        /* The config file cannot be recreated, so creating a plain filename
+         * is impossible */
+        return;
+    }
+
+    if (!force_json && bs->file->exact_filename[0]) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "blkdebug:%s:%s",
+                 qdict_get_try_str(bs->options, "config") ?: "",
+                 bs->file->exact_filename);
+    }
+
+    opts = qdict_new();
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug")));
+
+    QINCREF(bs->file->full_open_options);
+    qdict_put_obj(opts, "image", QOBJECT(bs->file->full_open_options));
+
+    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
+        if (strcmp(qdict_entry_key(e), "x-image") &&
+            strcmp(qdict_entry_key(e), "image") &&
+            strncmp(qdict_entry_key(e), "image.", strlen("image.")))
+        {
+            qobject_incref(qdict_entry_value(e));
+            qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e));
+        }
+    }
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_blkdebug = {
    .format_name            = "blkdebug",
    .protocol_name          = "blkdebug",
@@ -696,9 +777,11 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_file_open         = blkdebug_open,
    .bdrv_close             = blkdebug_close,
    .bdrv_getlength         = blkdebug_getlength,
+    .bdrv_refresh_filename  = blkdebug_refresh_filename,

    .bdrv_aio_readv         = blkdebug_aio_readv,
    .bdrv_aio_writev        = blkdebug_aio_writev,
+    .bdrv_aio_flush         = blkdebug_aio_flush,

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -10,6 +10,8 @@
 #include <stdarg.h>
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"

 typedef struct {
    BlockDriverState *test_file;
@@ -17,7 +19,7 @@ typedef struct {

 typedef struct BlkverifyAIOCB BlkverifyAIOCB;
 struct BlkverifyAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;

    /* Request metadata */
@@ -27,7 +29,6 @@ struct BlkverifyAIOCB {

    int ret;                    /* first completed request's result */
    unsigned int done;          /* completion counter */
-    bool *finished;             /* completion signal for cancel */

    QEMUIOVector *qiov;         /* user I/O vector */
    QEMUIOVector raw_qiov;      /* cloned I/O vector for raw file */
@@ -36,22 +37,8 @@ struct BlkverifyAIOCB {
    void (*verify)(BlkverifyAIOCB *acb);
 };

-static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb;
-    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
-    bool finished = false;
-
-    /* Wait until request completes, invokes its callback, and frees itself */
-    acb->finished = &finished;
-    while (!finished) {
-        aio_poll(aio_context, true);
-    }
-}
-
 static const AIOCBInfo blkverify_aiocb_info = {
    .aiocb_size         = sizeof(BlkverifyAIOCB),
-    .cancel             = blkverify_aio_cancel,
 };

 static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb,
@@ -156,6 +143,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,

    ret = 0;
 fail:
+    qemu_opts_del(opts);
    return ret;
 }

@@ -177,7 +165,7 @@ static int64_t blkverify_getlength(BlockDriverState *bs)
 static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
                                         int64_t sector_num, QEMUIOVector *qiov,
                                         int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
+                                         BlockCompletionFunc *cb,
                                         void *opaque)
 {
    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);
@@ -191,7 +179,6 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
    acb->qiov = qiov;
    acb->buf = NULL;
    acb->verify = NULL;
-    acb->finished = NULL;
    return acb;
 }

@@ -205,10 +192,7 @@ static void blkverify_aio_bh(void *opaque)
        qemu_vfree(acb->buf);
    }
    acb->common.cb(acb->common.opaque, acb->ret);
-    if (acb->finished) {
-        *acb->finished = true;
-    }
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static void blkverify_aio_cb(void *opaque, int ret)
@@ -245,9 +229,9 @@ static void blkverify_verify_readv(BlkverifyAIOCB *acb)
    }
 }

-static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;
    BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov,
@@ -265,9 +249,9 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;
    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
@@ -280,9 +264,9 @@ static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;

@@ -320,6 +304,32 @@ static void blkverify_attach_aio_context(BlockDriverState *bs,
    bdrv_attach_aio_context(s->test_file, new_context);
 }

+static void blkverify_refresh_filename(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* bs->file has already been refreshed */
+    bdrv_refresh_filename(s->test_file);
+
+    if (bs->file->full_open_options && s->test_file->full_open_options) {
+        QDict *opts = qdict_new();
+        qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify")));
+
+        QINCREF(bs->file->full_open_options);
+        qdict_put_obj(opts, "raw", QOBJECT(bs->file->full_open_options));
+        QINCREF(s->test_file->full_open_options);
+        qdict_put_obj(opts, "test", QOBJECT(s->test_file->full_open_options));
+
+        bs->full_open_options = opts;
+    }
+
+    if (bs->file->exact_filename[0] && s->test_file->exact_filename[0]) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "blkverify:%s:%s",
+                 bs->file->exact_filename, s->test_file->exact_filename);
+    }
+}
+
 static BlockDriver bdrv_blkverify = {
    .format_name                      = "blkverify",
    .protocol_name                    = "blkverify",
@@ -329,6 +339,7 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_file_open                   = blkverify_open,
    .bdrv_close                       = blkverify_close,
    .bdrv_getlength                   = blkverify_getlength,
+    .bdrv_refresh_filename            = blkverify_refresh_filename,

    .bdrv_aio_readv                   = blkverify_aio_readv,
    .bdrv_aio_writev                  = blkverify_aio_writev,
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -0,0 +1,665 @@
+/*
+ * QEMU Block backends
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "sysemu/block-backend.h"
+#include "block/block_int.h"
+#include "sysemu/blockdev.h"
+#include "qapi-event.h"
+
+/* Number of coroutines to reserve per attached device model */
+#define COROUTINE_POOL_RESERVATION 64
+
+struct BlockBackend {
+    char *name;
+    int refcnt;
+    BlockDriverState *bs;
+    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
+    QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */
+
+    void *dev;                  /* attached device model, if any */
+    /* TODO change to DeviceState when all users are qdevified */
+    const BlockDevOps *dev_ops;
+    void *dev_opaque;
+};
+
+static void drive_info_del(DriveInfo *dinfo);
+
+/* All the BlockBackends (except for hidden ones) */
+static QTAILQ_HEAD(, BlockBackend) blk_backends =
+    QTAILQ_HEAD_INITIALIZER(blk_backends);
+
+/*
+ * Create a new BlockBackend with @name, with a reference count of one.
+ * @name must not be null or empty.
+ * Fail if a BlockBackend with this name already exists.
+ * Store an error through @errp on failure, unless it's null.
+ * Return the new BlockBackend on success, null on failure.
+ */
+BlockBackend *blk_new(const char *name, Error **errp)
+{
+    BlockBackend *blk;
+
+    assert(name && name[0]);
+    if (!id_wellformed(name)) {
+        error_setg(errp, "Invalid device name");
+        return NULL;
+    }
+    if (blk_by_name(name)) {
+        error_setg(errp, "Device with id '%s' already exists", name);
+        return NULL;
+    }
+    if (bdrv_find_node(name)) {
+        error_setg(errp,
+                   "Device name '%s' conflicts with an existing node name",
+                   name);
+        return NULL;
+    }
+
+    blk = g_new0(BlockBackend, 1);
+    blk->name = g_strdup(name);
+    blk->refcnt = 1;
+    QTAILQ_INSERT_TAIL(&blk_backends, blk, link);
+    return blk;
+}
+
+/*
+ * Create a new BlockBackend with a new BlockDriverState attached.
+ * Otherwise just like blk_new(), which see.
+ */
+BlockBackend *blk_new_with_bs(const char *name, Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    blk = blk_new(name, errp);
+    if (!blk) {
+        return NULL;
+    }
+
+    bs = bdrv_new_root();
+    blk->bs = bs;
+    bs->blk = blk;
+    return blk;
+}
+
+static void blk_delete(BlockBackend *blk)
+{
+    assert(!blk->refcnt);
+    assert(!blk->dev);
+    if (blk->bs) {
+        assert(blk->bs->blk == blk);
+        blk->bs->blk = NULL;
+        bdrv_unref(blk->bs);
+        blk->bs = NULL;
+    }
+    /* Avoid double-remove after blk_hide_on_behalf_of_do_drive_del() */
+    if (blk->name[0]) {
+        QTAILQ_REMOVE(&blk_backends, blk, link);
+    }
+    g_free(blk->name);
+    drive_info_del(blk->legacy_dinfo);
+    g_free(blk);
+}
+
+static void drive_info_del(DriveInfo *dinfo)
+{
+    if (!dinfo) {
+        return;
+    }
+    qemu_opts_del(dinfo->opts);
+    g_free(dinfo->serial);
+    g_free(dinfo);
+}
+
+/*
+ * Increment @blk's reference count.
+ * @blk must not be null.
+ */
+void blk_ref(BlockBackend *blk)
+{
+    blk->refcnt++;
+}
+
+/*
+ * Decrement @blk's reference count.
+ * If this drops it to zero, destroy @blk.
+ * For convenience, do nothing if @blk is null.
+ */
+void blk_unref(BlockBackend *blk)
+{
+    if (blk) {
+        assert(blk->refcnt > 0);
+        if (!--blk->refcnt) {
+            blk_delete(blk);
+        }
+    }
+}
+
+/*
+ * Return the BlockBackend after @blk.
+ * If @blk is null, return the first one.
+ * Else, return @blk's next sibling, which may be null.
+ *
+ * To iterate over all BlockBackends, do
+ * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
+ *     ...
+ * }
+ */
+BlockBackend *blk_next(BlockBackend *blk)
+{
+    return blk ? QTAILQ_NEXT(blk, link) : QTAILQ_FIRST(&blk_backends);
+}
+
+/*
+ * Return @blk's name, a non-null string.
+ * Wart: the name is empty iff @blk has been hidden with
+ * blk_hide_on_behalf_of_do_drive_del().
+ */
+const char *blk_name(BlockBackend *blk)
+{
+    return blk->name;
+}
+
+/*
+ * Return the BlockBackend with name @name if it exists, else null.
+ * @name must not be null.
+ */
+BlockBackend *blk_by_name(const char *name)
+{
+    BlockBackend *blk;
+
+    assert(name);
+    QTAILQ_FOREACH(blk, &blk_backends, link) {
+        if (!strcmp(name, blk->name)) {
+            return blk;
+        }
+    }
+    return NULL;
+}
+
+/*
+ * Return the BlockDriverState attached to @blk if any, else null.
+ */
+BlockDriverState *blk_bs(BlockBackend *blk)
+{
+    return blk->bs;
+}
+
+/*
+ * Return @blk's DriveInfo if any, else null.
+ */
+DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
+{
+    return blk->legacy_dinfo;
+}
+
+/*
+ * Set @blk's DriveInfo to @dinfo, and return it.
+ * @blk must not have a DriveInfo set already.
+ * No other BlockBackend may have the same DriveInfo set.
+ */
+DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
+{
+    assert(!blk->legacy_dinfo);
+    return blk->legacy_dinfo = dinfo;
+}
+
+/*
+ * Return the BlockBackend with DriveInfo @dinfo.
+ * It must exist.
+ */
+BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
+{
+    BlockBackend *blk;
+
+    QTAILQ_FOREACH(blk, &blk_backends, link) {
+        if (blk->legacy_dinfo == dinfo) {
+            return blk;
+        }
+    }
+    abort();
+}
+
+/*
+ * Hide @blk.
+ * @blk must not have been hidden already.
+ * Make attached BlockDriverState, if any, anonymous.
+ * Once hidden, @blk is invisible to all functions that don't receive
+ * it as argument.  For example, blk_by_name() won't return it.
+ * Strictly for use by do_drive_del().
+ * TODO get rid of it!
+ */
+void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk)
+{
+    QTAILQ_REMOVE(&blk_backends, blk, link);
+    blk->name[0] = 0;
+    if (blk->bs) {
+        bdrv_make_anon(blk->bs);
+    }
+}
+
+/*
+ * Attach device model @dev to @blk.
+ * Return 0 on success, -EBUSY when a device model is attached already.
+ */
+int blk_attach_dev(BlockBackend *blk, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    if (blk->dev) {
+        return -EBUSY;
+    }
+    blk_ref(blk);
+    blk->dev = dev;
+    bdrv_iostatus_reset(blk->bs);
+    return 0;
+}
+
+/*
+ * Attach device model @dev to @blk.
+ * @blk must not have a device model attached already.
+ * TODO qdevified devices don't use this, remove when devices are qdevified
+ */
+void blk_attach_dev_nofail(BlockBackend *blk, void *dev)
+{
+    if (blk_attach_dev(blk, dev) < 0) {
+        abort();
+    }
+}
+
+/*
+ * Detach device model @dev from @blk.
+ * @dev must be currently attached to @blk.
+ */
+void blk_detach_dev(BlockBackend *blk, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    assert(blk->dev == dev);
+    blk->dev = NULL;
+    blk->dev_ops = NULL;
+    blk->dev_opaque = NULL;
+    bdrv_set_guest_block_size(blk->bs, 512);
+    blk_unref(blk);
+}
+
+/*
+ * Return the device model attached to @blk if any, else null.
+ */
+void *blk_get_attached_dev(BlockBackend *blk)
+/* TODO change to return DeviceState * when all users are qdevified */
+{
+    return blk->dev;
+}
+
+/*
+ * Set @blk's device model callbacks to @ops.
+ * @opaque is the opaque argument to pass to the callbacks.
+ * This is for use by device models.
+ */
+void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
+                     void *opaque)
+{
+    blk->dev_ops = ops;
+    blk->dev_opaque = opaque;
+}
+
+/*
+ * Notify @blk's attached device model of media change.
+ * If @load is true, notify of media load.
+ * Else, notify of media eject.
+ * Also send DEVICE_TRAY_MOVED events as appropriate.
+ */
+void blk_dev_change_media_cb(BlockBackend *blk, bool load)
+{
+    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
+        bool tray_was_closed = !blk_dev_is_tray_open(blk);
+
+        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
+        if (tray_was_closed) {
+            /* tray open */
+            qapi_event_send_device_tray_moved(blk_name(blk),
+                                              true, &error_abort);
+        }
+        if (load) {
+            /* tray close */
+            qapi_event_send_device_tray_moved(blk_name(blk),
+                                              false, &error_abort);
+        }
+    }
+}
+
+/*
+ * Does @blk's attached device model have removable media?
+ * %true if no device model is attached.
+ */
+bool blk_dev_has_removable_media(BlockBackend *blk)
+{
+    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
+}
+
+/*
+ * Notify @blk's attached device model of a media eject request.
+ * If @force is true, the medium is about to be yanked out forcefully.
+ */
+void blk_dev_eject_request(BlockBackend *blk, bool force)
+{
+    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
+        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
+    }
+}
+
+/*
+ * Does @blk's attached device model have a tray, and is it open?
+ */
+bool blk_dev_is_tray_open(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->is_tray_open) {
+        return blk->dev_ops->is_tray_open(blk->dev_opaque);
+    }
+    return false;
+}
+
+/*
+ * Does @blk's attached device model have the medium locked?
+ * %false if the device model has no such lock.
+ */
+bool blk_dev_is_medium_locked(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
+        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
+    }
+    return false;
+}
+
+/*
+ * Notify @blk's attached device model of a backend size change.
+ */
+void blk_dev_resize_cb(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->resize_cb) {
+        blk->dev_ops->resize_cb(blk->dev_opaque);
+    }
+}
+
+void blk_iostatus_enable(BlockBackend *blk)
+{
+    bdrv_iostatus_enable(blk->bs);
+}
+
+int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+             int nb_sectors)
+{
+    return bdrv_read(blk->bs, sector_num, buf, nb_sectors);
+}
+
+int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+                         int nb_sectors)
+{
+    return bdrv_read_unthrottled(blk->bs, sector_num, buf, nb_sectors);
+}
+
+int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
+              int nb_sectors)
+{
+    return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
+}
+
+BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                                 int nb_sectors, BdrvRequestFlags flags,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags,
+                                 cb, opaque);
+}
+
+int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
+{
+    return bdrv_pread(blk->bs, offset, buf, count);
+}
+
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+{
+    return bdrv_pwrite(blk->bs, offset, buf, count);
+}
+
+int64_t blk_getlength(BlockBackend *blk)
+{
+    return bdrv_getlength(blk->bs);
+}
+
+void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
+{
+    bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+}
+
+BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
+                          QEMUIOVector *iov, int nb_sectors,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
+                           QEMUIOVector *iov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_flush(BlockBackend *blk,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_flush(blk->bs, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_discard(BlockBackend *blk,
+                            int64_t sector_num, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque);
+}
+
+void blk_aio_cancel(BlockAIOCB *acb)
+{
+    bdrv_aio_cancel(acb);
+}
+
+void blk_aio_cancel_async(BlockAIOCB *acb)
+{
+    bdrv_aio_cancel_async(acb);
+}
+
+int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
+{
+    return bdrv_aio_multiwrite(blk->bs, reqs, num_reqs);
+}
+
+int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
+{
+    return bdrv_ioctl(blk->bs, req, buf);
+}
+
+BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque);
+}
+
+int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+{
+    return bdrv_co_discard(blk->bs, sector_num, nb_sectors);
+}
+
+int blk_co_flush(BlockBackend *blk)
+{
+    return bdrv_co_flush(blk->bs);
+}
+
+int blk_flush(BlockBackend *blk)
+{
+    return bdrv_flush(blk->bs);
+}
+
+int blk_flush_all(void)
+{
+    return bdrv_flush_all();
+}
+
+void blk_drain_all(void)
+{
+    bdrv_drain_all();
+}
+
+BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
+{
+    return bdrv_get_on_error(blk->bs, is_read);
+}
+
+BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
+                                      int error)
+{
+    return bdrv_get_error_action(blk->bs, is_read, error);
+}
+
+void blk_error_action(BlockBackend *blk, BlockErrorAction action,
+                      bool is_read, int error)
+{
+    bdrv_error_action(blk->bs, action, is_read, error);
+}
+
+int blk_is_read_only(BlockBackend *blk)
+{
+    return bdrv_is_read_only(blk->bs);
+}
+
+int blk_is_sg(BlockBackend *blk)
+{
+    return bdrv_is_sg(blk->bs);
+}
+
+int blk_enable_write_cache(BlockBackend *blk)
+{
+    return bdrv_enable_write_cache(blk->bs);
+}
+
+void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
+{
+    bdrv_set_enable_write_cache(blk->bs, wce);
+}
+
+void blk_invalidate_cache(BlockBackend *blk, Error **errp)
+{
+    bdrv_invalidate_cache(blk->bs, errp);
+}
+
+int blk_is_inserted(BlockBackend *blk)
+{
+    return bdrv_is_inserted(blk->bs);
+}
+
+void blk_lock_medium(BlockBackend *blk, bool locked)
+{
+    bdrv_lock_medium(blk->bs, locked);
+}
+
+void blk_eject(BlockBackend *blk, bool eject_flag)
+{
+    bdrv_eject(blk->bs, eject_flag);
+}
+
+int blk_get_flags(BlockBackend *blk)
+{
+    return bdrv_get_flags(blk->bs);
+}
+
+void blk_set_guest_block_size(BlockBackend *blk, int align)
+{
+    bdrv_set_guest_block_size(blk->bs, align);
+}
+
+void *blk_blockalign(BlockBackend *blk, size_t size)
+{
+    return qemu_blockalign(blk ? blk->bs : NULL, size);
+}
+
+bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
+{
+    return bdrv_op_is_blocked(blk->bs, op, errp);
+}
+
+void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
+{
+    bdrv_op_unblock(blk->bs, op, reason);
+}
+
+void blk_op_block_all(BlockBackend *blk, Error *reason)
+{
+    bdrv_op_block_all(blk->bs, reason);
+}
+
+void blk_op_unblock_all(BlockBackend *blk, Error *reason)
+{
+    bdrv_op_unblock_all(blk->bs, reason);
+}
+
+AioContext *blk_get_aio_context(BlockBackend *blk)
+{
+    return bdrv_get_aio_context(blk->bs);
+}
+
+void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
+{
+    bdrv_set_aio_context(blk->bs, new_context);
+}
+
+void blk_add_aio_context_notifier(BlockBackend *blk,
+        void (*attached_aio_context)(AioContext *new_context, void *opaque),
+        void (*detach_aio_context)(void *opaque), void *opaque)
+{
+    bdrv_add_aio_context_notifier(blk->bs, attached_aio_context,
+                                  detach_aio_context, opaque);
+}
+
+void blk_remove_aio_context_notifier(BlockBackend *blk,
+                                     void (*attached_aio_context)(AioContext *,
+                                                                  void *),
+                                     void (*detach_aio_context)(void *),
+                                     void *opaque)
+{
+    bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context,
+                                     detach_aio_context, opaque);
+}
+
+void blk_add_close_notifier(BlockBackend *blk, Notifier *notify)
+{
+    bdrv_add_close_notifier(blk->bs, notify);
+}
+
+void blk_io_plug(BlockBackend *blk)
+{
+    bdrv_io_plug(blk->bs);
+}
+
+void blk_io_unplug(BlockBackend *blk)
+{
+    bdrv_io_unplug(blk->bs);
+}
+
+BlockAcctStats *blk_get_stats(BlockBackend *blk)
+{
+    return bdrv_get_stats(blk->bs);
+}
+
+void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
+                  BlockCompletionFunc *cb, void *opaque)
+{
+    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
+}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -131,7 +131,11 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
        return -EFBIG;
    }

-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        error_setg(errp, "Could not allocate memory for catalog");
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
                     s->catalog_size * 4);
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -116,7 +116,12 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
                   "try increasing block size");
        return -EINVAL;
    }
-    s->offsets = g_malloc(offsets_size);
+
+    s->offsets = g_try_malloc(offsets_size);
+    if (s->offsets == NULL) {
+        error_setg(errp, "Could not allocate offsets table");
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
    if (ret < 0) {
@@ -158,8 +163,20 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* initialize zlib engine */
-    s->compressed_block = g_malloc(max_compressed_block_size + 1);
-    s->uncompressed_block = g_malloc(s->block_size);
+    s->compressed_block = g_try_malloc(max_compressed_block_size + 1);
+    if (s->compressed_block == NULL) {
+        error_setg(errp, "Could not allocate compressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    s->uncompressed_block = g_try_malloc(s->block_size);
+    if (s->uncompressed_block == NULL) {
+        error_setg(errp, "Could not allocate uncompressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    if (inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
--- a/block/commit.c
+++ b/block/commit.c
@@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs,
    return 0;
 }

-static void coroutine_fn commit_run(void *opaque)
+typedef struct {
+    int ret;
+} CommitCompleteData;
+
+static void commit_complete(BlockJob *job, void *opaque)
 {
-    CommitBlockJob *s = opaque;
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
    BlockDriverState *top = s->top;
    BlockDriverState *base = s->base;
    BlockDriverState *overlay_bs;
+    int ret = data->ret;
+
+    if (!block_job_is_cancelled(&s->common) && ret == 0) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+    }
+
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    overlay_bs = bdrv_find_overlay(active, top);
+    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, ret);
+    g_free(data);
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    CommitCompleteData *data;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
-    void *buf;
+    void *buf = NULL;
    int bytes_written = 0;
    int64_t base_len;

@@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque)


    if (s->common.len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    ret = base_len = bdrv_getlength(base);
    if (base_len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    if (base_len < s->common.len) {
        ret = bdrv_truncate(base, s->common.len);
        if (ret) {
-            goto exit_restore_reopen;
+            goto out;
        }
    }

@@ -128,7 +161,7 @@ wait:
            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
-                goto exit_free_buf;
+                goto out;
            } else {
                n = 0;
                continue;
@@ -140,27 +173,12 @@ wait:

    ret = 0;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
-        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
-    }
-
-exit_free_buf:
+out:
    qemu_vfree(buf);

-exit_restore_reopen:
-    /* restore base open flags here if appropriate (e.g., change the base back
-     * to r/o). These reopens do not need to be atomic, since we won't abort
-     * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
-    }
-    overlay_bs = bdrv_find_overlay(active, top);
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
-    }
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }

 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -182,7 +200,7 @@ static const BlockJobDriver commit_job_driver = {

 void commit_start(BlockDriverState *bs, BlockDriverState *base,
                  BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                  BlockdevOnError on_error, BlockCompletionFunc *cb,
                  void *opaque, const char *backing_file_str, Error **errp)
 {
    CommitBlockJob *s;
--- a/block/cow.c
+++ b/block/cow.c
@@ -1,432 +0,0 @@
-/*
- * Block driver for the COW format
- *
- * Copyright (c) 2004 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include "qemu-common.h"
-#include "block/block_int.h"
-#include "qemu/module.h"
-
-/**************************************************************/
-/* COW block driver using file system holes */
-
-/* user mode linux compatible COW file */
-#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
-#define COW_VERSION 2
-
-struct cow_header_v2 {
-    uint32_t magic;
-    uint32_t version;
-    char backing_file[1024];
-    int32_t mtime;
-    uint64_t size;
-    uint32_t sectorsize;
-};
-
-typedef struct BDRVCowState {
-    CoMutex lock;
-    int64_t cow_sectors_offset;
-} BDRVCowState;
-
-static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
-{
-    const struct cow_header_v2 *cow_header = (const void *)buf;
-
-    if (buf_size >= sizeof(struct cow_header_v2) &&
-        be32_to_cpu(cow_header->magic) == COW_MAGIC &&
-        be32_to_cpu(cow_header->version) == COW_VERSION)
-        return 100;
-    else
-        return 0;
-}
-
-static int cow_open(BlockDriverState *bs, QDict *options, int flags,
-                    Error **errp)
-{
-    BDRVCowState *s = bs->opaque;
-    struct cow_header_v2 cow_header;
-    int bitmap_size;
-    int64_t size;
-    int ret;
-
-    /* see if it is a cow image */
-    ret = bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header));
-    if (ret < 0) {
-        goto fail;
-    }
-
-    if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
-        error_setg(errp, "Image not in COW format");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    if (be32_to_cpu(cow_header.version) != COW_VERSION) {
-        char version[64];
-        snprintf(version, sizeof(version),
-               "COW version %" PRIu32, cow_header.version);
-        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "cow", version);
-        ret = -ENOTSUP;
-        goto fail;
-    }
-
-    /* cow image found */
-    size = be64_to_cpu(cow_header.size);
-    bs->total_sectors = size / 512;
-
-    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
-            cow_header.backing_file);
-
-    bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
-    s->cow_sectors_offset = (bitmap_size + 511) & ~511;
-    qemu_co_mutex_init(&s->lock);
-    return 0;
- fail:
-    return ret;
-}
-
-static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors)
-{
-    int64_t bitnum = start, last = start + nb_sectors;
-    while (bitnum < last) {
-        if ((bitnum & 7) == 0 && bitnum + 8 <= last) {
-            bitmap[bitnum / 8] = 0xFF;
-            bitnum += 8;
-            continue;
-        }
-        bitmap[bitnum/8] |= (1 << (bitnum % 8));
-        bitnum++;
-    }
-}
-
-#define BITS_PER_BITMAP_SECTOR (512 * 8)
-
-/* Cannot use bitmap.c on big-endian machines.  */
-static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
-{
-    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
-}
-
-static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors)
-{
-    int streak_value = value ? 0xFF : 0;
-    int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR);
-    int bitnum = start;
-    while (bitnum < last) {
-        if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) {
-            bitnum += 8;
-            continue;
-        }
-        if (cow_test_bit(bitnum, bitmap) == value) {
-            bitnum++;
-            continue;
-        }
-        break;
-    }
-    return MIN(bitnum, last) - start;
-}
-
-/* Return true if first block has been changed (ie. current version is
- * in COW file).  Set the number of continuous blocks for which that
- * is true. */
-static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *num_same)
-{
-    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
-    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
-    bool first = true;
-    int changed = 0, same = 0;
-
-    do {
-        int ret;
-        uint8_t bitmap[BDRV_SECTOR_SIZE];
-
-        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
-        int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
-
-        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-
-        if (first) {
-            changed = cow_test_bit(bitnum, bitmap);
-            first = false;
-        }
-
-        same += cow_find_streak(bitmap, changed, bitnum, nb_sectors);
-
-        bitnum += sector_bits;
-        nb_sectors -= sector_bits;
-        offset += BDRV_SECTOR_SIZE;
-    } while (nb_sectors);
-
-    *num_same = same;
-    return changed;
-}
-
-static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *num_same)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same);
-    int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS);
-    if (ret < 0) {
-        return ret;
-    }
-    return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID;
-}
-
-static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
-        int nb_sectors)
-{
-    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
-    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
-    bool first = true;
-    int sector_bits;
-
-    for ( ; nb_sectors;
-            bitnum += sector_bits,
-            nb_sectors -= sector_bits,
-            offset += BDRV_SECTOR_SIZE) {
-        int ret, set;
-        uint8_t bitmap[BDRV_SECTOR_SIZE];
-
-        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
-        sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
-
-        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-
-        /* Skip over any already set bits */
-        set = cow_find_streak(bitmap, 1, bitnum, sector_bits);
-        bitnum += set;
-        sector_bits -= set;
-        nb_sectors -= set;
-        if (!sector_bits) {
-            continue;
-        }
-
-        if (first) {
-            ret = bdrv_flush(bs->file);
-            if (ret < 0) {
-                return ret;
-            }
-            first = false;
-        }
-
-        cow_set_bits(bitmap, bitnum, sector_bits);
-
-        ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return 0;
-}
-
-static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
-                                 uint8_t *buf, int nb_sectors)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret, n;
-
-    while (nb_sectors > 0) {
-        ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n);
-        if (ret < 0) {
-            return ret;
-        }
-        if (ret) {
-            ret = bdrv_pread(bs->file,
-                        s->cow_sectors_offset + sector_num * 512,
-                        buf, n * 512);
-            if (ret < 0) {
-                return ret;
-            }
-        } else {
-            if (bs->backing_hd) {
-                /* read from the base image */
-                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
-                if (ret < 0) {
-                    return ret;
-                }
-            } else {
-                memset(buf, 0, n * 512);
-            }
-        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
-    }
-    return 0;
-}
-
-static coroutine_fn int cow_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCowState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cow_read(bs, sector_num, buf, nb_sectors);
-    qemu_co_mutex_unlock(&s->lock);
-    return ret;
-}
-
-static int cow_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret;
-
-    ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512,
-                      buf, nb_sectors * 512);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return cow_update_bitmap(bs, sector_num, nb_sectors);
-}
-
-static coroutine_fn int cow_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCowState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cow_write(bs, sector_num, buf, nb_sectors);
-    qemu_co_mutex_unlock(&s->lock);
-    return ret;
-}
-
-static void cow_close(BlockDriverState *bs)
-{
-}
-
-static int cow_create(const char *filename, QemuOpts *opts, Error **errp)
-{
-    struct cow_header_v2 cow_header;
-    struct stat st;
-    int64_t image_sectors = 0;
-    char *image_filename = NULL;
-    Error *local_err = NULL;
-    int ret;
-    BlockDriverState *cow_bs = NULL;
-
-    /* Read out options */
-    image_sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
-    image_filename = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-
-    ret = bdrv_create_file(filename, opts, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        goto exit;
-    }
-
-    ret = bdrv_open(&cow_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        goto exit;
-    }
-
-    memset(&cow_header, 0, sizeof(cow_header));
-    cow_header.magic = cpu_to_be32(COW_MAGIC);
-    cow_header.version = cpu_to_be32(COW_VERSION);
-    if (image_filename) {
-        /* Note: if no file, we put a dummy mtime */
-        cow_header.mtime = cpu_to_be32(0);
-
-        if (stat(image_filename, &st) != 0) {
-            goto mtime_fail;
-        }
-        cow_header.mtime = cpu_to_be32(st.st_mtime);
-    mtime_fail:
-        pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
-                image_filename);
-    }
-    cow_header.sectorsize = cpu_to_be32(512);
-    cow_header.size = cpu_to_be64(image_sectors * 512);
-    ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
-    if (ret < 0) {
-        goto exit;
-    }
-
-    /* resize to include at least all the bitmap */
-    ret = bdrv_truncate(cow_bs,
-        sizeof(cow_header) + ((image_sectors + 7) >> 3));
-    if (ret < 0) {
-        goto exit;
-    }
-
-exit:
-    g_free(image_filename);
-    if (cow_bs) {
-        bdrv_unref(cow_bs);
-    }
-    return ret;
-}
-
-static QemuOptsList cow_create_opts = {
-    .name = "cow-create-opts",
-    .head = QTAILQ_HEAD_INITIALIZER(cow_create_opts.head),
-    .desc = {
-        {
-            .name = BLOCK_OPT_SIZE,
-            .type = QEMU_OPT_SIZE,
-            .help = "Virtual disk size"
-        },
-        {
-            .name = BLOCK_OPT_BACKING_FILE,
-            .type = QEMU_OPT_STRING,
-            .help = "File name of a base image"
-        },
-        { /* end of list */ }
-    }
-};
-
-static BlockDriver bdrv_cow = {
-    .format_name    = "cow",
-    .instance_size  = sizeof(BDRVCowState),
-
-    .bdrv_probe     = cow_probe,
-    .bdrv_open      = cow_open,
-    .bdrv_close     = cow_close,
-    .bdrv_create    = cow_create,
-    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
-    .supports_backing       = true,
-
-    .bdrv_read              = cow_co_read,
-    .bdrv_write             = cow_co_write,
-    .bdrv_co_get_block_status   = cow_co_get_block_status,
-
-    .create_opts    = &cow_create_opts,
-};
-
-static void bdrv_cow_init(void)
-{
-    bdrv_register(&bdrv_cow);
-}
-
-block_init(bdrv_cow_init);
--- a/block/curl.c
+++ b/block/curl.c
@@ -26,7 +26,7 @@
 #include "qapi/qmp/qbool.h"
 #include <curl/curl.h>

-// #define DEBUG
+// #define DEBUG_CURL
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
@@ -63,6 +63,8 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_NUM_ACB    8
 #define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
+#define CURL_TIMEOUT_DEFAULT 5
+#define CURL_TIMEOUT_MAX 10000

 #define FIND_RET_NONE   0
 #define FIND_RET_OK     1
@@ -71,11 +73,13 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_URL       "url"
 #define CURL_BLOCK_OPT_READAHEAD "readahead"
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
+#define CURL_BLOCK_OPT_TIMEOUT "timeout"
+#define CURL_BLOCK_OPT_COOKIE    "cookie"

 struct BDRVCURLState;

 typedef struct CURLAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    QEMUIOVector *qiov;

@@ -109,6 +113,8 @@ typedef struct BDRVCURLState {
    char *url;
    size_t readahead_size;
    bool sslverify;
+    uint64_t timeout;
+    char *cookie;
    bool accept_range;
    AioContext *aio_context;
 } BDRVCURLState;
@@ -207,7 +213,7 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
                                acb->end - acb->start);
            acb->common.cb(acb->common.opaque, 0);
-            qemu_aio_release(acb);
+            qemu_aio_unref(acb);
            s->acb[i] = NULL;
        }
    }
@@ -299,7 +305,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
                    }

                    acb->common.cb(acb->common.opaque, -EIO);
-                    qemu_aio_release(acb);
+                    qemu_aio_unref(acb);
                    state->acb[i] = NULL;
                }
            }
@@ -352,7 +358,7 @@ static void curl_multi_timeout_do(void *arg)
 #endif
 }

-static CURLState *curl_init_state(BDRVCURLState *s)
+static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
 {
    CURLState *state = NULL;
    int i, j;
@@ -370,7 +376,7 @@ static CURLState *curl_init_state(BDRVCURLState *s)
            break;
        }
        if (!state) {
-            aio_poll(state->s->aio_context, true);
+            aio_poll(bdrv_get_aio_context(bs), true);
        }
    } while(!state);

@@ -382,7 +388,10 @@ static CURLState *curl_init_state(BDRVCURLState *s)
        curl_easy_setopt(state->curl, CURLOPT_URL, s->url);
        curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER,
                         (long) s->sslverify);
-        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5);
+        if (s->cookie) {
+            curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie);
+        }
+        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout);
        curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION,
                         (void *)curl_read_cb);
        curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state);
@@ -489,6 +498,16 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Verify SSL certificate"
        },
+        {
+            .name = CURL_BLOCK_OPT_TIMEOUT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Curl timeout"
+        },
+        {
+            .name = CURL_BLOCK_OPT_COOKIE,
+            .type = QEMU_OPT_STRING,
+            .help = "Pass the cookie or list of cookies with each request"
+        },
        { /* end of list */ }
    },
 };
@@ -501,6 +520,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *file;
+    const char *cookie;
    double d;

    static int inited = 0;
@@ -525,8 +545,18 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        goto out_noclean;
    }

+    s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT,
+                                     CURL_TIMEOUT_DEFAULT);
+    if (s->timeout > CURL_TIMEOUT_MAX) {
+        error_setg(errp, "timeout parameter is too large or negative");
+        goto out_noclean;
+    }
+
    s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true);

+    cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE);
+    s->cookie = g_strdup(cookie);
+
    file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL);
    if (file == NULL) {
        error_setg(errp, "curl block driver requires an 'url' option");
@@ -541,7 +571,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    DPRINTF("CURL: Opening %s\n", file);
    s->aio_context = bdrv_get_aio_context(bs);
    s->url = g_strdup(file);
-    state = curl_init_state(s);
+    state = curl_init_state(bs, s);
    if (!state)
        goto out_noclean;

@@ -582,19 +612,14 @@ out:
    curl_easy_cleanup(state->curl);
    state->curl = NULL;
 out_noclean:
+    g_free(s->cookie);
    g_free(s->url);
    qemu_opts_del(opts);
    return -EINVAL;
 }

-static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    // Do we have to implement canceling? Seems to work without...
-}
-
 static const AIOCBInfo curl_aiocb_info = {
    .aiocb_size         = sizeof(CURLAIOCB),
-    .cancel             = curl_aio_cancel,
 };


@@ -616,7 +641,7 @@ static void curl_readv_bh_cb(void *p)
    // we can just call the callback and be done.
    switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
        case FIND_RET_OK:
-            qemu_aio_release(acb);
+            qemu_aio_unref(acb);
            // fall through
        case FIND_RET_WAIT:
            return;
@@ -625,10 +650,10 @@ static void curl_readv_bh_cb(void *p)
    }

    // No cache found, so let's start a new request
-    state = curl_init_state(s);
+    state = curl_init_state(acb->common.bs, s);
    if (!state) {
        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return;
    }

@@ -640,7 +665,13 @@ static void curl_readv_bh_cb(void *p)
    state->buf_start = start;
    state->buf_len = acb->end + s->readahead_size;
    end = MIN(start + state->buf_len, s->len) - 1;
-    state->orig_buf = g_malloc(state->buf_len);
+    state->orig_buf = g_try_malloc(state->buf_len);
+    if (state->buf_len && state->orig_buf == NULL) {
+        curl_clean_state(state);
+        acb->common.cb(acb->common.opaque, -ENOMEM);
+        qemu_aio_unref(acb);
+        return;
+    }
    state->acb[0] = acb;

    snprintf(state->range, 127, "%zd-%zd", start, end);
@@ -654,9 +685,9 @@ static void curl_readv_bh_cb(void *p)
    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 }

-static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    CURLAIOCB *acb;

@@ -678,6 +709,7 @@ static void curl_close(BlockDriverState *bs)
    DPRINTF("CURL: Close\n");
    curl_detach_aio_context(bs);

+    g_free(s->cookie);
    g_free(s->url);
 }

--- a/block/dmg.c
+++ b/block/dmg.c
@@ -284,8 +284,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size + 1);
-    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
+    s->compressed_chunk = qemu_try_blockalign(bs->file,
+                                              max_compressed_size + 1);
+    s->uncompressed_chunk = qemu_try_blockalign(bs->file,
+                                                512 * max_sectors_per_chunk);
+    if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    if (inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
@@ -302,8 +309,8 @@ fail:
    g_free(s->lengths);
    g_free(s->sectors);
    g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);
    return ret;
 }

@@ -426,8 +433,8 @@ static void dmg_close(BlockDriverState *bs)
    g_free(s->lengths);
    g_free(s->sectors);
    g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);

    inflateEnd(&s->zstream);
 }
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -291,7 +291,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
    BDRVGlusterState *s = bs->opaque;
    int open_flags = 0;
    int ret = 0;
-    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+    GlusterConf *gconf = g_new0(GlusterConf, 1);
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
@@ -351,12 +351,12 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
    assert(state != NULL);
    assert(state->bs != NULL);

-    state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState));
+    state->opaque = g_new0(BDRVGlusterReopenState, 1);
    reop_s = state->opaque;

    qemu_gluster_parse_flags(state->flags, &open_flags);

-    gconf = g_malloc0(sizeof(GlusterConf));
+    gconf = g_new0(GlusterConf, 1);

    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
    if (reop_s->glfs == NULL) {
@@ -486,7 +486,7 @@ static int qemu_gluster_create(const char *filename,
    int prealloc = 0;
    int64_t total_size = 0;
    char *tmp = NULL;
-    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+    GlusterConf *gconf = g_new0(GlusterConf, 1);

    glfs = qemu_gluster_init(gconf, filename, errp);
    if (!glfs) {
@@ -494,8 +494,8 @@ static int qemu_gluster_create(const char *filename,
        goto out;
    }

-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    if (!tmp || !strcmp(tmp, "off")) {
@@ -516,9 +516,8 @@ static int qemu_gluster_create(const char *filename,
    if (!fd) {
        ret = -errno;
    } else {
-        if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
-            if (prealloc && qemu_gluster_zerofill(fd, 0,
-                    total_size * BDRV_SECTOR_SIZE)) {
+        if (!glfs_ftruncate(fd, total_size)) {
+            if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
                ret = -errno;
            }
        } else {
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -34,7 +34,6 @@
 #include "qemu/bitops.h"
 #include "qemu/bitmap.h"
 #include "block/block_int.h"
-#include "trace.h"
 #include "block/scsi.h"
 #include "qemu/iov.h"
 #include "sysemu/sysemu.h"
@@ -81,14 +80,13 @@ typedef struct IscsiTask {
 } IscsiTask;

 typedef struct IscsiAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUIOVector *qiov;
    QEMUBH *bh;
    IscsiLun *iscsilun;
    struct scsi_task *task;
    uint8_t *buf;
    int status;
-    int canceled;
    int64_t sector_num;
    int nb_sectors;
 #ifdef __linux__
@@ -120,16 +118,14 @@ iscsi_bh_cb(void *p)
    g_free(acb->buf);
    acb->buf = NULL;

-    if (acb->canceled == 0) {
-        acb->common.cb(acb->common.opaque, acb->status);
-    }
+    acb->common.cb(acb->common.opaque, acb->status);

    if (acb->task != NULL) {
        scsi_free_scsi_task(acb->task);
        acb->task = NULL;
    }

-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static void
@@ -231,7 +227,7 @@ iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
 }

 static void
-iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
+iscsi_aio_cancel(BlockAIOCB *blockacb)
 {
    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
    IscsiLun *iscsilun = acb->iscsilun;
@@ -240,20 +236,15 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
        return;
    }

-    acb->canceled = 1;
-
    /* send a task mgmt call to the target to cancel the task on the target */
    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
                                     iscsi_abort_task_cb, acb);

-    while (acb->status == -EINPROGRESS) {
-        aio_poll(iscsilun->aio_context, true);
-    }
 }

 static const AIOCBInfo iscsi_aiocb_info = {
    .aiocb_size         = sizeof(IscsiAIOCB),
-    .cancel             = iscsi_aio_cancel,
+    .cancel_async       = iscsi_aio_cancel,
 };


@@ -325,6 +316,13 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
    return 1;
 }

+static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+{
+    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
+                                                       iscsilun),
+                                       iscsilun->cluster_sectors));
+}
+
 static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
                                    int nb_sectors)
 {
@@ -364,6 +362,12 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    lba = sector_qemu2lun(sector_num, iscsilun);
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
@@ -531,6 +535,12 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
        int64_t ret;
@@ -638,10 +648,6 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    g_free(acb->buf);
    acb->buf = NULL;

-    if (acb->canceled != 0) {
-        return;
-    }
-
    acb->status = 0;
    if (status < 0) {
        error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
@@ -669,9 +675,9 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    iscsi_schedule_bh(acb);
 }

-static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
+static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct iscsi_context *iscsi = iscsilun->iscsi;
@@ -683,7 +689,6 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
-    acb->canceled    = 0;
    acb->bh          = NULL;
    acb->status      = -EINPROGRESS;
    acb->buf         = NULL;
@@ -693,7 +698,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
                     iscsi_get_error(iscsi));
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return NULL;
    }
    memset(acb->task, 0, sizeof(struct scsi_task));
@@ -731,7 +736,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
                                 (data.size > 0) ? &data : NULL,
                                 acb) != 0) {
        scsi_free_scsi_task(acb->task);
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return NULL;
    }

@@ -893,7 +898,10 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);

    if (iscsilun->zeroblock == NULL) {
-        iscsilun->zeroblock = g_malloc0(iscsilun->block_size);
+        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
+        if (iscsilun->zeroblock == NULL) {
+            return -ENOMEM;
+        }
    }

    iscsi_co_init_iscsitask(iscsilun, &iTask);
@@ -1223,6 +1231,40 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
 }

+static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+{
+    struct scsi_task *task;
+    struct scsi_mode_sense *ms = NULL;
+    bool wrprotected = false;
+
+    task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
+                                 1, SCSI_MODESENSE_PC_CURRENT,
+                                 0x3F, 0, 255);
+    if (task == NULL) {
+        error_report("iSCSI: Failed to send MODE_SENSE(6) command: %s",
+                     iscsi_get_error(iscsilun->iscsi));
+        goto out;
+    }
+
+    if (task->status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI: Failed MODE_SENSE(6), LUN assumed writable");
+        goto out;
+    }
+    ms = scsi_datain_unmarshall(task);
+    if (!ms) {
+        error_report("iSCSI: Failed to unmarshall MODE_SENSE(6) data: %s",
+                     iscsi_get_error(iscsilun->iscsi));
+        goto out;
+    }
+    wrprotected = ms->device_specific_parameter & 0x80;
+
+out:
+    if (task) {
+        scsi_free_scsi_task(task);
+    }
+    return wrprotected;
+}
+
 /*
 * We support iscsi url's on the form
 * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
@@ -1244,7 +1286,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
-    int i, ret;
+    int i, ret = 0;

    if ((BDRV_SECTOR_SIZE % 512) != 0) {
        error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. "
@@ -1343,6 +1385,14 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    scsi_free_scsi_task(task);
    task = NULL;

+    /* Check the write protect flag of the LUN if we want to write */
+    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
+        iscsi_is_write_protected(iscsilun)) {
+        error_setg(errp, "Cannot open a write protected LUN as read-write");
+        ret = -EACCES;
+        goto out;
+    }
+
    iscsi_readcapacity_sync(iscsilun, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -1413,9 +1463,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
-            iscsilun->allocationmap =
-                bitmap_new(DIV_ROUND_UP(bs->total_sectors,
-                                        iscsilun->cluster_sectors));
+            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+            if (iscsilun->allocationmap == NULL) {
+                ret = -ENOMEM;
+            }
        }
    }

@@ -1450,31 +1501,44 @@ static void iscsi_close(BlockDriverState *bs)
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1);
+}
+
 static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    IscsiLun *iscsilun = bs->opaque;
-
    /* We don't actually refresh here, but just return data queried in
     * iscsi_open(): iscsi targets don't change their limits. */
+
+    IscsiLun *iscsilun = bs->opaque;
+    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
+
+    if (iscsilun->bl.max_xfer_len) {
+        max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len);
+    }
+
+    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun);
+
    if (iscsilun->lbp.lbpu) {
        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
-                                                 iscsilun);
+            bs->bl.max_discard =
+                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun);
        }
-        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                   iscsilun);
+        bs->bl.discard_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }

    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
-                                                  iscsilun);
+        bs->bl.max_write_zeroes =
+            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
    }
    if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                        iscsilun);
+        bs->bl.write_zeroes_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }
-    bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
-                                                 iscsilun);
+    bs->bl.opt_transfer_length =
+        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
 }

 /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in
@@ -1508,9 +1572,7 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)

    if (iscsilun->allocationmap != NULL) {
        g_free(iscsilun->allocationmap);
-        iscsilun->allocationmap =
-            bitmap_new(DIV_ROUND_UP(bs->total_sectors,
-                                    iscsilun->cluster_sectors));
+        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
    }

    return 0;
@@ -1524,12 +1586,12 @@ static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
    IscsiLun *iscsilun = NULL;
    QDict *bs_options;

-    bs = bdrv_new("", &error_abort);
+    bs = bdrv_new();

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
-    bs->opaque = g_malloc0(sizeof(struct IscsiLun));
+    total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                              BDRV_SECTOR_SIZE);
+    bs->opaque = g_new0(struct IscsiLun, 1);
    iscsilun = bs->opaque;

    bs_options = qdict_new();
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -28,21 +28,21 @@
 #define MAX_QUEUED_IO  128

 struct qemu_laiocb {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    struct qemu_laio_state *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
-    QLIST_ENTRY(qemu_laiocb) node;
+    QSIMPLEQ_ENTRY(qemu_laiocb) next;
 };

 typedef struct {
-    struct iocb *iocbs[MAX_QUEUED_IO];
    int plugged;
-    unsigned int size;
-    unsigned int idx;
+    unsigned int n;
+    bool blocked;
+    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;

 struct qemu_laio_state {
@@ -51,8 +51,16 @@ struct qemu_laio_state {

    /* io queue for submit at batch */
    LaioQueue io_q;
+
+    /* I/O completion processing */
+    QEMUBH *completion_bh;
+    struct io_event events[MAX_EVENTS];
+    int event_idx;
+    int event_max;
 };

+static void ioq_submit(struct qemu_laio_state *s);
+
 static inline ssize_t io_event_ret(struct io_event *ev)
 {
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
@@ -79,120 +87,132 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
                ret = -EINVAL;
            }
        }
+    }
+    laiocb->common.cb(laiocb->common.opaque, ret);

-        laiocb->common.cb(laiocb->common.opaque, ret);
+    qemu_aio_unref(laiocb);
+}
+
+/* The completion BH fetches completed I/O requests and invokes their
+ * callbacks.
+ *
+ * The function is somewhat tricky because it supports nested event loops, for
+ * example when a request callback invokes aio_poll().  In order to do this,
+ * the completion events array and index are kept in qemu_laio_state.  The BH
+ * reschedules itself as long as there are completions pending so it will
+ * either be called again in a nested event loop or will be called after all
+ * events have been completed.  When there are no events left to complete, the
+ * BH returns without rescheduling.
+ */
+static void qemu_laio_completion_bh(void *opaque)
+{
+    struct qemu_laio_state *s = opaque;
+
+    /* Fetch more completion events when empty */
+    if (s->event_idx == s->event_max) {
+        do {
+            struct timespec ts = { 0 };
+            s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS,
+                                        s->events, &ts);
+        } while (s->event_max == -EINTR);
+
+        s->event_idx = 0;
+        if (s->event_max <= 0) {
+            s->event_max = 0;
+            return; /* no more events */
+        }
    }

-    qemu_aio_release(laiocb);
+    /* Reschedule so nested event loops see currently pending completions */
+    qemu_bh_schedule(s->completion_bh);
+
+    /* Process completion events */
+    while (s->event_idx < s->event_max) {
+        struct iocb *iocb = s->events[s->event_idx].obj;
+        struct qemu_laiocb *laiocb =
+                container_of(iocb, struct qemu_laiocb, iocb);
+
+        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
+        s->event_idx++;
+
+        qemu_laio_process_completion(s, laiocb);
+    }
+
+    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+        ioq_submit(s);
+    }
 }

 static void qemu_laio_completion_cb(EventNotifier *e)
 {
    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);

-    while (event_notifier_test_and_clear(&s->e)) {
-        struct io_event events[MAX_EVENTS];
-        struct timespec ts = { 0 };
-        int nevents, i;
-
-        do {
-            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
-        } while (nevents == -EINTR);
-
-        for (i = 0; i < nevents; i++) {
-            struct iocb *iocb = events[i].obj;
-            struct qemu_laiocb *laiocb =
-                    container_of(iocb, struct qemu_laiocb, iocb);
-
-            laiocb->ret = io_event_ret(&events[i]);
-            qemu_laio_process_completion(s, laiocb);
-        }
+    if (event_notifier_test_and_clear(&s->e)) {
+        qemu_bh_schedule(s->completion_bh);
    }
 }

-static void laio_cancel(BlockDriverAIOCB *blockacb)
+static void laio_cancel(BlockAIOCB *blockacb)
 {
    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
    struct io_event event;
    int ret;

-    if (laiocb->ret != -EINPROGRESS)
+    if (laiocb->ret != -EINPROGRESS) {
        return;
-
-    /*
-     * Note that as of Linux 2.6.31 neither the block device code nor any
-     * filesystem implements cancellation of AIO request.
-     * Thus the polling loop below is the normal code path.
-     */
+    }
    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
-    if (ret == 0) {
-        laiocb->ret = -ECANCELED;
+    laiocb->ret = -ECANCELED;
+    if (ret != 0) {
+        /* iocb is not cancelled, cb will be called by the event loop later */
        return;
    }

-    /*
-     * We have to wait for the iocb to finish.
-     *
-     * The only way to get the iocb status update is by polling the io context.
-     * We might be able to do this slightly more optimal by removing the
-     * O_NONBLOCK flag.
-     */
-    while (laiocb->ret == -EINPROGRESS) {
-        qemu_laio_completion_cb(&laiocb->ctx->e);
-    }
+    laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
 }

 static const AIOCBInfo laio_aiocb_info = {
    .aiocb_size         = sizeof(struct qemu_laiocb),
-    .cancel             = laio_cancel,
+    .cancel_async       = laio_cancel,
 };

 static void ioq_init(LaioQueue *io_q)
 {
-    io_q->size = MAX_QUEUED_IO;
-    io_q->idx = 0;
+    QSIMPLEQ_INIT(&io_q->pending);
    io_q->plugged = 0;
+    io_q->n = 0;
+    io_q->blocked = false;
 }

-static int ioq_submit(struct qemu_laio_state *s)
+static void ioq_submit(struct qemu_laio_state *s)
 {
-    int ret, i = 0;
-    int len = s->io_q.idx;
+    int ret, len;
+    struct qemu_laiocb *aiocb;
+    struct iocb *iocbs[MAX_QUEUED_IO];
+    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
-        ret = io_submit(s->ctx, len, s->io_q.iocbs);
-    } while (i++ < 3 && ret == -EAGAIN);
+        len = 0;
+        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
+            iocbs[len++] = &aiocb->iocb;
+            if (len == MAX_QUEUED_IO) {
+                break;
+            }
+        }

-    /* empty io queue */
-    s->io_q.idx = 0;
+        ret = io_submit(s->ctx, len, iocbs);
+        if (ret == -EAGAIN) {
+            break;
+        }
+        if (ret < 0) {
+            abort();
+        }

-    if (ret < 0) {
-        i = 0;
-    } else {
-        i = ret;
-    }
-
-    for (; i < len; i++) {
-        struct qemu_laiocb *laiocb =
-            container_of(s->io_q.iocbs[i], struct qemu_laiocb, iocb);
-
-        laiocb->ret = (ret < 0) ? ret : -EIO;
-        qemu_laio_process_completion(s, laiocb);
-    }
-    return ret;
-}
-
-static void ioq_enqueue(struct qemu_laio_state *s, struct iocb *iocb)
-{
-    unsigned int idx = s->io_q.idx;
-
-    s->io_q.iocbs[idx++] = iocb;
-    s->io_q.idx = idx;
-
-    /* submit immediately if queue is full */
-    if (idx == s->io_q.size) {
-        ioq_submit(s);
-    }
+        s->io_q.n -= ret;
+        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
+        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
+    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
+    s->io_q.blocked = (s->io_q.n > 0);
 }

 void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
@@ -202,27 +222,24 @@ void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
    s->io_q.plugged++;
 }

-int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
+void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
 {
    struct qemu_laio_state *s = aio_ctx;
-    int ret = 0;

    assert(s->io_q.plugged > 0 || !unplug);

    if (unplug && --s->io_q.plugged > 0) {
-        return 0;
+        return;
    }

-    if (s->io_q.idx > 0) {
-        ret = ioq_submit(s);
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+        ioq_submit(s);
    }
-
-    return ret;
 }

-BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    struct qemu_laio_state *s = aio_ctx;
    struct qemu_laiocb *laiocb;
@@ -253,17 +270,16 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

-    if (!s->io_q.plugged) {
-        if (io_submit(s->ctx, 1, &iocbs) < 0) {
-            goto out_free_aiocb;
-        }
-    } else {
-        ioq_enqueue(s, iocbs);
+    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
+    s->io_q.n++;
+    if (!s->io_q.blocked &&
+        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
+        ioq_submit(s);
    }
    return &laiocb->common;

 out_free_aiocb:
-    qemu_aio_release(laiocb);
+    qemu_aio_unref(laiocb);
    return NULL;
 }

@@ -272,12 +288,14 @@ void laio_detach_aio_context(void *s_, AioContext *old_context)
    struct qemu_laio_state *s = s_;

    aio_set_event_notifier(old_context, &s->e, NULL);
+    qemu_bh_delete(s->completion_bh);
 }

 void laio_attach_aio_context(void *s_, AioContext *new_context)
 {
    struct qemu_laio_state *s = s_;

+    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb);
 }

--- a/block/mirror.c
+++ b/block/mirror.c
@@ -45,6 +45,7 @@ typedef struct MirrorBlockJob {
    int64_t sector_num;
    int64_t granularity;
    size_t buf_size;
+    int64_t bdev_length;
    unsigned long *cow_bitmap;
    BdrvDirtyBitmap *dirty_bitmap;
    HBitmapIter hbi;
@@ -54,6 +55,7 @@ typedef struct MirrorBlockJob {

    unsigned long *in_flight_bitmap;
    int in_flight;
+    int sectors_in_flight;
    int ret;
 } MirrorBlockJob;

@@ -87,6 +89,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
+    s->sectors_in_flight -= op->nb_sectors;
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
@@ -98,8 +101,11 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
-    if (s->cow_bitmap && ret >= 0) {
-        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+    if (ret >= 0) {
+        if (s->cow_bitmap) {
+            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+        }
+        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
    }

    qemu_iovec_destroy(&op->qiov);
@@ -122,7 +128,8 @@ static void mirror_write_complete(void *opaque, int ret)
        BlockDriverState *source = s->common.bs;
        BlockErrorAction action;

-        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
+                              op->nb_sectors);
        action = mirror_error_action(s, false, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -139,7 +146,8 @@ static void mirror_read_complete(void *opaque, int ret)
        BlockDriverState *source = s->common.bs;
        BlockErrorAction action;

-        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
+                              op->nb_sectors);
        action = mirror_error_action(s, true, -ret);
        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
            s->ret = ret;
@@ -157,7 +165,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    BlockDriverState *source = s->common.bs;
    int nb_sectors, sectors_per_chunk, nb_chunks;
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
-    uint64_t delay_ns;
+    uint64_t delay_ns = 0;
    MirrorOp *op;

    s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -172,7 +180,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    hbitmap_next_sector = s->sector_num;
    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    end = s->common.len >> BDRV_SECTOR_BITS;
+    end = s->bdev_length / BDRV_SECTOR_SIZE;

    /* Extend the QEMUIOVector to include all adjacent blocks that will
     * be copied in this operation.
@@ -247,8 +255,6 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        next_chunk += added_chunks;
        if (!s->synced && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
-        } else {
-            delay_ns = 0;
        }
    } while (delay_ns == 0 && next_sector < end);

@@ -282,10 +288,12 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        next_sector += sectors_per_chunk;
    }

-    bdrv_reset_dirty(source, sector_num, nb_sectors);
+    bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
+                            nb_sectors);

    /* Copy the dirty cluster.  */
    s->in_flight++;
+    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
@@ -316,9 +324,56 @@ static void mirror_drain(MirrorBlockJob *s)
    }
 }

+typedef struct {
+    int ret;
+} MirrorExitData;
+
+static void mirror_exit(BlockJob *job, void *opaque)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    MirrorExitData *data = opaque;
+    AioContext *replace_aio_context = NULL;
+
+    if (s->to_replace) {
+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+    }
+
+    if (s->should_complete && data->ret == 0) {
+        BlockDriverState *to_replace = s->common.bs;
+        if (s->to_replace) {
+            to_replace = s->to_replace;
+        }
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
+        }
+        bdrv_swap(s->target, to_replace);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            bdrv_set_backing_hd(s->base, NULL);
+            bdrv_unref(p);
+        }
+    }
+    if (s->to_replace) {
+        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
+        error_free(s->replace_blocker);
+        bdrv_unref(s->to_replace);
+    }
+    if (replace_aio_context) {
+        aio_context_release(replace_aio_context);
+    }
+    g_free(s->replaces);
+    bdrv_unref(s->target);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
+    MirrorExitData *data;
    BlockDriverState *bs = s->common.bs;
    int64_t sector_num, end, sectors_per_chunk, length;
    uint64_t last_pause_ns;
@@ -331,11 +386,11 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
-        ret = s->common.len;
+    s->bdev_length = bdrv_getlength(bs);
+    if (s->bdev_length < 0) {
+        ret = s->bdev_length;
        goto immediate_exit;
-    } else if (s->common.len == 0) {
+    } else if (s->bdev_length == 0) {
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
@@ -346,7 +401,7 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    length = DIV_ROUND_UP(s->common.len, s->granularity);
+    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
    s->in_flight_bitmap = bitmap_new(length);

    /* If we have no backing file yet in the destination, we cannot let
@@ -366,8 +421,13 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

-    end = s->common.len >> BDRV_SECTOR_BITS;
-    s->buf = qemu_blockalign(bs, s->buf_size);
+    end = s->bdev_length / BDRV_SECTOR_SIZE;
+    s->buf = qemu_try_blockalign(bs, s->buf_size);
+    if (s->buf == NULL) {
+        ret = -ENOMEM;
+        goto immediate_exit;
+    }
+
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    mirror_free_init(s);

@@ -385,7 +445,7 @@ static void coroutine_fn mirror_run(void *opaque)

            assert(n > 0);
            if (ret == 1) {
-                bdrv_set_dirty(bs, sector_num, n);
+                bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
                sector_num = next;
            } else {
                sector_num += n;
@@ -406,6 +466,12 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        /* s->common.offset contains the number of bytes already processed so
+         * far, cnt is the number of dirty sectors remaining and
+         * s->sectors_in_flight is the number of sectors currently being
+         * processed; together those are the current total operation length */
+        s->common.len = s->common.offset +
+                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that qemu_aio_flush() returns.
@@ -442,7 +508,6 @@ static void coroutine_fn mirror_run(void *opaque)
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
-                s->common.offset = end * BDRV_SECTOR_SIZE;
                if (!s->synced) {
                    block_job_event_ready(&s->common);
                    s->synced = true;
@@ -464,15 +529,13 @@ static void coroutine_fn mirror_run(void *opaque)
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
-            bdrv_drain_all();
+            bdrv_drain(bs);
            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
        }

        ret = 0;
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
        if (!s->synced) {
-            /* Publish progress */
-            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
                break;
@@ -507,31 +570,10 @@ immediate_exit:
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
    bdrv_iostatus_disable(s->target);
-    if (s->should_complete && ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
-        if (s->to_replace) {
-            to_replace = s->to_replace;
-        }
-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_swap(s->target, to_replace);
-        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
-            /* drop the bs loop chain formed by the swap: break the loop then
-             * trigger the unref from the top one */
-            BlockDriverState *p = s->base->backing_hd;
-            bdrv_set_backing_hd(s->base, NULL);
-            bdrv_unref(p);
-        }
-    }
-    if (s->to_replace) {
-        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
-        error_free(s->replace_blocker);
-        bdrv_unref(s->to_replace);
-    }
-    g_free(s->replaces);
-    bdrv_unref(s->target);
-    block_job_completed(&s->common, ret);
+
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -564,22 +606,30 @@ static void mirror_complete(BlockJob *job, Error **errp)
        return;
    }
    if (!s->synced) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY,
+                  bdrv_get_device_name(job->bs));
        return;
    }

    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
+        AioContext *replace_aio_context;
+
        s->to_replace = check_to_replace_node(s->replaces, &local_err);
        if (!s->to_replace) {
            error_propagate(errp, local_err);
            return;
        }

+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
+
+        aio_context_release(replace_aio_context);
    }

    s->should_complete = true;
@@ -609,7 +659,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             int64_t buf_size,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
-                             BlockDriverCompletionFunc *cb,
+                             BlockCompletionFunc *cb,
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
@@ -669,7 +719,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                  int64_t speed, int64_t granularity, int64_t buf_size,
                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb,
+                  BlockCompletionFunc *cb,
                  void *opaque, Error **errp)
 {
    bool is_none_mode;
@@ -686,7 +736,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
 void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         int64_t speed,
                         BlockdevOnError on_error,
-                         BlockDriverCompletionFunc *cb,
+                         BlockCompletionFunc *cb,
                         void *opaque, Error **errp)
 {
    int64_t length, base_length;
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -31,8 +31,10 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "qemu/sockets.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"

 #include <sys/types.h>
 #include <unistd.h>
@@ -338,6 +340,51 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
    nbd_client_session_attach_aio_context(&s->client, new_context);
 }

+static void nbd_refresh_filename(BlockDriverState *bs)
+{
+    QDict *opts = qdict_new();
+    const char *path   = qdict_get_try_str(bs->options, "path");
+    const char *host   = qdict_get_try_str(bs->options, "host");
+    const char *port   = qdict_get_try_str(bs->options, "port");
+    const char *export = qdict_get_try_str(bs->options, "export");
+
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));
+
+    if (path && export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix:///%s?socket=%s", export, path);
+    } else if (path && !export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix://?socket=%s", path);
+    } else if (!path && export && port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s/%s", host, port, export);
+    } else if (!path && export && !port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s/%s", host, export);
+    } else if (!path && !export && port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s", host, port);
+    } else if (!path && !export && !port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s", host);
+    }
+
+    if (path) {
+        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path)));
+    } else if (port) {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port)));
+    } else {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+    }
+    if (export) {
+        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export)));
+    }
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_nbd = {
    .format_name                = "nbd",
    .protocol_name              = "nbd",
@@ -352,6 +399,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static BlockDriver bdrv_nbd_tcp = {
@@ -368,6 +416,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static BlockDriver bdrv_nbd_unix = {
@@ -384,6 +433,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static void bdrv_nbd_init(void)
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -172,7 +172,11 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,

    nfs_co_init_task(client, &task);

-    buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    if (nb_sectors && buf == NULL) {
+        return -ENOMEM;
+    }
+
    qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE);

    if (nfs_pwrite_async(client->context, client->fh,
@@ -389,28 +393,46 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }
    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                          errp);
    if (ret < 0) {
-        return ret;
+        goto out;
    }
    bs->total_sectors = ret;
-    return 0;
+    ret = 0;
+out:
+    qemu_opts_del(opts);
+    return ret;
 }

+static QemuOptsList nfs_create_opts = {
+    .name = "nfs-create-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(nfs_create_opts.head),
+    .desc = {
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size"
+        },
+        { /* end of list */ }
+    }
+};
+
 static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
 {
    int ret = 0;
    int64_t total_size = 0;
-    NFSClient *client = g_malloc0(sizeof(NFSClient));
+    NFSClient *client = g_new0(NFSClient, 1);

    client->aio_context = qemu_get_aio_context();

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    ret = nfs_client_open(client, url, O_CREAT, errp);
    if (ret < 0) {
@@ -461,6 +483,8 @@ static BlockDriver bdrv_nfs = {

    .instance_size                  = sizeof(NFSClient),
    .bdrv_needs_filename            = true,
+    .create_opts                    = &nfs_create_opts,
+
    .bdrv_has_zero_init             = nfs_has_zero_init,
    .bdrv_get_allocated_file_size   = nfs_get_allocated_file_size,
    .bdrv_truncate                  = nfs_file_truncate,
--- a/block/null.c
+++ b/block/null.c
@@ -0,0 +1,168 @@
+/*
+ * Null block driver
+ *
+ * Authors:
+ *  Fam Zheng <famz@redhat.com>
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "block/block_int.h"
+
+typedef struct {
+    int64_t length;
+} BDRVNullState;
+
+static QemuOptsList runtime_opts = {
+    .name = "null",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "",
+        },
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "size of the null block",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
+                          Error **errp)
+{
+    QemuOpts *opts;
+    BDRVNullState *s = bs->opaque;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &error_abort);
+    s->length =
+        qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+    qemu_opts_del(opts);
+    return 0;
+}
+
+static void null_close(BlockDriverState *bs)
+{
+}
+
+static int64_t null_getlength(BlockDriverState *bs)
+{
+    BDRVNullState *s = bs->opaque;
+    return s->length;
+}
+
+static coroutine_fn int null_co_readv(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      QEMUIOVector *qiov)
+{
+    return 0;
+}
+
+static coroutine_fn int null_co_writev(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       QEMUIOVector *qiov)
+{
+    return 0;
+}
+
+static coroutine_fn int null_co_flush(BlockDriverState *bs)
+{
+    return 0;
+}
+
+typedef struct {
+    BlockAIOCB common;
+    QEMUBH *bh;
+} NullAIOCB;
+
+static const AIOCBInfo null_aiocb_info = {
+    .aiocb_size = sizeof(NullAIOCB),
+};
+
+static void null_bh_cb(void *opaque)
+{
+    NullAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_unref(acb);
+}
+
+static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque)
+{
+    NullAIOCB *acb;
+
+    acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
+static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
+                                  int64_t sector_num, QEMUIOVector *qiov,
+                                  int nb_sectors,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
+                                   int64_t sector_num, QEMUIOVector *qiov,
+                                   int nb_sectors,
+                                   BlockCompletionFunc *cb,
+                                   void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockDriver bdrv_null_co = {
+    .format_name            = "null-co",
+    .protocol_name          = "null-co",
+    .instance_size          = sizeof(BDRVNullState),
+
+    .bdrv_file_open         = null_file_open,
+    .bdrv_close             = null_close,
+    .bdrv_getlength         = null_getlength,
+
+    .bdrv_co_readv          = null_co_readv,
+    .bdrv_co_writev         = null_co_writev,
+    .bdrv_co_flush_to_disk  = null_co_flush,
+};
+
+static BlockDriver bdrv_null_aio = {
+    .format_name            = "null-aio",
+    .protocol_name          = "null-aio",
+    .instance_size          = sizeof(BDRVNullState),
+
+    .bdrv_file_open         = null_file_open,
+    .bdrv_close             = null_close,
+    .bdrv_getlength         = null_getlength,
+
+    .bdrv_aio_readv         = null_aio_readv,
+    .bdrv_aio_writev        = null_aio_writev,
+    .bdrv_aio_flush         = null_aio_flush,
+};
+
+static void bdrv_null_init(void)
+{
+    bdrv_register(&bdrv_null_co);
+    bdrv_register(&bdrv_null_aio);
+}
+
+block_init(bdrv_null_init);
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -30,6 +30,7 @@
 /**************************************************************/

 #define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_MAGIC2 "WithouFreSpacExt"
 #define HEADER_VERSION 2
 #define HEADER_SIZE 64

@@ -41,8 +42,10 @@ struct parallels_header {
    uint32_t cylinders;
    uint32_t tracks;
    uint32_t catalog_entries;
-    uint32_t nb_sectors;
-    char padding[24];
+    uint64_t nb_sectors;
+    uint32_t inuse;
+    uint32_t data_off;
+    char padding[12];
 } QEMU_PACKED;

 typedef struct BDRVParallelsState {
@@ -52,6 +55,8 @@ typedef struct BDRVParallelsState {
    unsigned int catalog_size;

    unsigned int tracks;
+
+    unsigned int off_multiplier;
 } BDRVParallelsState;

 static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -59,11 +64,12 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam
    const struct parallels_header *ph = (const void *)buf;

    if (buf_size < HEADER_SIZE)
-	return 0;
+        return 0;

-    if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
-	(le32_to_cpu(ph->version) == HEADER_VERSION))
-	return 100;
+    if ((!memcmp(ph->magic, HEADER_MAGIC, 16) ||
+        !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
+        (le32_to_cpu(ph->version) == HEADER_VERSION))
+        return 100;

    return 0;
 }
@@ -83,14 +89,19 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
-        (le32_to_cpu(ph.version) != HEADER_VERSION)) {
-        error_setg(errp, "Image not in Parallels format");
-        ret = -EINVAL;
-        goto fail;
-    }
+    bs->total_sectors = le64_to_cpu(ph.nb_sectors);

-    bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+    if (le32_to_cpu(ph.version) != HEADER_VERSION) {
+        goto fail_format;
+    }
+    if (!memcmp(ph.magic, HEADER_MAGIC, 16)) {
+        s->off_multiplier = 1;
+        bs->total_sectors = 0xffffffff & bs->total_sectors;
+    } else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) {
+        s->off_multiplier = le32_to_cpu(ph.tracks);
+    } else {
+        goto fail_format;
+    }

    s->tracks = le32_to_cpu(ph.tracks);
    if (s->tracks == 0) {
@@ -98,6 +109,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto fail;
    }
+    if (s->tracks > INT32_MAX/513) {
+        error_setg(errp, "Invalid image: Too big cluster");
+        ret = -EFBIG;
+        goto fail;
+    }

    s->catalog_size = le32_to_cpu(ph.catalog_entries);
    if (s->catalog_size > INT_MAX / 4) {
@@ -105,7 +121,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EFBIG;
        goto fail;
    }
-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
    if (ret < 0) {
@@ -113,11 +133,14 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    }

    for (i = 0; i < s->catalog_size; i++)
-	le32_to_cpus(&s->catalog_bitmap[i]);
+        le32_to_cpus(&s->catalog_bitmap[i]);

    qemu_co_mutex_init(&s->lock);
    return 0;

+fail_format:
+    error_setg(errp, "Image not in Parallels format");
+    ret = -EINVAL;
 fail:
    g_free(s->catalog_bitmap);
    return ret;
@@ -132,9 +155,10 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    offset = sector_num % s->tracks;

    /* not allocated */
-    if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
-	return -1;
-    return (uint64_t)(s->catalog_bitmap[index] + offset) * 512;
+    if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0))
+        return -1;
+    return
+        ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512;
 }

 static int parallels_read(BlockDriverState *bs, int64_t sector_num,
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -28,6 +28,7 @@
 #include "qapi-visit.h"
 #include "qapi/qmp-output-visitor.h"
 #include "qapi/qmp/types.h"
+#include "sysemu/block-backend.h"

 BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 {
@@ -39,6 +40,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
    info->encrypted              = bs->encrypted;
    info->encryption_key_missing = bdrv_key_required(bs);

+    info->cache = g_new(BlockdevCacheInfo, 1);
+    *info->cache = (BlockdevCacheInfo) {
+        .writeback      = bdrv_enable_write_cache(bs),
+        .direct         = !!(bs->open_flags & BDRV_O_NOCACHE),
+        .no_flush       = !!(bs->open_flags & BDRV_O_NO_FLUSH),
+    };
+
    if (bs->node_name[0]) {
        info->has_node_name = true;
        info->node_name = g_strdup(bs->node_name);
@@ -165,19 +173,25 @@ void bdrv_query_image_info(BlockDriverState *bs,
                           ImageInfo **p_info,
                           Error **errp)
 {
-    uint64_t total_sectors;
+    int64_t size;
    const char *backing_filename;
    char backing_filename2[1024];
    BlockDriverInfo bdi;
    int ret;
    Error *err = NULL;
-    ImageInfo *info = g_new0(ImageInfo, 1);
+    ImageInfo *info;

-    bdrv_get_geometry(bs, &total_sectors);
+    size = bdrv_getlength(bs);
+    if (size < 0) {
+        error_setg_errno(errp, -size, "Can't get size of device '%s'",
+                         bdrv_get_device_name(bs));
+        return;
+    }

+    info = g_new0(ImageInfo, 1);
    info->filename        = g_strdup(bs->filename);
    info->format          = g_strdup(bdrv_get_format_name(bs));
-    info->virtual_size    = total_sectors * 512;
+    info->virtual_size    = size;
    info->actual_size     = bdrv_get_allocated_file_size(bs);
    info->has_actual_size = info->actual_size >= 0;
    if (bdrv_is_encrypted(bs)) {
@@ -200,7 +214,12 @@ void bdrv_query_image_info(BlockDriverState *bs,
        info->backing_filename = g_strdup(backing_filename);
        info->has_backing_filename = true;
        bdrv_get_full_backing_filename(bs, backing_filename2,
-                                       sizeof(backing_filename2));
+                                       sizeof(backing_filename2), &err);
+        if (err) {
+            error_propagate(errp, err);
+            qapi_free_ImageInfo(info);
+            return;
+        }

        if (strcmp(backing_filename, backing_filename2) != 0) {
            info->full_backing_filename =
@@ -236,22 +255,22 @@ void bdrv_query_image_info(BlockDriverState *bs,
 }

 /* @p_info will be set only on success. */
-void bdrv_query_info(BlockDriverState *bs,
-                     BlockInfo **p_info,
-                     Error **errp)
+static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
+                            Error **errp)
 {
    BlockInfo *info = g_malloc0(sizeof(*info));
+    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *bs0;
    ImageInfo **p_image_info;
    Error *local_err = NULL;
-    info->device = g_strdup(bs->device_name);
+    info->device = g_strdup(blk_name(blk));
    info->type = g_strdup("unknown");
-    info->locked = bdrv_dev_is_medium_locked(bs);
-    info->removable = bdrv_dev_has_removable_media(bs);
+    info->locked = blk_dev_is_medium_locked(blk);
+    info->removable = blk_dev_has_removable_media(blk);

-    if (bdrv_dev_has_removable_media(bs)) {
+    if (blk_dev_has_removable_media(blk)) {
        info->has_tray_open = true;
-        info->tray_open = bdrv_dev_is_tray_open(bs);
+        info->tray_open = blk_dev_is_tray_open(blk);
    }

    if (bdrv_iostatus_is_enabled(bs)) {
@@ -293,36 +312,43 @@ void bdrv_query_info(BlockDriverState *bs,
    qapi_free_BlockInfo(info);
 }

-static BlockStats *bdrv_query_stats(const BlockDriverState *bs)
+static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
+                                    bool query_backing)
 {
    BlockStats *s;

    s = g_malloc0(sizeof(*s));

-    if (bs->device_name[0]) {
+    if (bdrv_get_device_name(bs)[0]) {
        s->has_device = true;
-        s->device = g_strdup(bs->device_name);
+        s->device = g_strdup(bdrv_get_device_name(bs));
+    }
+
+    if (bdrv_get_node_name(bs)[0]) {
+        s->has_node_name = true;
+        s->node_name = g_strdup(bdrv_get_node_name(bs));
    }

    s->stats = g_malloc0(sizeof(*s->stats));
-    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
-    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
-    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
-    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
-    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
-    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
-    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
-    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
-    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
+    s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ];
+    s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
+    s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
+    s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
+    s->stats->wr_highest_offset =
+        bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
+    s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
+    s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE];
+    s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ];
+    s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH];

    if (bs->file) {
        s->has_parent = true;
-        s->parent = bdrv_query_stats(bs->file);
+        s->parent = bdrv_query_stats(bs->file, query_backing);
    }

-    if (bs->backing_hd) {
+    if (query_backing && bs->backing_hd) {
        s->has_backing = true;
-        s->backing = bdrv_query_stats(bs->backing_hd);
+        s->backing = bdrv_query_stats(bs->backing_hd, query_backing);
    }

    return s;
@@ -331,12 +357,12 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs)
 BlockInfoList *qmp_query_block(Error **errp)
 {
    BlockInfoList *head = NULL, **p_next = &head;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk;
    Error *local_err = NULL;

-     while ((bs = bdrv_next(bs))) {
+    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
        BlockInfoList *info = g_malloc0(sizeof(*info));
-        bdrv_query_info(bs, &info->value, &local_err);
+        bdrv_query_info(blk, &info->value, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            goto err;
@@ -353,17 +379,22 @@ BlockInfoList *qmp_query_block(Error **errp)
    return NULL;
 }

-BlockStatsList *qmp_query_blockstats(Error **errp)
+BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
+                                     bool query_nodes,
+                                     Error **errp)
 {
    BlockStatsList *head = NULL, **p_next = &head;
    BlockDriverState *bs = NULL;

-     while ((bs = bdrv_next(bs))) {
+    /* Just to be safe if query_nodes is not always initialized */
+    query_nodes = has_query_nodes && query_nodes;
+
+    while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) {
        BlockStatsList *info = g_malloc0(sizeof(*info));
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
-        info->value = bdrv_query_stats(bs);
+        info->value = bdrv_query_stats(bs, !query_nodes);
        aio_context_release(ctx);

        *p_next = info;
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        snprintf(version, sizeof(version), "QCOW version %" PRIu32,
                 header.version);
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bs->device_name, "qcow", version);
+                  bdrv_get_device_name(bs), "qcow", version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -182,7 +182,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    }

    s->l1_table_offset = header.l1_table_offset;
-    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    s->l1_table = g_try_new(uint64_t, s->l1_size);
+    if (s->l1_table == NULL) {
+        error_setg(errp, "Could not allocate memory for L1 table");
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
               s->l1_size * sizeof(uint64_t));
@@ -193,8 +198,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    for(i = 0;i < s->l1_size; i++) {
        be64_to_cpus(&s->l1_table[i]);
    }
-    /* alloc L2 cache */
-    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+
+    /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
+    s->l2_cache =
+        qemu_try_blockalign(bs->file,
+                            s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    if (s->l2_cache == NULL) {
+        error_setg(errp, "Could not allocate L2 table cache");
+        ret = -ENOMEM;
+        goto fail;
+    }
    s->cluster_cache = g_malloc(s->cluster_size);
    s->cluster_data = g_malloc(s->cluster_size);
    s->cluster_cache_offset = -1;
@@ -218,7 +231,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when qcow images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qcow", bs->device_name, "live migration");
+              "qcow", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    qemu_co_mutex_init(&s->lock);
@@ -226,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,

 fail:
    g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);
    return ret;
@@ -517,7 +530,10 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
    void *orig_buf;

    if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
    } else {
        orig_buf = NULL;
        buf = (uint8_t *)qiov->iov->iov_base;
@@ -619,7 +635,10 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
    s->cluster_cache_offset = -1; /* disable compressed cache */

    if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
    } else {
        orig_buf = NULL;
@@ -685,7 +704,7 @@ static void qcow_close(BlockDriverState *bs)
    BDRVQcowState *s = bs->opaque;

    g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);

@@ -706,7 +725,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    BlockDriverState *qcow_bs;

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
        flags |= BLOCK_FLAG_ENCRYPT;
@@ -734,7 +754,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(&header, 0, sizeof(header));
    header.magic = cpu_to_be32(QCOW_MAGIC);
    header.version = cpu_to_be32(QCOW_VERSION);
-    header.size = cpu_to_be64(total_size * 512);
+    header.size = cpu_to_be64(total_size);
    header_size = sizeof(header);
    backing_filename_len = 0;
    if (backing_file) {
@@ -756,7 +776,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }
    header_size = (header_size + 7) & ~7;
    shift = header.cluster_bits + header.l2_bits;
-    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+    l1_size = (total_size + (1LL << shift) - 1) >> shift;

    header.l1_table_offset = cpu_to_be64(header_size);
    if (flags & BLOCK_FLAG_ENCRYPT) {
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -48,15 +48,31 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
    Qcow2Cache *c;
    int i;

-    c = g_malloc0(sizeof(*c));
+    c = g_new0(Qcow2Cache, 1);
    c->size = num_tables;
-    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+    c->entries = g_try_new0(Qcow2CachedTable, num_tables);
+    if (!c->entries) {
+        goto fail;
+    }

    for (i = 0; i < c->size; i++) {
-        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+        c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (c->entries[i].table == NULL) {
+            goto fail;
+        }
    }

    return c;
+
+fail:
+    if (c->entries) {
+        for (i = 0; i < c->size; i++) {
+            qemu_vfree(c->entries[i].table);
+        }
+    }
+    g_free(c->entries);
+    g_free(c);
+    return NULL;
 }

 int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -72,14 +72,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 #endif

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
-    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_size2, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));

    /* write new table (align to cluster) */
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
    if (new_l1_table_offset < 0) {
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
        return new_l1_table_offset;
    }

@@ -113,7 +119,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
    if (ret < 0) {
        goto fail;
    }
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    old_l1_table_offset = s->l1_table_offset;
    s->l1_table_offset = new_l1_table_offset;
    s->l1_table = new_l1_table;
@@ -123,7 +129,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        QCOW2_DISCARD_OTHER);
    return 0;
 fail:
-    g_free(new_l1_table);
+    qemu_vfree(new_l1_table);
    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
                        QCOW2_DISCARD_OTHER);
    return ret;
@@ -158,12 +164,14 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 {
    BDRVQcowState *s = bs->opaque;
-    uint64_t buf[L1_ENTRIES_PER_SECTOR];
+    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
    int l1_start_index;
    int i, ret;

    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
-    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
+         i++)
+    {
        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
    }

@@ -372,7 +380,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
    }

    iov.iov_len = n * BDRV_SECTOR_SIZE;
-    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
+    if (iov.iov_base == NULL) {
+        return -ENOMEM;
+    }

    qemu_iovec_init_external(&qiov, &iov, 1);

@@ -477,6 +488,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        goto out;
    }

+    if (offset_into_cluster(s, l2_offset)) {
+        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
+                                " unaligned (L1 index: %#" PRIx64 ")",
+                                l2_offset, l1_index);
+        return -EIO;
+    }
+
    /* load the l2 table in memory */

    ret = l2_load(bs, l2_offset, &l2_table);
@@ -499,8 +517,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        break;
    case QCOW2_CLUSTER_ZERO:
        if (s->qcow_version < 3) {
-            qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-            return -EIO;
+            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
+                                    " in pre-v3 image (L2 offset: %#" PRIx64
+                                    ", L2 index: %#x)", l2_offset, l2_index);
+            ret = -EIO;
+            goto fail;
        }
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                &l2_table[l2_index], QCOW_OFLAG_ZERO);
@@ -516,6 +537,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                &l2_table[l2_index], QCOW_OFLAG_ZERO);
        *cluster_offset &= L2E_OFFSET_MASK;
+        if (offset_into_cluster(s, *cluster_offset)) {
+            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
+                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
+                                    ", L2 index: %#x)", *cluster_offset,
+                                    l2_offset, l2_index);
+            ret = -EIO;
+            goto fail;
+        }
        break;
    default:
        abort();
@@ -532,6 +561,10 @@ out:
    *num = nb_available - index_in_cluster;

    return ret;
+
+fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+    return ret;
 }

 /*
@@ -567,6 +600,12 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,

    assert(l1_index < s->l1_size);
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+    if (offset_into_cluster(s, l2_offset)) {
+        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
+                                " unaligned (L1 index: %#" PRIx64 ")",
+                                l2_offset, l1_index);
+        return -EIO;
+    }

    /* seek the l2 table of the given l2 offset */

@@ -702,7 +741,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
    assert(m->nb_clusters > 0);

-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+    old_cluster = g_try_new(uint64_t, m->nb_clusters);
+    if (old_cluster == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }

    /* copy content of unmodified sectors */
    ret = perform_cow(bs, m, &m->cow_start);
@@ -935,6 +978,15 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
        bool offset_matches =
            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;

+        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
+            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
+                                    "%#llx unaligned (guest offset: %#" PRIx64
+                                    ")", cluster_offset & L2E_OFFSET_MASK,
+                                    guest_offset);
+            ret = -EIO;
+            goto out;
+        }
+
        if (*host_offset != 0 && !offset_matches) {
            *bytes = 0;
            ret = 0;
@@ -966,7 +1018,7 @@ out:

    /* Only return a host offset if we actually made progress. Otherwise we
     * would make requirements for handle_alloc() that it can't fulfill */
-    if (ret) {
+    if (ret > 0) {
        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
                     + offset_into_cluster(s, guest_offset);
    }
@@ -1106,6 +1158,17 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
        return 0;
    }

+    /* !*host_offset would overwrite the image header and is reserved for "no
+     * host offset preferred". If 0 was a valid host offset, it'd trigger the
+     * following overlap check; do that now to avoid having an invalid value in
+     * *host_offset. */
+    if (!alloc_cluster_offset) {
+        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
+                                            nb_clusters * s->cluster_size);
+        assert(ret < 0);
+        goto fail;
+    }
+
    /*
     * Save info needed for meta data update.
     *
@@ -1200,7 +1263,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,

 again:
    start = offset;
-    remaining = *num << BDRV_SECTOR_BITS;
+    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
    cluster_offset = 0;
    *host_offset = 0;
    cur_bytes = 0;
@@ -1351,7 +1414,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 * clusters.
 */
 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters, enum qcow2_discard_type type)
+    unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l2_table;
@@ -1373,23 +1436,30 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);

        /*
-         * Make sure that a discarded area reads back as zeroes for v3 images
-         * (we cannot do it for v2 without actually writing a zero-filled
-         * buffer). We can skip the operation if the cluster is already marked
-         * as zero, or if it's unallocated and we don't have a backing file.
+         * If full_discard is false, make sure that a discarded area reads back
+         * as zeroes for v3 images (we cannot do it for v2 without actually
+         * writing a zero-filled buffer). We can skip the operation if the
+         * cluster is already marked as zero, or if it's unallocated and we
+         * don't have a backing file.
         *
         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
         * holding s->lock, so that doesn't work today.
+         *
+         * If full_discard is true, the sector should not read back as zeroes,
+         * but rather fall through to the backing file.
         */
        switch (qcow2_get_cluster_type(old_l2_entry)) {
            case QCOW2_CLUSTER_UNALLOCATED:
-                if (!bs->backing_hd) {
+                if (full_discard || !bs->backing_hd) {
                    continue;
                }
                break;

            case QCOW2_CLUSTER_ZERO:
-                continue;
+                if (!full_discard) {
+                    continue;
+                }
+                break;

            case QCOW2_CLUSTER_NORMAL:
            case QCOW2_CLUSTER_COMPRESSED:
@@ -1401,7 +1471,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,

        /* First remove L2 entries */
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-        if (s->qcow_version >= 3) {
+        if (!full_discard && s->qcow_version >= 3) {
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
        } else {
            l2_table[l2_index + i] = cpu_to_be64(0);
@@ -1420,7 +1490,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
 }

 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type)
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t end_offset;
@@ -1443,7 +1513,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,

    /* Each L2 table is handled by its own loop iteration */
    while (nb_clusters > 0) {
-        ret = discard_single_l2(bs, offset, nb_clusters, type);
+        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
        if (ret < 0) {
            goto fail;
        }
@@ -1543,15 +1613,14 @@ fail:
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
 * non-backed non-pre-allocated zero clusters).
 *
- * expanded_clusters is a bitmap where every bit corresponds to one cluster in
- * the image file; a bit gets set if the corresponding cluster has been used for
- * zero expansion (i.e., has been filled with zeroes and is referenced from an
- * L2 table). nb_clusters contains the total cluster count of the image file,
- * i.e., the number of bits in expanded_clusters.
+ * l1_entries and *visited_l1_entries are used to keep track of progress for
+ * status_cb(). l1_entries contains the total number of L1 entries and
+ * *visited_l1_entries counts all visited L1 entries.
 */
 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
-                                      int l1_size, uint8_t **expanded_clusters,
-                                      uint64_t *nb_clusters)
+                                      int l1_size, int64_t *visited_l1_entries,
+                                      int64_t l1_entries,
+                                      BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
@@ -1562,15 +1631,23 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
    if (!is_active_l1) {
        /* inactive L2 tables require a buffer to be stored in when loading
         * them from disk */
-        l2_table = qemu_blockalign(bs, s->cluster_size);
+        l2_table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (l2_table == NULL) {
+            return -ENOMEM;
+        }
    }

    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
        bool l2_dirty = false;
+        int l2_refcount;

        if (!l2_offset) {
            /* unallocated */
+            (*visited_l1_entries)++;
+            if (status_cb) {
+                status_cb(bs, *visited_l1_entries, l1_entries);
+            }
            continue;
        }

@@ -1587,33 +1664,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            goto fail;
        }

+        l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (l2_refcount < 0) {
+            ret = l2_refcount;
+            goto fail;
+        }
+
        for (j = 0; j < s->l2_size; j++) {
            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
-            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int64_t offset = l2_entry & L2E_OFFSET_MASK;
            int cluster_type = qcow2_get_cluster_type(l2_entry);
            bool preallocated = offset != 0;

-            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
-                cluster_index = offset >> s->cluster_bits;
-                assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-                if ((*expanded_clusters)[cluster_index / 8] &
-                    (1 << (cluster_index % 8))) {
-                    /* Probably a shared L2 table; this cluster was a zero
-                     * cluster which has been expanded, its refcount
-                     * therefore most likely requires an update. */
-                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
-                                                        QCOW2_DISCARD_NEVER);
-                    if (ret < 0) {
-                        goto fail;
-                    }
-                    /* Since we just increased the refcount, the COPIED flag may
-                     * no longer be set. */
-                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
-                    l2_dirty = true;
-                }
-                continue;
-            }
-            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+            if (cluster_type != QCOW2_CLUSTER_ZERO) {
                continue;
            }

@@ -1631,6 +1694,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    ret = offset;
                    goto fail;
                }
+
+                if (l2_refcount > 1) {
+                    /* For shared L2 tables, set the refcount accordingly (it is
+                     * already 1 and needs to be l2_refcount) */
+                    ret = qcow2_update_cluster_refcount(bs,
+                            offset >> s->cluster_bits, l2_refcount - 1,
+                            QCOW2_DISCARD_OTHER);
+                    if (ret < 0) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_OTHER);
+                        goto fail;
+                    }
+                }
            }

            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
@@ -1652,29 +1728,12 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                goto fail;
            }

-            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
-            l2_dirty = true;
-
-            cluster_index = offset >> s->cluster_bits;
-
-            if (cluster_index >= *nb_clusters) {
-                uint64_t old_bitmap_size = (*nb_clusters + 7) / 8;
-                uint64_t new_bitmap_size;
-                /* The offset may lie beyond the old end of the underlying image
-                 * file for growable files only */
-                assert(bs->file->growable);
-                *nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                                BDRV_SECTOR_SIZE);
-                new_bitmap_size = (*nb_clusters + 7) / 8;
-                *expanded_clusters = g_realloc(*expanded_clusters,
-                                               new_bitmap_size);
-                /* clear the newly allocated space */
-                memset(&(*expanded_clusters)[old_bitmap_size], 0,
-                       new_bitmap_size - old_bitmap_size);
+            if (l2_refcount == 1) {
+                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            } else {
+                l2_table[j] = cpu_to_be64(offset);
            }
-
-            assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-            (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8);
+            l2_dirty = true;
        }

        if (is_active_l1) {
@@ -1703,6 +1762,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                }
            }
        }
+
+        (*visited_l1_entries)++;
+        if (status_cb) {
+            status_cb(bs, *visited_l1_entries, l1_entries);
+        }
    }

    ret = 0;
@@ -1729,21 +1793,25 @@ fail:
 * allocation for pre-allocated ones). This is important for downgrading to a
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
-int qcow2_expand_zero_clusters(BlockDriverState *bs)
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l1_table = NULL;
-    uint64_t nb_clusters;
-    uint8_t *expanded_clusters;
+    int64_t l1_entries = 0, visited_l1_entries = 0;
    int ret;
    int i, j;

-    nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                   BDRV_SECTOR_SIZE);
-    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);
+    if (status_cb) {
+        l1_entries = s->l1_size;
+        for (i = 0; i < s->nb_snapshots; i++) {
+            l1_entries += s->snapshots[i].l1_size;
+        }
+    }

    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
-                                     &expanded_clusters, &nb_clusters);
+                                     &visited_l1_entries, l1_entries,
+                                     status_cb);
    if (ret < 0) {
        goto fail;
    }
@@ -1777,7 +1845,8 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
        }

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
-                                         &expanded_clusters, &nb_clusters);
+                                         &visited_l1_entries, l1_entries,
+                                         status_cb);
        if (ret < 0) {
            goto fail;
        }
@@ -1786,7 +1855,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
    ret = 0;

 fail:
-    g_free(expanded_clusters);
    g_free(l1_table);
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -58,7 +58,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
    }

    offset = s->snapshots_offset;
-    s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+    s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots);

    for(i = 0; i < s->nb_snapshots; i++) {
        /* Read statically sized part of the snapshot header */
@@ -381,7 +381,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    sn->l1_table_offset = l1_table_offset;
    sn->l1_size = s->l1_size;

-    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    l1_table = g_try_new(uint64_t, s->l1_size);
+    if (s->l1_size && l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    for(i = 0; i < s->l1_size; i++) {
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
    }
@@ -412,7 +417,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    }

    /* Append the new snapshot to the snapshot list */
-    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    new_snapshot_list = g_new(QCowSnapshot, s->nb_snapshots + 1);
    if (s->snapshots) {
        memcpy(new_snapshot_list, s->snapshots,
               s->nb_snapshots * sizeof(QCowSnapshot));
@@ -436,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
                           align_offset(sn->vm_state_size, s->cluster_size)
                                >> BDRV_SECTOR_BITS,
-                           QCOW2_DISCARD_NEVER);
+                           QCOW2_DISCARD_NEVER, false);

 #ifdef DEBUG_ALLOC
    {
@@ -499,7 +504,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     * Decrease the refcount referenced by the old one only when the L1
     * table is overwritten.
     */
-    sn_l1_table = g_malloc0(cur_l1_bytes);
+    sn_l1_table = g_try_malloc0(cur_l1_bytes);
+    if (cur_l1_bytes && sn_l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
    if (ret < 0) {
@@ -652,7 +661,7 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
        return s->nb_snapshots;
    }

-    sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    sn_tab = g_new0(QEMUSnapshotInfo, s->nb_snapshots);
    for(i = 0; i < s->nb_snapshots; i++) {
        sn_info = sn_tab + i;
        sn = s->snapshots + i;
@@ -698,17 +707,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
        return -EFBIG;
    }
    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
-    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_bytes, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
    if (ret < 0) {
        error_setg(errp, "Failed to read l1 table for snapshot");
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
        return ret;
    }

    /* Switch the L1 table */
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);

    s->l1_size = sn->l1_size;
    s->l1_table_offset = sn->l1_table_offset;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -30,6 +30,9 @@
 #include "qemu/error-report.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qbool.h"
+#include "qapi/util.h"
+#include "qapi/qmp/types.h"
+#include "qapi-event.h"
 #include "trace.h"
 #include "qemu/option_int.h"

@@ -114,7 +117,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
 #ifdef DEBUG_EXT
        printf("ext.magic = 0x%x\n", ext.magic);
 #endif
-        if (ext.len > end_offset - offset) {
+        if (offset > end_offset || ext.len > end_offset - offset) {
            error_setg(errp, "Header extension too large");
            return -EINVAL;
        }
@@ -203,8 +206,8 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
    vsnprintf(msg, sizeof(msg), fmt, ap);
    va_end(ap);

-    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2",
-              msg);
+    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+              bdrv_get_device_name(bs), "qcow2", msg);
 }

 static void report_unsupported_feature(BlockDriverState *bs,
@@ -402,6 +405,12 @@ static QemuOptsList qcow2_runtime_opts = {
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
+        {
+            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
+            .type = QEMU_OPT_STRING,
+            .help = "Selects which overlap checks to perform from a range of "
+                    "templates (none, constant, cached, all)",
+        },
        {
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
            .type = QEMU_OPT_BOOL,
@@ -442,6 +451,22 @@ static QemuOptsList qcow2_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L2 table",
        },
+        {
+            .name = QCOW2_OPT_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
+                    "cache size",
+        },
+        {
+            .name = QCOW2_OPT_L2_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum L2 table cache size",
+        },
+        {
+            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum refcount block cache size",
+        },
        { /* end of list */ }
    },
 };
@@ -457,6 +482,61 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 };

+static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
+                             uint64_t *refcount_cache_size, Error **errp)
+{
+    uint64_t combined_cache_size;
+    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
+
+    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
+    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
+    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
+
+    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
+    *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
+    *refcount_cache_size = qemu_opt_get_size(opts,
+                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
+
+    if (combined_cache_size_set) {
+        if (l2_cache_size_set && refcount_cache_size_set) {
+            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
+                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
+                       "the same time");
+            return;
+        } else if (*l2_cache_size > combined_cache_size) {
+            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
+                       QCOW2_OPT_CACHE_SIZE);
+            return;
+        } else if (*refcount_cache_size > combined_cache_size) {
+            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
+                       QCOW2_OPT_CACHE_SIZE);
+            return;
+        }
+
+        if (l2_cache_size_set) {
+            *refcount_cache_size = combined_cache_size - *l2_cache_size;
+        } else if (refcount_cache_size_set) {
+            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+        } else {
+            *refcount_cache_size = combined_cache_size
+                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
+            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+        }
+    } else {
+        if (!l2_cache_size_set && !refcount_cache_size_set) {
+            *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE;
+            *refcount_cache_size = *l2_cache_size
+                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        } else if (!l2_cache_size_set) {
+            *l2_cache_size = *refcount_cache_size
+                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        } else if (!refcount_cache_size_set) {
+            *refcount_cache_size = *l2_cache_size
+                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        }
+    }
+}
+
 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
@@ -464,12 +544,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    unsigned int len, i;
    int ret = 0;
    QCowHeader header;
-    QemuOpts *opts;
+    QemuOpts *opts = NULL;
    Error *local_err = NULL;
    uint64_t ext_end;
    uint64_t l1_vm_state_index;
-    const char *opt_overlap_check;
+    const char *opt_overlap_check, *opt_overlap_check_template;
    int overlap_check_template = 0;
+    uint64_t l2_cache_size, refcount_cache_size;

    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
@@ -617,6 +698,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,

    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
+    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
+    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
+    s->refcount_block_size = 1 << s->refcount_block_bits;
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
@@ -688,8 +772,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,


    if (s->l1_size > 0) {
-        s->l1_table = g_malloc0(
+        s->l1_table = qemu_try_blockalign(bs->file,
            align_offset(s->l1_size * sizeof(uint64_t), 512));
+        if (s->l1_table == NULL) {
+            error_setg(errp, "Could not allocate L1 table");
+            ret = -ENOMEM;
+            goto fail;
+        }
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
@@ -701,14 +790,61 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    /* get L2 table/refcount block cache size from command line options */
+    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    l2_cache_size /= s->cluster_size;
+    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
+        l2_cache_size = MIN_L2_CACHE_SIZE;
+    }
+    if (l2_cache_size > INT_MAX) {
+        error_setg(errp, "L2 cache size too big");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    refcount_cache_size /= s->cluster_size;
+    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
+        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
+    }
+    if (refcount_cache_size > INT_MAX) {
+        error_setg(errp, "Refcount cache size too big");
+        ret = -EINVAL;
+        goto fail;
+    }
+
    /* alloc L2 table/refcount block cache */
-    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
-    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+    s->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
+    s->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
+    if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) {
+        error_setg(errp, "Could not allocate metadata caches");
+        ret = -ENOMEM;
+        goto fail;
+    }

    s->cluster_cache = g_malloc(s->cluster_size);
    /* one more sector for decompressed data alignment */
-    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
-                                  + 512);
+    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                    * s->cluster_size + 512);
+    if (s->cluster_data == NULL) {
+        error_setg(errp, "Could not allocate temporary cluster buffer");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    s->cluster_cache_offset = -1;
    s->flags = flags;

@@ -774,7 +910,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
        BdrvCheckResult result = {0};

-        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not repair dirty image");
            goto fail;
@@ -782,14 +918,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Enable lazy_refcounts according to image and command line options */
-    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto fail;
-    }
-
    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));

@@ -803,7 +931,21 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

-    opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached";
+    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
+    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
+    if (opt_overlap_check_template && opt_overlap_check &&
+        strcmp(opt_overlap_check_template, opt_overlap_check))
+    {
+        error_setg(errp, "Conflicting values for qcow2 options '"
+                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
+                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (!opt_overlap_check) {
+        opt_overlap_check = opt_overlap_check_template ?: "cached";
+    }
+
    if (!strcmp(opt_overlap_check, "none")) {
        overlap_check_template = 0;
    } else if (!strcmp(opt_overlap_check, "constant")) {
@@ -816,7 +958,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
                   "'overlap-check'. Allowed are either of the following: "
                   "none, constant, cached, all", opt_overlap_check);
-        qemu_opts_del(opts);
        ret = -EINVAL;
        goto fail;
    }
@@ -831,6 +972,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    qemu_opts_del(opts);
+    opts = NULL;

    if (s->use_lazy_refcounts && s->qcow_version < 3) {
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
@@ -848,11 +990,12 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    return ret;

 fail:
+    qemu_opts_del(opts);
    g_free(s->unknown_header_fields);
    cleanup_unknown_header_ext(bs);
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
    if (s->l2_table_cache) {
@@ -1082,7 +1225,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 */
                if (!cluster_data) {
                    cluster_data =
-                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                        qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                      * s->cluster_size);
+                    if (cluster_data == NULL) {
+                        ret = -ENOMEM;
+                        goto fail;
+                    }
                }

                assert(cur_nr_sectors <=
@@ -1182,8 +1330,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,

        if (s->crypt_method) {
            if (!cluster_data) {
-                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
-                                                 s->cluster_size);
+                cluster_data = qemu_try_blockalign(bs->file,
+                                                   QCOW_MAX_CRYPT_CLUSTERS
+                                                   * s->cluster_size);
+                if (cluster_data == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
            }

            assert(hd_qiov.size <=
@@ -1270,15 +1423,28 @@ fail:
 static void qcow2_close(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;

    if (!(bs->open_flags & BDRV_O_INCOMING)) {
-        qcow2_cache_flush(bs, s->l2_table_cache);
-        qcow2_cache_flush(bs, s->refcount_block_cache);
+        int ret1, ret2;

-        qcow2_mark_clean(bs);
+        ret1 = qcow2_cache_flush(bs, s->l2_table_cache);
+        ret2 = qcow2_cache_flush(bs, s->refcount_block_cache);
+
+        if (ret1) {
+            error_report("Failed to flush the L2 table cache: %s",
+                         strerror(-ret1));
+        }
+        if (ret2) {
+            error_report("Failed to flush the refcount block cache: %s",
+                         strerror(-ret2));
+        }
+
+        if (!ret1 && !ret2) {
+            qcow2_mark_clean(bs);
+        }
    }

    qcow2_cache_destroy(bs, s->l2_table_cache);
@@ -1557,7 +1723,7 @@ static int preallocate(BlockDriverState *bs)
    int ret;
    QCowL2Meta *meta;

-    nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    nb_sectors = bdrv_nb_sectors(bs);
    offset = 0;

    while (nb_sectors) {
@@ -1612,7 +1778,7 @@ static int preallocate(BlockDriverState *bs)

 static int qcow2_create2(const char *filename, int64_t total_size,
                         const char *backing_file, const char *backing_format,
-                         int flags, size_t cluster_size, int prealloc,
+                         int flags, size_t cluster_size, PreallocMode prealloc,
                         QemuOpts *opts, int version,
                         Error **errp)
 {
@@ -1645,6 +1811,56 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    Error *local_err = NULL;
    int ret;

+    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
+        int64_t meta_size = 0;
+        uint64_t nreftablee, nrefblocke, nl1e, nl2e;
+        int64_t aligned_total_size = align_offset(total_size, cluster_size);
+
+        /* header: 1 cluster */
+        meta_size += cluster_size;
+
+        /* total size of L2 tables */
+        nl2e = aligned_total_size / cluster_size;
+        nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
+        meta_size += nl2e * sizeof(uint64_t);
+
+        /* total size of L1 tables */
+        nl1e = nl2e * sizeof(uint64_t) / cluster_size;
+        nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
+        meta_size += nl1e * sizeof(uint64_t);
+
+        /* total size of refcount blocks
+         *
+         * note: every host cluster is reference-counted, including metadata
+         * (even refcount blocks are recursively included).
+         * Let:
+         *   a = total_size (this is the guest disk size)
+         *   m = meta size not including refcount blocks and refcount tables
+         *   c = cluster size
+         *   y1 = number of refcount blocks entries
+         *   y2 = meta size including everything
+         * then,
+         *   y1 = (y2 + a)/c
+         *   y2 = y1 * sizeof(u16) + y1 * sizeof(u16) * sizeof(u64) / c + m
+         * we can get y1:
+         *   y1 = (a + m) / (c - sizeof(u16) - sizeof(u16) * sizeof(u64) / c)
+         */
+        nrefblocke = (aligned_total_size + meta_size + cluster_size) /
+            (cluster_size - sizeof(uint16_t) -
+             1.0 * sizeof(uint16_t) * sizeof(uint64_t) / cluster_size);
+        nrefblocke = align_offset(nrefblocke, cluster_size / sizeof(uint16_t));
+        meta_size += nrefblocke * sizeof(uint16_t);
+
+        /* total size of refcount tables */
+        nreftablee = nrefblocke * sizeof(uint16_t) / cluster_size;
+        nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t));
+        meta_size += nreftablee * sizeof(uint64_t);
+
+        qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
+                            aligned_total_size + meta_size);
+        qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc]);
+    }
+
    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
@@ -1671,7 +1887,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        .l1_size                    = cpu_to_be32(0),
        .refcount_table_offset      = cpu_to_be64(cluster_size),
        .refcount_table_clusters    = cpu_to_be32(1),
-        .refcount_order             = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .refcount_order             = cpu_to_be32(4),
        .header_length              = cpu_to_be32(sizeof(*header)),
    };

@@ -1712,10 +1928,9 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     * refcount of the cluster that is occupied by the header and the refcount
     * table)
     */
-    BlockDriver* drv = bdrv_find_format("qcow2");
-    assert(drv != NULL);
    ret = bdrv_open(&bs, filename, NULL, NULL,
-        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err);
+                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
+                    &bdrv_qcow2, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto out;
@@ -1733,7 +1948,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* Okay, now that we have a valid image, let's give it the right size */
-    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+    ret = bdrv_truncate(bs, total_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not resize image");
        goto out;
@@ -1750,7 +1965,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* And if we're supposed to preallocate metadata, do that now */
-    if (prealloc) {
+    if (prealloc != PREALLOC_MODE_OFF) {
        BDRVQcowState *s = bs->opaque;
        qemu_co_mutex_lock(&s->lock);
        ret = preallocate(bs);
@@ -1767,7 +1982,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
    ret = bdrv_open(&bs, filename, NULL, NULL,
                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
-                    drv, &local_err);
+                    &bdrv_qcow2, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto out;
@@ -1786,16 +2001,17 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    char *backing_file = NULL;
    char *backing_fmt = NULL;
    char *buf = NULL;
-    uint64_t sectors = 0;
+    uint64_t size = 0;
    int flags = 0;
    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
-    int prealloc = 0;
+    PreallocMode prealloc;
    int version = 3;
    Error *local_err = NULL;
    int ret;

    /* Read out options */
-    sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                    BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
@@ -1804,12 +2020,11 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                                         DEFAULT_CLUSTER_SIZE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    if (!buf || !strcmp(buf, "off")) {
-        prealloc = 0;
-    } else if (!strcmp(buf, "metadata")) {
-        prealloc = 1;
-    } else {
-        error_setg(errp, "Invalid preallocation mode: '%s'", buf);
+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto finish;
    }
@@ -1831,7 +2046,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
        flags |= BLOCK_FLAG_LAZY_REFCOUNTS;
    }

-    if (backing_file && prealloc) {
+    if (backing_file && prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Backing file and preallocation cannot be used at "
                   "the same time");
        ret = -EINVAL;
@@ -1845,7 +2060,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
        goto finish;
    }

-    ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                        cluster_size, prealloc, opts, version, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -1886,7 +2101,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors, QCOW2_DISCARD_REQUEST);
+        nb_sectors, QCOW2_DISCARD_REQUEST, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -1947,9 +2162,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
        cluster_offset = bdrv_getlength(bs->file);
-        cluster_offset = (cluster_offset + 511) & ~511;
-        bdrv_truncate(bs->file, cluster_offset);
-        return 0;
+        return bdrv_truncate(bs->file, cluster_offset);
    }

    if (nb_sectors != s->cluster_sectors) {
@@ -2028,6 +2241,195 @@ fail:
    return ret;
 }

+static int make_completely_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, l1_clusters;
+    int64_t offset;
+    uint64_t *new_reftable = NULL;
+    uint64_t rt_entry, l1_size2;
+    struct {
+        uint64_t l1_offset;
+        uint64_t reftable_offset;
+        uint32_t reftable_clusters;
+    } QEMU_PACKED l1_ofs_rt_ofs_cls;
+
+    ret = qcow2_cache_empty(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Refcounts will be broken utterly */
+    ret = qcow2_mark_dirty(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
+
+    /* After this call, neither the in-memory nor the on-disk refcount
+     * information accurately describe the actual references */
+
+    ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE,
+                            l1_clusters * s->cluster_sectors, 0);
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    memset(s->l1_table, 0, l1_size2);
+
+    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
+
+    /* Overwrite enough clusters at the beginning of the sectors to place
+     * the refcount table, a refcount block and the L1 table in; this may
+     * overwrite parts of the existing refcount and L1 table, which is not
+     * an issue because the dirty flag is set, complete data loss is in fact
+     * desired and partial data loss is consequently fine as well */
+    ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
+                            (2 + l1_clusters) * s->cluster_size /
+                            BDRV_SECTOR_SIZE, 0);
+    /* This call (even if it failed overall) may have overwritten on-disk
+     * refcount structures; in that case, the in-memory refcount information
+     * will probably differ from the on-disk information which makes the BDS
+     * unusable */
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+
+    /* "Create" an empty reftable (one cluster) directly after the image
+     * header and an empty L1 table three clusters after the image header;
+     * the cluster between those two will be used as the first refblock */
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
+    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
+                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    s->l1_table_offset = 3 * s->cluster_size;
+
+    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
+    if (!new_reftable) {
+        ret = -ENOMEM;
+        goto fail_broken_refcounts;
+    }
+
+    s->refcount_table_offset = s->cluster_size;
+    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
+
+    g_free(s->refcount_table);
+    s->refcount_table = new_reftable;
+    new_reftable = NULL;
+
+    /* Now the in-memory refcount information again corresponds to the on-disk
+     * information (reftable is empty and no refblocks (the refblock cache is
+     * empty)); however, this means some clusters (e.g. the image header) are
+     * referenced, but not refcounted, but the normal qcow2 code assumes that
+     * the in-memory information is always correct */
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+    /* Enter the first refblock into the reftable */
+    rt_entry = cpu_to_be64(2 * s->cluster_size);
+    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
+                           &rt_entry, sizeof(rt_entry));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    s->refcount_table[0] = 2 * s->cluster_size;
+
+    s->free_cluster_index = 0;
+    assert(3 + l1_clusters <= s->refcount_block_size);
+    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
+    if (offset < 0) {
+        ret = offset;
+        goto fail_broken_refcounts;
+    } else if (offset > 0) {
+        error_report("First cluster in emptied image is in use");
+        abort();
+    }
+
+    /* Now finally the in-memory information corresponds to the on-disk
+     * structures and is correct */
+    ret = qcow2_mark_clean(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    return 0;
+
+fail_broken_refcounts:
+    /* The BDS is unusable at this point. If we wanted to make it usable, we
+     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
+     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
+     * again. However, because the functions which could have caused this error
+     * path to be taken are used by those functions as well, it's very likely
+     * that that sequence will fail as well. Therefore, just eject the BDS. */
+    bs->drv = NULL;
+
+fail:
+    g_free(new_reftable);
+    return ret;
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t start_sector;
+    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
+    int l1_clusters, ret = 0;
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+
+    if (s->qcow_version >= 3 && !s->snapshots &&
+        3 + l1_clusters <= s->refcount_block_size) {
+        /* The following function only works for qcow2 v3 images (it requires
+         * the dirty flag) and only as long as there are no snapshots (because
+         * it completely empties the image). Furthermore, the L1 table and three
+         * additional clusters (image header, refcount table, one refcount
+         * block) have to fit inside one refcount block. */
+        return make_completely_empty(bs);
+    }
+
+    /* This fallback code simply discards every active cluster; this is slow,
+     * but works in all cases */
+    for (start_sector = 0; start_sector < bs->total_sectors;
+         start_sector += sector_step)
+    {
+        /* As this function is generally used after committing an external
+         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
+         * default action for this kind of discard is to pass the discard,
+         * which will ideally result in an actually smaller image file, as
+         * is probably desired. */
+        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
+                                     MIN(sector_step,
+                                         bs->total_sectors - start_sector),
+                                     QCOW2_DISCARD_SNAPSHOT, true);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
@@ -2083,6 +2485,9 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
            .has_lazy_refcounts = true,
+            .corrupt            = s->incompatible_features &
+                                  QCOW2_INCOMPAT_CORRUPT,
+            .has_corrupt        = true,
        };
    }

@@ -2156,7 +2561,8 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
-static int qcow2_downgrade(BlockDriverState *bs, int target_version)
+static int qcow2_downgrade(BlockDriverState *bs, int target_version,
+                           BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int current_version = s->qcow_version;
@@ -2205,7 +2611,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

-    ret = qcow2_expand_zero_clusters(bs);
+    ret = qcow2_expand_zero_clusters(bs, status_cb);
    if (ret < 0) {
        return ret;
    }
@@ -2219,7 +2625,8 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    return 0;
 }

-static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
+static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int old_version = s->qcow_version, new_version = old_version;
@@ -2297,7 +2704,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
                return ret;
            }
        } else {
-            ret = qcow2_downgrade(bs, new_version);
+            ret = qcow2_downgrade(bs, new_version, status_cb);
            if (ret < 0) {
                return ret;
            }
@@ -2353,6 +2760,52 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
    return 0;
 }

+/*
+ * If offset or size are negative, respectively, they will not be included in
+ * the BLOCK_IMAGE_CORRUPTED event emitted.
+ * fatal will be ignored for read-only BDS; corruptions found there will always
+ * be considered non-fatal.
+ */
+void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
+                             int64_t size, const char *message_format, ...)
+{
+    BDRVQcowState *s = bs->opaque;
+    char *message;
+    va_list ap;
+
+    fatal = fatal && !bs->read_only;
+
+    if (s->signaled_corruption &&
+        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
+    {
+        return;
+    }
+
+    va_start(ap, message_format);
+    message = g_strdup_vprintf(message_format, ap);
+    va_end(ap);
+
+    if (fatal) {
+        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
+                "corruption events will be suppressed\n", message);
+    } else {
+        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
+                "corruption events will be suppressed\n", message);
+    }
+
+    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
+                                          offset >= 0, offset, size >= 0, size,
+                                          fatal, &error_abort);
+    g_free(message);
+
+    if (fatal) {
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+    }
+
+    s->signaled_corruption = true;
+}
+
 static QemuOptsList qcow2_create_opts = {
    .name = "qcow2-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
@@ -2392,7 +2845,8 @@ static QemuOptsList qcow2_create_opts = {
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
-            .help = "Preallocation mode (allowed values: off, metadata)"
+            .help = "Preallocation mode (allowed values: off, metadata, "
+                    "falloc, full)"
        },
        {
            .name = BLOCK_OPT_LAZY_REFCOUNTS,
@@ -2404,7 +2858,7 @@ static QemuOptsList qcow2_create_opts = {
    }
 };

-static BlockDriver bdrv_qcow2 = {
+BlockDriver bdrv_qcow2 = {
    .format_name        = "qcow2",
    .instance_size      = sizeof(BDRVQcowState),
    .bdrv_probe         = qcow2_probe,
@@ -2424,6 +2878,7 @@ static BlockDriver bdrv_qcow2 = {
    .bdrv_co_discard        = qcow2_co_discard,
    .bdrv_truncate          = qcow2_truncate,
    .bdrv_write_compressed  = qcow2_write_compressed,
+    .bdrv_make_empty        = qcow2_make_empty,

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -59,15 +59,19 @@
 /* The cluster reads as all zeros */
 #define QCOW_OFLAG_ZERO (1ULL << 0)

-#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
-
 #define MIN_CLUSTER_BITS 9
 #define MAX_CLUSTER_BITS 21

-#define L2_CACHE_SIZE 16
+#define MIN_L2_CACHE_SIZE 1 /* cluster */

 /* Must be at least 4 to cover all cases of refcount table growth */
-#define REFCOUNT_CACHE_SIZE 4
+#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
+
+#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
+
+/* The refblock cache needs only a fourth of the L2 cache size to cover as many
+ * clusters */
+#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4

 #define DEFAULT_CLUSTER_SIZE 65536

@@ -77,6 +81,7 @@
 #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
 #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
 #define QCOW2_OPT_OVERLAP "overlap-check"
+#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
 #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
 #define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1"
 #define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2"
@@ -85,6 +90,9 @@
 #define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table"
 #define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1"
 #define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
+#define QCOW2_OPT_CACHE_SIZE "cache-size"
+#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"

 typedef struct QCowHeader {
    uint32_t magic;
@@ -213,6 +221,8 @@ typedef struct BDRVQcowState {
    int l2_size;
    int l1_size;
    int l1_vm_state_index;
+    int refcount_block_bits;
+    int refcount_block_size;
    int csize_shift;
    int csize_mask;
    uint64_t cluster_offset_mask;
@@ -252,6 +262,7 @@ typedef struct BDRVQcowState {
    bool discard_passthrough[QCOW2_DISCARD_MAX];

    int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
+    bool signaled_corruption;

    uint64_t incompatible_features;
    uint64_t compatible_features;
@@ -468,10 +479,16 @@ int qcow2_mark_corrupt(BlockDriverState *bs);
 int qcow2_mark_consistent(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);

+void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
+                             int64_t size, const char *message_format, ...)
+                             GCC_FMT_ATTR(5, 6);
+
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);

+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index);
+
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  int addend, enum qcow2_discard_type type);

@@ -519,10 +536,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,

 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type);
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard);
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);

-int qcow2_expand_zero_clusters(BlockDriverState *bs);
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb);

 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -227,8 +227,10 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
    };
    int ret;

-    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
-                                       sizeof(check.used_clusters[0]));
+    check.used_clusters = g_try_new0(uint32_t, (check.nclusters + 31) / 32);
+    if (check.nclusters && check.used_clusters == NULL) {
+        return -ENOMEM;
+    }

    check.result->bfi.total_clusters =
        (s->header.image_size + s->header.cluster_size - 1) /
--- a/block/qed-gencb.c
+++ b/block/qed-gencb.c
@@ -13,7 +13,7 @@

 #include "qed.h"

-void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque)
 {
    GenericCB *gencb = g_malloc(len);
    gencb->cb = cb;
@@ -24,7 +24,7 @@ void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
 void gencb_complete(void *opaque, int ret)
 {
    GenericCB *gencb = opaque;
-    BlockDriverCompletionFunc *cb = gencb->cb;
+    BlockCompletionFunc *cb = gencb->cb;
    void *user_opaque = gencb->opaque;

    g_free(gencb);
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -49,7 +49,7 @@ out:
 }

 static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                           BlockDriverCompletionFunc *cb, void *opaque)
+                           BlockCompletionFunc *cb, void *opaque)
 {
    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
                                                cb, opaque);
@@ -119,7 +119,7 @@ out:
 */
 static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
                            unsigned int index, unsigned int n, bool flush,
-                            BlockDriverCompletionFunc *cb, void *opaque)
+                            BlockCompletionFunc *cb, void *opaque)
 {
    QEDWriteTableCB *write_table_cb;
    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
@@ -180,7 +180,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 }

 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockDriverCompletionFunc *cb, void *opaque)
+                        BlockCompletionFunc *cb, void *opaque)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
    qed_write_table(s, s->header.l1_table_offset,
@@ -235,7 +235,7 @@ static void qed_read_l2_table_cb(void *opaque, int ret)
 }

 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockDriverCompletionFunc *cb, void *opaque)
+                       BlockCompletionFunc *cb, void *opaque)
 {
    QEDReadL2TableCB *read_l2_table_cb;

@@ -275,7 +275,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset

 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                        unsigned int index, unsigned int n, bool flush,
-                        BlockDriverCompletionFunc *cb, void *opaque)
+                        BlockCompletionFunc *cb, void *opaque)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
    qed_write_table(s, request->l2_table->offset,
--- a/block/qed.c
+++ b/block/qed.c
@@ -18,22 +18,8 @@
 #include "qapi/qmp/qerror.h"
 #include "migration/migration.h"

-static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QEDAIOCB *acb = (QEDAIOCB *)blockacb;
-    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
-    bool finished = false;
-
-    /* Wait for the request to finish */
-    acb->finished = &finished;
-    while (!finished) {
-        aio_poll(aio_context, true);
-    }
-}
-
 static const AIOCBInfo qed_aiocb_info = {
    .aiocb_size         = sizeof(QEDAIOCB),
-    .cancel             = qed_aio_cancel,
 };

 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
@@ -144,7 +130,7 @@ static void qed_write_header_read_cb(void *opaque, int ret)
 * This function only updates known header fields in-place and does not affect
 * extra data after the QED header.
 */
-static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
                             void *opaque)
 {
    /* We must write full sectors for O_DIRECT but cannot necessarily generate
@@ -422,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
        snprintf(buf, sizeof(buf), "%" PRIx64,
            s->header.features & ~QED_FEATURE_MASK);
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "QED", buf);
+            bdrv_get_device_name(bs), "QED", buf);
        return -ENOTSUP;
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -648,7 +634,8 @@ static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
    char *backing_fmt = NULL;
    int ret;

-    image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
    cluster_size = qemu_opt_get_size_del(opts,
@@ -772,7 +759,7 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
                                  QEMUIOVector *qiov,
                                  QEMUIOVector **backing_qiov,
-                                  BlockDriverCompletionFunc *cb, void *opaque)
+                                  BlockCompletionFunc *cb, void *opaque)
 {
    uint64_t backing_length = 0;
    size_t size;
@@ -864,7 +851,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret)
 */
 static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
                                       uint64_t len, uint64_t offset,
-                                       BlockDriverCompletionFunc *cb,
+                                       BlockCompletionFunc *cb,
                                       void *opaque)
 {
    CopyFromBackingFileCB *copy_cb;
@@ -915,21 +902,15 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
    QEDAIOCB *acb = opaque;
-    BlockDriverCompletionFunc *cb = acb->common.cb;
+    BlockCompletionFunc *cb = acb->common.cb;
    void *user_opaque = acb->common.opaque;
    int ret = acb->bh_ret;
-    bool *finished = acb->finished;

    qemu_bh_delete(acb->bh);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);

    /* Invoke callback */
    cb(user_opaque, ret);
-
-    /* Signal cancel completion */
-    if (finished) {
-        *finished = true;
-    }
 }

 static void qed_aio_complete(QEDAIOCB *acb, int ret)
@@ -1083,7 +1064,7 @@ static void qed_aio_write_main(void *opaque, int ret)
    BDRVQEDState *s = acb_to_s(acb);
    uint64_t offset = acb->cur_cluster +
                      qed_offset_into_cluster(s, acb->cur_pos);
-    BlockDriverCompletionFunc *next_fn;
+    BlockCompletionFunc *next_fn;

    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);

@@ -1183,7 +1164,7 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
    BDRVQEDState *s = acb_to_s(acb);
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;

    /* Cancel timer when the first allocating request comes in */
    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -1240,7 +1221,11 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
        struct iovec *iov = acb->qiov->iov;

        if (!iov->iov_base) {
-            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+            if (iov->iov_base == NULL) {
+                qed_aio_complete(acb, -ENOMEM);
+                return;
+            }
            memset(iov->iov_base, 0, iov->iov_len);
        }
    }
@@ -1380,11 +1365,11 @@ static void qed_aio_next_io(void *opaque, int ret)
                      io_fn, acb);
 }

-static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque, int flags)
+static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
+                                 int64_t sector_num,
+                                 QEMUIOVector *qiov, int nb_sectors,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque, int flags)
 {
    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);

@@ -1392,7 +1377,6 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
                        opaque, flags);

    acb->flags = flags;
-    acb->finished = NULL;
    acb->qiov = qiov;
    acb->qiov_offset = 0;
    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@@ -1406,20 +1390,20 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            QEMUIOVector *qiov, int nb_sectors,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov, int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 }

-static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             QEMUIOVector *qiov, int nb_sectors,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
                         opaque, QED_AIOCB_WRITE);
@@ -1447,7 +1431,7 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                 int nb_sectors,
                                                 BdrvRequestFlags flags)
 {
-    BlockDriverAIOCB *blockacb;
+    BlockAIOCB *blockacb;
    BDRVQEDState *s = bs->opaque;
    QEDWriteZeroesCB cb = { .done = false };
    QEMUIOVector qiov;
--- a/block/qed.h
+++ b/block/qed.h
@@ -128,7 +128,7 @@ enum {
 };

 typedef struct QEDAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int bh_ret;                     /* final return status for completion bh */
    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
@@ -203,11 +203,11 @@ typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t l
 * Generic callback for chaining async callbacks
 */
 typedef struct {
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;
    void *opaque;
 } GenericCB;

-void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
+void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque);
 void gencb_complete(void *opaque, int ret);

 /**
@@ -230,16 +230,16 @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
 */
 int qed_read_l1_table_sync(BDRVQEDState *s);
 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockDriverCompletionFunc *cb, void *opaque);
+                        BlockCompletionFunc *cb, void *opaque);
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                            unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                           uint64_t offset);
 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockDriverCompletionFunc *cb, void *opaque);
+                       BlockCompletionFunc *cb, void *opaque);
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                        unsigned int index, unsigned int n, bool flush,
-                        BlockDriverCompletionFunc *cb, void *opaque);
+                        BlockCompletionFunc *cb, void *opaque);
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            unsigned int index, unsigned int n, bool flush);

--- a/block/quorum.c
+++ b/block/quorum.c
@@ -16,7 +16,12 @@
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>
 #include "block/block_int.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
 #include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qstring.h"
 #include "qapi-event.h"

 #define HASH_LENGTH 32
@@ -24,6 +29,7 @@
 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
 #define QUORUM_OPT_BLKVERIFY      "blkverify"
 #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
+#define QUORUM_OPT_READ_PATTERN   "read-pattern"

 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -74,6 +80,8 @@ typedef struct BDRVQuorumState {
    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
                            * block if Quorum is reached.
                            */
+
+    QuorumReadPattern read_pattern;
 } BDRVQuorumState;

 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -84,7 +92,7 @@ typedef struct QuorumAIOCB QuorumAIOCB;
 * $children_count QuorumChildRequest.
 */
 typedef struct QuorumChildRequest {
-    BlockDriverAIOCB *aiocb;
+    BlockAIOCB *aiocb;
    QEMUIOVector qiov;
    uint8_t *buf;
    int ret;
@@ -97,7 +105,7 @@ typedef struct QuorumChildRequest {
 * used to do operations on each children and track overall progress.
 */
 struct QuorumAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;

    /* Request metadata */
    uint64_t sector_num;
@@ -117,11 +125,12 @@ struct QuorumAIOCB {

    bool is_read;
    int vote_ret;
+    int child_iter;             /* which child to read in fifo pattern */
 };

 static bool quorum_vote(QuorumAIOCB *acb);

-static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
+static void quorum_aio_cancel(BlockAIOCB *blockacb)
 {
    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
    BDRVQuorumState *s = acb->common.bs->opaque;
@@ -129,21 +138,19 @@ static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)

    /* cancel all callbacks */
    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_cancel(acb->qcrs[i].aiocb);
+        if (acb->qcrs[i].aiocb) {
+            bdrv_aio_cancel_async(acb->qcrs[i].aiocb);
+        }
    }
-
-    g_free(acb->qcrs);
-    qemu_aio_release(acb);
 }

 static AIOCBInfo quorum_aiocb_info = {
    .aiocb_size         = sizeof(QuorumAIOCB),
-    .cancel             = quorum_aio_cancel,
+    .cancel_async       = quorum_aio_cancel,
 };

 static void quorum_aio_finalize(QuorumAIOCB *acb)
 {
-    BDRVQuorumState *s = acb->common.bs->opaque;
    int i, ret = 0;

    if (acb->vote_ret) {
@@ -153,14 +160,15 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
    acb->common.cb(acb->common.opaque, ret);

    if (acb->is_read) {
-        for (i = 0; i < s->num_children; i++) {
+        /* on the quorum case acb->child_iter == s->num_children - 1 */
+        for (i = 0; i <= acb->child_iter; i++) {
            qemu_vfree(acb->qcrs[i].buf);
            qemu_iovec_destroy(&acb->qcrs[i].qiov);
        }
    }

    g_free(acb->qcrs);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
@@ -178,7 +186,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
                                   QEMUIOVector *qiov,
                                   uint64_t sector_num,
                                   int nb_sectors,
-                                   BlockDriverCompletionFunc *cb,
+                                   BlockCompletionFunc *cb,
                                   void *opaque)
 {
    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
@@ -218,8 +226,8 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)

 static void quorum_report_failure(QuorumAIOCB *acb)
 {
-    const char *reference = acb->common.bs->device_name[0] ?
-                            acb->common.bs->device_name :
+    const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
+                            bdrv_get_device_name(acb->common.bs) :
                            acb->common.bs->node_name;

    qapi_event_send_quorum_failure(reference, acb->sector_num,
@@ -256,6 +264,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret)
    quorum_aio_finalize(acb);
 }

+static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb);
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+    int i;
+    assert(dest->niov == source->niov);
+    assert(dest->size == source->size);
+    for (i = 0; i < source->niov; i++) {
+        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+        memcpy(dest->iov[i].iov_base,
+               source->iov[i].iov_base,
+               source->iov[i].iov_len);
+    }
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
    QuorumChildRequest *sacb = opaque;
@@ -263,6 +286,21 @@ static void quorum_aio_cb(void *opaque, int ret)
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;

+    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
+        /* We try to read next child in FIFO order if we fail to read */
+        if (ret < 0 && ++acb->child_iter < s->num_children) {
+            read_fifo_child(acb);
+            return;
+        }
+
+        if (ret == 0) {
+            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
+        }
+        acb->vote_ret = ret;
+        quorum_aio_finalize(acb);
+        return;
+    }
+
    sacb->ret = ret;
    acb->count++;
    if (ret == 0) {
@@ -343,19 +381,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
    return count;
 }

-static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
-{
-    int i;
-    assert(dest->niov == source->niov);
-    assert(dest->size == source->size);
-    for (i = 0; i < source->niov; i++) {
-        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
-        memcpy(dest->iov[i].iov_base,
-               source->iov[i].iov_base,
-               source->iov[i].iov_len);
-    }
-}
-
 static void quorum_count_vote(QuorumVotes *votes,
                              QuorumVoteValue *value,
                              int index)
@@ -615,40 +640,68 @@ free_exit:
    return rewrite;
 }

-static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
-                                         void *opaque)
+static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
 {
-    BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
-                                      nb_sectors, cb, opaque);
+    BDRVQuorumState *s = acb->common.bs->opaque;
    int i;

-    acb->is_read = true;
-
    for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
-        qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
-        qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
+        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
+        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
+        qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
    }

    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors,
-                       quorum_aio_cb, &acb->qcrs[i]);
+        bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov,
+                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
    }

    return &acb->common;
 }

-static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
-                                          int64_t sector_num,
-                                          QEMUIOVector *qiov,
-                                          int nb_sectors,
-                                          BlockDriverCompletionFunc *cb,
-                                          void *opaque)
+static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter],
+                                                     acb->qiov->size);
+    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
+    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
+                     acb->qcrs[acb->child_iter].buf);
+    bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num,
+                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+                   quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+
+    return &acb->common;
+}
+
+static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs,
+                                    int64_t sector_num,
+                                    QEMUIOVector *qiov,
+                                    int nb_sectors,
+                                    BlockCompletionFunc *cb,
+                                    void *opaque)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
+                                      nb_sectors, cb, opaque);
+    acb->is_read = true;
+
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        acb->child_iter = s->num_children - 1;
+        return read_quorum_children(acb);
+    }
+
+    acb->child_iter = 0;
+    return read_fifo_child(acb);
+}
+
+static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs,
+                                     int64_t sector_num,
+                                     QEMUIOVector *qiov,
+                                     int nb_sectors,
+                                     BlockCompletionFunc *cb,
+                                     void *opaque)
 {
    BDRVQuorumState *s = bs->opaque;
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
@@ -782,16 +835,39 @@ static QemuOptsList quorum_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Rewrite corrupted block on read quorum",
        },
+        {
+            .name = QUORUM_OPT_READ_PATTERN,
+            .type = QEMU_OPT_STRING,
+            .help = "Allowed pattern: quorum, fifo. Quorum is default",
+        },
        { /* end of list */ }
    },
 };

+static int parse_read_pattern(const char *opt)
+{
+    int i;
+
+    if (!opt) {
+        /* Set quorum as default */
+        return QUORUM_READ_PATTERN_QUORUM;
+    }
+
+    for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+        if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
+            return i;
+        }
+    }
+
+    return -EINVAL;
+}
+
 static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                       Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    Error *local_err = NULL;
-    QemuOpts *opts;
+    QemuOpts *opts = NULL;
    bool *opened;
    QDict *sub = NULL;
    QList *list = NULL;
@@ -827,28 +903,37 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
    }

    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
-
-    /* and validate it against s->num_children */
-    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
    if (ret < 0) {
+        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
        goto exit;
    }
+    s->read_pattern = ret;

-    /* is the driver in blkverify mode */
-    if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
-        s->num_children == 2 && s->threshold == 2) {
-        s->is_blkverify = true;
-    } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
-        fprintf(stderr, "blkverify mode is set by setting blkverify=on "
-                "and using two files with vote_threshold=2\n");
-    }
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        /* and validate it against s->num_children */
+        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+        if (ret < 0) {
+            goto exit;
+        }

-    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
-    if (s->rewrite_corrupted && s->is_blkverify) {
-        error_setg(&local_err,
-                   "rewrite-corrupted=on cannot be used with blkverify=on");
-        ret = -EINVAL;
-        goto exit;
+        /* is the driver in blkverify mode */
+        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
+            s->num_children == 2 && s->threshold == 2) {
+            s->is_blkverify = true;
+        } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
+            fprintf(stderr, "blkverify mode is set by setting blkverify=on "
+                    "and using two files with vote_threshold=2\n");
+        }
+
+        s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,
+                                                 false);
+        if (s->rewrite_corrupted && s->is_blkverify) {
+            error_setg(&local_err,
+                       "rewrite-corrupted=on cannot be used with blkverify=on");
+            ret = -EINVAL;
+            goto exit;
+        }
    }

    /* allocate the children BlockDriverState array */
@@ -903,6 +988,7 @@ close_exit:
    g_free(s->bs);
    g_free(opened);
 exit:
+    qemu_opts_del(opts);
    /* propagate error */
    if (local_err) {
        error_propagate(errp, local_err);
@@ -945,6 +1031,39 @@ static void quorum_attach_aio_context(BlockDriverState *bs,
    }
 }

+static void quorum_refresh_filename(BlockDriverState *bs)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QDict *opts;
+    QList *children;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_refresh_filename(s->bs[i]);
+        if (!s->bs[i]->full_open_options) {
+            return;
+        }
+    }
+
+    children = qlist_new();
+    for (i = 0; i < s->num_children; i++) {
+        QINCREF(s->bs[i]->full_open_options);
+        qlist_append_obj(children, QOBJECT(s->bs[i]->full_open_options));
+    }
+
+    opts = qdict_new();
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("quorum")));
+    qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD,
+                  QOBJECT(qint_from_int(s->threshold)));
+    qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY,
+                  QOBJECT(qbool_from_int(s->is_blkverify)));
+    qdict_put_obj(opts, QUORUM_OPT_REWRITE,
+                  QOBJECT(qbool_from_int(s->rewrite_corrupted)));
+    qdict_put_obj(opts, "children", QOBJECT(children));
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_quorum = {
    .format_name                        = "quorum",
    .protocol_name                      = "quorum",
@@ -953,6 +1072,7 @@ static BlockDriver bdrv_quorum = {

    .bdrv_file_open                     = quorum_open,
    .bdrv_close                         = quorum_close,
+    .bdrv_refresh_filename              = quorum_refresh_filename,

    .bdrv_co_flush_to_disk              = quorum_co_flush,

--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -35,13 +35,13 @@
 #ifdef CONFIG_LINUX_AIO
 void *laio_init(void);
 void laio_cleanup(void *s);
-BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
+        BlockCompletionFunc *cb, void *opaque, int type);
 void laio_detach_aio_context(void *s, AioContext *old_context);
 void laio_attach_aio_context(void *s, AioContext *new_context);
 void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
-int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
+void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
 #endif

 #ifdef _WIN32
@@ -49,10 +49,10 @@ typedef struct QEMUWin32AIOState QEMUWin32AIOState;
 QEMUWin32AIOState *win32_aio_init(void);
 void win32_aio_cleanup(QEMUWin32AIOState *aio);
 int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
-BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
        QEMUWin32AIOState *aio, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
+        BlockCompletionFunc *cb, void *opaque, int type);
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context);
 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -30,6 +30,7 @@
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "raw-aio.h"
+#include "qapi/util.h"

 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@@ -59,9 +60,6 @@
 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
 #endif
 #endif
-#ifdef CONFIG_FIEMAP
-#include <linux/fiemap.h>
-#endif
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
 #include <linux/falloc.h>
 #endif
@@ -149,9 +147,7 @@ typedef struct BDRVRawState {
    bool has_discard:1;
    bool has_write_zeroes:1;
    bool discard_zeroes:1;
-#ifdef CONFIG_FIEMAP
-    bool skip_fiemap;
-#endif
+    bool needs_alignment;
 } BDRVRawState;

 typedef struct BDRVRawReopenState {
@@ -229,7 +225,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)

    /* For /dev/sg devices the alignment is not really used.
       With buffered I/O, we don't have any restrictions. */
-    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+    if (bs->sg || !s->needs_alignment) {
        bs->request_alignment = 1;
        s->buf_align = 1;
        return;
@@ -445,8 +441,12 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
+    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
+        s->needs_alignment = true;
+    }

    if (fstat(s->fd, &st) < 0) {
+        ret = -errno;
        error_setg_errno(errp, errno, "Could not stat file");
        goto fail;
    }
@@ -471,6 +471,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        }
 #endif
    }
+#ifdef __FreeBSD__
+    if (S_ISCHR(st.st_mode)) {
+        /*
+         * The file is a char device (disk), which on FreeBSD isn't behind
+         * a pager, so force all requests to be aligned. This is needed
+         * so QEMU makes sure all IO operations on the device are aligned
+         * to sector size, or else FreeBSD will reject them with EINVAL.
+         */
+        s->needs_alignment = true;
+    }
+#endif

 #ifdef CONFIG_XFS
    if (platform_test_xfs_fd(s->fd)) {
@@ -517,7 +528,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,

    s = state->bs->opaque;

-    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+    state->opaque = g_new0(BDRVRawReopenState, 1);
    raw_s = state->opaque;

 #ifdef CONFIG_LINUX_AIO
@@ -747,6 +758,15 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
        }
        if (len == -1 && errno == EINTR) {
            continue;
+        } else if (len == -1 && errno == EINVAL &&
+                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
+                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
+                   offset > 0) {
+            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
+             * after a short read.  Assume that O_DIRECT short reads only occur
+             * at EOF.  Therefore this is a short read, not an I/O error.
+             */
+            break;
        } else if (len == -1) {
            offset = -errno;
            break;
@@ -798,7 +818,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
     * Ok, we have to do it the hard way, copy all segments into
     * a single aligned buffer.
     */
-    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    if (buf == NULL) {
+        return -ENOMEM;
+    }
+
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
        char *p = buf;
        int i;
@@ -1027,9 +1051,9 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    return thread_pool_submit_co(pool, aio_worker, acb);
 }

-static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
    ThreadPool *pool;
@@ -1052,9 +1076,9 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }

-static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    BDRVRawState *s = bs->opaque;

@@ -1062,11 +1086,12 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
        return NULL;

    /*
-     * If O_DIRECT is used the buffer needs to be aligned on a sector
-     * boundary.  Check if this is the case or tell the low-level
-     * driver that it needs to copy the buffer.
+     * Check if the underlying device requires requests to be aligned,
+     * and if the request we are trying to submit is aligned or not.
+     * If this is the case tell the low-level driver that it needs
+     * to copy the buffer.
     */
-    if ((bs->open_flags & BDRV_O_NOCACHE)) {
+    if (s->needs_alignment) {
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
@@ -1111,24 +1136,24 @@ static void raw_aio_flush_io_queue(BlockDriverState *bs)
 #endif
 }

-static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_READ);
 }

-static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_WRITE);
 }

-static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1352,128 +1377,199 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    int result = 0;
    int64_t total_size = 0;
    bool nocow = false;
+    PreallocMode prealloc;
+    char *buf = NULL;
+    Error *local_err = NULL;

    strstart(filename, "file:", &filename);

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               &local_err);
+    g_free(buf);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        result = -EINVAL;
+        goto out;
+    }

    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                   0644);
    if (fd < 0) {
        result = -errno;
        error_setg_errno(errp, -result, "Could not create file");
-    } else {
-        if (nocow) {
-#ifdef __linux__
-            /* Set NOCOW flag to solve performance issue on fs like btrfs.
-             * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
-             * will be ignored since any failure of this operation should not
-             * block the left work.
-             */
-            int attr;
-            if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-                attr |= FS_NOCOW_FL;
-                ioctl(fd, FS_IOC_SETFLAGS, &attr);
-            }
-#endif
-        }
-
-        if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not resize file");
-        }
-        if (qemu_close(fd) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not close the new file");
-        }
+        goto out;
    }
+
+    if (nocow) {
+#ifdef __linux__
+        /* Set NOCOW flag to solve performance issue on fs like btrfs.
+         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
+         * will be ignored since any failure of this operation should not
+         * block the left work.
+         */
+        int attr;
+        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
+            attr |= FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &attr);
+        }
+#endif
+    }
+
+    if (ftruncate(fd, total_size) != 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not resize file");
+        goto out_close;
+    }
+
+    switch (prealloc) {
+#ifdef CONFIG_POSIX_FALLOCATE
+    case PREALLOC_MODE_FALLOC:
+        /* posix_fallocate() doesn't set errno. */
+        result = -posix_fallocate(fd, 0, total_size);
+        if (result != 0) {
+            error_setg_errno(errp, -result,
+                             "Could not preallocate data for the new file");
+        }
+        break;
+#endif
+    case PREALLOC_MODE_FULL:
+    {
+        int64_t num = 0, left = total_size;
+        buf = g_malloc0(65536);
+
+        while (left > 0) {
+            num = MIN(left, 65536);
+            result = write(fd, buf, num);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not write to the new file");
+                break;
+            }
+            left -= result;
+        }
+        if (result >= 0) {
+            result = fsync(fd);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not flush new file to disk");
+            }
+        }
+        g_free(buf);
+        break;
+    }
+    case PREALLOC_MODE_OFF:
+        break;
+    default:
+        result = -EINVAL;
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_lookup[prealloc]);
+        break;
+    }
+
+out_close:
+    if (qemu_close(fd) != 0 && result == 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not close the new file");
+    }
+out:
    return result;
 }

-static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
-                          off_t *hole, int nb_sectors, int *pnum)
-{
-#ifdef CONFIG_FIEMAP
-    BDRVRawState *s = bs->opaque;
-    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-    struct {
-        struct fiemap fm;
-        struct fiemap_extent fe;
-    } f;
-
-    if (s->skip_fiemap) {
-        return -ENOTSUP;
-    }
-
-    f.fm.fm_start = start;
-    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
-    f.fm.fm_flags = 0;
-    f.fm.fm_extent_count = 1;
-    f.fm.fm_reserved = 0;
-    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
-        s->skip_fiemap = true;
-        return -errno;
-    }
-
-    if (f.fm.fm_mapped_extents == 0) {
-        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
-         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
-         */
-        off_t length = lseek(s->fd, 0, SEEK_END);
-        *hole = f.fm.fm_start;
-        *data = MIN(f.fm.fm_start + f.fm.fm_length, length);
-    } else {
-        *data = f.fe.fe_logical;
-        *hole = f.fe.fe_logical + f.fe.fe_length;
-        if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
-            ret |= BDRV_BLOCK_ZERO;
-        }
-    }
-
-    return ret;
-#else
-    return -ENOTSUP;
-#endif
-}
-
-static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
-                             off_t *hole, int *pnum)
+/*
+ * Find allocation range in @bs around offset @start.
+ * May change underlying file descriptor's file offset.
+ * If @start is not in a hole, store @start in @data, and the
+ * beginning of the next hole in @hole, and return 0.
+ * If @start is in a non-trailing hole, store @start in @hole and the
+ * beginning of the next non-hole in @data, and return 0.
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
+ * If we can't find out, return a negative errno other than -ENXIO.
+ */
+static int find_allocation(BlockDriverState *bs, off_t start,
+                           off_t *data, off_t *hole)
 {
 #if defined SEEK_HOLE && defined SEEK_DATA
    BDRVRawState *s = bs->opaque;
+    off_t offs;

-    *hole = lseek(s->fd, start, SEEK_HOLE);
-    if (*hole == -1) {
-        /* -ENXIO indicates that sector_num was past the end of the file.
-         * There is a virtual hole there.  */
-        assert(errno != -ENXIO);
+    /*
+     * SEEK_DATA cases:
+     * D1. offs == start: start is in data
+     * D2. offs > start: start is in a hole, next data at offs
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
+     *                              or start is beyond EOF
+     *     If the latter happens, the file has been truncated behind
+     *     our back since we opened it.  All bets are off then.
+     *     Treating like a trailing hole is simplest.
+     * D4. offs < 0, errno != ENXIO: we learned nothing
+     */
+    offs = lseek(s->fd, start, SEEK_DATA);
+    if (offs < 0) {
+        return -errno;          /* D3 or D4 */
+    }
+    assert(offs >= start);

-        return -errno;
+    if (offs > start) {
+        /* D2: in hole, next data at offs */
+        *hole = start;
+        *data = offs;
+        return 0;
    }

-    if (*hole > start) {
+    /* D1: in data, end not yet known */
+
+    /*
+     * SEEK_HOLE cases:
+     * H1. offs == start: start is in a hole
+     *     If this happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H2. offs > start: either start is in data, next hole at offs,
+     *                   or start is in trailing hole, EOF at offs
+     *     Linux treats trailing holes like any other hole: offs ==
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
+     *     If that happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
+     *     If this happens, the file has been truncated behind our
+     *     back since we opened it.  Treat it like a trailing hole.
+     * H4. offs < 0, errno != ENXIO: we learned nothing
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
+     */
+    offs = lseek(s->fd, start, SEEK_HOLE);
+    if (offs < 0) {
+        return -errno;          /* D1 and (H3 or H4) */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /*
+         * D1 and H2: either in data, next hole at offs, or it was in
+         * data but is now in a trailing hole.  In the latter case,
+         * all bets are off.  Treating it as if it there was data all
+         * the way to EOF is safe, so simply do that.
+         */
        *data = start;
-    } else {
-        /* On a hole.  We need another syscall to find its end.  */
-        *data = lseek(s->fd, start, SEEK_DATA);
-        if (*data == -1) {
-            *data = lseek(s->fd, 0, SEEK_END);
-        }
+        *hole = offs;
+        return 0;
    }

-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+    /* D1 and H1 */
+    return -EBUSY;
 #else
    return -ENOTSUP;
 #endif
 }

 /*
- * Returns true iff the specified sector is present in the disk image. Drivers
- * not implementing the functionality are assumed to not support backing files,
- * hence all their sectors are reported as allocated.
+ * Returns the allocation status of the specified sectors.
 *
 * If 'sector_num' is beyond the end of the disk image the return value is 0
 * and 'pnum' is set to 0.
@@ -1490,7 +1586,8 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                                    int nb_sectors, int *pnum)
 {
    off_t start, data = 0, hole = 0;
-    int64_t ret;
+    int64_t total_size;
+    int ret;

    ret = fd_open(bs);
    if (ret < 0) {
@@ -1498,34 +1595,41 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
    }

    start = sector_num * BDRV_SECTOR_SIZE;
-
-    ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
-    if (ret < 0) {
-        ret = try_seek_hole(bs, start, &data, &hole, pnum);
-        if (ret < 0) {
-            /* Assume everything is allocated. */
-            data = 0;
-            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
-            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-        }
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        return total_size;
+    } else if (start >= total_size) {
+        *pnum = 0;
+        return 0;
+    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
+        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
    }

-    if (data <= start) {
+    ret = find_allocation(bs, start, &data, &hole);
+    if (ret == -ENXIO) {
+        /* Trailing hole */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_ZERO;
+    } else if (ret < 0) {
+        /* No info available, so pretend there are no holes */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA;
+    } else if (data == start) {
        /* On a data extent, compute sectors to the end of the extent.  */
        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        ret = BDRV_BLOCK_DATA;
    } else {
        /* On a hole, compute sectors to the beginning of the next extent.  */
+        assert(hole == start);
        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
-        ret &= ~BDRV_BLOCK_DATA;
-        ret |= BDRV_BLOCK_ZERO;
+        ret = BDRV_BLOCK_ZERO;
    }
-
-    return ret;
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }

-static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
+static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1572,11 +1676,16 @@ static QemuOptsList raw_create_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Turn off copy-on-write (valid only on btrfs)"
        },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, falloc, full)"
+        },
        { /* end of list */ }
    }
 };

-static BlockDriver bdrv_file = {
+BlockDriver bdrv_file = {
    .format_name = "file",
    .protocol_name = "file",
    .instance_size = sizeof(BDRVRawState),
@@ -1814,7 +1923,7 @@ static int fd_open(BlockDriverState *bs)
        return 0;
    last_media_present = (s->fd >= 0);
    if (s->fd >= 0 &&
-        (get_clock() - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
+        (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
        qemu_close(s->fd);
        s->fd = -1;
 #ifdef DEBUG_FLOPPY
@@ -1823,7 +1932,7 @@ static int fd_open(BlockDriverState *bs)
    }
    if (s->fd < 0) {
        if (s->fd_got_error &&
-            (get_clock() - s->fd_error_time) < FD_OPEN_TIMEOUT) {
+            (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
 #ifdef DEBUG_FLOPPY
            printf("No floppy (open delayed)\n");
 #endif
@@ -1831,7 +1940,7 @@ static int fd_open(BlockDriverState *bs)
        }
        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
        if (s->fd < 0) {
-            s->fd_error_time = get_clock();
+            s->fd_error_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
            s->fd_got_error = 1;
            if (last_media_present)
                s->fd_media_changed = 1;
@@ -1846,7 +1955,7 @@ static int fd_open(BlockDriverState *bs)
    }
    if (!last_media_present)
        s->fd_media_changed = 1;
-    s->fd_open_time = get_clock();
+    s->fd_open_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    s->fd_got_error = 0;
    return 0;
 }
@@ -1858,9 +1967,9 @@ static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    return ioctl(s->fd, req, buf);
 }

-static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
+static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    RawPosixAIOData *acb;
@@ -1899,9 +2008,9 @@ static int fd_open(BlockDriverState *bs)

 #endif /* !linux && !FreeBSD */

-static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
+static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1953,8 +2062,8 @@ static int hdev_create(const char *filename, QemuOpts *opts,
    (void)has_prefix;

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    fd = qemu_open(filename, O_WRONLY | O_BINARY);
    if (fd < 0) {
@@ -1970,7 +2079,7 @@ static int hdev_create(const char *filename, QemuOpts *opts,
        error_setg(errp,
                   "The given file is neither a block nor a character device");
        ret = -ENODEV;
-    } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) {
+    } else if (lseek(fd, 0, SEEK_END) < total_size) {
        error_setg(errp, "Device is too small");
        ret = -ENOSPC;
    }
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -138,9 +138,9 @@ static int aio_worker(void *arg)
    return ret;
 }

-static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
+static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
    ThreadPool *pool;
@@ -369,9 +369,9 @@ fail:
    return ret;
 }

-static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                         BlockDriverCompletionFunc *cb, void *opaque)
+                         BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
@@ -383,9 +383,9 @@ static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
    }
 }

-static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                          BlockDriverCompletionFunc *cb, void *opaque)
+                          BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
@@ -397,8 +397,8 @@ static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
    }
 }

-static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
-                         BlockDriverCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
+                         BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
@@ -511,8 +511,8 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    strstart(filename, "file:", &filename);

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                   0644);
@@ -521,7 +521,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }
    set_sparse(fd);
-    ftruncate(fd, total_size * 512);
+    ftruncate(fd, total_size);
    qemu_close(fd);
    return 0;
 }
@@ -540,7 +540,7 @@ static QemuOptsList raw_create_opts = {
    }
 };

-static BlockDriver bdrv_file = {
+BlockDriver bdrv_file = {
    .format_name	= "file",
    .protocol_name	= "file",
    .instance_size	= sizeof(BDRVRawState),
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -58,8 +58,58 @@ static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
 static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
                                      int nb_sectors, QEMUIOVector *qiov)
 {
+    void *buf = NULL;
+    BlockDriver *drv;
+    QEMUIOVector local_qiov;
+    int ret;
+
+    if (bs->probed && sector_num == 0) {
+        /* As long as these conditions are true, we can't get partial writes to
+         * the probe buffer and can just directly check the request. */
+        QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512);
+        QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512);
+
+        if (nb_sectors == 0) {
+            /* qemu_iovec_to_buf() would fail, but we want to return success
+             * instead of -EINVAL in this case. */
+            return 0;
+        }
+
+        buf = qemu_try_blockalign(bs->file, 512);
+        if (!buf) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+
+        ret = qemu_iovec_to_buf(qiov, 0, buf, 512);
+        if (ret != 512) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        drv = bdrv_probe_all(buf, 512, NULL);
+        if (drv != bs->drv) {
+            ret = -EPERM;
+            goto fail;
+        }
+
+        /* Use the checked buffer, a malicious guest might be overwriting its
+         * original buffer in the background. */
+        qemu_iovec_init(&local_qiov, qiov->niov + 1);
+        qemu_iovec_add(&local_qiov, buf, 512);
+        qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512);
+        qiov = &local_qiov;
+    }
+
    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
+    ret = bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
+
+fail:
+    if (qiov == &local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+    }
+    qemu_vfree(buf);
+    return ret;
 }

 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
@@ -129,10 +179,10 @@ static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    return bdrv_ioctl(bs->file, req, buf);
 }

-static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
-                                       unsigned long int req, void *buf,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque)
+static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+                                 unsigned long int req, void *buf,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque)
 {
    return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque);
 }
@@ -158,6 +208,18 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->sg;
+
+    if (bs->probed && !bdrv_is_read_only(bs)) {
+        fprintf(stderr,
+                "WARNING: Image format was not specified for '%s' and probing "
+                "guessed raw.\n"
+                "         Automatically detecting the format is dangerous for "
+                "raw images, write operations on block 0 will be restricted.\n"
+                "         Specify the 'raw' format explicitly to remove the "
+                "restrictions.\n",
+                bs->file->filename);
+    }
+
    return 0;
 }

@@ -173,7 +235,7 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
    return 1;
 }

-static BlockDriver bdrv_raw = {
+BlockDriver bdrv_raw = {
    .format_name          = "raw",
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -68,7 +68,7 @@ typedef enum {
 } RBDAIOCmd;

 typedef struct RBDAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int64_t ret;
    QEMUIOVector *qiov;
@@ -77,7 +77,6 @@ typedef struct RBDAIOCB {
    int64_t sector_num;
    int error;
    struct BDRVRBDState *s;
-    int cancelled;
    int status;
 } RBDAIOCB;

@@ -314,7 +313,8 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* Read out options */
-    bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                     BDRV_SECTOR_SIZE);
    objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0);
    if (objsize) {
        if ((objsize - 1) & objsize) {    /* not a power of 2? */
@@ -407,9 +407,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
    acb->status = 0;

-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    qemu_aio_unref(acb);
 }

 /* TODO Convert to fine grained options */
@@ -461,7 +459,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
    r = rados_create(&s->cluster, clientname);
    if (r < 0) {
-        error_setg(&local_err, "error initializing");
+        error_setg(errp, "error initializing");
        goto failed_opts;
    }

@@ -497,19 +495,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,

    r = rados_connect(s->cluster);
    if (r < 0) {
-        error_setg(&local_err, "error connecting");
+        error_setg(errp, "error connecting");
        goto failed_shutdown;
    }

    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
    if (r < 0) {
-        error_setg(&local_err, "error opening pool %s", pool);
+        error_setg(errp, "error opening pool %s", pool);
        goto failed_shutdown;
    }

    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
    if (r < 0) {
-        error_setg(&local_err, "error reading header from %s", s->name);
+        error_setg(errp, "error reading header from %s", s->name);
        goto failed_open;
    }

@@ -538,25 +536,8 @@ static void qemu_rbd_close(BlockDriverState *bs)
    rados_shutdown(s->cluster);
 }

-/*
- * Cancel aio. Since we don't reference acb in a non qemu threads,
- * it is safe to access it here.
- */
-static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
-    acb->cancelled = 1;
-
-    while (acb->status == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(acb->common.bs), true);
-    }
-
-    qemu_aio_release(acb);
-}
-
 static const AIOCBInfo rbd_aiocb_info = {
    .aiocb_size = sizeof(RBDAIOCB),
-    .cancel = qemu_rbd_aio_cancel,
 };

 static void rbd_finish_bh(void *opaque)
@@ -608,16 +589,16 @@ static int rbd_aio_flush_wrapper(rbd_image_t image,
 #endif
 }

-static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov,
-                                       int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque,
-                                       RBDAIOCmd cmd)
+static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
+                                 int64_t sector_num,
+                                 QEMUIOVector *qiov,
+                                 int nb_sectors,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque,
+                                 RBDAIOCmd cmd)
 {
    RBDAIOCB *acb;
-    RADOSCB *rcb;
+    RADOSCB *rcb = NULL;
    rbd_completion_t c;
    int64_t off, size;
    char *buf;
@@ -631,12 +612,14 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
        acb->bounce = NULL;
    } else {
-        acb->bounce = qemu_blockalign(bs, qiov->size);
+        acb->bounce = qemu_try_blockalign(bs, qiov->size);
+        if (acb->bounce == NULL) {
+            goto failed;
+        }
    }
    acb->ret = 0;
    acb->error = 0;
    acb->s = s;
-    acb->cancelled = 0;
    acb->bh = NULL;
    acb->status = -EINPROGRESS;

@@ -649,7 +632,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    off = sector_num * BDRV_SECTOR_SIZE;
    size = nb_sectors * BDRV_SECTOR_SIZE;

-    rcb = g_malloc(sizeof(RADOSCB));
+    rcb = g_new(RADOSCB, 1);
    rcb->done = 0;
    rcb->acb = acb;
    rcb->buf = buf;
@@ -688,36 +671,36 @@ failed_completion:
 failed:
    g_free(rcb);
    qemu_vfree(acb->bounce);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
    return NULL;
 }

-static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            QEMUIOVector *qiov,
-                                            int nb_sectors,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
                         RBD_AIO_READ);
 }

-static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             QEMUIOVector *qiov,
-                                             int nb_sectors,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov,
+                                       int nb_sectors,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
                         RBD_AIO_WRITE);
 }

 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
-static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
 }
@@ -859,7 +842,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
    int max_snaps = RBD_MAX_SNAPS;

    do {
-        snaps = g_malloc(sizeof(*snaps) * max_snaps);
+        snaps = g_new(rbd_snap_info_t, max_snaps);
        snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
        if (snap_count <= 0) {
            g_free(snaps);
@@ -870,7 +853,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
        goto done;
    }

-    sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
+    sn_tab = g_new0(QEMUSnapshotInfo, snap_count);

    for (i = 0; i < snap_count; i++) {
        const char *snap_name = snaps[i].name;
@@ -893,17 +876,29 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }

 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
-                                              int64_t sector_num,
-                                              int nb_sectors,
-                                              BlockDriverCompletionFunc *cb,
-                                              void *opaque)
+static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
+                                        int64_t sector_num,
+                                        int nb_sectors,
+                                        BlockCompletionFunc *cb,
+                                        void *opaque)
 {
    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
                         RBD_AIO_DISCARD);
 }
 #endif

+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+static void qemu_rbd_invalidate_cache(BlockDriverState *bs,
+                                      Error **errp)
+{
+    BDRVRBDState *s = bs->opaque;
+    int r = rbd_invalidate_cache(s->image);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "Failed to invalidate the cache");
+    }
+}
+#endif
+
 static QemuOptsList qemu_rbd_create_opts = {
    .name = "rbd-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head),
@@ -953,6 +948,9 @@ static BlockDriver bdrv_rbd = {
    .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+    .bdrv_invalidate_cache  = qemu_rbd_invalidate_cache,
+#endif
 };

 static void bdrv_rbd_init(void)
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -103,6 +103,9 @@
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0

+#define LOCK_TYPE_NORMAL 0
+#define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */
+
 typedef struct SheepdogReq {
    uint8_t proto_ver;
    uint8_t opcode;
@@ -166,7 +169,8 @@ typedef struct SheepdogVdiReq {
    uint8_t copy_policy;
    uint8_t reserved[2];
    uint32_t snapid;
-    uint32_t pad[3];
+    uint32_t type;
+    uint32_t pad[2];
 } SheepdogVdiReq;

 typedef struct SheepdogVdiRsp {
@@ -297,7 +301,7 @@ enum AIOCBState {
 };

 struct SheepdogAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;

    QEMUIOVector *qiov;

@@ -311,7 +315,6 @@ struct SheepdogAIOCB {
    void (*aio_done_func)(SheepdogAIOCB *);

    bool cancelable;
-    bool *finished;
    int nr_pending;
 };

@@ -442,10 +445,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
 {
    qemu_coroutine_enter(acb->coroutine, NULL);
-    if (acb->finished) {
-        *acb->finished = true;
-    }
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 /*
@@ -473,41 +473,38 @@ static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
    return true;
 }

-static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
+static void sd_aio_cancel(BlockAIOCB *blockacb)
 {
    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
    BDRVSheepdogState *s = acb->common.bs->opaque;
    AIOReq *aioreq, *next;
-    bool finished = false;

-    acb->finished = &finished;
-    while (!finished) {
-        if (sd_acb_cancelable(acb)) {
-            /* Remove outstanding requests from pending and failed queues.  */
-            QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
-                               next) {
-                if (aioreq->aiocb == acb) {
-                    free_aio_req(s, aioreq);
-                }
+    if (sd_acb_cancelable(acb)) {
+        /* Remove outstanding requests from pending and failed queues.  */
+        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
+                           next) {
+            if (aioreq->aiocb == acb) {
+                free_aio_req(s, aioreq);
            }
-            QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
-                               next) {
-                if (aioreq->aiocb == acb) {
-                    free_aio_req(s, aioreq);
-                }
-            }
-
-            assert(acb->nr_pending == 0);
-            sd_finish_aiocb(acb);
-            return;
        }
-        aio_poll(s->aio_context, true);
+        QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
+                           next) {
+            if (aioreq->aiocb == acb) {
+                free_aio_req(s, aioreq);
+            }
+        }
+
+        assert(acb->nr_pending == 0);
+        if (acb->common.cb) {
+            acb->common.cb(acb->common.opaque, -ECANCELED);
+        }
+        sd_finish_aiocb(acb);
    }
 }

 static const AIOCBInfo sd_aiocb_info = {
-    .aiocb_size = sizeof(SheepdogAIOCB),
-    .cancel = sd_aio_cancel,
+    .aiocb_size     = sizeof(SheepdogAIOCB),
+    .cancel_async   = sd_aio_cancel,
 };

 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
@@ -524,7 +521,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,

    acb->aio_done_func = NULL;
    acb->cancelable = true;
-    acb->finished = NULL;
    acb->coroutine = qemu_coroutine_self();
    acb->ret = 0;
    acb->nr_pending = 0;
@@ -712,7 +708,6 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)

 static coroutine_fn void reconnect_to_sdog(void *opaque)
 {
-    Error *local_err = NULL;
    BDRVSheepdogState *s = opaque;
    AIOReq *aio_req, *next;

@@ -727,6 +722,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)

    /* Try to reconnect the sheepdog server every one second. */
    while (s->fd < 0) {
+        Error *local_err = NULL;
        s->fd = get_sheep_fd(s, &local_err);
        if (s->fd < 0) {
            DPRINTF("Wait for connection to be established\n");
@@ -1090,6 +1086,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
    memset(&hdr, 0, sizeof(hdr));
    if (lock) {
        hdr.opcode = SD_OP_LOCK_VDI;
+        hdr.type = LOCK_TYPE_NORMAL;
    } else {
        hdr.opcode = SD_OP_GET_VDI_INFO;
    }
@@ -1110,6 +1107,8 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
                   sd_strerror(rsp->result), filename, snapid, tag);
        if (rsp->result == SD_RES_NO_VDI) {
            ret = -ENOENT;
+        } else if (rsp->result == SD_RES_VDI_LOCKED) {
+            ret = -EBUSY;
        } else {
            ret = -EIO;
        }
@@ -1682,7 +1681,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
    uint32_t snapid;
    bool prealloc = false;

-    s = g_malloc0(sizeof(BDRVSheepdogState));
+    s = g_new0(BDRVSheepdogState, 1);

    memset(tag, 0, sizeof(tag));
    if (strstr(filename, "://")) {
@@ -1695,7 +1694,8 @@ static int sd_create(const char *filename, QemuOpts *opts,
        goto out;
    }

-    s->inode.vdi_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                                 BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    if (!buf || !strcmp(buf, "off")) {
@@ -1793,6 +1793,7 @@ static void sd_close(BlockDriverState *bs)
    memset(&hdr, 0, sizeof(hdr));

    hdr.opcode = SD_OP_RELEASE_VDI;
+    hdr.type = LOCK_TYPE_NORMAL;
    hdr.base_vdi_id = s->inode.vdi_id;
    wlen = strlen(s->name) + 1;
    hdr.data_length = wlen;
@@ -2129,7 +2130,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

@@ -2150,7 +2151,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

@@ -2273,7 +2274,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
    uint32_t snapid = 0;
    int ret = 0;

-    old_s = g_malloc(sizeof(BDRVSheepdogState));
+    old_s = g_new(BDRVSheepdogState, 1);

    memcpy(old_s, s, sizeof(BDRVSheepdogState));

@@ -2357,7 +2358,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
        goto out;
    }

-    sn_tab = g_malloc0(nr * sizeof(*sn_tab));
+    sn_tab = g_new0(QEMUSnapshotInfo, nr);

    /* calculate a vdi id with hash function */
    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
@@ -2509,7 +2510,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -236,6 +236,10 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
        error_setg(errp, "snapshot_id and name are both NULL");
        return -EINVAL;
    }
+
+    /* drain all pending i/o before deleting snapshot */
+    bdrv_drain_all();
+
    if (drv->bdrv_snapshot_delete) {
        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
    }
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -517,6 +517,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    const char *host, *user, *path, *host_key_check;
    int port;

+    if (!qdict_haskey(options, "host")) {
+        ret = -EINVAL;
+        error_setg(errp, "No hostname was specified");
+        goto err;
+    }
    host = qdict_get_str(options, "host");

    if (qdict_haskey(options, "port")) {
@@ -525,6 +530,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        port = 22;
    }

+    if (!qdict_haskey(options, "path")) {
+        ret = -EINVAL;
+        error_setg(errp, "No path was specified");
+        goto err;
+    }
    path = qdict_get_str(options, "path");

    if (qdict_haskey(options, "user")) {
@@ -700,7 +710,8 @@ static int ssh_create(const char *filename, QemuOpts *opts, Error **errp)
    ssh_state_init(&s);

    /* Get desired file size. */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    DPRINTF("total_size=%" PRIi64, total_size);

    uri_options = qdict_new();
--- a/block/stream.c
+++ b/block/stream.c
@@ -79,9 +79,39 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
    bdrv_refresh_limits(top, NULL);
 }

+typedef struct {
+    int ret;
+    bool reached_end;
+} StreamCompleteData;
+
+static void stream_complete(BlockJob *job, void *opaque)
+{
+    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
+    StreamCompleteData *data = opaque;
+    BlockDriverState *base = s->base;
+
+    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
+        data->ret == 0) {
+        const char *base_id = NULL, *base_fmt = NULL;
+        if (base) {
+            base_id = s->backing_file_str;
+            if (base->drv) {
+                base_fmt = base->drv->format_name;
+            }
+        }
+        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
+        close_unused_images(job->bs, base, base_id);
+    }
+
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
+    StreamCompleteData *data;
    BlockDriverState *bs = s->common.bs;
    BlockDriverState *base = s->base;
    int64_t sector_num, end;
@@ -183,21 +213,13 @@ wait:
    /* Do not remove the backing file if an error was there but ignored.  */
    ret = error;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
-        const char *base_id = NULL, *base_fmt = NULL;
-        if (base) {
-            base_id = s->backing_file_str;
-            if (base->drv) {
-                base_fmt = base->drv->format_name;
-            }
-        }
-        ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        close_unused_images(bs, base, base_id);
-    }
-
    qemu_vfree(buf);
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+
+    /* Modify backing chain and close BDSes in main loop */
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    data->reached_end = sector_num == end;
+    block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }

 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -220,7 +242,7 @@ static const BlockJobDriver stream_job_driver = {
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *backing_file_str, int64_t speed,
                  BlockdevOnError on_error,
-                  BlockDriverCompletionFunc *cb,
+                  BlockCompletionFunc *cb,
                  void *opaque, Error **errp)
 {
    StreamBlockJob *s;
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,13 +53,6 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif

 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
@@ -127,8 +120,18 @@ typedef unsigned char uuid_t[16];

 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)

-/* max blocks in image is (0xffffffff / 4) */
-#define VDI_BLOCKS_IN_IMAGE_MAX  0x3fffffff
+/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since
+ * the bmap is read and written in a single operation, its size needs to be
+ * limited to INT_MAX; furthermore, when opening an image, the bmap size is
+ * rounded up to be aligned on BDRV_SECTOR_SIZE.
+ * Therefore this should satisfy the following:
+ * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1
+ * (INT_MAX + 1 is the first value not representable as an int)
+ * This guarantees that any value below or equal to the constant will, when
+ * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary,
+ * still be below or equal to INT_MAX. */
+#define VDI_BLOCKS_IN_IMAGE_MAX \
+    ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t)))
 #define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
                                  (uint64_t)DEFAULT_CLUSTER_SIZE)

@@ -144,12 +147,14 @@ static inline int uuid_is_null(const uuid_t uu)
    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
 }

+# if defined(CONFIG_VDI_DEBUG)
 static inline void uuid_unparse(const uuid_t uu, char *out)
 {
    snprintf(out, 37, UUID_FMT,
            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
            uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
 }
+# endif
 #endif

 typedef struct {
@@ -299,7 +304,12 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
        return -ENOTSUP;
    }

-    bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t));
+    bmap = g_try_new(uint32_t, s->header.blocks_in_image);
+    if (s->header.blocks_in_image && bmap == NULL) {
+        res->check_errors++;
+        return -ENOMEM;
+    }
+
    memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));

    /* Check block map and value of blocks_allocated. */
@@ -357,23 +367,23 @@ static int vdi_make_empty(BlockDriverState *bs)
 static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
    const VdiHeader *header = (const VdiHeader *)buf;
-    int result = 0;
+    int ret = 0;

    logout("\n");

    if (buf_size < sizeof(*header)) {
        /* Header too small, no VDI. */
    } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
-        result = 100;
+        ret = 100;
    }

-    if (result == 0) {
+    if (ret == 0) {
        logout("no vdi image\n");
    } else {
        logout("%s", header->text);
    }

-    return result;
+    return ret;
 }

 static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
@@ -409,8 +419,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
           We accept them but round the disk size to the next multiple of
           SECTOR_SIZE. */
        logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
-        header.disk_size += SECTOR_SIZE - 1;
-        header.disk_size &= ~(SECTOR_SIZE - 1);
+        header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE);
    }

    if (header.signature != VDI_SIGNATURE) {
@@ -477,8 +486,13 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    s->header = header;

    bmap_size = header.blocks_in_image * sizeof(uint32_t);
-    bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
-    s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+    bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE);
+    s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE);
+    if (s->bmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
    if (ret < 0) {
        goto fail_free_bmap;
@@ -487,13 +501,13 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when vdi images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vdi", bs->device_name, "live migration");
+              "vdi", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;

 fail_free_bmap:
-    g_free(s->bmap);
+    qemu_vfree(s->bmap);

 fail:
    return ret;
@@ -681,8 +695,7 @@ static int vdi_co_write(BlockDriverState *bs,

 static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    int fd;
-    int result = 0;
+    int ret = 0;
    uint64_t bytes = 0;
    uint32_t blocks;
    size_t block_size = DEFAULT_CLUSTER_SIZE;
@@ -690,12 +703,16 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    VdiHeader header;
    size_t i;
    size_t bmap_size;
-    bool nocow = false;
+    int64_t offset = 0;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;
+    uint32_t *bmap = NULL;

    logout("\n");

    /* Read out options. */
-    bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                     BDRV_SECTOR_SIZE);
 #if defined(CONFIG_VDI_BLOCK_SIZE)
    /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */
    block_size = qemu_opt_get_size_del(opts,
@@ -707,45 +724,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
        image_type = VDI_TYPE_STATIC;
    }
 #endif
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);

    if (bytes > VDI_DISK_SIZE_MAX) {
-        result = -ENOTSUP;
+        ret = -ENOTSUP;
        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
                          ", max supported is 0x%" PRIx64 ")",
                          bytes, VDI_DISK_SIZE_MAX);
        goto exit;
    }

-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        result = -errno;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
        goto exit;
    }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
    }

    /* We need enough blocks to store the given disk size,
       so always round up. */
-    blocks = (bytes + block_size - 1) / block_size;
+    blocks = DIV_ROUND_UP(bytes, block_size);

    bmap_size = blocks * sizeof(uint32_t);
-    bmap_size = ((bmap_size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE -1));
+    bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE);

    memset(&header, 0, sizeof(header));
    pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
@@ -769,13 +774,20 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    if (write(fd, &header, sizeof(header)) < 0) {
-        result = -errno;
-        goto close_and_exit;
+    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header));
+    if (ret < 0) {
+        error_setg(errp, "Error writing header to %s", filename);
+        goto exit;
    }
+    offset += sizeof(header);

    if (bmap_size > 0) {
-        uint32_t *bmap = g_malloc0(bmap_size);
+        bmap = g_try_malloc0(bmap_size);
+        if (bmap == NULL) {
+            ret = -ENOMEM;
+            error_setg(errp, "Could not allocate bmap");
+            goto exit;
+        }
        for (i = 0; i < blocks; i++) {
            if (image_type == VDI_TYPE_STATIC) {
                bmap[i] = i;
@@ -783,35 +795,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        if (write(fd, bmap, bmap_size) < 0) {
-            result = -errno;
-            g_free(bmap);
-            goto close_and_exit;
+        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size);
+        if (ret < 0) {
+            error_setg(errp, "Error writing bmap to %s", filename);
+            goto exit;
        }
-        g_free(bmap);
+        offset += bmap_size;
    }

    if (image_type == VDI_TYPE_STATIC) {
-        if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
-            result = -errno;
-            goto close_and_exit;
+        ret = bdrv_truncate(bs, offset + blocks * block_size);
+        if (ret < 0) {
+            error_setg(errp, "Failed to statically allocate %s", filename);
+            goto exit;
        }
    }

-close_and_exit:
-    if ((close(fd) < 0) && !result) {
-        result = -errno;
-    }
-
 exit:
-    return result;
+    bdrv_unref(bs);
+    g_free(bmap);
+    return ret;
 }

 static void vdi_close(BlockDriverState *bs)
 {
    BDRVVdiState *s = bs->opaque;

-    g_free(s->bmap);
+    qemu_vfree(s->bmap);

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
@@ -842,11 +852,6 @@ static QemuOptsList vdi_create_opts = {
            .def_value_str = "off"
        },
 #endif
-        {
-            .name = BLOCK_OPT_NOCOW,
-            .type = QEMU_OPT_BOOL,
-            .help = "Turn off copy-on-write (valid only on btrfs)"
-        },
        /* TODO: An additional option to set UUID values might be useful. */
        { /* end of list */ }
    }
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -82,8 +82,6 @@ void vhdx_log_desc_le_import(VHDXLogDescriptor *d)
    assert(d != NULL);

    le32_to_cpus(&d->signature);
-    le32_to_cpus(&d->trailing_bytes);
-    le64_to_cpus(&d->leading_bytes);
    le64_to_cpus(&d->file_offset);
    le64_to_cpus(&d->sequence_number);
 }
@@ -99,6 +97,15 @@ void vhdx_log_desc_le_export(VHDXLogDescriptor *d)
    cpu_to_le64s(&d->sequence_number);
 }

+void vhdx_log_data_le_import(VHDXLogDataSector *d)
+{
+    assert(d != NULL);
+
+    le32_to_cpus(&d->data_signature);
+    le32_to_cpus(&d->sequence_high);
+    le32_to_cpus(&d->sequence_low);
+}
+
 void vhdx_log_data_le_export(VHDXLogDataSector *d)
 {
    assert(d != NULL);
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -84,6 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
    if (ret < 0) {
        goto exit;
    }
+    vhdx_log_entry_hdr_le_import(hdr);

 exit:
    return ret;
@@ -211,7 +212,7 @@ static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
 {
    int valid = false;

-    if (memcmp(&hdr->signature, "loge", 4)) {
+    if (hdr->signature != VHDX_LOG_SIGNATURE) {
        goto exit;
    }

@@ -275,12 +276,12 @@ static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
        goto exit;
    }

-    if (!memcmp(&desc->signature, "zero", 4)) {
+    if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
        if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
            /* valid */
            ret = true;
        }
-    } else if (!memcmp(&desc->signature, "desc", 4)) {
+    } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
            /* valid */
            ret = true;
    }
@@ -327,13 +328,15 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
 * passed into this function. Each descriptor will also be validated,
 * and error returned if any are invalid. */
 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
-                              VHDXLogEntries *log, VHDXLogDescEntries **buffer)
+                              VHDXLogEntries *log, VHDXLogDescEntries **buffer,
+                              bool convert_endian)
 {
    int ret = 0;
    uint32_t desc_sectors;
    uint32_t sectors_read;
    VHDXLogEntryHeader hdr;
    VHDXLogDescEntries *desc_entries = NULL;
+    VHDXLogDescriptor desc;
    int i;

    assert(*buffer == NULL);
@@ -342,14 +345,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
    if (ret < 0) {
        goto exit;
    }
-    vhdx_log_entry_hdr_le_import(&hdr);
+
    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
        ret = -EINVAL;
        goto exit;
    }

    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
-    desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    desc_entries = qemu_try_blockalign(bs->file,
+                                       desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    if (desc_entries == NULL) {
+        ret = -ENOMEM;
+        goto exit;
+    }

    ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
                                desc_sectors, false);
@@ -363,12 +371,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,

    /* put in proper endianness, and validate each desc */
    for (i = 0; i < hdr.descriptor_count; i++) {
-        vhdx_log_desc_le_import(&desc_entries->desc[i]);
-        if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
+        desc = desc_entries->desc[i];
+        vhdx_log_desc_le_import(&desc);
+        if (convert_endian) {
+            desc_entries->desc[i] = desc;
+        }
+        if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
            ret = -EINVAL;
            goto free_and_exit;
        }
    }
+    if (convert_endian) {
+        desc_entries->hdr = hdr;
+    }

    *buffer = desc_entries;
    goto exit;
@@ -403,7 +418,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,

    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);

-    if (!memcmp(&desc->signature, "desc", 4)) {
+    if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
        /* data sector */
        if (data == NULL) {
            ret = -EFAULT;
@@ -431,10 +446,15 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,

        memcpy(buffer+offset, &desc->trailing_bytes, 4);

-    } else if (!memcmp(&desc->signature, "zero", 4)) {
+    } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
        /* write 'count' sectors of sector */
        memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
        count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
+    } else {
+        error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
+                      desc->signature);
+        ret = -EINVAL;
+        goto exit;
    }

    file_offset = desc->file_offset;
@@ -493,13 +513,13 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
            goto exit;
        }

-        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
+        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
        if (ret < 0) {
            goto exit;
        }

        for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
-            if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
+            if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
                /* data sector, so read a sector to flush */
                ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
                                            data, 1, false);
@@ -510,6 +530,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                    ret = -EINVAL;
                    goto exit;
                }
+                vhdx_log_data_le_import(data);
            }

            ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
@@ -558,9 +579,6 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
        goto inc_and_exit;
    }

-    vhdx_log_entry_hdr_le_import(&hdr);
-
-
    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
        goto inc_and_exit;
    }
@@ -573,13 +591,13 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,

    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);

-    /* Read desc sectors, and calculate log checksum */
+    /* Read all log sectors, and calculate log checksum */

    total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;


    /* read_desc() will increment the read idx */
-    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
+    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
    if (ret < 0) {
        goto free_and_exit;
    }
@@ -602,7 +620,7 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
        }
    }
    crc ^= 0xffffffff;
-    if (crc != desc_buffer->hdr.checksum) {
+    if (crc != hdr.checksum) {
        goto free_and_exit;
    }

@@ -905,7 +923,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
    buffer = qemu_blockalign(bs, total_length);
    memcpy(buffer, &new_hdr, sizeof(new_hdr));

-    new_desc = (VHDXLogDescriptor *) (buffer + sizeof(new_hdr));
+    new_desc = buffer + sizeof(new_hdr);
    data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
    data_tmp = data;

@@ -962,7 +980,6 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
     * last data sector */
    vhdx_update_checksum(buffer, total_length,
                         offsetof(VHDXLogEntryHeader, checksum));
-    cpu_to_le32s((uint32_t *)(buffer + 4));

    /* now write to the log */
    ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -99,7 +99,8 @@ static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
 /* Each parent type must have a valid GUID; this is for parent images
 * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
 * need to make up our own QCOW2 GUID type */
-static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
+static const MSGUID parent_vhdx_guid __attribute__((unused))
+                                     = { .data1 = 0xb04aefb7,
                                         .data2 = 0xd19e,
                                         .data3 = 0x4a81,
                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8,
@@ -135,10 +136,8 @@ typedef struct VHDXSectorInfo {
 * buf: buffer pointer
 * size: size of buffer (must be > crc_offset+4)
 *
- * Note: The resulting checksum is in the CPU endianness, not necessarily
- *       in the file format endianness (LE).  Any header export to disk should
- *       make sure that vhdx_header_le_export() is used to convert to the
- *       correct endianness
+ * Note: The buffer should have all multi-byte data in little-endian format,
+ *       and the resulting checksum is in little endian format.
 */
 uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
 {
@@ -149,6 +148,7 @@ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)

    memset(buf + crc_offset, 0, sizeof(crc));
    crc =  crc32c(0xffffffff, buf, size);
+    cpu_to_le32s(&crc);
    memcpy(buf + crc_offset, &crc, sizeof(crc));

    return crc;
@@ -300,7 +300,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
 {
    uint8_t *buffer = NULL;
    int ret;
-    VHDXHeader header_le;
+    VHDXHeader *header_le;

    assert(bs_file != NULL);
    assert(hdr != NULL);
@@ -321,11 +321,12 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
    }

    /* overwrite the actual VHDXHeader portion */
-    memcpy(buffer, hdr, sizeof(VHDXHeader));
-    hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
-                                         offsetof(VHDXHeader, checksum));
-    vhdx_header_le_export(hdr, &header_le);
-    ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader));
+    header_le = (VHDXHeader *)buffer;
+    memcpy(header_le, hdr, sizeof(VHDXHeader));
+    vhdx_header_le_export(hdr, header_le);
+    vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
+                         offsetof(VHDXHeader, checksum));
+    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader));

 exit:
    qemu_vfree(buffer);
@@ -432,13 +433,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    }
    /* copy over just the relevant portion that we need */
    memcpy(header1, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header1);

-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header1->signature, "head", 4)             &&
-        header1->version == 1) {
-        h1_seq = header1->sequence_number;
-        h1_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header1);
+        if (header1->signature == VHDX_HEADER_SIGNATURE &&
+            header1->version == 1) {
+            h1_seq = header1->sequence_number;
+            h1_valid = true;
+        }
    }

    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
@@ -447,13 +449,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    }
    /* copy over just the relevant portion that we need */
    memcpy(header2, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header2);

-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header2->signature, "head", 4)             &&
-        header2->version == 1) {
-        h2_seq = header2->sequence_number;
-        h2_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header2);
+        if (header2->signature == VHDX_HEADER_SIGNATURE &&
+            header2->version == 1) {
+            h2_seq = header2->sequence_number;
+            h2_valid = true;
+        }
    }

    /* If there is only 1 valid header (or no valid headers), we
@@ -519,15 +522,21 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
        goto fail;
    }
    memcpy(&s->rt, buffer, sizeof(s->rt));
-    vhdx_region_header_le_import(&s->rt);
    offset += sizeof(s->rt);

-    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
-        memcmp(&s->rt.signature, "regi", 4)) {
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) {
        ret = -EINVAL;
        goto fail;
    }

+    vhdx_region_header_le_import(&s->rt);
+
+    if (s->rt.signature != VHDX_REGION_SIGNATURE) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+
    /* Per spec, maximum region table entry count is 2047 */
    if (s->rt.entry_count > 2047) {
        ret = -EINVAL;
@@ -630,7 +639,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)

    vhdx_metadata_header_le_import(&s->metadata_hdr);

-    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+    if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) {
        ret = -EINVAL;
        goto exit;
    }
@@ -950,7 +959,11 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* s->bat is freed in vhdx_close() */
-    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+    s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length);
+    if (s->bat == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
    if (ret < 0) {
@@ -991,7 +1004,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VHDX images are used */
    error_set(&s->migration_blocker,
            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-            "vhdx", bs->device_name, "live migration");
+            "vhdx", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;
@@ -1096,8 +1109,9 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
            /* check the payload block state */
            switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
-            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
-            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
+            case PAYLOAD_BLOCK_UNDEFINED:
+            case PAYLOAD_BLOCK_UNMAPPED:
+            case PAYLOAD_BLOCK_UNMAPPED_v095:
            case PAYLOAD_BLOCK_ZERO:
                /* return zero */
                qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
@@ -1264,11 +1278,11 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                        sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
                    }
                }
-
                /* fall through */
            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
-            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
-            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
+            case PAYLOAD_BLOCK_UNMAPPED:
+            case PAYLOAD_BLOCK_UNMAPPED_v095:
+            case PAYLOAD_BLOCK_UNDEFINED:
                bat_prior_offset = sinfo.file_offset;
                ret = vhdx_allocate_block(bs, s, &sinfo.file_offset);
                if (ret < 0) {
@@ -1369,7 +1383,7 @@ static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
    int ret = 0;
    VHDXHeader *hdr = NULL;

-    hdr = g_malloc0(sizeof(VHDXHeader));
+    hdr = g_new0(VHDXHeader, 1);

    hdr->signature       = VHDX_HEADER_SIGNATURE;
    hdr->sequence_number = g_random_int();
@@ -1395,6 +1409,12 @@ exit:
    return ret;
 }

+#define VHDX_METADATA_ENTRY_BUFFER_SIZE \
+                                    (sizeof(VHDXFileParameters)               +\
+                                     sizeof(VHDXVirtualDiskSize)              +\
+                                     sizeof(VHDXPage83Data)                   +\
+                                     sizeof(VHDXVirtualDiskLogicalSectorSize) +\
+                                     sizeof(VHDXVirtualDiskPhysicalSectorSize))

 /*
 * Create the Metadata entries.
@@ -1433,11 +1453,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs,
    VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size;
    VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size;

-    entry_buffer = g_malloc0(sizeof(VHDXFileParameters)               +
-                             sizeof(VHDXVirtualDiskSize)              +
-                             sizeof(VHDXPage83Data)                   +
-                             sizeof(VHDXVirtualDiskLogicalSectorSize) +
-                             sizeof(VHDXVirtualDiskPhysicalSectorSize));
+    entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE);

    mt_file_params = entry_buffer;
    offset += sizeof(VHDXFileParameters);
@@ -1518,7 +1534,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs,
    }

    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
-                      VHDX_HEADER_BLOCK_SIZE);
+                      VHDX_METADATA_ENTRY_BUFFER_SIZE);
    if (ret < 0) {
        goto exit;
    }
@@ -1540,7 +1556,8 @@ exit:
 */
 static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                           uint64_t image_size, VHDXImageType type,
-                           bool use_zero_blocks, VHDXRegionTableEntry *rt_bat)
+                           bool use_zero_blocks, uint64_t file_offset,
+                           uint32_t length)
 {
    int ret = 0;
    uint64_t data_file_offset;
@@ -1555,7 +1572,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
    /* this gives a data start after BAT/bitmap entries, and well
     * past any metadata entries (with a 4 MB buffer for future
     * expansion */
-    data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB;
+    data_file_offset = file_offset + length + 5 * MiB;
    total_sectors = image_size >> s->logical_sector_size_bits;

    if (type == VHDX_TYPE_DYNAMIC) {
@@ -1579,7 +1596,11 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                use_zero_blocks ||
                bdrv_has_zero_init(bs) == 0) {
        /* for a fixed file, the default BAT entry is not zero */
-        s->bat = g_malloc0(rt_bat->length);
+        s->bat = g_try_malloc0(length);
+        if (length && s->bat == NULL) {
+            ret = -ENOMEM;
+            goto exit;
+        }
        block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
                                                PAYLOAD_BLOCK_NOT_PRESENT;
        block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
@@ -1594,7 +1615,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
            cpu_to_le64s(&s->bat[sinfo.bat_idx]);
            sector_num += s->sectors_per_block;
        }
-        ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length);
+        ret = bdrv_pwrite(bs, file_offset, s->bat, length);
        if (ret < 0) {
            goto exit;
        }
@@ -1626,6 +1647,8 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
    int ret = 0;
    uint32_t offset = 0;
    void *buffer = NULL;
+    uint64_t bat_file_offset;
+    uint32_t bat_length;
    BDRVVHDXState *s = NULL;
    VHDXRegionTableHeader *region_table;
    VHDXRegionTableEntry *rt_bat;
@@ -1635,7 +1658,7 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,

    /* Populate enough of the BDRVVHDXState to be able to use the
     * pre-existing BAT calculation, translation, and update functions */
-    s = g_malloc0(sizeof(BDRVVHDXState));
+    s = g_new0(BDRVVHDXState, 1);

    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
                     (uint64_t) sector_size / (uint64_t) block_size;
@@ -1674,19 +1697,26 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
    rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
    *metadata_offset = rt_metadata->file_offset;

+    bat_file_offset = rt_bat->file_offset;
+    bat_length = rt_bat->length;
+
+    vhdx_region_header_le_export(region_table);
+    vhdx_region_entry_le_export(rt_bat);
+    vhdx_region_entry_le_export(rt_metadata);
+
    vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
                         offsetof(VHDXRegionTableHeader, checksum));


    /* The region table gives us the data we need to create the BAT,
     * so do that now */
-    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat);
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks,
+                          bat_file_offset, bat_length);
+    if (ret < 0) {
+        goto exit;
+    }

    /* Now write out the region headers to disk */
-    vhdx_region_header_le_export(region_table);
-    vhdx_region_entry_le_export(rt_bat);
-    vhdx_region_entry_le_export(rt_metadata);
-
    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
                      VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
@@ -1699,7 +1729,6 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
        goto exit;
    }

-
 exit:
    g_free(s);
    g_free(buffer);
@@ -1740,11 +1769,12 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    VHDXImageType image_type;
    Error *local_err = NULL;

-    image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0);
    block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0);
    type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
-    use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, false);
+    use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, true);

    if (image_size > VHDX_MAX_IMAGE_SIZE) {
        error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB");
@@ -1849,7 +1879,6 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }


-
 delete_and_exit:
    bdrv_unref(bs);
 exit:
@@ -1907,7 +1936,9 @@ static QemuOptsList vhdx_create_opts = {
       {
           .name = VHDX_BLOCK_OPT_ZERO,
           .type = QEMU_OPT_BOOL,
-           .help = "Force use of payload blocks of type 'ZERO'.  Non-standard."
+           .help = "Force use of payload blocks of type 'ZERO'. "\
+                   "Non-standard, but default.  Do not set to 'off' when "\
+                   "using 'qemu-img convert' with subformat=dynamic."
       },
       { NULL }
    }
@@ -1925,6 +1956,7 @@ static BlockDriver bdrv_vhdx = {
    .bdrv_create            = vhdx_create,
    .bdrv_get_info          = vhdx_get_info,
    .bdrv_check             = vhdx_check,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,

    .create_opts            = &vhdx_create_opts,
 };
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -226,7 +226,8 @@ typedef struct QEMU_PACKED VHDXLogDataSector {
 #define PAYLOAD_BLOCK_NOT_PRESENT       0
 #define PAYLOAD_BLOCK_UNDEFINED         1
 #define PAYLOAD_BLOCK_ZERO              2
-#define PAYLOAD_BLOCK_UNMAPPED          5
+#define PAYLOAD_BLOCK_UNMAPPED          3
+#define PAYLOAD_BLOCK_UNMAPPED_v095     5
 #define PAYLOAD_BLOCK_FULLY_PRESENT     6
 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7

@@ -435,6 +436,7 @@ void vhdx_header_le_import(VHDXHeader *h);
 void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h);
 void vhdx_log_desc_le_import(VHDXLogDescriptor *d);
 void vhdx_log_desc_le_export(VHDXLogDescriptor *d);
+void vhdx_log_data_le_import(VHDXLogDataSector *d);
 void vhdx_log_data_le_export(VHDXLogDataSector *d);
 void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr);
 void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr);
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -28,6 +28,7 @@
 #include "qemu/module.h"
 #include "migration/migration.h"
 #include <zlib.h>
+#include <glib.h>

 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
@@ -106,6 +107,7 @@ typedef struct VmdkExtent {
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

    int64_t cluster_sectors;
+    int64_t next_cluster_sector;
    char *type;
 } VmdkExtent;

@@ -124,7 +126,6 @@ typedef struct BDRVVmdkState {
 } BDRVVmdkState;

 typedef struct VmdkMetaData {
-    uint32_t offset;
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
@@ -233,7 +234,7 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
        return;
    }
    s->num_extents--;
-    s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
+    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }

 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
@@ -397,6 +398,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
 {
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
+    int64_t nb_sectors;

    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@@ -412,8 +414,12 @@ static int vmdk_add_extent(BlockDriverState *bs,
        return -EFBIG;
    }

-    s->extents = g_realloc(s->extents,
-                              (s->num_extents + 1) * sizeof(VmdkExtent));
+    nb_sectors = bdrv_nb_sectors(file);
+    if (nb_sectors < 0) {
+        return nb_sectors;
+    }
+
+    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
    extent = &s->extents[s->num_extents];
    s->num_extents++;

@@ -427,6 +433,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
+    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -448,7 +455,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,

    /* read the L1 table */
    l1_size = extent->l1_size * sizeof(uint32_t);
-    extent->l1_table = g_malloc(l1_size);
+    extent->l1_table = g_try_malloc(l1_size);
+    if (l1_size && extent->l1_table == NULL) {
+        return -ENOMEM;
+    }
+
    ret = bdrv_pread(extent->file,
                     extent->l1_table_offset,
                     extent->l1_table,
@@ -464,7 +475,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
    }

    if (extent->l1_backup_table_offset) {
-        extent->l1_backup_table = g_malloc(l1_size);
+        extent->l1_backup_table = g_try_malloc(l1_size);
+        if (l1_size && extent->l1_backup_table == NULL) {
+            ret = -ENOMEM;
+            goto fail_l1;
+        }
        ret = bdrv_pread(extent->file,
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
@@ -481,7 +496,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
    }

    extent->l2_cache =
-        g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
    return 0;
 fail_l1b:
    g_free(extent->l1_backup_table);
@@ -542,8 +557,16 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
        return NULL;
    }

-    size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
-    buf = g_malloc0(size + 1);
+    if (size < 4) {
+        /* Both descriptor file and sparse image must be much larger than 4
+         * bytes, also callers of vmdk_read_desc want to compare the first 4
+         * bytes with VMDK4_MAGIC, let's error out if less is read. */
+        error_setg(errp, "File is too small, not a valid image");
+        return NULL;
+    }
+
+    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
+    buf = g_malloc(size + 1);

    ret = bdrv_pread(file, desc_offset, buf, size);
    if (ret < 0) {
@@ -551,6 +574,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
        g_free(buf);
        return NULL;
    }
+    buf[ret] = 0;

    return buf;
 }
@@ -621,6 +645,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
            bs->file->total_sectors * 512 - 1536,
            &footer, sizeof(footer));
        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read footer");
            return ret;
        }

@@ -632,6 +657,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
            le32_to_cpu(footer.eos_marker.size) != 0  ||
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
        {
+            error_setg(errp, "Invalid footer");
            return -EINVAL;
        }

@@ -643,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
        snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
                 le32_to_cpu(header.version));
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bs->device_name, "vmdk", buf);
+                  bdrv_get_device_name(bs), "vmdk", buf);
        return -ENOTSUP;
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
        /* VMware KB 2064959 explains that version 3 added support for
@@ -662,6 +688,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
                        * le64_to_cpu(header.granularity);
    if (l1_entry_sectors == 0) {
+        error_setg(errp, "L1 entry size is invalid");
        return -EINVAL;
    }
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
@@ -669,8 +696,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
-    if (bdrv_getlength(file) <
-            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+    if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) {
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
@@ -771,10 +797,12 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
    VmdkExtent *extent;

    while (*p) {
-        /* parse extent line:
+        /* parse extent line in one of below formats:
+         *
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
-         * or
         * RW [size in sectors] SPARSE "file-name.vmdk"
+         * RW [size in sectors] VMFS "file-name.vmdk"
+         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
         */
        flat_offset = -1;
        ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
@@ -805,6 +833,14 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            goto next_line;
        }

+        if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
+            !desc_file_path[0])
+        {
+            error_setg(errp, "Cannot use relative extent paths with VMDK "
+                       "descriptor file '%s'", bs->file->filename);
+            return -EINVAL;
+        }
+
        path_combine(extent_path, sizeof(extent_path),
                desc_file_path, fname);
        extent_file = NULL;
@@ -821,6 +857,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
                            0, 0, 0, 0, 0, &extent, errp);
            if (ret < 0) {
+                bdrv_unref(extent_file);
                return ret;
            }
            extent->flat_start_offset = flat_offset << 9;
@@ -832,14 +869,15 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            } else {
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
            }
+            g_free(buf);
            if (ret) {
-                g_free(buf);
                bdrv_unref(extent_file);
                return ret;
            }
            extent = &s->extents[s->num_extents - 1];
        } else {
            error_setg(errp, "Unsupported extent type '%s'", type);
+            bdrv_unref(extent_file);
            return -ENOTSUP;
        }
        extent->type = g_strdup(type);
@@ -879,7 +917,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
    }
    s->create_type = g_strdup(ct);
    s->desc_offset = 0;
-    ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp);
+    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp);
 exit:
    return ret;
 }
@@ -887,7 +925,7 @@ exit:
 static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
-    char *buf = NULL;
+    char *buf;
    int ret;
    BDRVVmdkState *s = bs->opaque;
    uint32_t magic;
@@ -924,7 +962,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VMDK images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vmdk", bs->device_name, "live migration");
+              "vmdk", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);
    g_free(buf);
    return 0;
@@ -952,57 +990,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
    }
 }

+/**
+ * get_whole_cluster
+ *
+ * Copy backing file's cluster that covers @sector_num, otherwise write zero,
+ * to the cluster at @cluster_sector_num.
+ *
+ * If @skip_start_sector < @skip_end_sector, the relative range
+ * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
+ * it for call to write user data in the request.
+ */
 static int get_whole_cluster(BlockDriverState *bs,
-                VmdkExtent *extent,
-                uint64_t cluster_offset,
-                uint64_t offset,
-                bool allocate)
+                             VmdkExtent *extent,
+                             uint64_t cluster_sector_num,
+                             uint64_t sector_num,
+                             uint64_t skip_start_sector,
+                             uint64_t skip_end_sector)
 {
    int ret = VMDK_OK;
-    uint8_t *whole_grain = NULL;
+    int64_t cluster_bytes;
+    uint8_t *whole_grain;

+    /* For COW, align request sector_num to cluster start */
+    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
+    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    whole_grain = qemu_blockalign(bs, cluster_bytes);
+
+    if (!bs->backing_hd) {
+        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
+        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
+               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+    }
+
+    assert(skip_end_sector <= extent->cluster_sectors);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
-    if (bs->backing_hd) {
-        whole_grain =
-            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
-        if (!vmdk_is_cid_valid(bs)) {
-            ret = VMDK_ERROR;
-            goto exit;
-        }
+    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
+        ret = VMDK_ERROR;
+        goto exit;
+    }

-        /* floor offset to cluster */
-        offset -= offset % (extent->cluster_sectors * 512);
-        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-                extent->cluster_sectors);
-        if (ret < 0) {
-            ret = VMDK_ERROR;
-            goto exit;
+    /* Read backing data before skip range */
+    if (skip_start_sector > 0) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num,
+                            whole_grain, skip_start_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
        }
-
-        /* Write grain only into the active image */
-        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
-                extent->cluster_sectors);
+        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
+                         skip_start_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
+    /* Read backing data after skip range */
+    if (skip_end_sector < extent->cluster_sectors) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
+                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                            extent->cluster_sectors - skip_end_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
+                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                         extent->cluster_sectors - skip_end_sector);
+        if (ret < 0) {
+            ret = VMDK_ERROR;
+            goto exit;
+        }
+    }
+
 exit:
    qemu_vfree(whole_grain);
    return ret;
 }

-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+                         uint32_t offset)
 {
-    uint32_t offset;
-    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
-    offset = cpu_to_le32(m_data->offset);
+    offset = cpu_to_le32(offset);
    /* update L2 table */
    if (bdrv_pwrite_sync(
                extent->file,
                ((int64_t)m_data->l2_offset * 512)
-                    + (m_data->l2_index * sizeof(m_data->offset)),
+                    + (m_data->l2_index * sizeof(offset)),
                &offset, sizeof(offset)) < 0) {
        return VMDK_ERROR;
    }
@@ -1012,7 +1090,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
        if (bdrv_pwrite_sync(
                    extent->file,
                    ((int64_t)m_data->l2_offset * 512)
-                        + (m_data->l2_index * sizeof(m_data->offset)),
+                        + (m_data->l2_index * sizeof(offset)),
                    &offset, sizeof(offset)) < 0) {
            return VMDK_ERROR;
        }
@@ -1024,17 +1102,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
    return VMDK_OK;
 }

+/**
+ * get_cluster_offset
+ *
+ * Look up cluster offset in extent file by sector number, and store in
+ * @cluster_offset.
+ *
+ * For flat extents, the start offset as parsed from the description file is
+ * returned.
+ *
+ * For sparse extents, look up in L1, L2 table. If allocate is true, return an
+ * offset for a new cluster and update L2 cache. If there is a backing file,
+ * COW is done before returning; otherwise, zeroes are written to the allocated
+ * cluster. Both COW and zero writing skips the sector range
+ * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
+ * has new data to write there.
+ *
+ * Returns: VMDK_OK if cluster exists and mapped in the image.
+ *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
+ *          VMDK_ERROR if failed.
+ */
 static int get_cluster_offset(BlockDriverState *bs,
-                                    VmdkExtent *extent,
-                                    VmdkMetaData *m_data,
-                                    uint64_t offset,
-                                    int allocate,
-                                    uint64_t *cluster_offset)
+                              VmdkExtent *extent,
+                              VmdkMetaData *m_data,
+                              uint64_t offset,
+                              bool allocate,
+                              uint64_t *cluster_offset,
+                              uint64_t skip_start_sector,
+                              uint64_t skip_end_sector)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
    uint32_t min_count, *l2_table;
    bool zeroed = false;
+    int64_t ret;
+    int64_t cluster_sector;

    if (m_data) {
        m_data->valid = 0;
@@ -1088,52 +1190,41 @@ static int get_cluster_offset(BlockDriverState *bs,
    extent->l2_cache_counts[min_index] = 1;
 found:
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
-    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
+    cluster_sector = le32_to_cpu(l2_table[l2_index]);

    if (m_data) {
        m_data->valid = 1;
        m_data->l1_index = l1_index;
        m_data->l2_index = l2_index;
-        m_data->offset = *cluster_offset;
        m_data->l2_offset = l2_offset;
        m_data->l2_cache_entry = &l2_table[l2_index];
    }
-    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
        zeroed = true;
    }

-    if (!*cluster_offset || zeroed) {
+    if (!cluster_sector || zeroed) {
        if (!allocate) {
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
        }

-        /* Avoid the L2 tables update for the images that have snapshots. */
-        *cluster_offset = bdrv_getlength(extent->file);
-        if (!extent->compressed) {
-            bdrv_truncate(
-                extent->file,
-                *cluster_offset + (extent->cluster_sectors << 9)
-            );
-        }
-
-        *cluster_offset >>= 9;
-        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
+        cluster_sector = extent->next_cluster_sector;
+        extent->next_cluster_sector += extent->cluster_sectors;

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        if (get_whole_cluster(
-                bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return VMDK_ERROR;
-        }
-
-        if (m_data) {
-            m_data->offset = *cluster_offset;
+        ret = get_whole_cluster(bs, extent,
+                                cluster_sector,
+                                offset >> BDRV_SECTOR_BITS,
+                                skip_start_sector, skip_end_sector);
+        if (ret) {
+            return ret;
        }
    }
-    *cluster_offset <<= 9;
+    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
    return VMDK_OK;
 }

@@ -1168,7 +1259,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
    }
    qemu_co_mutex_lock(&s->lock);
    ret = get_cluster_offset(bs, extent, NULL,
-                            sector_num * 512, 0, &offset);
+                             sector_num * 512, false, &offset,
+                             0, 0);
    qemu_co_mutex_unlock(&s->lock);

    switch (ret) {
@@ -1321,9 +1413,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
-        ret = get_cluster_offset(
-                            bs, extent, NULL,
-                            sector_num << 9, 0, &cluster_offset);
+        ret = get_cluster_offset(bs, extent, NULL,
+                                 sector_num << 9, false, &cluster_offset,
+                                 0, 0);
        extent_begin_sector = extent->end_sector - extent->sectors;
        extent_relative_sector_num = sector_num - extent_begin_sector;
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@@ -1404,12 +1496,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
-        ret = get_cluster_offset(
-                                bs,
-                                extent,
-                                &m_data,
-                                sector_num << 9, !extent->compressed,
-                                &cluster_offset);
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                 !(extent->compressed || zeroed),
+                                 &cluster_offset,
+                                 index_in_cluster, index_in_cluster + n);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1418,24 +1515,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(
-                                        bs,
-                                        extent,
-                                        &m_data,
-                                        sector_num << 9, 1,
-                                        &cluster_offset);
+                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                         true, &cluster_offset, 0, 0);
            }
        }
        if (ret == VMDK_ERROR) {
            return -EINVAL;
        }
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
@@ -1443,9 +1529,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                    n >= extent->cluster_sectors) {
                n = extent->cluster_sectors;
                if (!zero_dry_run) {
-                    m_data.offset = VMDK_GTE_ZEROED;
                    /* update L2 tables */
-                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
+                            != VMDK_OK) {
                        return -EIO;
                    }
                }
@@ -1461,7 +1547,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
            }
            if (m_data.valid) {
                /* update L2 tables */
-                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                if (vmdk_L2update(extent, &m_data,
+                                  cluster_offset >> BDRV_SECTOR_BITS)
+                        != VMDK_OK) {
                    return -EIO;
                }
            }
@@ -1473,7 +1561,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        /* update CID on the first write every time the virtual disk is
         * opened */
        if (!s->cid_updated) {
-            ret = vmdk_write_cid(bs, time(NULL));
+            ret = vmdk_write_cid(bs, g_random_int());
            if (ret < 0) {
                return ret;
            }
@@ -1742,7 +1830,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }
    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
@@ -1802,8 +1891,19 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    }
    if (backing_file) {
        BlockDriverState *bs = NULL;
-        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_NO_BACKING, NULL,
+        char *full_backing = g_new0(char, PATH_MAX);
+        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
+                                                     full_backing, PATH_MAX,
+                                                     &local_err);
+        if (local_err) {
+            g_free(full_backing);
+            error_propagate(errp, local_err);
+            ret = -ENOENT;
+            goto exit;
+        }
+        ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, NULL,
                        errp);
+        g_free(full_backing);
        if (ret != 0) {
            goto exit;
        }
@@ -1856,7 +1956,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    }
    /* generate descriptor file */
    desc = g_strdup_printf(desc_template,
-                           (uint32_t)time(NULL),
+                           g_random_int(),
                           parent_cid,
                           fmt,
                           parent_desc_line,
@@ -1999,7 +2099,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
-    int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int64_t total_sectors = bdrv_nb_sectors(bs);
    int ret;
    uint64_t cluster_offset;

@@ -2020,7 +2120,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
-                                 0, &cluster_offset);
+                                 false, &cluster_offset, 0, 0);
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
@@ -2071,23 +2171,29 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
    return spec_info;
 }

+static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
+{
+    return a->flat == b->flat &&
+           a->compressed == b->compressed &&
+           (a->flat || a->cluster_sectors == b->cluster_sectors);
+}
+
 static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
+
+    /* See if we have multiple extents but they have different cases */
+    for (i = 1; i < s->num_extents; i++) {
+        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
+            return -ENOTSUP;
+        }
+    }
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
-    /* See if we have multiple extents but they have different cases */
-    for (i = 1; i < s->num_extents; i++) {
-        if (bdi->needs_compressed_writes != s->extents[i].compressed ||
-            (bdi->cluster_size && bdi->cluster_size !=
-                s->extents[i].cluster_sectors << BDRV_SECTOR_BITS)) {
-            return -ENOTSUP;
-        }
-    }
    return 0;
 }

--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,13 +29,6 @@
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif

 /**************************************************************/

@@ -214,7 +207,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
            "incorrect.\n", bs->filename);

    /* Write 'checksum' back to footer, or else will leave it with zero. */
-    footer->checksum = be32_to_cpu(checksum);
+    footer->checksum = cpu_to_be32(checksum);

    // The visible size of a image in Virtual PC depends on the geometry
    // rather than on the size stored in the footer (the size in the footer
@@ -276,7 +269,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
            goto fail;
        }

-        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);
+        s->pagetable = qemu_try_blockalign(bs->file, s->max_table_entries * 4);
+        if (s->pagetable == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }

        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);

@@ -323,7 +320,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VHD images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vpc", bs->device_name, "live migration");
+              "vpc", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;
@@ -475,7 +472,7 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)

    // Write BAT entry to disk
    bat_offset = s->bat_offset + (4 * index);
-    bat_value = be32_to_cpu(s->pagetable[index]);
+    bat_value = cpu_to_be32(s->pagetable[index]);
    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
    if (ret < 0)
        goto fail;
@@ -492,7 +489,7 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) != VHD_FIXED) {
+    if (be32_to_cpu(footer->type) != VHD_FIXED) {
        bdi->cluster_size = s->block_size;
    }

@@ -509,7 +506,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num,
    int64_t sectors, sectors_per_block;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_read(bs->file, sector_num, buf, nb_sectors);
    }
    while (nb_sectors > 0) {
@@ -558,7 +555,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num,
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_write(bs->file, sector_num, buf, nb_sectors);
    }
    while (nb_sectors > 0) {
@@ -656,39 +653,41 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    return 0;
 }

-static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
+static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
+                               int64_t total_sectors)
 {
    VHDDynDiskHeader *dyndisk_header =
        (VHDDynDiskHeader *) buf;
    size_t block_size, num_bat_entries;
    int i;
-    int ret = -EIO;
+    int ret;
+    int64_t offset = 0;

    // Write the footer (twice: at the beginning and at the end)
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret) {
        goto fail;
    }

-    if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret < 0) {
        goto fail;
    }

    // Write the initial BAT
-    if (lseek(fd, 3 * 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 3 * 512;

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        if (write(fd, buf, 512) != 512) {
+        ret = bdrv_pwrite_sync(bs, offset, buf, 512);
+        if (ret < 0) {
            goto fail;
        }
+        offset += 512;
    }

    // Prepare the Dynamic Disk Header
@@ -700,48 +699,44 @@ static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
     * Note: The spec is actually wrong here for data_offset, it says
     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
     */
-    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
-    dyndisk_header->table_offset = be64_to_cpu(3 * 512);
-    dyndisk_header->version = be32_to_cpu(0x00010000);
-    dyndisk_header->block_size = be32_to_cpu(block_size);
-    dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+    dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
+    dyndisk_header->table_offset = cpu_to_be64(3 * 512);
+    dyndisk_header->version = cpu_to_be32(0x00010000);
+    dyndisk_header->block_size = cpu_to_be32(block_size);
+    dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);

-    dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));

    // Write the header
-    if (lseek(fd, 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 512;

-    if (write(fd, buf, 1024) != 1024) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, 1024);
+    if (ret < 0) {
        goto fail;
    }
-    ret = 0;

 fail:
    return ret;
 }

-static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size)
+static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
+                             int64_t total_size)
 {
-    int ret = -EIO;
+    int ret;

    /* Add footer to total size */
-    total_size += 512;
-    if (ftruncate(fd, total_size) != 0) {
-        ret = -errno;
-        goto fail;
-    }
-    if (lseek(fd, -512, SEEK_END) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
-        goto fail;
+    total_size += HEADER_SIZE;
+
+    ret = bdrv_truncate(bs, total_size);
+    if (ret < 0) {
+        return ret;
    }

-    ret = 0;
+    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    if (ret < 0) {
+        return ret;
+    }

- fail:
    return ret;
 }

@@ -750,7 +745,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    uint8_t buf[1024];
    VHDFooter *footer = (VHDFooter *) buf;
    char *disk_type_param;
-    int fd, i;
+    int i;
    uint16_t cyls = 0;
    uint8_t heads = 0;
    uint8_t secs_per_cyl = 0;
@@ -758,10 +753,12 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size;
    int disk_type;
    int ret = -EIO;
-    bool nocow = false;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (disk_type_param) {
        if (!strcmp(disk_type_param, "dynamic")) {
@@ -775,28 +772,17 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    } else {
        disk_type = VHD_DYNAMIC;
    }
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);

-    /* Create the file */
-    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
-    if (fd < 0) {
-        ret = -EIO;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
        goto out;
    }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto out;
    }

    /*
@@ -810,7 +796,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
                               &secs_per_cyl))
        {
            ret = -EFBIG;
-            goto fail;
+            goto out;
        }
    }

@@ -824,46 +810,45 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    memcpy(footer->creator_app, "qemu", 4);
    memcpy(footer->creator_os, "Wi2k", 4);

-    footer->features = be32_to_cpu(0x02);
-    footer->version = be32_to_cpu(0x00010000);
+    footer->features = cpu_to_be32(0x02);
+    footer->version = cpu_to_be32(0x00010000);
    if (disk_type == VHD_DYNAMIC) {
-        footer->data_offset = be64_to_cpu(HEADER_SIZE);
+        footer->data_offset = cpu_to_be64(HEADER_SIZE);
    } else {
-        footer->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
+        footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
    }
-    footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+    footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);

    /* Version of Virtual PC 2007 */
-    footer->major = be16_to_cpu(0x0005);
-    footer->minor = be16_to_cpu(0x0003);
+    footer->major = cpu_to_be16(0x0005);
+    footer->minor = cpu_to_be16(0x0003);
    if (disk_type == VHD_DYNAMIC) {
-        footer->orig_size = be64_to_cpu(total_sectors * 512);
-        footer->size = be64_to_cpu(total_sectors * 512);
+        footer->orig_size = cpu_to_be64(total_sectors * 512);
+        footer->size = cpu_to_be64(total_sectors * 512);
    } else {
-        footer->orig_size = be64_to_cpu(total_size);
-        footer->size = be64_to_cpu(total_size);
+        footer->orig_size = cpu_to_be64(total_size);
+        footer->size = cpu_to_be64(total_size);
    }
-    footer->cyls = be16_to_cpu(cyls);
+    footer->cyls = cpu_to_be16(cyls);
    footer->heads = heads;
    footer->secs_per_cyl = secs_per_cyl;

-    footer->type = be32_to_cpu(disk_type);
+    footer->type = cpu_to_be32(disk_type);

 #if defined(CONFIG_UUID)
    uuid_generate(footer->uuid);
 #endif

-    footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));

    if (disk_type == VHD_DYNAMIC) {
-        ret = create_dynamic_disk(fd, buf, total_sectors);
+        ret = create_dynamic_disk(bs, buf, total_sectors);
    } else {
-        ret = create_fixed_disk(fd, buf, total_size);
+        ret = create_fixed_disk(bs, buf, total_size);
    }

-fail:
-    qemu_close(fd);
 out:
+    bdrv_unref(bs);
    g_free(disk_type_param);
    return ret;
 }
@@ -873,7 +858,7 @@ static int vpc_has_zero_init(BlockDriverState *bs)
    BDRVVPCState *s = bs->opaque;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_has_zero_init(bs->file);
    } else {
        return 1;
@@ -908,11 +893,6 @@ static QemuOptsList vpc_create_opts = {
                "Type of virtual hard disk format. Supported formats are "
                "{dynamic (default) | fixed} "
        },
-        {
-            .name = BLOCK_OPT_NOCOW,
-            .type = QEMU_OPT_BOOL,
-            .help = "Turn off copy-on-write (valid only on btrfs)"
-        },
        { /* end of list */ }
    }
 };
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -52,10 +52,6 @@

 #define DLOG(a) a

-#undef stderr
-#define stderr STDERR
-FILE* stderr = NULL;
-
 static void checkpoint(void);

 #ifdef __MINGW32__
@@ -732,7 +728,7 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)
 	if(first_cluster == 0 && (is_dotdot || is_dot))
 	    continue;

-	buffer=(char*)g_malloc(length);
+	buffer = g_malloc(length);
 	snprintf(buffer,length,"%s/%s",dirname,entry->d_name);

 	if(stat(buffer,&st)<0) {
@@ -767,7 +763,7 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)

 	/* create mapping for this file */
 	if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
-	    s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+	    s->current_mapping = array_get_next(&(s->mapping));
 	    s->current_mapping->begin=0;
 	    s->current_mapping->end=st.st_size;
 	    /*
@@ -811,12 +807,12 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)
    }

     /* reget the mapping, since s->mapping was possibly realloc()ed */
-    mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+    mapping = array_get(&(s->mapping), mapping_index);
    first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
 	* 0x20 / s->cluster_size;
    mapping->end = first_cluster;

-    direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+    direntry = array_get(&(s->directory), mapping->dir_index);
    set_begin_of_direntry(direntry, mapping->begin);

    return 0;
@@ -1082,11 +1078,6 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
    vvv = s;
 #endif

-DLOG(if (stderr == NULL) {
-    stderr = fopen("vvfat.log", "a");
-    setbuf(stderr, NULL);
-})
-
    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -1191,7 +1182,7 @@ DLOG(if (stderr == NULL) {
    if (s->qcow) {
        error_set(&s->migration_blocker,
                  QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  "vvfat (rw)", bs->device_name, "live migration");
+                  "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
        migrate_add_blocker(s->migration_blocker);
    }

@@ -2926,6 +2917,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
    }

    bdrv_qcow = bdrv_find_format("qcow");
+    if (!bdrv_qcow) {
+        error_setg(errp, "Failed to locate qcow driver");
+        ret = -ENOENT;
+        goto err;
+    }
+
    opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, &error_abort);
    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512);
    qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:");
@@ -2948,9 +2945,9 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
    unlink(s->qcow_filename);
 #endif

-    bdrv_set_backing_hd(s->bs, bdrv_new("", &error_abort));
+    bdrv_set_backing_hd(s->bs, bdrv_new());
    s->bs->backing_hd->drv = &vvfat_write_target;
-    s->bs->backing_hd->opaque = g_malloc(sizeof(void*));
+    s->bs->backing_hd->opaque = g_new(void *, 1);
    *(void**)s->bs->backing_hd->opaque = s;

    return 0;
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -44,7 +44,7 @@ struct QEMUWin32AIOState {
 };

 typedef struct QEMUWin32AIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    struct QEMUWin32AIOState *ctx;
    int nbytes;
    OVERLAPPED ov;
@@ -88,7 +88,7 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,


    waiocb->common.cb(waiocb->common.opaque, ret);
-    qemu_aio_release(waiocb);
+    qemu_aio_unref(waiocb);
 }

 static void win32_aio_completion_cb(EventNotifier *e)
@@ -106,28 +106,14 @@ static void win32_aio_completion_cb(EventNotifier *e)
    }
 }

-static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
-
-    /*
-     * CancelIoEx is only supported in Vista and newer.  For now, just
-     * wait for completion.
-     */
-    while (!HasOverlappedIoCompleted(&waiocb->ov)) {
-        aio_poll(bdrv_get_aio_context(blockacb->bs), true);
-    }
-}
-
 static const AIOCBInfo win32_aiocb_info = {
    .aiocb_size         = sizeof(QEMUWin32AIOCB),
-    .cancel             = win32_aio_cancel,
 };

-BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
        QEMUWin32AIOState *aio, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    struct QEMUWin32AIOCB *waiocb;
    uint64_t offset = sector_num * 512;
@@ -139,7 +125,10 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
    waiocb->is_read = (type == QEMU_AIO_READ);

    if (qiov->niov > 1) {
-        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        waiocb->buf = qemu_try_blockalign(bs, qiov->size);
+        if (waiocb->buf == NULL) {
+            goto out;
+        }
        if (type & QEMU_AIO_WRITE) {
            iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
        }
@@ -168,7 +157,8 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,

 out_dec_count:
    aio->count--;
-    qemu_aio_release(waiocb);
+out:
+    qemu_aio_unref(waiocb);
    return NULL;
 }

--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -10,6 +10,7 @@
 */

 #include "sysemu/blockdev.h"
+#include "sysemu/block-backend.h"
 #include "hw/block/block.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
@@ -73,7 +74,7 @@ static void nbd_close_notifier(Notifier *n, void *data)
 void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
                        Error **errp)
 {
-    BlockDriverState *bs;
+    BlockBackend *blk;
    NBDExport *exp;
    NBDCloseNotifier *n;

@@ -87,12 +88,12 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
        return;
    }

-    bs = bdrv_find(device);
-    if (!bs) {
+    blk = blk_by_name(device);
+    if (!blk) {
        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
        return;
    }
-    if (!bdrv_is_inserted(bs)) {
+    if (!blk_is_inserted(blk)) {
        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
        return;
    }
@@ -100,18 +101,18 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
    if (!has_writable) {
        writable = false;
    }
-    if (bdrv_is_read_only(bs)) {
+    if (blk_is_read_only(blk)) {
        writable = false;
    }

-    exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, NULL);
+    exp = nbd_export_new(blk, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, NULL);

    nbd_export_set_name(exp, device);

-    n = g_malloc0(sizeof(NBDCloseNotifier));
+    n = g_new0(NBDCloseNotifier, 1);
    n->n.notify = nbd_close_notifier;
    n->exp = exp;
-    bdrv_add_close_notifier(bs, &n->n);
+    blk_add_close_notifier(blk, &n->n);
    QTAILQ_INSERT_TAIL(&close_notifiers, n, next);
 }

--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -36,7 +36,7 @@
 #include "qapi-event.h"

 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
-                       int64_t speed, BlockDriverCompletionFunc *cb,
+                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
 {
    BlockJob *job;
@@ -50,6 +50,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
    bdrv_op_block_all(bs, job->blocker);
+    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);

    job->driver        = driver;
    job->bs            = bs;
@@ -107,7 +108,8 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 void block_job_complete(BlockJob *job, Error **errp)
 {
    if (job->paused || job->cancelled || !job->driver->complete) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY,
+                  bdrv_get_device_name(job->bs));
        return;
    }

@@ -152,27 +154,30 @@ void block_job_iostatus_reset(BlockJob *job)
    }
 }

-struct BlockCancelData {
+struct BlockFinishData {
    BlockJob *job;
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;
    void *opaque;
    bool cancelled;
    int ret;
 };

-static void block_job_cancel_cb(void *opaque, int ret)
+static void block_job_finish_cb(void *opaque, int ret)
 {
-    struct BlockCancelData *data = opaque;
+    struct BlockFinishData *data = opaque;

    data->cancelled = block_job_is_cancelled(data->job);
    data->ret = ret;
    data->cb(data->opaque, ret);
 }

-int block_job_cancel_sync(BlockJob *job)
+static int block_job_finish_sync(BlockJob *job,
+                                 void (*finish)(BlockJob *, Error **errp),
+                                 Error **errp)
 {
-    struct BlockCancelData data;
+    struct BlockFinishData data;
    BlockDriverState *bs = job->bs;
+    Error *local_err = NULL;

    assert(bs->job == job);

@@ -183,15 +188,37 @@ int block_job_cancel_sync(BlockJob *job)
    data.cb = job->cb;
    data.opaque = job->opaque;
    data.ret = -EINPROGRESS;
-    job->cb = block_job_cancel_cb;
+    job->cb = block_job_finish_cb;
    job->opaque = &data;
-    block_job_cancel(job);
+    finish(job, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EBUSY;
+    }
    while (data.ret == -EINPROGRESS) {
        aio_poll(bdrv_get_aio_context(bs), true);
    }
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
 }

+/* A wrapper around block_job_cancel() taking an Error ** parameter so it may be
+ * used with block_job_finish_sync() without the need for (rather nasty)
+ * function pointer casts there. */
+static void block_job_cancel_err(BlockJob *job, Error **errp)
+{
+    block_job_cancel(job);
+}
+
+int block_job_cancel_sync(BlockJob *job)
+{
+    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
+}
+
+int block_job_complete_sync(BlockJob *job, Error **errp)
+{
+    return block_job_finish_sync(job, &block_job_complete, errp);
+}
+
 void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
 {
    assert(job->busy);
@@ -205,7 +232,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    if (block_job_is_paused(job)) {
        qemu_coroutine_yield();
    } else {
-        co_sleep_ns(type, ns);
+        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
    }
    job->busy = true;
 }
@@ -235,6 +262,7 @@ BlockJobInfo *block_job_query(BlockJob *job)
    info->offset    = job->offset;
    info->speed     = job->speed;
    info->io_status = job->iostatus;
+    info->ready     = job->ready;
    return info;
 }

@@ -270,6 +298,8 @@ void block_job_event_completed(BlockJob *job, const char *msg)

 void block_job_event_ready(BlockJob *job)
 {
+    job->ready = true;
+
    qapi_event_send_block_job_ready(job->driver->job_type,
                                    bdrv_get_device_name(job->bs),
                                    job->len,
@@ -313,3 +343,48 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
    }
    return action;
 }
+
+typedef struct {
+    BlockJob *job;
+    QEMUBH *bh;
+    AioContext *aio_context;
+    BlockJobDeferToMainLoopFn *fn;
+    void *opaque;
+} BlockJobDeferToMainLoopData;
+
+static void block_job_defer_to_main_loop_bh(void *opaque)
+{
+    BlockJobDeferToMainLoopData *data = opaque;
+    AioContext *aio_context;
+
+    qemu_bh_delete(data->bh);
+
+    /* Prevent race with block_job_defer_to_main_loop() */
+    aio_context_acquire(data->aio_context);
+
+    /* Fetch BDS AioContext again, in case it has changed */
+    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context_acquire(aio_context);
+
+    data->fn(data->job, data->opaque);
+
+    aio_context_release(aio_context);
+
+    aio_context_release(data->aio_context);
+
+    g_free(data);
+}
+
+void block_job_defer_to_main_loop(BlockJob *job,
+                                  BlockJobDeferToMainLoopFn *fn,
+                                  void *opaque)
+{
+    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
+    data->job = job;
+    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
+    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->fn = fn;
+    data->opaque = opaque;
+
+    qemu_bh_schedule(data->bh);
+}
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -0,0 +1,331 @@
+/*
+ * QEMU Boot Device Implement
+ *
+ * Copyright (c) 2014 HUAWEI TECHNOLOGIES CO.,LTD.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "sysemu/sysemu.h"
+#include "qapi/visitor.h"
+#include "qemu/error-report.h"
+#include "hw/hw.h"
+
+typedef struct FWBootEntry FWBootEntry;
+
+struct FWBootEntry {
+    QTAILQ_ENTRY(FWBootEntry) link;
+    int32_t bootindex;
+    DeviceState *dev;
+    char *suffix;
+};
+
+static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
+    QTAILQ_HEAD_INITIALIZER(fw_boot_order);
+static QEMUBootSetHandler *boot_set_handler;
+static void *boot_set_opaque;
+
+void qemu_register_boot_set(QEMUBootSetHandler *func, void *opaque)
+{
+    boot_set_handler = func;
+    boot_set_opaque = opaque;
+}
+
+void qemu_boot_set(const char *boot_order, Error **errp)
+{
+    Error *local_err = NULL;
+
+    if (!boot_set_handler) {
+        error_setg(errp, "no function defined to set boot device list for"
+                         " this architecture");
+        return;
+    }
+
+    validate_bootdevices(boot_order, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    boot_set_handler(boot_set_opaque, boot_order, errp);
+}
+
+void validate_bootdevices(const char *devices, Error **errp)
+{
+    /* We just do some generic consistency checks */
+    const char *p;
+    int bitmap = 0;
+
+    for (p = devices; *p != '\0'; p++) {
+        /* Allowed boot devices are:
+         * a-b: floppy disk drives
+         * c-f: IDE disk drives
+         * g-m: machine implementation dependent drives
+         * n-p: network devices
+         * It's up to each machine implementation to check if the given boot
+         * devices match the actual hardware implementation and firmware
+         * features.
+         */
+        if (*p < 'a' || *p > 'p') {
+            error_setg(errp, "Invalid boot device '%c'", *p);
+            return;
+        }
+        if (bitmap & (1 << (*p - 'a'))) {
+            error_setg(errp, "Boot device '%c' was given twice", *p);
+            return;
+        }
+        bitmap |= 1 << (*p - 'a');
+    }
+}
+
+void restore_boot_order(void *opaque)
+{
+    char *normal_boot_order = opaque;
+    static int first = 1;
+
+    /* Restore boot order and remove ourselves after the first boot */
+    if (first) {
+        first = 0;
+        return;
+    }
+
+    qemu_boot_set(normal_boot_order, NULL);
+
+    qemu_unregister_reset(restore_boot_order, normal_boot_order);
+    g_free(normal_boot_order);
+}
+
+void check_boot_index(int32_t bootindex, Error **errp)
+{
+    FWBootEntry *i;
+
+    if (bootindex >= 0) {
+        QTAILQ_FOREACH(i, &fw_boot_order, link) {
+            if (i->bootindex == bootindex) {
+                error_setg(errp, "The bootindex %d has already been used",
+                           bootindex);
+                return;
+            }
+        }
+    }
+}
+
+void del_boot_device_path(DeviceState *dev, const char *suffix)
+{
+    FWBootEntry *i;
+
+    if (dev == NULL) {
+        return;
+    }
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        if ((!suffix || !g_strcmp0(i->suffix, suffix)) &&
+             i->dev == dev) {
+            QTAILQ_REMOVE(&fw_boot_order, i, link);
+            g_free(i->suffix);
+            g_free(i);
+
+            break;
+        }
+    }
+}
+
+void add_boot_device_path(int32_t bootindex, DeviceState *dev,
+                          const char *suffix)
+{
+    FWBootEntry *node, *i;
+
+    if (bootindex < 0) {
+        del_boot_device_path(dev, suffix);
+        return;
+    }
+
+    assert(dev != NULL || suffix != NULL);
+
+    del_boot_device_path(dev, suffix);
+
+    node = g_malloc0(sizeof(FWBootEntry));
+    node->bootindex = bootindex;
+    node->suffix = g_strdup(suffix);
+    node->dev = dev;
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        if (i->bootindex == bootindex) {
+            error_report("Two devices with same boot index %d", bootindex);
+            exit(1);
+        } else if (i->bootindex < bootindex) {
+            continue;
+        }
+        QTAILQ_INSERT_BEFORE(i, node, link);
+        return;
+    }
+    QTAILQ_INSERT_TAIL(&fw_boot_order, node, link);
+}
+
+DeviceState *get_boot_device(uint32_t position)
+{
+    uint32_t counter = 0;
+    FWBootEntry *i = NULL;
+    DeviceState *res = NULL;
+
+    if (!QTAILQ_EMPTY(&fw_boot_order)) {
+        QTAILQ_FOREACH(i, &fw_boot_order, link) {
+            if (counter == position) {
+                res = i->dev;
+                break;
+            }
+            counter++;
+        }
+    }
+    return res;
+}
+
+/*
+ * This function returns null terminated string that consist of new line
+ * separated device paths.
+ *
+ * memory pointed by "size" is assigned total length of the array in bytes
+ *
+ */
+char *get_boot_devices_list(size_t *size, bool ignore_suffixes)
+{
+    FWBootEntry *i;
+    size_t total = 0;
+    char *list = NULL;
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        char *devpath = NULL, *bootpath;
+        size_t len;
+
+        if (i->dev) {
+            devpath = qdev_get_fw_dev_path(i->dev);
+            assert(devpath);
+        }
+
+        if (i->suffix && !ignore_suffixes && devpath) {
+            size_t bootpathlen = strlen(devpath) + strlen(i->suffix) + 1;
+
+            bootpath = g_malloc(bootpathlen);
+            snprintf(bootpath, bootpathlen, "%s%s", devpath, i->suffix);
+            g_free(devpath);
+        } else if (devpath) {
+            bootpath = devpath;
+        } else if (!ignore_suffixes) {
+            assert(i->suffix);
+            bootpath = g_strdup(i->suffix);
+        } else {
+            bootpath = g_strdup("");
+        }
+
+        if (total) {
+            list[total-1] = '\n';
+        }
+        len = strlen(bootpath) + 1;
+        list = g_realloc(list, total + len);
+        memcpy(&list[total], bootpath, len);
+        total += len;
+        g_free(bootpath);
+    }
+
+    *size = total;
+
+    if (boot_strict && *size > 0) {
+        list[total-1] = '\n';
+        list = g_realloc(list, total + 5);
+        memcpy(&list[total], "HALT", 5);
+        *size = total + 5;
+    }
+    return list;
+}
+
+typedef struct {
+    int32_t *bootindex;
+    const char *suffix;
+    DeviceState *dev;
+} BootIndexProperty;
+
+static void device_get_bootindex(Object *obj, Visitor *v, void *opaque,
+                                 const char *name, Error **errp)
+{
+    BootIndexProperty *prop = opaque;
+    visit_type_int32(v, prop->bootindex, name, errp);
+}
+
+static void device_set_bootindex(Object *obj, Visitor *v, void *opaque,
+                                 const char *name, Error **errp)
+{
+    BootIndexProperty *prop = opaque;
+    int32_t boot_index;
+    Error *local_err = NULL;
+
+    visit_type_int32(v, &boot_index, name, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* check whether bootindex is present in fw_boot_order list  */
+    check_boot_index(boot_index, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* change bootindex to a new one */
+    *prop->bootindex = boot_index;
+
+    add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix);
+
+out:
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+}
+
+static void property_release_bootindex(Object *obj, const char *name,
+                                       void *opaque)
+
+{
+    BootIndexProperty *prop = opaque;
+
+    del_boot_device_path(prop->dev, prop->suffix);
+    g_free(prop);
+}
+
+void device_add_bootindex_property(Object *obj, int32_t *bootindex,
+                                   const char *name, const char *suffix,
+                                   DeviceState *dev, Error **errp)
+{
+    Error *local_err = NULL;
+    BootIndexProperty *prop = g_malloc0(sizeof(*prop));
+
+    prop->bootindex = bootindex;
+    prop->suffix = suffix;
+    prop->dev = dev;
+
+    object_property_add(obj, name, "int32",
+                        device_get_bootindex,
+                        device_set_bootindex,
+                        property_release_bootindex,
+                        prop, &local_err);
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+        g_free(prop);
+        return;
+    }
+    /* initialize devices' bootindex property to -1 */
+    object_property_set_int(obj, -1, name, NULL);
+}
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -351,8 +351,10 @@ static inline void init_thread(struct target_pt_regs *_regs, struct image_info *

    _regs->gpr[1] = infop->start_stack;
 #if defined(TARGET_PPC64) && !defined(TARGET_ABI32)
-    entry = ldq_raw(infop->entry) + infop->load_addr;
-    toc = ldq_raw(infop->entry + 8) + infop->load_addr;
+    get_user_u64(entry, infop->entry);
+    entry += infop->load_addr;
+    get_user_u64(toc, infop->entry + 8);
+    toc += infop->load_addr;
    _regs->gpr[2] = toc;
    infop->entry = entry;
 #endif
@@ -365,8 +367,9 @@ static inline void init_thread(struct target_pt_regs *_regs, struct image_info *
    get_user_ual(_regs->gpr[3], pos);
    pos += sizeof(abi_ulong);
    _regs->gpr[4] = pos;
-    for (tmp = 1; tmp != 0; pos += sizeof(abi_ulong))
-        tmp = ldl(pos);
+    for (tmp = 1; tmp != 0; pos += sizeof(abi_ulong)) {
+        get_user_ual(tmp, pos);
+    }
    _regs->gpr[5] = pos;
 }

--- a/204
+++ b/204
@@ -326,7 +326,7 @@ seccomp=""
 glusterfs=""
 glusterfs_discard="no"
 glusterfs_zerofill="no"
-virtio_blk_data_plane=""
+archipelago=""
 gtk=""
 gtkabi=""
 vte=""
@@ -388,6 +388,7 @@ cpp="${CPP-$cc -E}"
 objcopy="${OBJCOPY-${cross_prefix}objcopy}"
 ld="${LD-${cross_prefix}ld}"
 libtool="${LIBTOOL-${cross_prefix}libtool}"
+nm="${NM-${cross_prefix}nm}"
 strip="${STRIP-${cross_prefix}strip}"
 windres="${WINDRES-${cross_prefix}windres}"
 pkg_config_exe="${PKG_CONFIG-${cross_prefix}pkg-config}"
@@ -1087,9 +1088,12 @@ for opt do
  ;;
  --enable-glusterfs) glusterfs="yes"
  ;;
-  --disable-virtio-blk-data-plane) virtio_blk_data_plane="no"
+  --disable-archipelago) archipelago="no"
  ;;
-  --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes"
+  --enable-archipelago) archipelago="yes"
+  ;;
+  --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane)
+      echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2
  ;;
  --disable-gtk) gtk="no"
  ;;
@@ -1344,7 +1348,7 @@ Advanced options (experts only):
  --enable-linux-aio       enable Linux AIO support
  --disable-cap-ng         disable libcap-ng support
  --enable-cap-ng          enable libcap-ng support
-  --disable-attr           disables attr and xattr support
+  --disable-attr           disable attr and xattr support
  --enable-attr            enable attr and xattr support
  --disable-blobs          disable installing provided firmware blobs
  --enable-docs            enable documentation build
@@ -1375,20 +1379,22 @@ Advanced options (experts only):
  --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
  --with-win-sdk=SDK-path  path to Windows Platform SDK (to build VSS .tlb)
  --disable-seccomp        disable seccomp support
-  --enable-seccomp         enables seccomp support
+  --enable-seccomp         enable seccomp support
  --with-coroutine=BACKEND coroutine backend. Supported options:
                           gthread, ucontext, sigaltstack, windows
  --disable-coroutine-pool disable coroutine freelist (worse performance)
  --enable-coroutine-pool  enable coroutine freelist (better performance)
  --enable-glusterfs       enable GlusterFS backend
  --disable-glusterfs      disable GlusterFS backend
+  --enable-archipelago     enable Archipelago backend
+  --disable-archipelago    disable Archipelago backend
  --enable-gcov            enable test coverage analysis with gcov
  --gcov=GCOV              use specified gcov [$gcov_tool]
  --disable-tpm            disable TPM support
  --enable-tpm             enable TPM support
  --disable-libssh2        disable ssh block device support
  --enable-libssh2         enable ssh block device support
-  --disable-vhdx           disables support for the Microsoft VHDX image format
+  --disable-vhdx           disable support for the Microsoft VHDX image format
  --enable-vhdx            enable support for the Microsoft VHDX image format
  --disable-quorum         disable quorum block filter support
  --enable-quorum          enable quorum block filter support
@@ -1723,6 +1729,7 @@ fi

 cat > $TMPC <<EOF
 #include <sys/socket.h>
+#include <linux/ip.h>
 int main(void) { return sizeof(struct mmsghdr); }
 EOF
 if compile_prog "" "" ; then
@@ -1816,13 +1823,14 @@ fi
 # libseccomp check

 if test "$seccomp" != "no" ; then
-    if $pkg_config --atleast-version=2.1.0 libseccomp; then
+    if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
+        $pkg_config --atleast-version=2.1.1 libseccomp; then
        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
 	seccomp="yes"
    else
 	if test "$seccomp" = "yes"; then
-            feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.0"
+            feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.1"
 	fi
 	seccomp="no"
    fi
@@ -1861,6 +1869,32 @@ EOF
 #if !defined(HVM_MAX_VCPUS)
 # error HVM_MAX_VCPUS not defined
 #endif
+int main(void) {
+  xc_interface *xc;
+  xs_daemon_open();
+  xc = xc_interface_open(0, 0, 0);
+  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
+  xc_gnttab_open(NULL, 0);
+  xc_domain_add_to_physmap(0, 0, XENMAPSPACE_gmfn, 0, 0);
+  xc_hvm_inject_msi(xc, 0, 0xf0000000, 0x00000000);
+  xc_hvm_create_ioreq_server(xc, 0, 0, NULL);
+  return 0;
+}
+EOF
+      compile_prog "" "$xen_libs"
+    then
+    xen_ctrl_version=450
+    xen=yes
+
+  elif
+      cat > $TMPC <<EOF &&
+#include <xenctrl.h>
+#include <xenstore.h>
+#include <stdint.h>
+#include <xen/hvm/hvm_info_table.h>
+#if !defined(HVM_MAX_VCPUS)
+# error HVM_MAX_VCPUS not defined
+#endif
 int main(void) {
  xc_interface *xc;
  xs_daemon_open();
@@ -2708,12 +2742,18 @@ for i in $glib_modules; do
    fi
 done

+# g_test_trap_subprocess added in 2.38. Used by some tests.
+glib_subprocess=yes
+if ! $pkg_config --atleast-version=2.38 glib-2.0; then
+    glib_subprocess=no
+fi
+
 ##########################################
 # SHA command probe for modules
 if test "$modules" = yes; then
    shacmd_probe="sha1sum sha1 shasum"
    for c in $shacmd_probe; do
-        if which $c >/dev/null 2>&1; then
+        if has $c; then
            shacmd="$c"
            break
        fi
@@ -2729,7 +2769,7 @@ fi
 if test "$pixman" = ""; then
  if test "$want_tools" = "no" -a "$softmmu" = "no"; then
    pixman="none"
-  elif $pkg_config pixman-1 > /dev/null 2>&1; then
+  elif $pkg_config --atleast-version=0.21.8 pixman-1 > /dev/null 2>&1; then
    pixman="system"
  else
    pixman="internal"
@@ -2745,11 +2785,12 @@ if test "$pixman" = "none"; then
  pixman_cflags=
  pixman_libs=
 elif test "$pixman" = "system"; then
+  # pixman version has been checked above
  pixman_cflags=`$pkg_config --cflags pixman-1`
  pixman_libs=`$pkg_config --libs pixman-1`
 else
  if test ! -d ${source_path}/pixman/pixman; then
-    error_exit "pixman not present. Your options:" \
+    error_exit "pixman >= 0.21.8 not present. Your options:" \
        "  (1) Preferred: Install the pixman devel package (any recent" \
        "      distro should have packages as Xorg needs pixman too)." \
        "  (2) Fetch the pixman submodule, using:" \
@@ -2927,16 +2968,6 @@ else
  tpm_passthrough=no
 fi

-##########################################
-# adjust virtio-blk-data-plane based on linux-aio
-
-if test "$virtio_blk_data_plane" = "yes" -a \
-	"$linux_aio" != "yes" ; then
-  error_exit "virtio-blk-data-plane requires Linux AIO, please try --enable-linux-aio"
-elif test -z "$virtio_blk_data_plane" ; then
-  virtio_blk_data_plane=$linux_aio
-fi
-
 ##########################################
 # attr probe

@@ -3071,6 +3102,33 @@ EOF
  fi
 fi

+
+##########################################
+# archipelago probe
+if test "$archipelago" != "no" ; then
+    cat > $TMPC <<EOF
+#include <stdio.h>
+#include <xseg/xseg.h>
+#include <xseg/protocol.h>
+int main(void) {
+    xseg_initialize();
+    return 0;
+}
+EOF
+    archipelago_libs=-lxseg
+    if compile_prog "" "$archipelago_libs"; then
+        archipelago="yes"
+        libs_tools="$archipelago_libs $libs_tools"
+        libs_softmmu="$archipelago_libs $libs_softmmu"
+    else
+      if test "$archipelago" = "yes" ; then
+        feature_not_found "Archipelago backend support" "Install libxseg devel"
+      fi
+      archipelago="no"
+    fi
+fi
+
+
 ##########################################
 # glusterfs probe
 if test "$glusterfs" != "no" ; then
@@ -3086,7 +3144,8 @@ if test "$glusterfs" != "no" ; then
    fi
  else
    if test "$glusterfs" = "yes" ; then
-      feature_not_found "GlusterFS backend support" "Install glusterfs-api devel"
+      feature_not_found "GlusterFS backend support" \
+          "Install glusterfs-api devel >= 3"
    fi
    glusterfs="no"
  fi
@@ -3276,6 +3335,21 @@ if compile_prog "" "" ; then
  fallocate_punch_hole=yes
 fi

+# check for posix_fallocate
+posix_fallocate=no
+cat > $TMPC << EOF
+#include <fcntl.h>
+
+int main(void)
+{
+    posix_fallocate(0, 0, 0);
+    return 0;
+}
+EOF
+if compile_prog "" "" ; then
+    posix_fallocate=yes
+fi
+
 # check for sync_file_range
 sync_file_range=no
 cat > $TMPC << EOF
@@ -3420,6 +3494,37 @@ if compile_prog "" "" ; then
  sendfile=yes
 fi

+# check for timerfd support (glibc 2.8 and newer)
+timerfd=no
+cat > $TMPC << EOF
+#include <sys/timerfd.h>
+
+int main(void)
+{
+    return(timerfd_create(CLOCK_REALTIME, 0));
+}
+EOF
+if compile_prog "" "" ; then
+  timerfd=yes
+fi
+
+# check for setns and unshare support
+setns=no
+cat > $TMPC << EOF
+#include <sched.h>
+
+int main(void)
+{
+    int ret;
+    ret = setns(0, 0);
+    ret = unshare(0);
+    return ret;
+}
+EOF
+if compile_prog "" "" ; then
+  setns=yes
+fi
+
 # Check if tools are available to build documentation.
 if test "$docs" != "no" ; then
  if has makeinfo && has pod2man; then
@@ -3531,7 +3636,8 @@ EOF
    spice_server_version=$($pkg_config --modversion spice-server)
  else
    if test "$spice" = "yes" ; then
-      feature_not_found "spice" "Install spice-server and spice-protocol devel"
+      feature_not_found "spice" \
+          "Install spice-server(>=0.12.0) and spice-protocol(>=0.12.3) devel"
    fi
    spice="no"
  fi
@@ -3562,7 +3668,7 @@ EOF
        smartcard_nss="yes"
    else
        if test "$smartcard_nss" = "yes"; then
-            feature_not_found "nss"
+            feature_not_found "nss" "Install nss devel >= 3.12.8"
        fi
        smartcard_nss="no"
    fi
@@ -3578,7 +3684,7 @@ if test "$libusb" != "no" ; then
        libs_softmmu="$libs_softmmu $libusb_libs"
    else
        if test "$libusb" = "yes"; then
-            feature_not_found "libusb" "Install libusb devel"
+            feature_not_found "libusb" "Install libusb devel >= 1.0.13"
        fi
        libusb="no"
    fi
@@ -3892,12 +3998,11 @@ else
 fi

 ########################################
-# check if we have valgrind/valgrind.h and valgrind/memcheck.h
+# check if we have valgrind/valgrind.h

 valgrind_h=no
 cat > $TMPC << EOF
 #include <valgrind/valgrind.h>
-#include <valgrind/memcheck.h>
 int main(void) {
  return 0;
 }
@@ -4003,7 +4108,7 @@ if test "$libnfs" != "no" ; then
    LIBS="$LIBS $libnfs_libs"
  else
    if test "$libnfs" = "yes" ; then
-      feature_not_found "libnfs"
+      feature_not_found "libnfs" "Install libnfs devel >= 1.9.3"
    fi
    libnfs="no"
  fi
@@ -4133,9 +4238,9 @@ EOF
  fi
 fi

-# add pixman flags after all config tests are done
-QEMU_CFLAGS="$QEMU_CFLAGS $pixman_cflags $fdt_cflags"
-libs_softmmu="$libs_softmmu $pixman_libs"
+# prepend pixman and ftd flags after all config tests are done
+QEMU_CFLAGS="$pixman_cflags $fdt_cflags $QEMU_CFLAGS"
+libs_softmmu="$pixman_libs $libs_softmmu"

 echo "Install prefix    $prefix"
 echo "BIOS directory    `eval echo $qemu_datadir`"
@@ -4204,6 +4309,9 @@ if test -n "$sparc_cpu"; then
    echo "Target Sparc Arch $sparc_cpu"
 fi
 echo "xen support       $xen"
+if test "$xen" = "yes" ; then
+  echo "xen ctrl version  $xen_ctrl_version"
+fi
 echo "brlapi support    $brlapi"
 echo "bluez  support    $bluez"
 echo "Documentation     $docs"
@@ -4250,7 +4358,7 @@ echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
 echo "GlusterFS support $glusterfs"
-echo "virtio-blk-data-plane $virtio_blk_data_plane"
+echo "Archipelago support $archipelago"
 echo "gcov              $gcov_tool"
 echo "gcov enabled      $gcov"
 echo "TPM support       $tpm"
@@ -4459,6 +4567,9 @@ fi
 if test "$fallocate_punch_hole" = "yes" ; then
  echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak
 fi
+if test "$posix_fallocate" = "yes" ; then
+  echo "CONFIG_POSIX_FALLOCATE=y" >> $config_host_mak
+fi
 if test "$sync_file_range" = "yes" ; then
  echo "CONFIG_SYNC_FILE_RANGE=y" >> $config_host_mak
 fi
@@ -4486,6 +4597,12 @@ fi
 if test "$sendfile" = "yes" ; then
  echo "CONFIG_SENDFILE=y" >> $config_host_mak
 fi
+if test "$timerfd" = "yes" ; then
+  echo "CONFIG_TIMERFD=y" >> $config_host_mak
+fi
+if test "$setns" = "yes" ; then
+  echo "CONFIG_SETNS=y" >> $config_host_mak
+fi
 if test "$inotify" = "yes" ; then
  echo "CONFIG_INOTIFY=y" >> $config_host_mak
 fi
@@ -4510,6 +4627,9 @@ if test "$bluez" = "yes" ; then
  echo "CONFIG_BLUEZ=y" >> $config_host_mak
  echo "BLUEZ_CFLAGS=$bluez_cflags" >> $config_host_mak
 fi
+if test "glib_subprocess" = "yes" ; then
+  echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak
+fi
 echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 if test "$gtk" = "yes" ; then
  echo "CONFIG_GTK=y" >> $config_host_mak
@@ -4688,6 +4808,11 @@ if test "$glusterfs_zerofill" = "yes" ; then
  echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
 fi

+if test "$archipelago" = "yes" ; then
+  echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak
+  echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
  echo "CONFIG_LIBSSH2=m" >> $config_host_mak
  echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak
@@ -4698,10 +4823,6 @@ if test "$quorum" = "yes" ; then
  echo "CONFIG_QUORUM=y" >> $config_host_mak
 fi

-if test "$virtio_blk_data_plane" = "yes" ; then
-  echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak
-fi
-
 if test "$vhdx" = "yes" ; then
  echo "CONFIG_VHDX=y" >> $config_host_mak
 fi
@@ -4808,6 +4929,7 @@ echo "AS=$as" >> $config_host_mak
 echo "CPP=$cpp" >> $config_host_mak
 echo "OBJCOPY=$objcopy" >> $config_host_mak
 echo "LD=$ld" >> $config_host_mak
+echo "NM=$nm" >> $config_host_mak
 echo "WINDRES=$windres" >> $config_host_mak
 echo "LIBTOOL=$libtool" >> $config_host_mak
 echo "CFLAGS=$CFLAGS" >> $config_host_mak
@@ -4816,6 +4938,7 @@ echo "QEMU_CFLAGS=$QEMU_CFLAGS" >> $config_host_mak
 echo "QEMU_INCLUDES=$QEMU_INCLUDES" >> $config_host_mak
 if test "$sparse" = "yes" ; then
  echo "CC           := REAL_CC=\"\$(CC)\" cgcc"       >> $config_host_mak
+  echo "CXX          := REAL_CC=\"\$(CXX)\" cgcc"      >> $config_host_mak
  echo "HOST_CC      := REAL_CC=\"\$(HOST_CC)\" cgcc"  >> $config_host_mak
  echo "QEMU_CFLAGS  += -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-non-pointer-null" >> $config_host_mak
 fi
@@ -4936,7 +5059,7 @@ case "$target_name" in
  aarch64)
    TARGET_BASE_ARCH=arm
    bflt="yes"
-    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml"
+    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  cris)
  ;;
@@ -4965,6 +5088,8 @@ case "$target_name" in
    TARGET_BASE_ARCH=mips
    echo "TARGET_ABI_MIPSN64=y" >> $config_target_mak
  ;;
+  tricore)
+  ;;
  moxie)
  ;;
  or32)
@@ -5013,6 +5138,7 @@ case "$target_name" in
    echo "TARGET_ABI32=y" >> $config_target_mak
  ;;
  s390x)
+    gdb_xml_files="s390x-core64.xml s390-acr.xml s390-fpr.xml"
  ;;
  unicore32)
  ;;
@@ -5292,10 +5418,6 @@ for rom in seabios vgabios ; do
    echo "LD=$ld" >> $config_mak
 done

-if test "$docs" = "yes" ; then
-  mkdir -p QMP
-fi
-
 # set up qemu-iotests in this build directory
 iotests_common_env="tests/qemu-iotests/common.env"
 iotests_check="tests/qemu-iotests/check"
--- a/coroutine-sigaltstack.c
+++ b/coroutine-sigaltstack.c
@@ -155,7 +155,7 @@ Coroutine *qemu_coroutine_new(void)
    stack_t oss;
    sigset_t sigs;
    sigset_t osigs;
-    jmp_buf old_env;
+    sigjmp_buf old_env;

    /* The way to manipulate stack is with the sigaltstack function. We
     * prepare a stack, with it delivering a signal to ourselves and then
--- a/coroutine-ucontext.c
+++ b/coroutine-ucontext.c
@@ -25,7 +25,6 @@
 #include <stdlib.h>
 #include <setjmp.h>
 #include <stdint.h>
-#include <pthread.h>
 #include <ucontext.h>
 #include "qemu-common.h"
 #include "block/coroutine_int.h"
@@ -48,15 +47,8 @@ typedef struct {
 /**
 * Per-thread coroutine bookkeeping
 */
-typedef struct {
-    /** Currently executing coroutine */
-    Coroutine *current;
-
-    /** The default coroutine */
-    CoroutineUContext leader;
-} CoroutineThreadState;
-
-static pthread_key_t thread_state_key;
+static __thread CoroutineUContext leader;
+static __thread Coroutine *current;

 /*
 * va_args to makecontext() must be type 'int', so passing
@@ -68,36 +60,6 @@ union cc_arg {
    int i[2];
 };

-static CoroutineThreadState *coroutine_get_thread_state(void)
-{
-    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
-
-    if (!s) {
-        s = g_malloc0(sizeof(*s));
-        s->current = &s->leader.base;
-        pthread_setspecific(thread_state_key, s);
-    }
-    return s;
-}
-
-static void qemu_coroutine_thread_cleanup(void *opaque)
-{
-    CoroutineThreadState *s = opaque;
-
-    g_free(s);
-}
-
-static void __attribute__((constructor)) coroutine_init(void)
-{
-    int ret;
-
-    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
-    if (ret != 0) {
-        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
-        abort();
-    }
-}
-
 static void coroutine_trampoline(int i0, int i1)
 {
    union cc_arg arg;
@@ -193,15 +155,23 @@ void qemu_coroutine_delete(Coroutine *co_)
    g_free(co);
 }

-CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
-                                      CoroutineAction action)
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the sigsetjmp() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                      CoroutineAction action)
 {
    CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
    CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
-    CoroutineThreadState *s = coroutine_get_thread_state();
    int ret;

-    s->current = to_;
+    current = to_;

    ret = sigsetjmp(from->env, 0);
    if (ret == 0) {
@@ -212,14 +182,13 @@ CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,

 Coroutine *qemu_coroutine_self(void)
 {
-    CoroutineThreadState *s = coroutine_get_thread_state();
-
-    return s->current;
+    if (!current) {
+        current = &leader.base;
+    }
+    return current;
 }

 bool qemu_in_coroutine(void)
 {
-    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
-
-    return s && s->current->caller;
+    return current && current->caller;
 }
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -18,10 +18,114 @@
 */
 #include "config.h"
 #include "cpu.h"
+#include "trace.h"
 #include "disas/disas.h"
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
+#include "qemu/timer.h"
+
+/* -icount align implementation. */
+
+typedef struct SyncClocks {
+    int64_t diff_clk;
+    int64_t last_cpu_icount;
+    int64_t realtime_clock;
+} SyncClocks;
+
+#if !defined(CONFIG_USER_ONLY)
+/* Allow the guest to have a max 3ms advance.
+ * The difference between the 2 clocks could therefore
+ * oscillate around 0.
+ */
+#define VM_CLOCK_ADVANCE 3000000
+#define THRESHOLD_REDUCE 1.5
+#define MAX_DELAY_PRINT_RATE 2000000000LL
+#define MAX_NB_PRINTS 100
+
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+    int64_t cpu_icount;
+
+    if (!icount_align_option) {
+        return;
+    }
+
+    cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+    sc->diff_clk += cpu_icount_to_ns(sc->last_cpu_icount - cpu_icount);
+    sc->last_cpu_icount = cpu_icount;
+
+    if (sc->diff_clk > VM_CLOCK_ADVANCE) {
+#ifndef _WIN32
+        struct timespec sleep_delay, rem_delay;
+        sleep_delay.tv_sec = sc->diff_clk / 1000000000LL;
+        sleep_delay.tv_nsec = sc->diff_clk % 1000000000LL;
+        if (nanosleep(&sleep_delay, &rem_delay) < 0) {
+            sc->diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * 1000000000LL;
+            sc->diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec;
+        } else {
+            sc->diff_clk = 0;
+        }
+#else
+        Sleep(sc->diff_clk / SCALE_MS);
+        sc->diff_clk = 0;
+#endif
+    }
+}
+
+static void print_delay(const SyncClocks *sc)
+{
+    static float threshold_delay;
+    static int64_t last_realtime_clock;
+    static int nb_prints;
+
+    if (icount_align_option &&
+        sc->realtime_clock - last_realtime_clock >= MAX_DELAY_PRINT_RATE &&
+        nb_prints < MAX_NB_PRINTS) {
+        if ((-sc->diff_clk / (float)1000000000LL > threshold_delay) ||
+            (-sc->diff_clk / (float)1000000000LL <
+             (threshold_delay - THRESHOLD_REDUCE))) {
+            threshold_delay = (-sc->diff_clk / 1000000000LL) + 1;
+            printf("Warning: The guest is now late by %.1f to %.1f seconds\n",
+                   threshold_delay - 1,
+                   threshold_delay);
+            nb_prints++;
+            last_realtime_clock = sc->realtime_clock;
+        }
+    }
+}
+
+static void init_delay_params(SyncClocks *sc,
+                              const CPUState *cpu)
+{
+    if (!icount_align_option) {
+        return;
+    }
+    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) -
+                   sc->realtime_clock +
+                   cpu_get_clock_offset();
+    sc->last_cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+    if (sc->diff_clk < max_delay) {
+        max_delay = sc->diff_clk;
+    }
+    if (sc->diff_clk > max_advance) {
+        max_advance = sc->diff_clk;
+    }
+
+    /* Print every 2s max if the guest is late. We limit the number
+       of printed messages to NB_PRINT_MAX(currently 100) */
+    print_delay(sc);
+}
+#else
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+}
+
+static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
+{
+}
+#endif /* CONFIG USER ONLY */

 void cpu_loop_exit(CPUState *cpu)
 {
@@ -64,7 +168,12 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
    }
 #endif /* DEBUG_DISAS */

+    cpu->can_do_io = 0;
    next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    cpu->can_do_io = 1;
+    trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
+                       next_tb & TB_EXIT_MASK);
+
    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
        /* We didn't start executing this TB (eg because the instruction
         * counter hit zero); we must restore the guest PC to the address
@@ -95,16 +204,22 @@ static void cpu_exec_nocache(CPUArchState *env, int max_cycles,
 {
    CPUState *cpu = ENV_GET_CPU(env);
    TranslationBlock *tb;
+    target_ulong pc = orig_tb->pc;
+    target_ulong cs_base = orig_tb->cs_base;
+    uint64_t flags = orig_tb->flags;

    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;

-    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
-                     max_cycles);
+    /* tb_gen_code can flush our orig_tb, invalidate it now */
+    tb_phys_invalidate(orig_tb, -1);
+    tb = tb_gen_code(cpu, pc, cs_base, flags,
+                     max_cycles | CF_NOCACHE);
    cpu->current_tb = tb;
    /* execute the generated code */
+    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb->tc_ptr);
    cpu->current_tb = NULL;
    tb_phys_invalidate(tb, -1);
@@ -187,16 +302,10 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env)
    return tb;
 }

-static CPUDebugExcpHandler *debug_excp_handler;
-
-void cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler)
-{
-    debug_excp_handler = handler;
-}
-
 static void cpu_handle_debug_exception(CPUArchState *env)
 {
    CPUState *cpu = ENV_GET_CPU(env);
+    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUWatchpoint *wp;

    if (!cpu->watchpoint_hit) {
@@ -204,9 +313,8 @@ static void cpu_handle_debug_exception(CPUArchState *env)
            wp->flags &= ~BP_WATCHPOINT_HIT;
        }
    }
-    if (debug_excp_handler) {
-        debug_excp_handler(env);
-    }
+
+    cc->debug_excp_handler(cpu);
 }

 /* main execution loop */
@@ -216,10 +324,7 @@ volatile sig_atomic_t exit_request;
 int cpu_exec(CPUArchState *env)
 {
    CPUState *cpu = ENV_GET_CPU(env);
-#if !(defined(CONFIG_USER_ONLY) && \
-      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
    CPUClass *cc = CPU_GET_CLASS(cpu);
-#endif
 #ifdef TARGET_I386
    X86CPU *x86_cpu = X86_CPU(cpu);
 #endif
@@ -227,6 +332,8 @@ int cpu_exec(CPUArchState *env)
    TranslationBlock *tb;
    uint8_t *tc_ptr;
    uintptr_t next_tb;
+    SyncClocks sc;
+
    /* This must be volatile so it is not trashed by longjmp() */
    volatile bool have_tb_lock = false;

@@ -252,36 +359,14 @@ int cpu_exec(CPUArchState *env)
        cpu->exit_request = 1;
    }

-#if defined(TARGET_I386)
-    /* put eflags in CPU temporary format */
-    CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
-    env->df = 1 - (2 * ((env->eflags >> 10) & 1));
-    CC_OP = CC_OP_EFLAGS;
-    env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
-#elif defined(TARGET_SPARC)
-#elif defined(TARGET_M68K)
-    env->cc_op = CC_OP_FLAGS;
-    env->cc_dest = env->sr & 0xf;
-    env->cc_x = (env->sr >> 4) & 1;
-#elif defined(TARGET_ALPHA)
-#elif defined(TARGET_ARM)
-#elif defined(TARGET_UNICORE32)
-#elif defined(TARGET_PPC)
-    env->reserve_addr = -1;
-#elif defined(TARGET_LM32)
-#elif defined(TARGET_MICROBLAZE)
-#elif defined(TARGET_MIPS)
-#elif defined(TARGET_MOXIE)
-#elif defined(TARGET_OPENRISC)
-#elif defined(TARGET_SH4)
-#elif defined(TARGET_CRIS)
-#elif defined(TARGET_S390X)
-#elif defined(TARGET_XTENSA)
-    /* XXXXX */
-#else
-#error unsupported target CPU
-#endif
-    cpu->exception_index = -1;
+    cc->cpu_exec_enter(cpu);
+
+    /* Calculate difference between guest clock and host clock.
+     * This delay includes the delay of the last cycle, so
+     * what we have to do is sleep until it is 0. As for the
+     * advance/delay we gain here, we try to fix it next time.
+     */
+    init_delay_params(&sc, cpu);

    /* prepare setjmp context for exception handling */
    for(;;) {
@@ -294,6 +379,7 @@ int cpu_exec(CPUArchState *env)
                    if (ret == EXCP_DEBUG) {
                        cpu_handle_debug_exception(env);
                    }
+                    cpu->exception_index = -1;
                    break;
                } else {
 #if defined(CONFIG_USER_ONLY)
@@ -304,6 +390,7 @@ int cpu_exec(CPUArchState *env)
                    cc->do_interrupt(cpu);
 #endif
                    ret = cpu->exception_index;
+                    cpu->exception_index = -1;
                    break;
 #else
                    cc->do_interrupt(cpu);
@@ -325,16 +412,12 @@ int cpu_exec(CPUArchState *env)
                        cpu->exception_index = EXCP_DEBUG;
                        cpu_loop_exit(cpu);
                    }
-#if defined(TARGET_ARM) || defined(TARGET_SPARC) || defined(TARGET_MIPS) || \
-    defined(TARGET_PPC) || defined(TARGET_ALPHA) || defined(TARGET_CRIS) || \
-    defined(TARGET_MICROBLAZE) || defined(TARGET_LM32) || defined(TARGET_UNICORE32)
                    if (interrupt_request & CPU_INTERRUPT_HALT) {
                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                        cpu->halted = 1;
                        cpu->exception_index = EXCP_HLT;
                        cpu_loop_exit(cpu);
                    }
-#endif
 #if defined(TARGET_I386)
                    if (interrupt_request & CPU_INTERRUPT_INIT) {
                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
@@ -347,251 +430,15 @@ int cpu_exec(CPUArchState *env)
                        cpu_reset(cpu);
                    }
 #endif
-#if defined(TARGET_I386)
-#if !defined(CONFIG_USER_ONLY)
-                    if (interrupt_request & CPU_INTERRUPT_POLL) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
-                        apic_poll_irq(x86_cpu->apic_state);
-                    }
-#endif
-                    if (interrupt_request & CPU_INTERRUPT_SIPI) {
-                            do_cpu_sipi(x86_cpu);
-                    } else if (env->hflags2 & HF2_GIF_MASK) {
-                        if ((interrupt_request & CPU_INTERRUPT_SMI) &&
-                            !(env->hflags & HF_SMM_MASK)) {
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_SMI,
-                                                          0);
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
-                            do_smm_enter(x86_cpu);
-                            next_tb = 0;
-                        } else if ((interrupt_request & CPU_INTERRUPT_NMI) &&
-                                   !(env->hflags2 & HF2_NMI_MASK)) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
-                            env->hflags2 |= HF2_NMI_MASK;
-                            do_interrupt_x86_hardirq(env, EXCP02_NMI, 1);
-                            next_tb = 0;
-                        } else if (interrupt_request & CPU_INTERRUPT_MCE) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_MCE;
-                            do_interrupt_x86_hardirq(env, EXCP12_MCHK, 0);
-                            next_tb = 0;
-                        } else if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                                   (((env->hflags2 & HF2_VINTR_MASK) && 
-                                     (env->hflags2 & HF2_HIF_MASK)) ||
-                                    (!(env->hflags2 & HF2_VINTR_MASK) && 
-                                     (env->eflags & IF_MASK && 
-                                      !(env->hflags & HF_INHIBIT_IRQ_MASK))))) {
-                            int intno;
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_INTR,
-                                                          0);
-                            cpu->interrupt_request &= ~(CPU_INTERRUPT_HARD |
-                                                        CPU_INTERRUPT_VIRQ);
-                            intno = cpu_get_pic_interrupt(env);
-                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing hardware INT=0x%02x\n", intno);
-                            do_interrupt_x86_hardirq(env, intno, 1);
-                            /* ensure that no TB jump will be modified as
-                               the program flow was changed */
-                            next_tb = 0;
-#if !defined(CONFIG_USER_ONLY)
-                        } else if ((interrupt_request & CPU_INTERRUPT_VIRQ) &&
-                                   (env->eflags & IF_MASK) && 
-                                   !(env->hflags & HF_INHIBIT_IRQ_MASK)) {
-                            int intno;
-                            /* FIXME: this should respect TPR */
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_VINTR,
-                                                          0);
-                            intno = ldl_phys(cpu->as,
-                                             env->vm_vmcb
-                                             + offsetof(struct vmcb,
-                                                        control.int_vector));
-                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing virtual hardware INT=0x%02x\n", intno);
-                            do_interrupt_x86_hardirq(env, intno, 1);
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_VIRQ;
-                            next_tb = 0;
-#endif
-                        }
-                    }
-#elif defined(TARGET_PPC)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        ppc_hw_interrupt(env);
-                        if (env->pending_interrupts == 0) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
-                        }
+                    /* The target hook has 3 exit conditions:
+                       False when the interrupt isn't processed,
+                       True when it is, and we should restart on a new TB,
+                       and via longjmp via cpu_loop_exit.  */
+                    if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
                        next_tb = 0;
                    }
-#elif defined(TARGET_LM32)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD)
-                        && (env->ie & IE_IE)) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_MICROBLAZE)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD)
-                        && (env->sregs[SR_MSR] & MSR_IE)
-                        && !(env->sregs[SR_MSR] & (MSR_EIP | MSR_BIP))
-                        && !(env->iflags & (D_FLAG | IMM_FLAG))) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_MIPS)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                        cpu_mips_hw_interrupts_pending(env)) {
-                        /* Raise it */
-                        cpu->exception_index = EXCP_EXT_INTERRUPT;
-                        env->error_code = 0;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_OPENRISC)
-                    {
-                        int idx = -1;
-                        if ((interrupt_request & CPU_INTERRUPT_HARD)
-                            && (env->sr & SR_IEE)) {
-                            idx = EXCP_INT;
-                        }
-                        if ((interrupt_request & CPU_INTERRUPT_TIMER)
-                            && (env->sr & SR_TEE)) {
-                            idx = EXCP_TICK;
-                        }
-                        if (idx >= 0) {
-                            cpu->exception_index = idx;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_SPARC)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        if (cpu_interrupts_enabled(env) &&
-                            env->interrupt_index > 0) {
-                            int pil = env->interrupt_index & 0xf;
-                            int type = env->interrupt_index & 0xf0;
-
-                            if (((type == TT_EXTINT) &&
-                                  cpu_pil_allowed(env, pil)) ||
-                                  type != TT_EXTINT) {
-                                cpu->exception_index = env->interrupt_index;
-                                cc->do_interrupt(cpu);
-                                next_tb = 0;
-                            }
-                        }
-                    }
-#elif defined(TARGET_ARM)
-                    if (interrupt_request & CPU_INTERRUPT_FIQ
-                        && !(env->daif & PSTATE_F)) {
-                        cpu->exception_index = EXCP_FIQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-                    /* ARMv7-M interrupt return works by loading a magic value
-                       into the PC.  On real hardware the load causes the
-                       return to occur.  The qemu implementation performs the
-                       jump normally, then does the exception return when the
-                       CPU tries to execute code at the magic address.
-                       This will cause the magic PC value to be pushed to
-                       the stack if an interrupt occurred at the wrong time.
-                       We avoid this by disabling interrupts when
-                       pc contains a magic address.  */
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && ((IS_M(env) && env->regs[15] < 0xfffffff0)
-                            || !(env->daif & PSTATE_I))) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_UNICORE32)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && !(env->uncached_asr & ASR_I)) {
-                        cpu->exception_index = UC32_EXCP_INTR;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_SH4)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_ALPHA)
-                    {
-                        int idx = -1;
-                        /* ??? This hard-codes the OSF/1 interrupt levels.  */
-                        switch (env->pal_mode ? 7 : env->ps & PS_INT_MASK) {
-                        case 0 ... 3:
-                            if (interrupt_request & CPU_INTERRUPT_HARD) {
-                                idx = EXCP_DEV_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 4:
-                            if (interrupt_request & CPU_INTERRUPT_TIMER) {
-                                idx = EXCP_CLK_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 5:
-                            if (interrupt_request & CPU_INTERRUPT_SMP) {
-                                idx = EXCP_SMP_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 6:
-                            if (interrupt_request & CPU_INTERRUPT_MCHK) {
-                                idx = EXCP_MCHK;
-                            }
-                        }
-                        if (idx >= 0) {
-                            cpu->exception_index = idx;
-                            env->error_code = 0;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_CRIS)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && (env->pregs[PR_CCS] & I_FLAG)
-                        && !env->locked_irq) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-                    if (interrupt_request & CPU_INTERRUPT_NMI) {
-                        unsigned int m_flag_archval;
-                        if (env->pregs[PR_VR] < 32) {
-                            m_flag_archval = M_FLAG_V10;
-                        } else {
-                            m_flag_archval = M_FLAG_V32;
-                        }
-                        if ((env->pregs[PR_CCS] & m_flag_archval)) {
-                            cpu->exception_index = EXCP_NMI;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_M68K)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && ((env->sr & SR_I) >> SR_I_SHIFT)
-                            < env->pending_level) {
-                        /* Real hardware gets the interrupt vector via an
-                           IACK cycle at this point.  Current emulated
-                           hardware doesn't rely on this, so we
-                           provide/save the vector when the interrupt is
-                           first signalled.  */
-                        cpu->exception_index = env->pending_vector;
-                        do_interrupt_m68k_hardirq(env);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_S390X) && !defined(CONFIG_USER_ONLY)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                        (env->psw.mask & PSW_MASK_EXT)) {
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_XTENSA)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        cpu->exception_index = EXC_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#endif
-                   /* Don't use the cached interrupt_request value,
-                      do_interrupt may have updated the EXITTB flag. */
+                    /* Don't use the cached interrupt_request value,
+                       do_interrupt may have updated the EXITTB flag. */
                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
                        cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
                        /* ensure that no TB jump will be modified as
@@ -637,6 +484,7 @@ int cpu_exec(CPUArchState *env)
                cpu->current_tb = tb;
                barrier();
                if (likely(!cpu->exit_request)) {
+                    trace_exec_tb(tb, tb->pc);
                    tc_ptr = tb->tc_ptr;
                    /* execute the generated code */
                    next_tb = cpu_tb_exec(cpu, tc_ptr);
@@ -672,6 +520,7 @@ int cpu_exec(CPUArchState *env)
                            if (insns_left > 0) {
                                /* Execute remaining instructions.  */
                                cpu_exec_nocache(env, insns_left, tb);
+                                align_clocks(&sc, cpu);
                            }
                            cpu->exception_index = EXCP_INTERRUPT;
                            next_tb = 0;
@@ -684,6 +533,9 @@ int cpu_exec(CPUArchState *env)
                    }
                }
                cpu->current_tb = NULL;
+                /* Try to align the host and virtual clocks
+                   if the guest is in advance */
+                align_clocks(&sc, cpu);
                /* reset soft MMU for next block (it can currently
                   only be set by a memory fault) */
            } /* for(;;) */
@@ -692,10 +544,8 @@ int cpu_exec(CPUArchState *env)
             * local variables as longjmp is marked 'noreturn'. */
            cpu = current_cpu;
            env = cpu->env_ptr;
-#if !(defined(CONFIG_USER_ONLY) && \
-      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
            cc = CPU_GET_CLASS(cpu);
-#endif
+            cpu->can_do_io = 1;
 #ifdef TARGET_I386
            x86_cpu = X86_CPU(cpu);
 #endif
@@ -706,35 +556,7 @@ int cpu_exec(CPUArchState *env)
        }
    } /* for(;;) */

-
-#if defined(TARGET_I386)
-    /* restore flags in standard format */
-    env->eflags = env->eflags | cpu_cc_compute_all(env, CC_OP)
-        | (env->df & DF_MASK);
-#elif defined(TARGET_ARM)
-    /* XXX: Save/restore host fpu exception state?.  */
-#elif defined(TARGET_UNICORE32)
-#elif defined(TARGET_SPARC)
-#elif defined(TARGET_PPC)
-#elif defined(TARGET_LM32)
-#elif defined(TARGET_M68K)
-    cpu_m68k_flush_flags(env, env->cc_op);
-    env->cc_op = CC_OP_FLAGS;
-    env->sr = (env->sr & 0xffe0)
-              | env->cc_dest | (env->cc_x << 4);
-#elif defined(TARGET_MICROBLAZE)
-#elif defined(TARGET_MIPS)
-#elif defined(TARGET_MOXIE)
-#elif defined(TARGET_OPENRISC)
-#elif defined(TARGET_SH4)
-#elif defined(TARGET_ALPHA)
-#elif defined(TARGET_CRIS)
-#elif defined(TARGET_S390X)
-#elif defined(TARGET_XTENSA)
-    /* XXXXX */
-#else
-#error unsupported target CPU
-#endif
+    cc->cpu_exec_exit(cpu);

    /* fail safe : never use current_cpu outside cpu_exec() */
    current_cpu = NULL;
--- a/cpus.c
+++ b/cpus.c
@@ -40,6 +40,7 @@
 #include "qemu/bitmap.h"
 #include "qemu/seqlock.h"
 #include "qapi-event.h"
+#include "hw/nmi.h"

 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -64,6 +65,8 @@
 #endif /* CONFIG_LINUX */

 static CPUState *next_cpu;
+int64_t max_delay;
+int64_t max_advance;

 bool cpu_is_stopped(CPUState *cpu)
 {
@@ -102,17 +105,12 @@ static bool all_cpu_threads_idle(void)

 /* Protected by TimersState seqlock */

-/* Compensate for varying guest execution speed.  */
-static int64_t qemu_icount_bias;
-static int64_t vm_clock_warp_start;
+static int64_t vm_clock_warp_start = -1;
 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 static int icount_time_shift;
 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 #define MAX_ICOUNT_SHIFT 10

-/* Only written by TCG thread */
-static int64_t qemu_icount;
-
 static QEMUTimer *icount_rt_timer;
 static QEMUTimer *icount_vm_timer;
 static QEMUTimer *icount_warp_timer;
@@ -129,24 +127,36 @@ typedef struct TimersState {
    int64_t cpu_clock_offset;
    int32_t cpu_ticks_enabled;
    int64_t dummy;
+
+    /* Compensate for varying guest execution speed.  */
+    int64_t qemu_icount_bias;
+    /* Only written by TCG thread */
+    int64_t qemu_icount;
 } TimersState;

 static TimersState timers_state;

-/* Return the virtual CPU time, based on the instruction counter.  */
-static int64_t cpu_get_icount_locked(void)
+int64_t cpu_get_icount_raw(void)
 {
    int64_t icount;
    CPUState *cpu = current_cpu;

-    icount = qemu_icount;
+    icount = timers_state.qemu_icount;
    if (cpu) {
        if (!cpu_can_do_io(cpu)) {
-            fprintf(stderr, "Bad clock read\n");
+            fprintf(stderr, "Bad icount read\n");
+            exit(1);
        }
        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
    }
-    return qemu_icount_bias + (icount << icount_time_shift);
+    return icount;
+}
+
+/* Return the virtual CPU time, based on the instruction counter.  */
+static int64_t cpu_get_icount_locked(void)
+{
+    int64_t icount = cpu_get_icount_raw();
+    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 }

 int64_t cpu_get_icount(void)
@@ -162,6 +172,11 @@ int64_t cpu_get_icount(void)
    return icount;
 }

+int64_t cpu_icount_to_ns(int64_t icount)
+{
+    return icount << icount_time_shift;
+}
+
 /* return the host CPU cycle counter and handle stop/restart */
 /* Caller must hold the BQL */
 int64_t cpu_get_ticks(void)
@@ -214,6 +229,23 @@ int64_t cpu_get_clock(void)
    return ti;
 }

+/* return the offset between the host clock and virtual CPU clock */
+int64_t cpu_get_clock_offset(void)
+{
+    int64_t ti;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        ti = timers_state.cpu_clock_offset;
+        if (!timers_state.cpu_ticks_enabled) {
+            ti -= get_clock();
+        }
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return -ti;
+}
+
 /* enable cpu_get_ticks()
 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 */
@@ -284,14 +316,15 @@ static void icount_adjust(void)
        icount_time_shift++;
    }
    last_delta = delta;
-    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+    timers_state.qemu_icount_bias = cur_icount
+                              - (timers_state.qemu_icount << icount_time_shift);
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }

 static void icount_adjust_rt(void *opaque)
 {
    timer_mod(icount_rt_timer,
-                   qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
+              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
    icount_adjust();
 }

@@ -319,7 +352,7 @@ static void icount_warp_rt(void *opaque)

    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
-        int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        int64_t clock = cpu_get_clock_locked();
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
@@ -328,12 +361,11 @@ static void icount_warp_rt(void *opaque)
             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
             * far ahead of real time.
             */
-            int64_t cur_time = cpu_get_clock_locked();
            int64_t cur_icount = cpu_get_icount_locked();
-            int64_t delta = cur_time - cur_icount;
+            int64_t delta = clock - cur_icount;
            warp_delta = MIN(warp_delta, delta);
        }
-        qemu_icount_bias += warp_delta;
+        timers_state.qemu_icount_bias += warp_delta;
    }
    vm_clock_warp_start = -1;
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
@@ -351,7 +383,7 @@ void qtest_clock_warp(int64_t dest)
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
        seqlock_write_lock(&timers_state.vm_clock_seqlock);
-        qemu_icount_bias += warp;
+        timers_state.qemu_icount_bias += warp;
        seqlock_write_unlock(&timers_state.vm_clock_seqlock);

        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
@@ -393,7 +425,7 @@ void qemu_clock_warp(QEMUClockType type)
    }

    /* We want to use the earliest deadline from ALL vm_clocks */
-    clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
    if (deadline < 0) {
        return;
@@ -411,8 +443,8 @@ void qemu_clock_warp(QEMUClockType type)
         * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
         * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
         * event.  Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
-         * after some e"real" time, (related to the time left until the next
-         * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
+         * after some "real" time, (related to the time left until the next
+         * event) has passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
         * This avoids that the warps are visible externally; for example,
         * you will not be sending network packets continuously instead of
         * every 100ms.
@@ -428,6 +460,25 @@ void qemu_clock_warp(QEMUClockType type)
    }
 }

+static bool icount_state_needed(void *opaque)
+{
+    return use_icount;
+}
+
+/*
+ * This is a subsection for icount migration.
+ */
+static const VMStateDescription icount_vmstate_timers = {
+    .name = "timer/icount",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_INT64(qemu_icount_bias, TimersState),
+        VMSTATE_INT64(qemu_icount, TimersState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_timers = {
    .name = "timer",
    .version_id = 2,
@@ -437,23 +488,48 @@ static const VMStateDescription vmstate_timers = {
        VMSTATE_INT64(dummy, TimersState),
        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection[]) {
+        {
+            .vmsd = &icount_vmstate_timers,
+            .needed = icount_state_needed,
+        }, {
+            /* empty */
+        }
    }
 };

-void configure_icount(const char *option)
+void cpu_ticks_init(void)
 {
    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
+}
+
+void configure_icount(QemuOpts *opts, Error **errp)
+{
+    const char *option;
+    char *rem_str = NULL;
+
+    option = qemu_opt_get(opts, "shift");
    if (!option) {
+        if (qemu_opt_get(opts, "align") != NULL) {
+            error_setg(errp, "Please specify shift option when using align");
+        }
        return;
    }
-
-    icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
-                                          icount_warp_rt, NULL);
+    icount_align_option = qemu_opt_get_bool(opts, "align", false);
+    icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
+                                     icount_warp_rt, NULL);
    if (strcmp(option, "auto") != 0) {
-        icount_time_shift = strtol(option, NULL, 0);
+        errno = 0;
+        icount_time_shift = strtol(option, &rem_str, 0);
+        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
+            error_setg(errp, "icount: Invalid shift value");
+        }
        use_icount = 1;
        return;
+    } else if (icount_align_option) {
+        error_setg(errp, "shift=auto and align=on are incompatible");
    }

    use_icount = 2;
@@ -467,10 +543,10 @@ void configure_icount(const char *option)
       the virtual time trigger catches emulated time passing too fast.
       Realtime triggers occur even when idle, so use them less frequently
       than VM triggers.  */
-    icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
-                                        icount_adjust_rt, NULL);
+    icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
+                                   icount_adjust_rt, NULL);
    timer_mod(icount_rt_timer,
-                   qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
+                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
    icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                        icount_adjust_vm, NULL);
    timer_mod(icount_vm_timer,
@@ -523,6 +599,15 @@ void cpu_synchronize_all_post_init(void)
    }
 }

+void cpu_clean_all_dirty(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_clean_state(cpu);
+    }
+}
+
 static int do_vm_stop(RunState state)
 {
    int ret = 0;
@@ -855,6 +940,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
    qemu_mutex_lock(&qemu_global_mutex);
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
    current_cpu = cpu;

    r = kvm_init_vcpu(cpu);
@@ -895,6 +981,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    qemu_mutex_lock_iothread();
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
@@ -937,6 +1024,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
    CPU_FOREACH(cpu) {
        cpu->thread_id = qemu_get_thread_id();
        cpu->created = true;
+        cpu->can_do_io = 1;
    }
    qemu_cond_signal(&qemu_cpu_cond);

@@ -1250,7 +1338,8 @@ static int tcg_cpu_exec(CPUArchState *env)
        int64_t count;
        int64_t deadline;
        int decr;
-        qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
@@ -1265,7 +1354,7 @@ static int tcg_cpu_exec(CPUArchState *env)
        }

        count = qemu_icount_round(deadline);
-        qemu_icount += count;
+        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
        cpu->icount_decr.u16.low = decr;
@@ -1278,7 +1367,8 @@ static int tcg_cpu_exec(CPUArchState *env)
    if (use_icount) {
        /* Fold pending instructions back into the
           instruction counter, and clear the interrupt flag.  */
-        qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
    }
@@ -1342,6 +1432,9 @@ CpuInfoList *qmp_query_cpus(Error **errp)
 #elif defined(TARGET_MIPS)
        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
        CPUMIPSState *env = &mips_cpu->env;
+#elif defined(TARGET_TRICORE)
+        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
+        CPUTriCoreState *env = &tricore_cpu->env;
 #endif

        cpu_synchronize_state(cpu);
@@ -1366,6 +1459,9 @@ CpuInfoList *qmp_query_cpus(Error **errp)
 #elif defined(TARGET_MIPS)
        info->value->has_PC = true;
        info->value->PC = env->active_tc.PC;
+#elif defined(TARGET_TRICORE)
+        info->value->has_PC = true;
+        info->value->PC = env->PC;
 #endif

        /* XXX: waiting for the qapi to support GSList */
@@ -1469,21 +1565,24 @@ void qmp_inject_nmi(Error **errp)
            apic_deliver_nmi(cpu->apic_state);
        }
    }
-#elif defined(TARGET_S390X)
-    CPUState *cs;
-    S390CPU *cpu;
-
-    CPU_FOREACH(cs) {
-        cpu = S390_CPU(cs);
-        if (cpu->env.cpu_num == monitor_get_cpu_index()) {
-            if (s390_cpu_restart(S390_CPU(cs)) == -1) {
-                error_set(errp, QERR_UNSUPPORTED);
-                return;
-            }
-            break;
-        }
-    }
 #else
-    error_set(errp, QERR_UNSUPPORTED);
+    nmi_monitor_handle(monitor_get_cpu_index(), errp);
 #endif
 }
+
+void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
+{
+    if (!use_icount) {
+        return;
+    }
+
+    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
+                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
+    if (icount_align_option) {
+        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
+        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
+    } else {
+        cpu_fprintf(f, "Max guest delay     NA\n");
+        cpu_fprintf(f, "Max guest advance   NA\n");
+    }
+}
--- a/cputlb.c
+++ b/cputlb.c
@@ -60,8 +60,10 @@ void tlb_flush(CPUState *cpu, int flush_global)
    cpu->current_tb = NULL;

    memset(env->tlb_table, -1, sizeof(env->tlb_table));
+    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));

+    env->vtlb_index = 0;
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;
    tlb_flush_count++;
@@ -108,6 +110,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
    }

+    /* check whether there are entries that need to be flushed in the vtlb */
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        int k;
+        for (k = 0; k < CPU_VTLB_SIZE; k++) {
+            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+        }
+    }
+
    tb_flush_jmp_cache(cpu, addr);
 }

@@ -172,6 +182,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, ram_addr_t length)
                tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
                                      start1, length);
            }
+
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
+                                      start1, length);
+            }
        }
    }
 }
@@ -195,6 +210,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong vaddr)
    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
    }
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        int k;
+        for (k = 0; k < CPU_VTLB_SIZE; k++) {
+            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+        }
+    }
 }

 /* Our TLB does not support large pages, so remember the area covered by
@@ -235,6 +257,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
    uintptr_t addend;
    CPUTLBEntry *te;
    hwaddr iotlb, xlat, sz;
+    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;

    assert(size >= TARGET_PAGE_SIZE);
    if (size != TARGET_PAGE_SIZE) {
@@ -247,7 +270,8 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
    assert(sz >= TARGET_PAGE_SIZE);

 #if defined(DEBUG_TLB)
-    printf("tlb_set_page: vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
+    qemu_log_mask(CPU_LOG_MMU,
+           "tlb_set_page: vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
           " prot=%x idx=%d\n",
           vaddr, paddr, prot, mmu_idx);
 #endif
@@ -267,8 +291,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                                            prot, &address);

    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    env->iotlb[mmu_idx][index] = iotlb - vaddr;
    te = &env->tlb_table[mmu_idx][index];
+
+    /* do not discard the translation in te, evict it into a victim tlb */
+    env->tlb_v_table[mmu_idx][vidx] = *te;
+    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
+
+    /* refill the tlb */
+    env->iotlb[mmu_idx][index] = iotlb - vaddr;
    te->addend = addend - vaddr;
    if (prot & PAGE_READ) {
        te->addr_read = address;
--- a/default-configs/mips-softmmu.mak
+++ b/default-configs/mips-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/mips64-softmmu.mak
+++ b/default-configs/mips64-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/mipsel-softmmu.mak
+++ b/default-configs/mipsel-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -30,3 +30,5 @@ CONFIG_IPACK=y
 CONFIG_WDT_IB6300ESB=y
 CONFIG_PCI_TESTDEV=y
 CONFIG_NVME_PCI=y
+CONFIG_SD=y
+CONFIG_SDHCI=y
--- a/default-configs/ppc-softmmu.mak
+++ b/default-configs/ppc-softmmu.mak
@@ -45,8 +45,8 @@ CONFIG_PREP=y
 CONFIG_MAC=y
 CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
+CONFIG_ETSEC=y
+CONFIG_LIBDECNUMBER=y
 # For PReP
 CONFIG_MC146818RTC=y
-CONFIG_ETSEC=y
 CONFIG_ISA_TESTDEV=y
-CONFIG_LIBDECNUMBER=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -46,6 +46,8 @@ CONFIG_PREP=y
 CONFIG_MAC=y
 CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
+CONFIG_ETSEC=y
+CONFIG_LIBDECNUMBER=y
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
@@ -58,4 +60,3 @@ CONFIG_I82374=y
 CONFIG_I8257=y
 CONFIG_MC146818RTC=y
 CONFIG_ISA_TESTDEV=y
-CONFIG_LIBDECNUMBER=y
--- a/default-configs/s390x-softmmu.mak
+++ b/default-configs/s390x-softmmu.mak
@@ -1,3 +1,4 @@
+include pci.mak
 CONFIG_VIRTIO=y
 CONFIG_SCLPCONSOLE=y
 CONFIG_S390_FLIC=y
--- a/default-configs/tricore-softmmu.mak
+++ b/default-configs/tricore-softmmu.mak
--- a/device-hotplug.c
+++ b/device-hotplug.c
@@ -24,6 +24,7 @@

 #include "hw/hw.h"
 #include "hw/boards.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "qemu/config-file.h"
 #include "sysemu/sysemu.h"
@@ -76,6 +77,6 @@ void drive_hot_add(Monitor *mon, const QDict *qdict)

 err:
    if (dinfo) {
-        drive_del(dinfo);
+        blk_unref(blk_by_legacy_dinfo(dinfo));
    }
 }
--- a/device_tree.c
+++ b/device_tree.c
@@ -20,6 +20,7 @@

 #include "config.h"
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
@@ -59,13 +60,13 @@ void *create_device_tree(int *sizep)
    }
    ret = fdt_open_into(fdt, fdt, *sizep);
    if (ret) {
-        fprintf(stderr, "Unable to copy device tree in memory\n");
+        error_report("Unable to copy device tree in memory");
        exit(1);
    }

    return fdt;
 fail:
-    fprintf(stderr, "%s Couldn't create dt: %s\n", __func__, fdt_strerror(ret));
+    error_report("%s Couldn't create dt: %s", __func__, fdt_strerror(ret));
    exit(1);
 }

@@ -79,8 +80,8 @@ void *load_device_tree(const char *filename_path, int *sizep)
    *sizep = 0;
    dt_size = get_image_size(filename_path);
    if (dt_size < 0) {
-        printf("Unable to get size of device tree file '%s'\n",
-            filename_path);
+        error_report("Unable to get size of device tree file '%s'",
+                     filename_path);
        goto fail;
    }

@@ -92,21 +93,21 @@ void *load_device_tree(const char *filename_path, int *sizep)

    dt_file_load_size = load_image(filename_path, fdt);
    if (dt_file_load_size < 0) {
-        printf("Unable to open device tree file '%s'\n",
-               filename_path);
+        error_report("Unable to open device tree file '%s'",
+                     filename_path);
        goto fail;
    }

    ret = fdt_open_into(fdt, fdt, dt_size);
    if (ret) {
-        printf("Unable to copy device tree in memory\n");
+        error_report("Unable to copy device tree in memory");
        goto fail;
    }

    /* Check sanity of device tree */
    if (fdt_check_header(fdt)) {
-        printf ("Device tree file loaded into memory is invalid: %s\n",
-            filename_path);
+        error_report("Device tree file loaded into memory is invalid: %s",
+                     filename_path);
        goto fail;
    }
    *sizep = dt_size;
@@ -123,8 +124,8 @@ static int findnode_nofail(void *fdt, const char *node_path)

    offset = fdt_path_offset(fdt, node_path);
    if (offset < 0) {
-        fprintf(stderr, "%s Couldn't find node %s: %s\n", __func__, node_path,
-                fdt_strerror(offset));
+        error_report("%s Couldn't find node %s: %s", __func__, node_path,
+                     fdt_strerror(offset));
        exit(1);
    }

@@ -138,8 +139,8 @@ int qemu_fdt_setprop(void *fdt, const char *node_path,

    r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s: %s\n", __func__, node_path,
-                property, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s: %s", __func__, node_path,
+                     property, fdt_strerror(r));
        exit(1);
    }

@@ -153,8 +154,8 @@ int qemu_fdt_setprop_cell(void *fdt, const char *node_path,

    r = fdt_setprop_cell(fdt, findnode_nofail(fdt, node_path), property, val);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s = %#08x: %s\n", __func__,
-                node_path, property, val, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s = %#08x: %s", __func__,
+                     node_path, property, val, fdt_strerror(r));
        exit(1);
    }

@@ -175,8 +176,8 @@ int qemu_fdt_setprop_string(void *fdt, const char *node_path,

    r = fdt_setprop_string(fdt, findnode_nofail(fdt, node_path), property, string);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s = %s: %s\n", __func__,
-                node_path, property, string, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s = %s: %s", __func__,
+                     node_path, property, string, fdt_strerror(r));
        exit(1);
    }

@@ -193,8 +194,8 @@ const void *qemu_fdt_getprop(void *fdt, const char *node_path,
    }
    r = fdt_getprop(fdt, findnode_nofail(fdt, node_path), property, lenp);
    if (!r) {
-        fprintf(stderr, "%s: Couldn't get %s/%s: %s\n", __func__,
-                node_path, property, fdt_strerror(*lenp));
+        error_report("%s: Couldn't get %s/%s: %s", __func__,
+                     node_path, property, fdt_strerror(*lenp));
        exit(1);
    }
    return r;
@@ -206,8 +207,8 @@ uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path,
    int len;
    const uint32_t *p = qemu_fdt_getprop(fdt, node_path, property, &len);
    if (len != 4) {
-        fprintf(stderr, "%s: %s/%s not 4 bytes long (not a cell?)\n",
-                __func__, node_path, property);
+        error_report("%s: %s/%s not 4 bytes long (not a cell?)",
+                     __func__, node_path, property);
        exit(1);
    }
    return be32_to_cpu(*p);
@@ -219,8 +220,8 @@ uint32_t qemu_fdt_get_phandle(void *fdt, const char *path)

    r = fdt_get_phandle(fdt, findnode_nofail(fdt, path));
    if (r == 0) {
-        fprintf(stderr, "%s: Couldn't get phandle for %s: %s\n", __func__,
-                path, fdt_strerror(r));
+        error_report("%s: Couldn't get phandle for %s: %s", __func__,
+                     path, fdt_strerror(r));
        exit(1);
    }

@@ -265,8 +266,8 @@ int qemu_fdt_nop_node(void *fdt, const char *node_path)

    r = fdt_nop_node(fdt, findnode_nofail(fdt, node_path));
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't nop node %s: %s\n", __func__, node_path,
-                fdt_strerror(r));
+        error_report("%s: Couldn't nop node %s: %s", __func__, node_path,
+                     fdt_strerror(r));
        exit(1);
    }

@@ -294,8 +295,8 @@ int qemu_fdt_add_subnode(void *fdt, const char *name)

    retval = fdt_add_subnode(fdt, parent, basename);
    if (retval < 0) {
-        fprintf(stderr, "FDT: Failed to create subnode %s: %s\n", name,
-                fdt_strerror(retval));
+        error_report("FDT: Failed to create subnode %s: %s", name,
+                     fdt_strerror(retval));
        exit(1);
    }

@@ -323,6 +324,7 @@ int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
    uint64_t value;
    int cellnum, vnum, ncells;
    uint32_t hival;
+    int ret;

    propcells = g_new0(uint32_t, numvalues * 2);

@@ -330,18 +332,23 @@ int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
    for (vnum = 0; vnum < numvalues; vnum++) {
        ncells = values[vnum * 2];
        if (ncells != 1 && ncells != 2) {
-            return -1;
+            ret = -1;
+            goto out;
        }
        value = values[vnum * 2 + 1];
        hival = cpu_to_be32(value >> 32);
        if (ncells > 1) {
            propcells[cellnum++] = hival;
        } else if (hival != 0) {
-            return -1;
+            ret = -1;
+            goto out;
        }
        propcells[cellnum++] = cpu_to_be32(value);
    }

-    return qemu_fdt_setprop(fdt, node_path, property, propcells,
-                            cellnum * sizeof(uint32_t));
+    ret = qemu_fdt_setprop(fdt, node_path, property, propcells,
+                           cellnum * sizeof(uint32_t));
+out:
+    g_free(propcells);
+    return ret;
 }
--- a/disas/arm-a64.cc
+++ b/disas/arm-a64.cc
@@ -39,7 +39,7 @@ public:
    ~QEMUDisassembler() { }

 protected:
-    void ProcessOutput(Instruction *instr) {
+    virtual void ProcessOutput(const Instruction *instr) {
        fprintf(stream_, "%08" PRIx32 "      %s",
                instr->InstructionBits(), GetOutput());
    }
--- a/disas/libvixl/README
+++ b/disas/libvixl/README
@@ -2,7 +2,7 @@
 The code in this directory is a subset of libvixl:
 https://github.com/armvixl/vixl
 (specifically, it is the set of files needed for disassembly only,
-taken from libvixl 1.4).
+taken from libvixl 1.6).
 Bugfixes should preferably be sent upstream initially.

 The disassembler does not currently support the entire A64 instruction
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .1.0
 .2.50