Update version for v2.2.0 release

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Update version for v2.2.0-rc5 release
2014-12-09 12:13:37 +00:00 · 2014-12-04 15:51:22 +00:00 · 2014-12-04 12:22:46 +00:00 · 2014-12-01 13:35:26 +00:00 · 2014-12-01 12:29:35 +00:00 · 2014-12-01 10:25:46 +01:00
1017 changed files with 62257 additions and 16134 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,10 @@
 /trace/generated-tracers.dtrace
 /trace/generated-events.h
 /trace/generated-events.c
+/trace/generated-helpers-wrappers.h
+/trace/generated-helpers.h
+/trace/generated-helpers.c
+/trace/generated-tcg-tracers.h
 /trace/generated-ust-provider.h
 /trace/generated-ust.c
 /libcacard/trace/generated-tracers.c
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ notifications:
    on_failure: always
 env:
  global:
-    - TEST_CMD="make check"
+    - TEST_CMD=""
    - EXTRA_CONFIG=""
    # Development packages, EXTRA_PKGS saved for additional builds
    - CORE_PKGS="libusb-1.0-0-dev libiscsi-dev librados-dev libncurses5-dev"
@@ -20,31 +20,51 @@ env:
    - GUI_PKGS="libgtk-3-dev libvte-2.90-dev libsdl1.2-dev libpng12-dev libpixman-1-dev"
    - EXTRA_PKGS=""
  matrix:
+    # Group major targets together with their linux-user counterparts
    - TARGETS=alpha-softmmu,alpha-linux-user
-    - TARGETS=arm-softmmu,arm-linux-user
-    - TARGETS=aarch64-softmmu,aarch64-linux-user
-    - TARGETS=cris-softmmu
-    - TARGETS=i386-softmmu,x86_64-softmmu
-    - TARGETS=lm32-softmmu
-    - TARGETS=m68k-softmmu
-    - TARGETS=microblaze-softmmu,microblazeel-softmmu
+    - TARGETS=arm-softmmu,arm-linux-user,armeb-linux-user,aarch64-softmmu,aarch64-linux-user
+    - TARGETS=cris-softmmu,cris-linux-user
+    - TARGETS=i386-softmmu,i386-linux-user,x86_64-softmmu,x86_64-linux-user
+    - TARGETS=m68k-softmmu,m68k-linux-user
+    - TARGETS=microblaze-softmmu,microblazeel-softmmu,microblaze-linux-user,microblazeel-linux-user
    - TARGETS=mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu
-    - TARGETS=moxie-softmmu
-    - TARGETS=or32-softmmu,
-    - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu
-    - TARGETS=s390x-softmmu
-    - TARGETS=sh4-softmmu,sh4eb-softmmu
-    - TARGETS=sparc-softmmu,sparc64-softmmu
-    - TARGETS=unicore32-softmmu
-    - TARGETS=xtensa-softmmu,xtensaeb-softmmu
+    - TARGETS=mips-linux-user,mips64-linux-user,mips64el-linux-user,mipsel-linux-user,mipsn32-linux-user,mipsn32el-linux-user
+    - TARGETS=or32-softmmu,or32-linux-user
+    - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu,ppc-linux-user,ppc64-linux-user,ppc64abi32-linux-user,ppc64le-linux-user
+    - TARGETS=s390x-softmmu,s390x-linux-user
+    - TARGETS=sh4-softmmu,sh4eb-softmmu,sh4-linux-user sh4eb-linux-user
+    - TARGETS=sparc-softmmu,sparc64-softmmu,sparc-linux-user,sparc32plus-linux-user,sparc64-linux-user
+    - TARGETS=unicore32-softmmu,unicore32-linux-user
+    # Group remaining softmmu only targets into one build
+    - TARGETS=lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,xtensaeb-softmmu
+git:
+  # we want to do this ourselves
+  submodules: false
 before_install:
+  - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
  - git submodule update --init --recursive
  - sudo apt-get update -qq
  - sudo apt-get install -qq ${CORE_PKGS} ${NET_PKGS} ${GUI_PKGS} ${EXTRA_PKGS}
-script: "./configure --target-list=${TARGETS} ${EXTRA_CONFIG} && make && ${TEST_CMD}"
+before_script:
+  - ./configure --target-list=${TARGETS} --enable-debug-tcg ${EXTRA_CONFIG}
+script:
+  - make -j2 && ${TEST_CMD}
 matrix:
  # We manually include a number of additional build for non-standard bits
  include:
+    # Make check target (we only do this once)
+    - env:
+        - TARGETS=alpha-softmmu,arm-softmmu,aarch64-softmmu,cris-softmmu,
+                  i386-softmmu,x86_64-softmmu,m68k-softmmu,microblaze-softmmu,
+                  microblazeel-softmmu,mips-softmmu,mips64-softmmu,
+                  mips64el-softmmu,mipsel-softmmu,or32-softmmu,ppc-softmmu,
+                  ppc64-softmmu,ppcemb-softmmu,s390x-softmmu,sh4-softmmu,
+                  sh4eb-softmmu,sparc-softmmu,sparc64-softmmu,
+                  unicore32-softmmu,unicore32-linux-user,
+                  lm32-softmmu,moxie-softmmu,tricore-softmmu,xtensa-softmmu,
+                  xtensaeb-softmmu
+          TEST_CMD="make check"
+      compiler: gcc
    # Debug related options
    - env: TARGETS=i386-softmmu,x86_64-softmmu
           EXTRA_CONFIG="--enable-debug"
@@ -73,7 +93,6 @@ matrix:
      compiler: gcc
    - env: TARGETS=i386-softmmu,x86_64-softmmu
           EXTRA_CONFIG="--enable-trace-backends=ftrace"
-           TEST_CMD=""
      compiler: gcc
    - env: TARGETS=i386-softmmu,x86_64-softmmu
          EXTRA_PKGS="liblttng-ust-dev liburcu-dev"
--- a/14
+++ b/14
@@ -91,3 +91,17 @@ Mixed declarations (interleaving statements and declarations within blocks)
 are not allowed; declarations should be at the beginning of blocks.  In other
 words, the code should not generate warnings if using GCC's
 -Wdeclaration-after-statement option.
+
+6. Conditional statements
+
+When comparing a variable for (in)equality with a constant, list the
+constant on the right, as in:
+
+if (a == 1) {
+    /* Reads like: "If a equals 1" */
+    do_something();
+}
+
+Rationale: Yoda conditions (as in 'if (1 == a)') are awkward to read.
+Besides, good compilers already warn users when '==' is mis-typed as '=',
+even when the constant is on the right.
--- a/128
+++ b/128
@@ -51,6 +51,7 @@ Descriptions of section entries:
 General Project Administration
 ------------------------------
 M: Anthony Liguori <aliguori@amazon.com>
+M: Peter Maydell <peter.maydell@linaro.org>

 Responsible Disclosure, Reporting Security Issues
 ------------------------------
@@ -61,11 +62,23 @@ L: secalert@redhat.com

 Guest CPU cores (TCG):
 ----------------------
+Overall
+L: qemu-devel@nongnu.org
+S: Odd fixes
+F: cpu-exec.c
+F: cputlb.c
+F: softmmu_template.h
+F: translate-all.c
+F: include/exec/cpu_ldst.h
+F: include/exec/cpu_ldst_template.h
+F: include/exec/helper*.h
+
 Alpha
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: target-alpha/
 F: hw/alpha/
+F: tests/tcg/alpha/

 ARM
 M: Peter Maydell <peter.maydell@linaro.org>
@@ -79,6 +92,7 @@ M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 S: Maintained
 F: target-cris/
 F: hw/cris/
+F: tests/tcg/cris/

 LM32
 M: Michael Walle <michael@walle.cc>
@@ -86,6 +100,7 @@ S: Maintained
 F: target-lm32/
 F: hw/lm32/
 F: hw/char/lm32_*
+F: tests/tcg/lm32/

 M68K
 S: Orphan
@@ -100,9 +115,11 @@ F: hw/microblaze/

 MIPS
 M: Aurelien Jarno <aurelien@aurel32.net>
-S: Odd Fixes
+M: Leon Alrae <leon.alrae@imgtec.com>
+S: Maintained
 F: target-mips/
 F: hw/mips/
+F: tests/tcg/mips/

 Moxie
 M: Anthony Green <green@moxielogic.com>
@@ -114,6 +131,7 @@ M: Jia Liu <proljc@gmail.com>
 S: Maintained
 F: target-openrisc/
 F: hw/openrisc/
+F: tests/tcg/openrisc/

 PowerPC
 M: Alexander Graf <agraf@suse.de>
@@ -149,7 +167,8 @@ F: target-unicore32/
 F: hw/unicore32/

 X86
-M: qemu-devel@nongnu.org
+M: Paolo Bonzini <pbonzini@redhat.com>
+M: Richard Henderson <rth@twiddle.net>
 S: Odd Fixes
 F: target-i386/
 F: hw/i386/
@@ -160,6 +179,13 @@ W: http://wiki.osll.spb.ru/doku.php?id=etc:users:jcmvbkbc:qemu-target-xtensa
 S: Maintained
 F: target-xtensa/
 F: hw/xtensa/
+F: tests/tcg/xtensa/
+
+TriCore
+M: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
+S: Maintained
+F: target-tricore/
+F: hw/tricore/

 Guest CPU Cores (KVM):
 ----------------------
@@ -192,9 +218,12 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target-s390x/kvm.c
-F: hw/intc/s390_flic.[hc]
+F: hw/intc/s390_flic.c
+F: hw/intc/s390_flic_kvm.c
+F: include/hw/s390x/s390_flic.h

 X86
+M: Paolo Bonzini <pbonzini@redhat.com>
 M: Marcelo Tosatti <mtosatti@redhat.com>
 L: kvm@vger.kernel.org
 S: Supported
@@ -260,7 +289,7 @@ F: include/hw/arm/digic.h
 F: hw/*/digic*

 Gumstix
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/arm/gumstix.c

@@ -276,7 +305,7 @@ S: Maintained
 F: hw/arm/integratorcp.c

 Mainstone
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/arm/mainstone.c

@@ -382,7 +411,7 @@ S: Maintained
 F: hw/mips/mips_malta.c

 Mipssim
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Orphan
 F: hw/mips/mips_mipssim.c

@@ -515,6 +544,8 @@ F: hw/s390x/s390-virtio-ccw.c
 F: hw/s390x/css.[hc]
 F: hw/s390x/sclp*.[hc]
 F: hw/s390x/ipl*.[hc]
+F: include/hw/s390x/
+F: pc-bios/s390-ccw/
 T: git git://github.com/cohuck/qemu virtio-ccw-upstr

 UniCore32 Machines
@@ -552,12 +583,13 @@ Xtensa Machines
 sim
 M: Max Filippov <jcmvbkbc@gmail.com>
 S: Maintained
-F: hw/xtensa/xtensa_sim.c
+F: hw/xtensa/sim.c

-Avnet LX60
+XTFPGA (LX60, LX200, ML605, KC705)
 M: Max Filippov <jcmvbkbc@gmail.com>
 S: Maintained
-F: hw/xtensa/xtensa_lx60.c
+F: hw/xtensa/xtfpga.c
+F: hw/net/opencores_eth.c

 Devices
 -------
@@ -614,7 +646,13 @@ USB
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb/*
-F: tests/usb-hcd-ehci-test.c
+F: tests/usb-*-test.c
+
+USB (serial adapter)
+M: Gerd Hoffmann <kraxel@redhat.com>
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
+S: Maintained
+F: hw/usb/dev-serial.c

 VFIO
 M: Alex Williamson <alex.williamson@redhat.com>
@@ -678,6 +716,12 @@ S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h

+Vmware
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/vmxnet*
+F: hw/scsi/vmw_pvscsi*
+
 Subsystems
 ----------
 Audio
@@ -694,18 +738,30 @@ Block
 M: Kevin Wolf <kwolf@redhat.com>
 M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Supported
+F: async.c
+F: aio-*.c
 F: block*
 F: block/
 F: hw/block/
 F: qemu-img*
 F: qemu-io*
+F: tests/image-fuzzer/
+F: tests/qemu-iotests/
 T: git git://repo.or.cz/qemu/kevin.git block
 T: git git://github.com/stefanha/qemu.git block

 Character Devices
 M: Anthony Liguori <aliguori@amazon.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: qemu-char.c
+F: backends/msmouse.c
+F: backends/testdev.c
+
+Character Devices (Braille)
+M: Samuel Thibault <samuel.thibault@ens-lyon.org>
+S: Maintained
+F: backends/baum.c

 CPU
 M: Andreas Färber <afaerber@suse.de>
@@ -727,7 +783,7 @@ S: Maintained
 F: device_tree.[ch]

 GDB stub
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Odd Fixes
 F: gdbstub*
 F: gdb-xml/
@@ -764,7 +820,11 @@ F: ui/cocoa.m

 Main loop
 M: Anthony Liguori <aliguori@amazon.com>
-S: Supported
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Maintained
+F: cpus.c
+F: main-loop.c
+F: qemu-timer.c
 F: vl.c

 Human Monitor (HMP)
@@ -803,6 +863,7 @@ M: Luiz Capitulino <lcapitulino@redhat.com>
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
 S: Maintained
 F: qapi/
+F: tests/qapi-schema/
 T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp

 QAPI Schema
@@ -813,6 +874,18 @@ S: Supported
 F: qapi-schema.json
 T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp

+QObject
+M: Luiz Capitulino <lcapitulino@redhat.com>
+S: Maintained
+F: qobject/
+T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp
+
+QEMU Guest Agent
+M: Michael Roth <mdroth@linux.vnet.ibm.com>
+S: Maintained
+F: qga/
+T: git git://github.com/mdroth/qemu.git qga
+
 QOM
 M: Anthony Liguori <aliguori@amazon.com>
 M: Andreas Färber <afaerber@suse.de>
@@ -853,6 +926,15 @@ M: Blue Swirl <blauwirbel@gmail.com>
 S: Odd Fixes
 F: scripts/checkpatch.pl

+Migration
+M: Juan Quintela <quintela@redhat.com>
+S: Maintained
+F: include/migration/
+F: migration*
+F: savevm.c
+F: arch_init.c
+F: vmstate.c
+
 Seccomp
 M: Eduardo Otubo <eduardo.otubo@profitbricks.com>
 S: Supported
@@ -861,6 +943,12 @@ F: include/sysemu/seccomp.h

 Usermode Emulation
 ------------------
+Overall
+M: Riku Voipio <riku.voipio@iki.fi>
+S: Maintained
+F: thunk.c
+F: user-exec.c
+
 BSD user
 M: Blue Swirl <blauwirbel@gmail.com>
 S: Maintained
@@ -874,7 +962,6 @@ F: linux-user/
 Tiny Code Generator (TCG)
 -------------------------
 Common code
-M: qemu-devel@nongnu.org
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: tcg/
@@ -891,7 +978,7 @@ S: Maintained
 F: tcg/arm/

 i386 target
-M: qemu-devel@nongnu.org
+L: qemu-devel@nongnu.org
 S: Maintained
 F: tcg/i386/

@@ -968,7 +1055,7 @@ S: Supported
 F: block/rbd.c

 Sheepdog
-M: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
+M: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
 M: Liu Yuan <namei.unix@gmail.com>
 L: sheepdog@lists.wpkg.org
 S: Supported
@@ -1000,3 +1087,14 @@ SSH
 M: Richard W.M. Jones <rjones@redhat.com>
 S: Supported
 F: block/ssh.c
+
+ARCHIPELAGO
+M: Chrysostomos Nanakos <cnanakos@grnet.gr>
+M: Chrysostomos Nanakos <chris@include.gr>
+S: Maintained
+F: block/archipelago.c
+
+Bootdevice
+M: Gonglei <arei.gonglei@huawei.com>
+S: Maintained
+F: bootdevice.c
--- a/9
+++ b/9
@@ -57,6 +57,12 @@ GENERATED_HEADERS += trace/generated-tracers-dtrace.h
 endif
 GENERATED_SOURCES += trace/generated-tracers.c

+GENERATED_HEADERS += trace/generated-tcg-tracers.h
+
+GENERATED_HEADERS += trace/generated-helpers-wrappers.h
+GENERATED_HEADERS += trace/generated-helpers.h
+GENERATED_SOURCES += trace/generated-helpers.c
+
 ifeq ($(findstring ust,$(TRACE_BACKENDS)),ust)
 GENERATED_HEADERS += trace/generated-ust-provider.h
 GENERATED_SOURCES += trace/generated-ust.c
@@ -202,7 +208,7 @@ Makefile: $(version-obj-y) $(version-lobj-y)
 # Build libraries

 libqemustub.a: $(stub-obj-y)
-libqemuutil.a: $(util-obj-y) qapi-types.o qapi-visit.o qapi-event.o
+libqemuutil.a: $(util-obj-y)

 block-modules = $(foreach o,$(block-obj-m),"$(basename $(subst /,-,$o))",) NULL
 util/module.o-cflags = -D'CONFIG_BLOCK_MODULES=$(block-modules)'
@@ -412,6 +418,7 @@ endif
 	set -e; for x in $(KEYMAPS); do \
 		$(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \
 	done
+	$(INSTALL_DATA) $(SRC_PATH)/trace-events "$(DESTDIR)$(qemu_datadir)/trace-events"
 	for d in $(TARGET_DIRS); do \
 	$(MAKE) $(SUBDIR_MAKEFLAGS) TARGET_DIR=$$d/ -C $$d $@ || exit 1 ; \
        done
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -1,7 +1,7 @@
 #######################################################################
 # Common libraries for tools and emulators
 stub-obj-y = stubs/
-util-obj-y = util/ qobject/ qapi/ trace/
+util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o

 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
@@ -12,7 +12,6 @@ block-obj-y += main-loop.o iohandler.o qemu-timer.o
 block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
-block-obj-y += qapi-types.o qapi-visit.o qapi-event.o
 block-obj-y += qemu-io-cmds.o

 block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
@@ -51,7 +50,7 @@ common-obj-$(CONFIG_LINUX) += fsdev/

 common-obj-y += migration.o migration-tcp.o
 common-obj-y += vmstate.o
-common-obj-y += qemu-file.o
+common-obj-y += qemu-file.o qemu-file-unix.o qemu-file-stdio.o
 common-obj-$(CONFIG_RDMA) += migration-rdma.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o
@@ -63,6 +62,7 @@ common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

 common-obj-y += audio/
 common-obj-y += hw/
+common-obj-y += accel.o

 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
@@ -88,11 +88,6 @@ common-obj-y += qmp-marshal.o
 common-obj-y += qmp.o hmp.o
 endif

-######################################################################
-# some qapi visitors are used by both system and user emulation:
-
-common-obj-y += qapi-visit.o qapi-types.o
-
 #######################################################################
 # Target-independent parts used in system and user emulation
 common-obj-y += qemu-log.o
@@ -106,10 +101,15 @@ common-obj-y += disas/
 version-obj-$(CONFIG_WIN32) += $(BUILD_DIR)/version.o
 version-lobj-$(CONFIG_WIN32) += $(BUILD_DIR)/version.lo

+######################################################################
+# tracing
+util-obj-y +=  trace/
+target-obj-y += trace/
+
 ######################################################################
 # guest agent

 # FIXME: a few definitions from qapi-types.o/qapi-visit.o are needed
 # by libqemuutil.a.  These should be moved to a separate .json schema.
-qga-obj-y = qga/ qapi-types.o qapi-visit.o
+qga-obj-y = qga/
 qga-vss-dll-obj-y = qga/
--- a/Makefile.target
+++ b/Makefile.target
@@ -38,7 +38,7 @@ config-target.h: config-target.h-timestamp
 config-target.h-timestamp: config-target.mak

 ifdef CONFIG_TRACE_SYSTEMTAP
-stap: $(QEMU_PROG).stp-installed $(QEMU_PROG).stp
+stap: $(QEMU_PROG).stp-installed $(QEMU_PROG).stp $(QEMU_PROG)-simpletrace.stp

 ifdef CONFIG_USER_ONLY
 TARGET_TYPE=user
@@ -64,6 +64,13 @@ $(QEMU_PROG).stp: $(SRC_PATH)/trace-events
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp")

+$(QEMU_PROG)-simpletrace.stp: $(SRC_PATH)/trace-events
+	$(call quiet-command,$(TRACETOOL) \
+		--format=simpletrace-stap \
+		--backends=$(TRACE_BACKENDS) \
+		--probe-prefix=qemu.$(TARGET_TYPE).$(TARGET_NAME) \
+		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG)-simpletrace.stp")
+
 else
 stap:
 endif
@@ -120,7 +127,7 @@ endif #CONFIG_BSD_USER
 # System emulator target
 ifdef CONFIG_SOFTMMU
 obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
-obj-y += qtest.o
+obj-y += qtest.o bootdevice.o
 obj-y += hw/
 obj-$(CONFIG_FDT) += device_tree.o
 obj-$(CONFIG_KVM) += kvm-all.o
@@ -152,15 +159,20 @@ endif # CONFIG_SOFTMMU
 dummy := $(call unnest-vars,,obj-y)
 all-obj-y := $(obj-y)

+target-obj-y :=
 block-obj-y :=
 common-obj-y :=
 include $(SRC_PATH)/Makefile.objs
+dummy := $(call unnest-vars,,target-obj-y)
+target-obj-y-save := $(target-obj-y)
 dummy := $(call unnest-vars,.., \
               block-obj-y \
               block-obj-m \
               common-obj-y \
               common-obj-m)
+target-obj-y := $(target-obj-y-save)
 all-obj-y += $(common-obj-y)
+all-obj-y += $(target-obj-y)
 all-obj-$(CONFIG_SOFTMMU) += $(block-obj-y)

 # build either PROG or PROGW
@@ -191,6 +203,7 @@ endif
 ifdef CONFIG_TRACE_SYSTEMTAP
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset"
 	$(INSTALL_DATA) $(QEMU_PROG).stp-installed "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG).stp"
+	$(INSTALL_DATA) $(QEMU_PROG)-simpletrace.stp "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG)-simpletrace.stp"
 endif

 GENERATED_HEADERS += config-target.h
--- a/2
+++ b/2
@@ -1 +1 @@
-2.1.2
+2.2.0
--- a/accel.c
+++ b/accel.c
@@ -0,0 +1,157 @@
+/*
+ * QEMU System Emulator, accelerator interfaces
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "sysemu/accel.h"
+#include "hw/boards.h"
+#include "qemu-common.h"
+#include "sysemu/arch_init.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "sysemu/qtest.h"
+#include "hw/xen/xen.h"
+#include "qom/object.h"
+#include "hw/boards.h"
+
+int tcg_tb_size;
+static bool tcg_allowed = true;
+
+static int tcg_init(MachineState *ms)
+{
+    tcg_exec_init(tcg_tb_size * 1024 * 1024);
+    return 0;
+}
+
+static const TypeInfo accel_type = {
+    .name = TYPE_ACCEL,
+    .parent = TYPE_OBJECT,
+    .class_size = sizeof(AccelClass),
+    .instance_size = sizeof(AccelState),
+};
+
+/* Lookup AccelClass from opt_name. Returns NULL if not found */
+static AccelClass *accel_find(const char *opt_name)
+{
+    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
+    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
+    g_free(class_name);
+    return ac;
+}
+
+static int accel_init_machine(AccelClass *acc, MachineState *ms)
+{
+    ObjectClass *oc = OBJECT_CLASS(acc);
+    const char *cname = object_class_get_name(oc);
+    AccelState *accel = ACCEL(object_new(cname));
+    int ret;
+    ms->accelerator = accel;
+    *(acc->allowed) = true;
+    ret = acc->init_machine(ms);
+    if (ret < 0) {
+        ms->accelerator = NULL;
+        *(acc->allowed) = false;
+        object_unref(OBJECT(accel));
+    }
+    return ret;
+}
+
+int configure_accelerator(MachineState *ms)
+{
+    const char *p;
+    char buf[10];
+    int ret;
+    bool accel_initialised = false;
+    bool init_failed = false;
+    AccelClass *acc = NULL;
+
+    p = qemu_opt_get(qemu_get_machine_opts(), "accel");
+    if (p == NULL) {
+        /* Use the default "accelerator", tcg */
+        p = "tcg";
+    }
+
+    while (!accel_initialised && *p != '\0') {
+        if (*p == ':') {
+            p++;
+        }
+        p = get_opt_name(buf, sizeof(buf), p, ':');
+        acc = accel_find(buf);
+        if (!acc) {
+            fprintf(stderr, "\"%s\" accelerator not found.\n", buf);
+            continue;
+        }
+        if (acc->available && !acc->available()) {
+            printf("%s not supported for this target\n",
+                   acc->name);
+            continue;
+        }
+        ret = accel_init_machine(acc, ms);
+        if (ret < 0) {
+            init_failed = true;
+            fprintf(stderr, "failed to initialize %s: %s\n",
+                    acc->name,
+                    strerror(-ret));
+        } else {
+            accel_initialised = true;
+        }
+    }
+
+    if (!accel_initialised) {
+        if (!init_failed) {
+            fprintf(stderr, "No accelerator found!\n");
+        }
+        exit(1);
+    }
+
+    if (init_failed) {
+        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
+    }
+
+    return !accel_initialised;
+}
+
+
+static void tcg_accel_class_init(ObjectClass *oc, void *data)
+{
+    AccelClass *ac = ACCEL_CLASS(oc);
+    ac->name = "tcg";
+    ac->init_machine = tcg_init;
+    ac->allowed = &tcg_allowed;
+}
+
+#define TYPE_TCG_ACCEL ACCEL_CLASS_NAME("tcg")
+
+static const TypeInfo tcg_accel_type = {
+    .name = TYPE_TCG_ACCEL,
+    .parent = TYPE_ACCEL,
+    .class_init = tcg_accel_class_init,
+};
+
+static void register_accel_types(void)
+{
+    type_register_static(&accel_type);
+    type_register_static(&tcg_accel_type);
+}
+
+type_init(register_accel_types);
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -100,6 +100,11 @@ void aio_set_event_notifier(AioContext *ctx,
                       (IOHandler *)io_read, NULL, notifier);
 }

+bool aio_prepare(AioContext *ctx)
+{
+    return false;
+}
+
 bool aio_pending(AioContext *ctx)
 {
    AioHandler *node;
@@ -119,11 +124,20 @@ bool aio_pending(AioContext *ctx)
    return false;
 }

-static bool aio_dispatch(AioContext *ctx)
+bool aio_dispatch(AioContext *ctx)
 {
    AioHandler *node;
    bool progress = false;

+    /*
+     * If there are callbacks left that have been queued, we need to call them.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for aio_poll loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        progress = true;
+    }
+
    /*
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
@@ -184,22 +198,9 @@ bool aio_poll(AioContext *ctx, bool blocking)

    /* aio_notify can avoid the expensive event_notifier_set if
     * everything (file descriptors, bottom halves, timers) will
-     * be re-evaluated before the next blocking poll().  This happens
-     * in two cases:
-     *
-     * 1) when aio_poll is called with blocking == false
-     *
-     * 2) when we are called after poll().  If we are called before
-     *    poll(), bottom halves will not be re-evaluated and we need
-     *    aio_notify() if blocking == true.
-     *
-     * The first aio_dispatch() only does something when AioContext is
-     * running as a GSource, and in that case aio_poll is used only
-     * with blocking == false, so this optimization is already quite
-     * effective.  However, the code is ugly and should be restructured
-     * to have a single aio_dispatch() call.  To do this, we need to
-     * reorganize aio_poll into a prepare/poll/dispatch model like
-     * glib's.
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
     *
     * If we're in a nested event loop, ctx->dispatching might be true.
     * In that case we can restore it just before returning, but we
@@ -207,26 +208,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
     */
    aio_set_dispatching(ctx, !blocking);

-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Re-evaluate condition (1) above.  */
-    aio_set_dispatching(ctx, !blocking);
-    if (aio_dispatch(ctx)) {
-        progress = true;
-    }
-
-    if (progress && !blocking) {
-        goto out;
-    }
-
    ctx->walking_handlers++;

    g_array_set_size(ctx->pollfds, 0);
@@ -249,7 +230,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
    /* wait until next event */
    ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
                         ctx->pollfds->len,
-                         blocking ? timerlistgroup_deadline_ns(&ctx->tlg) : 0);
+                         blocking ? aio_compute_timeout(ctx) : 0);

    /* if we have any readable fds, dispatch event */
    if (ret > 0) {
@@ -268,7 +249,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
        progress = true;
    }

-out:
    aio_set_dispatching(ctx, was_dispatching);
    return progress;
 }
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -22,12 +22,80 @@

 struct AioHandler {
    EventNotifier *e;
+    IOHandler *io_read;
+    IOHandler *io_write;
    EventNotifierHandler *io_notify;
    GPollFD pfd;
    int deleted;
+    void *opaque;
    QLIST_ENTRY(AioHandler) node;
 };

+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        void *opaque)
+{
+    /* fd is a SOCKET in our case */
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.fd == fd && !node->deleted) {
+            break;
+        }
+    }
+
+    /* Are we deleting the fd handler? */
+    if (!io_read && !io_write) {
+        if (node) {
+            /* If the lock is held, just mark the node as deleted */
+            if (ctx->walking_handlers) {
+                node->deleted = 1;
+                node->pfd.revents = 0;
+            } else {
+                /* Otherwise, delete it for real.  We can't just mark it as
+                 * deleted because deleted nodes are only cleaned up after
+                 * releasing the walking_handlers lock.
+                 */
+                QLIST_REMOVE(node, node);
+                g_free(node);
+            }
+        }
+    } else {
+        HANDLE event;
+
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_malloc0(sizeof(AioHandler));
+            node->pfd.fd = fd;
+            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
+        }
+
+        node->pfd.events = 0;
+        if (node->io_read) {
+            node->pfd.events |= G_IO_IN;
+        }
+        if (node->io_write) {
+            node->pfd.events |= G_IO_OUT;
+        }
+
+        node->e = &ctx->notifier;
+
+        /* Update handler with latest information */
+        node->opaque = opaque;
+        node->io_read = io_read;
+        node->io_write = io_write;
+
+        event = event_notifier_get_handle(&ctx->notifier);
+        WSAEventSelect(node->pfd.fd, event,
+                       FD_READ | FD_ACCEPT | FD_CLOSE |
+                       FD_CONNECT | FD_WRITE | FD_OOB);
+    }
+
+    aio_notify(ctx);
+}
+
 void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *e,
                            EventNotifierHandler *io_notify)
@@ -76,6 +144,43 @@ void aio_set_event_notifier(AioContext *ctx,
    aio_notify(ctx);
 }

+bool aio_prepare(AioContext *ctx)
+{
+    static struct timeval tv0;
+    AioHandler *node;
+    bool have_select_revents = false;
+    fd_set rfds, wfds;
+
+    /* fill fd sets */
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->io_read) {
+            FD_SET ((SOCKET)node->pfd.fd, &rfds);
+        }
+        if (node->io_write) {
+            FD_SET ((SOCKET)node->pfd.fd, &wfds);
+        }
+    }
+
+    if (select(0, &rfds, &wfds, NULL, &tv0) > 0) {
+        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+            node->pfd.revents = 0;
+            if (FD_ISSET(node->pfd.fd, &rfds)) {
+                node->pfd.revents |= G_IO_IN;
+                have_select_revents = true;
+            }
+
+            if (FD_ISSET(node->pfd.fd, &wfds)) {
+                node->pfd.revents |= G_IO_OUT;
+                have_select_revents = true;
+            }
+        }
+    }
+
+    return have_select_revents;
+}
+
 bool aio_pending(AioContext *ctx)
 {
    AioHandler *node;
@@ -84,47 +189,37 @@ bool aio_pending(AioContext *ctx)
        if (node->pfd.revents && node->io_notify) {
            return true;
        }
+
+        if ((node->pfd.revents & G_IO_IN) && node->io_read) {
+            return true;
+        }
+        if ((node->pfd.revents & G_IO_OUT) && node->io_write) {
+            return true;
+        }
    }

    return false;
 }

-bool aio_poll(AioContext *ctx, bool blocking)
+static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
 {
    AioHandler *node;
-    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool progress;
-    int count;
-    int timeout;
-
-    progress = false;
+    bool progress = false;

    /*
-     * If there are callbacks left that have been queued, we need to call then.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Run timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
-    /*
-     * Then dispatch any pending callbacks from the GSource.
-     *
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
    while (node) {
        AioHandler *tmp;
+        int revents = node->pfd.revents;

        ctx->walking_handlers++;

-        if (node->pfd.revents && node->io_notify) {
+        if (!node->deleted &&
+            (revents || event_notifier_get_handle(node->e) == event) &&
+            node->io_notify) {
            node->pfd.revents = 0;
            node->io_notify(node->e);

@@ -134,6 +229,28 @@ bool aio_poll(AioContext *ctx, bool blocking)
            }
        }

+        if (!node->deleted &&
+            (node->io_read || node->io_write)) {
+            node->pfd.revents = 0;
+            if ((revents & G_IO_IN) && node->io_read) {
+                node->io_read(node->opaque);
+                progress = true;
+            }
+            if ((revents & G_IO_OUT) && node->io_write) {
+                node->io_write(node->opaque);
+                progress = true;
+            }
+
+            /* if the next select() will return an event, we have progressed */
+            if (event == event_notifier_get_handle(&ctx->notifier)) {
+                WSANETWORKEVENTS ev;
+                WSAEnumNetworkEvents(node->pfd.fd, event, &ev);
+                if (ev.lNetworkEvents) {
+                    progress = true;
+                }
+            }
+        }
+
        tmp = node;
        node = QLIST_NEXT(node, node);

@@ -145,10 +262,47 @@ bool aio_poll(AioContext *ctx, bool blocking)
        }
    }

-    if (progress && !blocking) {
-        return true;
+    return progress;
+}
+
+bool aio_dispatch(AioContext *ctx)
+{
+    bool progress;
+
+    progress = aio_bh_poll(ctx);
+    progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+    return progress;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    bool was_dispatching, progress, have_select_revents, first;
+    int count;
+    int timeout;
+
+    have_select_revents = aio_prepare(ctx);
+    if (have_select_revents) {
+        blocking = false;
    }

+    was_dispatching = ctx->dispatching;
+    progress = false;
+
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
    ctx->walking_handlers++;

    /* fill fd sets */
@@ -160,64 +314,40 @@ bool aio_poll(AioContext *ctx, bool blocking)
    }

    ctx->walking_handlers--;
+    first = true;

    /* wait until next event */
    while (count > 0) {
+        HANDLE event;
        int ret;

-        timeout = blocking ?
-            qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg)) : 0;
+        timeout = blocking
+            ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
        ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        aio_set_dispatching(ctx, true);
+
+        if (first && aio_bh_poll(ctx)) {
+            progress = true;
+        }
+        first = false;

        /* if we have any signaled events, dispatch event */
-        if ((DWORD) (ret - WAIT_OBJECT_0) >= count) {
+        event = NULL;
+        if ((DWORD) (ret - WAIT_OBJECT_0) < count) {
+            event = events[ret - WAIT_OBJECT_0];
+            events[ret - WAIT_OBJECT_0] = events[--count];
+        } else if (!have_select_revents) {
            break;
        }

+        have_select_revents = false;
        blocking = false;

-        /* we have to walk very carefully in case
-         * aio_set_fd_handler is called while we're walking */
-        node = QLIST_FIRST(&ctx->aio_handlers);
-        while (node) {
-            AioHandler *tmp;
-
-            ctx->walking_handlers++;
-
-            if (!node->deleted &&
-                event_notifier_get_handle(node->e) == events[ret - WAIT_OBJECT_0] &&
-                node->io_notify) {
-                node->io_notify(node->e);
-
-                /* aio_notify() does not count as progress */
-                if (node->e != &ctx->notifier) {
-                    progress = true;
-                }
-            }
-
-            tmp = node;
-            node = QLIST_NEXT(node, node);
-
-            ctx->walking_handlers--;
-
-            if (!ctx->walking_handlers && tmp->deleted) {
-                QLIST_REMOVE(tmp, node);
-                g_free(tmp);
-            }
-        }
-
-        /* Try again, but only call each handler once.  */
-        events[ret - WAIT_OBJECT_0] = events[--count];
+        progress |= aio_dispatch_handlers(ctx, event);
    }

-    if (blocking) {
-        /* Run the timers a second time. We do this because otherwise aio_wait
-         * will not note progress - and will stop a drain early - if we have
-         * a timer that was not ready to run entering g_poll but is ready
-         * after g_poll. This will only do anything if a timer has expired.
-         */
-        progress |= timerlistgroup_run_timers(&ctx->tlg);
-    }
+    progress |= timerlistgroup_run_timers(&ctx->tlg);

+    aio_set_dispatching(ctx, was_dispatching);
    return progress;
 }
--- a/arch_init.c
+++ b/arch_init.c
@@ -104,6 +104,8 @@ int graphic_depth = 32;
 #define QEMU_ARCH QEMU_ARCH_XTENSA
 #elif defined(TARGET_UNICORE32)
 #define QEMU_ARCH QEMU_ARCH_UNICORE32
+#elif defined(TARGET_TRICORE)
+#define QEMU_ARCH QEMU_ARCH_TRICORE
 #endif

 const uint32_t arch_type = QEMU_ARCH;
@@ -484,15 +486,23 @@ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)


 /* Needs iothread lock! */
+/* Fix me: there are too many global variables used in migration process. */
+static int64_t start_time;
+static int64_t bytes_xfer_prev;
+static int64_t num_dirty_pages_period;
+
+static void migration_bitmap_sync_init(void)
+{
+    start_time = 0;
+    bytes_xfer_prev = 0;
+    num_dirty_pages_period = 0;
+}

 static void migration_bitmap_sync(void)
 {
    RAMBlock *block;
    uint64_t num_dirty_pages_init = migration_dirty_pages;
    MigrationState *s = migrate_get_current();
-    static int64_t start_time;
-    static int64_t bytes_xfer_prev;
-    static int64_t num_dirty_pages_period;
    int64_t end_time;
    int64_t bytes_xfer_now;
    static uint64_t xbzrle_cache_miss_prev;
@@ -772,6 +782,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    mig_throttle_on = false;
    dirty_rate_high_cnt = 0;
    bitmap_sync_count = 0;
+    migration_bitmap_sync_init();

    if (migrate_use_xbzrle()) {
        XBZRLE_cache_lock();
@@ -1004,7 +1015,7 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
-        if (!block) {
+        if (!block || block->length <= offset) {
            error_report("Ack, bad migration stream!");
            return NULL;
        }
@@ -1017,8 +1028,9 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    id[len] = 0;

    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (!strncmp(id, block->idstr, sizeof(id)))
+        if (!strncmp(id, block->idstr, sizeof(id)) && block->length > offset) {
            return memory_region_get_ram_ptr(block->mr) + offset;
+        }
    }

    error_report("Can't find block %s!", id);
@@ -1038,8 +1050,7 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)

 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
-    ram_addr_t addr;
-    int flags, ret = 0;
+    int flags = 0, ret = 0;
    static uint64_t seq_iter;

    seq_iter++;
@@ -1048,21 +1059,24 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
        ret = -EINVAL;
    }

-    while (!ret) {
-        addr = qemu_get_be64(f);
+    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+        ram_addr_t addr, total_ram_bytes;
+        void *host;
+        uint8_t ch;

+        addr = qemu_get_be64(f);
        flags = addr & ~TARGET_PAGE_MASK;
        addr &= TARGET_PAGE_MASK;

-        if (flags & RAM_SAVE_FLAG_MEM_SIZE) {
+        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+        case RAM_SAVE_FLAG_MEM_SIZE:
            /* Synchronize RAM block list */
-            char id[256];
-            ram_addr_t length;
-            ram_addr_t total_ram_bytes = addr;
-
-            while (total_ram_bytes) {
+            total_ram_bytes = addr;
+            while (!ret && total_ram_bytes) {
                RAMBlock *block;
                uint8_t len;
+                char id[256];
+                ram_addr_t length;

                len = qemu_get_byte(f);
                qemu_get_buffer(f, (uint8_t *)id, len);
@@ -1072,8 +1086,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                QTAILQ_FOREACH(block, &ram_list.blocks, next) {
                    if (!strncmp(id, block->idstr, sizeof(id))) {
                        if (block->length != length) {
-                            error_report("Length mismatch: %s: " RAM_ADDR_FMT
-                                         " in != " RAM_ADDR_FMT, id, length,
+                            error_report("Length mismatch: %s: 0x" RAM_ADDR_FMT
+                                         " in != 0x" RAM_ADDR_FMT, id, length,
                                         block->length);
                            ret =  -EINVAL;
                        }
@@ -1086,16 +1100,11 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                                 "accept migration", id);
                    ret = -EINVAL;
                }
-                if (ret) {
-                    break;
-                }

                total_ram_bytes -= length;
            }
-        } else if (flags & RAM_SAVE_FLAG_COMPRESS) {
-            void *host;
-            uint8_t ch;
-
+            break;
+        case RAM_SAVE_FLAG_COMPRESS:
            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
@@ -1105,9 +1114,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)

            ch = qemu_get_byte(f);
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
-        } else if (flags & RAM_SAVE_FLAG_PAGE) {
-            void *host;
-
+            break;
+        case RAM_SAVE_FLAG_PAGE:
            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
@@ -1116,8 +1124,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
-        } else if (flags & RAM_SAVE_FLAG_XBZRLE) {
-            void *host = host_from_stream_offset(f, addr, flags);
+            break;
+        case RAM_SAVE_FLAG_XBZRLE:
+            host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                ret = -EINVAL;
@@ -1130,17 +1139,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                ret = -EINVAL;
                break;
            }
-        } else if (flags & RAM_SAVE_FLAG_HOOK) {
-            ram_control_load_hook(f, flags);
-        } else if (flags & RAM_SAVE_FLAG_EOS) {
+            break;
+        case RAM_SAVE_FLAG_EOS:
            /* normal exit */
            break;
-        } else {
-            error_report("Unknown migration flags: %#x", flags);
-            ret = -EINVAL;
-            break;
+        default:
+            if (flags & RAM_SAVE_FLAG_HOOK) {
+                ram_control_load_hook(f, flags);
+            } else {
+                error_report("Unknown combination of migration flags: %#x",
+                             flags);
+                ret = -EINVAL;
+            }
+        }
+        if (!ret) {
+            ret = qemu_file_get_error(f);
        }
-        ret = qemu_file_get_error(f);
    }

    DPRINTF("Completed load of VM with exit code %d seq iteration "
@@ -1335,11 +1349,6 @@ void cpudef_init(void)
 #endif
 }

-int tcg_available(void)
-{
-    return 1;
-}
-
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
--- a/async.c
+++ b/async.c
@@ -152,39 +152,48 @@ void qemu_bh_delete(QEMUBH *bh)
    bh->deleted = 1;
 }

-static gboolean
-aio_ctx_prepare(GSource *source, gint    *timeout)
+int64_t
+aio_compute_timeout(AioContext *ctx)
 {
-    AioContext *ctx = (AioContext *) source;
+    int64_t deadline;
+    int timeout = -1;
    QEMUBH *bh;
-    int deadline;

-    /* We assume there is no timeout already supplied */
-    *timeout = -1;
    for (bh = ctx->first_bh; bh; bh = bh->next) {
        if (!bh->deleted && bh->scheduled) {
            if (bh->idle) {
                /* idle bottom halves will be polled at least
                 * every 10ms */
-                *timeout = 10;
+                timeout = 10000000;
            } else {
                /* non-idle bottom halves will be executed
                 * immediately */
-                *timeout = 0;
-                return true;
+                return 0;
            }
        }
    }

-    deadline = qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg));
+    deadline = timerlistgroup_deadline_ns(&ctx->tlg);
    if (deadline == 0) {
-        *timeout = 0;
-        return true;
+        return 0;
    } else {
-        *timeout = qemu_soonest_timeout(*timeout, deadline);
+        return qemu_soonest_timeout(timeout, deadline);
+    }
+}
+
+static gboolean
+aio_ctx_prepare(GSource *source, gint    *timeout)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    /* We assume there is no timeout already supplied */
+    *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+
+    if (aio_prepare(ctx)) {
+        *timeout = 0;
    }

-    return false;
+    return *timeout == 0;
 }

 static gboolean
@@ -209,7 +218,7 @@ aio_ctx_dispatch(GSource     *source,
    AioContext *ctx = (AioContext *) source;

    assert(callback == NULL);
-    aio_poll(ctx, false);
+    aio_dispatch(ctx);
    return true;
 }

@@ -280,18 +289,24 @@ static void aio_rfifolock_cb(void *opaque)
    aio_notify(opaque);
 }

-AioContext *aio_context_new(void)
+AioContext *aio_context_new(Error **errp)
 {
+    int ret;
    AioContext *ctx;
    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    ret = event_notifier_init(&ctx->notifier, false);
+    if (ret < 0) {
+        g_source_destroy(&ctx->source);
+        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
+        return NULL;
+    }
+    aio_set_event_notifier(ctx, &ctx->notifier,
+                           (EventNotifierHandler *)
+                           event_notifier_test_and_clear);
    ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
    ctx->thread_pool = NULL;
    qemu_mutex_init(&ctx->bh_lock);
    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
-    event_notifier_init(&ctx->notifier, false);
-    aio_set_event_notifier(ctx, &ctx->notifier, 
-                           (EventNotifierHandler *)
-                           event_notifier_test_and_clear);
    timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);

    return ctx;
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -1,7 +1,7 @@
 common-obj-y += rng.o rng-egd.o
 common-obj-$(CONFIG_POSIX) += rng-random.o

-common-obj-y += msmouse.o
+common-obj-y += msmouse.o testdev.o
 common-obj-$(CONFIG_BRLAPI) += baum.o
 baum.o-cflags := $(SDL_CFLAGS)

--- a/backends/baum.c
+++ b/backends/baum.c
@@ -629,7 +629,7 @@ fail_handle:

 static void register_types(void)
 {
-    register_char_driver_qapi("braille", CHARDEV_BACKEND_KIND_BRAILLE, NULL);
+    register_char_driver("braille", CHARDEV_BACKEND_KIND_BRAILLE, NULL);
 }

 type_init(register_types);
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -27,7 +27,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)

    path = object_get_canonical_path_component(OBJECT(backend));
    memory_region_init_ram(&backend->mr, OBJECT(backend), path,
-                           backend->size);
+                           backend->size, errp);
    g_free(path);
 }

--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -257,15 +257,6 @@ static void host_memory_backend_init(Object *obj)
                        host_memory_backend_set_policy, NULL, NULL, NULL);
 }

-static void host_memory_backend_finalize(Object *obj)
-{
-    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
-
-    if (memory_region_size(&backend->mr)) {
-        memory_region_destroy(&backend->mr);
-    }
-}
-
 MemoryRegion *
 host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp)
 {
@@ -360,7 +351,6 @@ static const TypeInfo host_memory_backend_info = {
    .class_init = host_memory_backend_class_init,
    .instance_size = sizeof(HostMemoryBackend),
    .instance_init = host_memory_backend_init,
-    .instance_finalize = host_memory_backend_finalize,
    .interfaces = (InterfaceInfo[]) {
        { TYPE_USER_CREATABLE },
        { }
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -79,7 +79,7 @@ CharDriverState *qemu_chr_open_msmouse(void)

 static void register_types(void)
 {
-    register_char_driver_qapi("msmouse", CHARDEV_BACKEND_KIND_MSMOUSE, NULL);
+    register_char_driver("msmouse", CHARDEV_BACKEND_KIND_MSMOUSE, NULL);
 }

 type_init(register_types);
--- a/backends/testdev.c
+++ b/backends/testdev.c
@@ -0,0 +1,131 @@
+/*
+ * QEMU Char Device for testsuite control
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "sysemu/char.h"
+
+#define BUF_SIZE 32
+
+typedef struct {
+    CharDriverState *chr;
+    uint8_t in_buf[32];
+    int in_buf_used;
+} TestdevCharState;
+
+/* Try to interpret a whole incoming packet */
+static int testdev_eat_packet(TestdevCharState *testdev)
+{
+    const uint8_t *cur = testdev->in_buf;
+    int len = testdev->in_buf_used;
+    uint8_t c;
+    int arg;
+
+#define EAT(c) do { \
+    if (!len--) {   \
+        return 0;   \
+    }               \
+    c = *cur++;     \
+} while (0)
+
+    EAT(c);
+
+    while (isspace(c)) {
+        EAT(c);
+    }
+
+    arg = 0;
+    while (isdigit(c)) {
+        arg = arg * 10 + c - '0';
+        EAT(c);
+    }
+
+    while (isspace(c)) {
+        EAT(c);
+    }
+
+    switch (c) {
+    case 'q':
+        exit((arg << 1) | 1);
+        break;
+    default:
+        break;
+    }
+    return cur - testdev->in_buf;
+}
+
+/* The other end is writing some data.  Store it and try to interpret */
+static int testdev_write(CharDriverState *chr, const uint8_t *buf, int len)
+{
+    TestdevCharState *testdev = chr->opaque;
+    int tocopy, eaten, orig_len = len;
+
+    while (len) {
+        /* Complete our buffer as much as possible */
+        tocopy = MIN(len, BUF_SIZE - testdev->in_buf_used);
+
+        memcpy(testdev->in_buf + testdev->in_buf_used, buf, tocopy);
+        testdev->in_buf_used += tocopy;
+        buf += tocopy;
+        len -= tocopy;
+
+        /* Interpret it as much as possible */
+        while (testdev->in_buf_used > 0 &&
+               (eaten = testdev_eat_packet(testdev)) > 0) {
+            memmove(testdev->in_buf, testdev->in_buf + eaten,
+                    testdev->in_buf_used - eaten);
+            testdev->in_buf_used -= eaten;
+        }
+    }
+    return orig_len;
+}
+
+static void testdev_close(struct CharDriverState *chr)
+{
+    TestdevCharState *testdev = chr->opaque;
+
+    g_free(testdev);
+}
+
+CharDriverState *chr_testdev_init(void)
+{
+    TestdevCharState *testdev;
+    CharDriverState *chr;
+
+    testdev = g_malloc0(sizeof(TestdevCharState));
+    testdev->chr = chr = g_malloc0(sizeof(CharDriverState));
+
+    chr->opaque = testdev;
+    chr->chr_write = testdev_write;
+    chr->chr_close = testdev_close;
+
+    return chr;
+}
+
+static void register_types(void)
+{
+    register_char_driver("testdev", CHARDEV_BACKEND_KIND_TESTDEV, NULL);
+}
+
+type_init(register_types);
--- a/block-migration.c
+++ b/block-migration.c
@@ -14,7 +14,9 @@
 */

 #include "qemu-common.h"
-#include "block/block_int.h"
+#include "block/block.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 #include "hw/hw.h"
 #include "qemu/queue.h"
 #include "qemu/timer.h"
@@ -70,7 +72,7 @@ typedef struct BlkMigBlock {
    int nr_sectors;
    struct iovec iov;
    QEMUIOVector qiov;
-    BlockDriverAIOCB *aiocb;
+    BlockAIOCB *aiocb;

    /* Protected by block migration lock.  */
    int ret;
@@ -130,9 +132,9 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk)
                     | flags);

    /* device name */
-    len = strlen(blk->bmds->bs->device_name);
+    len = strlen(bdrv_get_device_name(blk->bmds->bs));
    qemu_put_byte(f, len);
-    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
+    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);

    /* if a block is zero we need to flush here since the network
     * bandwidth is now a lot higher than the storage device bandwidth.
@@ -186,7 +188,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;

-    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
+    if (sector < bdrv_nb_sectors(bmds->bs)) {
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    } else {
@@ -223,8 +225,7 @@ static void alloc_aio_bitmap(BlkMigDevState *bmds)
    BlockDriverState *bs = bmds->bs;
    int64_t bitmap_size;

-    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
-            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;

    bmds->aio_bitmap = g_malloc0(bitmap_size);
@@ -284,7 +285,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
        nr_sectors = total_sectors - cur_sector;
    }

-    blk = g_malloc(sizeof(BlkMigBlock));
+    blk = g_new(BlkMigBlock, 1);
    blk->buf = g_malloc(BLOCK_SIZE);
    blk->bmds = bmds;
    blk->sector = cur_sector;
@@ -344,18 +345,31 @@ static void unset_dirty_tracking(void)
    }
 }

-static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
+static void init_blk_migration(QEMUFile *f)
 {
+    BlockDriverState *bs;
    BlkMigDevState *bmds;
    int64_t sectors;

-    if (!bdrv_is_read_only(bs)) {
-        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    block_mig_state.submitted = 0;
+    block_mig_state.read_done = 0;
+    block_mig_state.transferred = 0;
+    block_mig_state.total_sector_sum = 0;
+    block_mig_state.prev_progress = -1;
+    block_mig_state.bulk_completed = 0;
+    block_mig_state.zero_blocks = migrate_zero_blocks();
+
+    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+        if (bdrv_is_read_only(bs)) {
+            continue;
+        }
+
+        sectors = bdrv_nb_sectors(bs);
        if (sectors <= 0) {
            return;
        }

-        bmds = g_malloc0(sizeof(BlkMigDevState));
+        bmds = g_new0(BlkMigDevState, 1);
        bmds->bs = bs;
        bmds->bulk_completed = 0;
        bmds->total_sectors = sectors;
@@ -370,28 +384,15 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs)

        if (bmds->shared_base) {
            DPRINTF("Start migration for %s with shared base image\n",
-                    bs->device_name);
+                    bdrv_get_device_name(bs));
        } else {
-            DPRINTF("Start full migration for %s\n", bs->device_name);
+            DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
        }

        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    }
 }

-static void init_blk_migration(QEMUFile *f)
-{
-    block_mig_state.submitted = 0;
-    block_mig_state.read_done = 0;
-    block_mig_state.transferred = 0;
-    block_mig_state.total_sector_sum = 0;
-    block_mig_state.prev_progress = -1;
-    block_mig_state.bulk_completed = 0;
-    block_mig_state.zero_blocks = migrate_zero_blocks();
-
-    bdrv_iterate(init_blk_migration_it, NULL);
-}
-
 /* Called with no lock taken.  */

 static int blk_mig_save_bulked_block(QEMUFile *f)
@@ -466,7 +467,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }
-            blk = g_malloc(sizeof(BlkMigBlock));
+            blk = g_new(BlkMigBlock, 1);
            blk->buf = g_malloc(BLOCK_SIZE);
            blk->bmds = bmds;
            blk->sector = sector;
@@ -799,7 +800,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)

            if (bs != bs_prev) {
                bs_prev = bs;
-                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+                total_sectors = bdrv_nb_sectors(bs);
                if (total_sectors <= 0) {
                    error_report("Error getting length of block device %s",
                                 device_name);
--- a/block.c
+++ b/block.c
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,28 +1,28 @@
-block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-$(CONFIG_QUORUM) += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
-block-obj-y += snapshot.o qapi.o
+block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+block-obj-y += null.o mirror.o

-ifeq ($(CONFIG_POSIX),y)
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_LIBNFS) += nfs.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
-endif
+block-obj-y += accounting.o

 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += mirror.o
 common-obj-y += backup.o

 iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
@@ -35,5 +35,6 @@ gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
 gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
+archipelago.o-libs := $(ARCHIPELAGO_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -0,0 +1,54 @@
+/*
+ * QEMU System Emulator block accounting
+ *
+ * Copyright (c) 2011 Christoph Hellwig
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/accounting.h"
+#include "block/block_int.h"
+
+void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
+                      int64_t bytes, enum BlockAcctType type)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    cookie->bytes = bytes;
+    cookie->start_time_ns = get_clock();
+    cookie->type = type;
+}
+
+void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BLOCK_MAX_IOTYPE);
+
+    stats->nr_bytes[cookie->type] += cookie->bytes;
+    stats->nr_ops[cookie->type]++;
+    stats->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
+}
+
+
+void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
+                               unsigned int nb_sectors)
+{
+    if (stats->wr_highest_sector < sector_num + nb_sectors - 1) {
+        stats->wr_highest_sector = sector_num + nb_sectors - 1;
+    }
+}
--- a/block/archipelago.c
+++ b/block/archipelago.c
--- a/block/backup.c
+++ b/block/backup.c
@@ -227,9 +227,25 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
    }
 }

+typedef struct {
+    int ret;
+} BackupCompleteData;
+
+static void backup_complete(BlockJob *job, void *opaque)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    BackupCompleteData *data = opaque;
+
+    bdrv_unref(s->target);
+
+    block_job_completed(job, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
+    BackupCompleteData *data;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
    BlockdevOnError on_target_error = job->on_target_error;
@@ -344,16 +360,17 @@ static void coroutine_fn backup_run(void *opaque)
    hbitmap_free(job->bitmap);

    bdrv_iostatus_disable(target);
-    bdrv_unref(target);

-    block_job_completed(&job->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }

 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  int64_t speed, MirrorSyncMode sync_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb, void *opaque,
+                  BlockCompletionFunc *cb, void *opaque,
                  Error **errp)
 {
    int64_t len;
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -26,6 +26,10 @@
 #include "qemu/config-file.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"

 typedef struct BDRVBlkdebugState {
    int state;
@@ -37,7 +41,7 @@ typedef struct BDRVBlkdebugState {
 } BDRVBlkdebugState;

 typedef struct BlkdebugAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int ret;
 } BlkdebugAIOCB;
@@ -48,11 +52,8 @@ typedef struct BlkdebugSuspendedReq {
    QLIST_ENTRY(BlkdebugSuspendedReq) next;
 } BlkdebugSuspendedReq;

-static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
-
 static const AIOCBInfo blkdebug_aiocb_info = {
-    .aiocb_size = sizeof(BlkdebugAIOCB),
-    .cancel     = blkdebug_aio_cancel,
+    .aiocb_size    = sizeof(BlkdebugAIOCB),
 };

 enum {
@@ -194,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
    [BLKDBG_PWRITEV]                        = "pwritev",
    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
+
+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
 };

 static int get_event_by_name(const char *name, BlkDebugEvent *event)
@@ -213,6 +216,7 @@ static int get_event_by_name(const char *name, BlkDebugEvent *event)
 struct add_rule_data {
    BDRVBlkdebugState *s;
    int action;
+    Error **errp;
 };

 static int add_rule(QemuOpts *opts, void *opaque)
@@ -225,7 +229,11 @@ static int add_rule(QemuOpts *opts, void *opaque)

    /* Find the right event for the rule */
    event_name = qemu_opt_get(opts, "event");
-    if (!event_name || get_event_by_name(event_name, &event) < 0) {
+    if (!event_name) {
+        error_setg(d->errp, "Missing event name for rule");
+        return -1;
+    } else if (get_event_by_name(event_name, &event) < 0) {
+        error_setg(d->errp, "Invalid event name \"%s\"", event_name);
        return -1;
    }

@@ -311,10 +319,21 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,

    d.s = s;
    d.action = ACTION_INJECT_ERROR;
-    qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0);
+    d.errp = &local_err;
+    qemu_opts_foreach(&inject_error_opts, add_rule, &d, 1);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }

    d.action = ACTION_SET_STATE;
-    qemu_opts_foreach(&set_state_opts, add_rule, &d, 0);
+    qemu_opts_foreach(&set_state_opts, add_rule, &d, 1);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }

    ret = 0;
 fail:
@@ -443,21 +462,11 @@ static void error_callback_bh(void *opaque)
    struct BlkdebugAIOCB *acb = opaque;
    qemu_bh_delete(acb->bh);
    acb->common.cb(acb->common.opaque, acb->ret);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

-static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    BlkdebugAIOCB *acb = container_of(blockacb, BlkdebugAIOCB, common);
-    if (acb->bh) {
-        qemu_bh_delete(acb->bh);
-        acb->bh = NULL;
-    }
-    qemu_aio_release(acb);
-}
-
-static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
-    BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
+static BlockAIOCB *inject_error(BlockDriverState *bs,
+    BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
 {
    BDRVBlkdebugState *s = bs->opaque;
    int error = rule->options.inject.error;
@@ -482,9 +491,9 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkdebugState *s = bs->opaque;
    BlkdebugRule *rule = NULL;
@@ -504,9 +513,9 @@ static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }

-static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkdebugState *s = bs->opaque;
    BlkdebugRule *rule = NULL;
@@ -526,6 +535,25 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }

+static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs,
+    BlockCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugRule *rule = NULL;
+
+    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+        if (rule->options.inject.sector == -1) {
+            break;
+        }
+    }
+
+    if (rule && rule->options.inject.error) {
+        return inject_error(bs, cb, opaque, rule);
+    }
+
+    return bdrv_aio_flush(bs->file, cb, opaque);
+}
+

 static void blkdebug_close(BlockDriverState *bs)
 {
@@ -691,6 +719,98 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
    return bdrv_getlength(bs->file);
 }

+static void blkdebug_refresh_filename(BlockDriverState *bs)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    struct BlkdebugRule *rule;
+    QDict *opts;
+    QList *inject_error_list = NULL, *set_state_list = NULL;
+    QList *suspend_list = NULL;
+    int event;
+
+    if (!bs->file->full_open_options) {
+        /* The config file cannot be recreated, so creating a plain filename
+         * is impossible */
+        return;
+    }
+
+    opts = qdict_new();
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug")));
+
+    QINCREF(bs->file->full_open_options);
+    qdict_put_obj(opts, "image", QOBJECT(bs->file->full_open_options));
+
+    for (event = 0; event < BLKDBG_EVENT_MAX; event++) {
+        QLIST_FOREACH(rule, &s->rules[event], next) {
+            if (rule->action == ACTION_INJECT_ERROR) {
+                QDict *inject_error = qdict_new();
+
+                qdict_put_obj(inject_error, "event", QOBJECT(qstring_from_str(
+                              BlkdebugEvent_lookup[rule->event])));
+                qdict_put_obj(inject_error, "state",
+                              QOBJECT(qint_from_int(rule->state)));
+                qdict_put_obj(inject_error, "errno", QOBJECT(qint_from_int(
+                              rule->options.inject.error)));
+                qdict_put_obj(inject_error, "sector", QOBJECT(qint_from_int(
+                              rule->options.inject.sector)));
+                qdict_put_obj(inject_error, "once", QOBJECT(qbool_from_int(
+                              rule->options.inject.once)));
+                qdict_put_obj(inject_error, "immediately",
+                              QOBJECT(qbool_from_int(
+                              rule->options.inject.immediately)));
+
+                if (!inject_error_list) {
+                    inject_error_list = qlist_new();
+                }
+
+                qlist_append_obj(inject_error_list, QOBJECT(inject_error));
+            } else if (rule->action == ACTION_SET_STATE) {
+                QDict *set_state = qdict_new();
+
+                qdict_put_obj(set_state, "event", QOBJECT(qstring_from_str(
+                              BlkdebugEvent_lookup[rule->event])));
+                qdict_put_obj(set_state, "state",
+                              QOBJECT(qint_from_int(rule->state)));
+                qdict_put_obj(set_state, "new_state", QOBJECT(qint_from_int(
+                              rule->options.set_state.new_state)));
+
+                if (!set_state_list) {
+                    set_state_list = qlist_new();
+                }
+
+                qlist_append_obj(set_state_list, QOBJECT(set_state));
+            } else if (rule->action == ACTION_SUSPEND) {
+                QDict *suspend = qdict_new();
+
+                qdict_put_obj(suspend, "event", QOBJECT(qstring_from_str(
+                              BlkdebugEvent_lookup[rule->event])));
+                qdict_put_obj(suspend, "state",
+                              QOBJECT(qint_from_int(rule->state)));
+                qdict_put_obj(suspend, "tag", QOBJECT(qstring_from_str(
+                              rule->options.suspend.tag)));
+
+                if (!suspend_list) {
+                    suspend_list = qlist_new();
+                }
+
+                qlist_append_obj(suspend_list, QOBJECT(suspend));
+            }
+        }
+    }
+
+    if (inject_error_list) {
+        qdict_put_obj(opts, "inject-error", QOBJECT(inject_error_list));
+    }
+    if (set_state_list) {
+        qdict_put_obj(opts, "set-state", QOBJECT(set_state_list));
+    }
+    if (suspend_list) {
+        qdict_put_obj(opts, "suspend", QOBJECT(suspend_list));
+    }
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_blkdebug = {
    .format_name            = "blkdebug",
    .protocol_name          = "blkdebug",
@@ -700,9 +820,11 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_file_open         = blkdebug_open,
    .bdrv_close             = blkdebug_close,
    .bdrv_getlength         = blkdebug_getlength,
+    .bdrv_refresh_filename  = blkdebug_refresh_filename,

    .bdrv_aio_readv         = blkdebug_aio_readv,
    .bdrv_aio_writev        = blkdebug_aio_writev,
+    .bdrv_aio_flush         = blkdebug_aio_flush,

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -10,6 +10,8 @@
 #include <stdarg.h>
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"

 typedef struct {
    BlockDriverState *test_file;
@@ -17,7 +19,7 @@ typedef struct {

 typedef struct BlkverifyAIOCB BlkverifyAIOCB;
 struct BlkverifyAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;

    /* Request metadata */
@@ -27,7 +29,6 @@ struct BlkverifyAIOCB {

    int ret;                    /* first completed request's result */
    unsigned int done;          /* completion counter */
-    bool *finished;             /* completion signal for cancel */

    QEMUIOVector *qiov;         /* user I/O vector */
    QEMUIOVector raw_qiov;      /* cloned I/O vector for raw file */
@@ -36,22 +37,8 @@ struct BlkverifyAIOCB {
    void (*verify)(BlkverifyAIOCB *acb);
 };

-static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb;
-    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
-    bool finished = false;
-
-    /* Wait until request completes, invokes its callback, and frees itself */
-    acb->finished = &finished;
-    while (!finished) {
-        aio_poll(aio_context, true);
-    }
-}
-
 static const AIOCBInfo blkverify_aiocb_info = {
    .aiocb_size         = sizeof(BlkverifyAIOCB),
-    .cancel             = blkverify_aio_cancel,
 };

 static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb,
@@ -156,6 +143,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,

    ret = 0;
 fail:
+    qemu_opts_del(opts);
    return ret;
 }

@@ -177,7 +165,7 @@ static int64_t blkverify_getlength(BlockDriverState *bs)
 static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
                                         int64_t sector_num, QEMUIOVector *qiov,
                                         int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
+                                         BlockCompletionFunc *cb,
                                         void *opaque)
 {
    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);
@@ -191,7 +179,6 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
    acb->qiov = qiov;
    acb->buf = NULL;
    acb->verify = NULL;
-    acb->finished = NULL;
    return acb;
 }

@@ -205,10 +192,7 @@ static void blkverify_aio_bh(void *opaque)
        qemu_vfree(acb->buf);
    }
    acb->common.cb(acb->common.opaque, acb->ret);
-    if (acb->finished) {
-        *acb->finished = true;
-    }
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static void blkverify_aio_cb(void *opaque, int ret)
@@ -245,9 +229,9 @@ static void blkverify_verify_readv(BlkverifyAIOCB *acb)
    }
 }

-static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;
    BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov,
@@ -265,9 +249,9 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;
    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
@@ -280,9 +264,9 @@ static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    BDRVBlkverifyState *s = bs->opaque;

@@ -320,6 +304,32 @@ static void blkverify_attach_aio_context(BlockDriverState *bs,
    bdrv_attach_aio_context(s->test_file, new_context);
 }

+static void blkverify_refresh_filename(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* bs->file has already been refreshed */
+    bdrv_refresh_filename(s->test_file);
+
+    if (bs->file->full_open_options && s->test_file->full_open_options) {
+        QDict *opts = qdict_new();
+        qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify")));
+
+        QINCREF(bs->file->full_open_options);
+        qdict_put_obj(opts, "raw", QOBJECT(bs->file->full_open_options));
+        QINCREF(s->test_file->full_open_options);
+        qdict_put_obj(opts, "test", QOBJECT(s->test_file->full_open_options));
+
+        bs->full_open_options = opts;
+    }
+
+    if (bs->file->exact_filename[0] && s->test_file->exact_filename[0]) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "blkverify:%s:%s",
+                 bs->file->exact_filename, s->test_file->exact_filename);
+    }
+}
+
 static BlockDriver bdrv_blkverify = {
    .format_name                      = "blkverify",
    .protocol_name                    = "blkverify",
@@ -329,6 +339,7 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_file_open                   = blkverify_open,
    .bdrv_close                       = blkverify_close,
    .bdrv_getlength                   = blkverify_getlength,
+    .bdrv_refresh_filename            = blkverify_refresh_filename,

    .bdrv_aio_readv                   = blkverify_aio_readv,
    .bdrv_aio_writev                  = blkverify_aio_writev,
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -0,0 +1,631 @@
+/*
+ * QEMU Block backends
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "sysemu/block-backend.h"
+#include "block/block_int.h"
+#include "sysemu/blockdev.h"
+#include "qapi-event.h"
+
+/* Number of coroutines to reserve per attached device model */
+#define COROUTINE_POOL_RESERVATION 64
+
+struct BlockBackend {
+    char *name;
+    int refcnt;
+    BlockDriverState *bs;
+    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
+    QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */
+
+    void *dev;                  /* attached device model, if any */
+    /* TODO change to DeviceState when all users are qdevified */
+    const BlockDevOps *dev_ops;
+    void *dev_opaque;
+};
+
+static void drive_info_del(DriveInfo *dinfo);
+
+/* All the BlockBackends (except for hidden ones) */
+static QTAILQ_HEAD(, BlockBackend) blk_backends =
+    QTAILQ_HEAD_INITIALIZER(blk_backends);
+
+/*
+ * Create a new BlockBackend with @name, with a reference count of one.
+ * @name must not be null or empty.
+ * Fail if a BlockBackend with this name already exists.
+ * Store an error through @errp on failure, unless it's null.
+ * Return the new BlockBackend on success, null on failure.
+ */
+BlockBackend *blk_new(const char *name, Error **errp)
+{
+    BlockBackend *blk;
+
+    assert(name && name[0]);
+    if (!id_wellformed(name)) {
+        error_setg(errp, "Invalid device name");
+        return NULL;
+    }
+    if (blk_by_name(name)) {
+        error_setg(errp, "Device with id '%s' already exists", name);
+        return NULL;
+    }
+    if (bdrv_find_node(name)) {
+        error_setg(errp,
+                   "Device name '%s' conflicts with an existing node name",
+                   name);
+        return NULL;
+    }
+
+    blk = g_new0(BlockBackend, 1);
+    blk->name = g_strdup(name);
+    blk->refcnt = 1;
+    QTAILQ_INSERT_TAIL(&blk_backends, blk, link);
+    return blk;
+}
+
+/*
+ * Create a new BlockBackend with a new BlockDriverState attached.
+ * Otherwise just like blk_new(), which see.
+ */
+BlockBackend *blk_new_with_bs(const char *name, Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    blk = blk_new(name, errp);
+    if (!blk) {
+        return NULL;
+    }
+
+    bs = bdrv_new_root();
+    blk->bs = bs;
+    bs->blk = blk;
+    return blk;
+}
+
+static void blk_delete(BlockBackend *blk)
+{
+    assert(!blk->refcnt);
+    assert(!blk->dev);
+    if (blk->bs) {
+        assert(blk->bs->blk == blk);
+        blk->bs->blk = NULL;
+        bdrv_unref(blk->bs);
+        blk->bs = NULL;
+    }
+    /* Avoid double-remove after blk_hide_on_behalf_of_do_drive_del() */
+    if (blk->name[0]) {
+        QTAILQ_REMOVE(&blk_backends, blk, link);
+    }
+    g_free(blk->name);
+    drive_info_del(blk->legacy_dinfo);
+    g_free(blk);
+}
+
+static void drive_info_del(DriveInfo *dinfo)
+{
+    if (!dinfo) {
+        return;
+    }
+    qemu_opts_del(dinfo->opts);
+    g_free(dinfo->serial);
+    g_free(dinfo);
+}
+
+/*
+ * Increment @blk's reference count.
+ * @blk must not be null.
+ */
+void blk_ref(BlockBackend *blk)
+{
+    blk->refcnt++;
+}
+
+/*
+ * Decrement @blk's reference count.
+ * If this drops it to zero, destroy @blk.
+ * For convenience, do nothing if @blk is null.
+ */
+void blk_unref(BlockBackend *blk)
+{
+    if (blk) {
+        assert(blk->refcnt > 0);
+        if (!--blk->refcnt) {
+            blk_delete(blk);
+        }
+    }
+}
+
+/*
+ * Return the BlockBackend after @blk.
+ * If @blk is null, return the first one.
+ * Else, return @blk's next sibling, which may be null.
+ *
+ * To iterate over all BlockBackends, do
+ * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
+ *     ...
+ * }
+ */
+BlockBackend *blk_next(BlockBackend *blk)
+{
+    return blk ? QTAILQ_NEXT(blk, link) : QTAILQ_FIRST(&blk_backends);
+}
+
+/*
+ * Return @blk's name, a non-null string.
+ * Wart: the name is empty iff @blk has been hidden with
+ * blk_hide_on_behalf_of_do_drive_del().
+ */
+const char *blk_name(BlockBackend *blk)
+{
+    return blk->name;
+}
+
+/*
+ * Return the BlockBackend with name @name if it exists, else null.
+ * @name must not be null.
+ */
+BlockBackend *blk_by_name(const char *name)
+{
+    BlockBackend *blk;
+
+    assert(name);
+    QTAILQ_FOREACH(blk, &blk_backends, link) {
+        if (!strcmp(name, blk->name)) {
+            return blk;
+        }
+    }
+    return NULL;
+}
+
+/*
+ * Return the BlockDriverState attached to @blk if any, else null.
+ */
+BlockDriverState *blk_bs(BlockBackend *blk)
+{
+    return blk->bs;
+}
+
+/*
+ * Return @blk's DriveInfo if any, else null.
+ */
+DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
+{
+    return blk->legacy_dinfo;
+}
+
+/*
+ * Set @blk's DriveInfo to @dinfo, and return it.
+ * @blk must not have a DriveInfo set already.
+ * No other BlockBackend may have the same DriveInfo set.
+ */
+DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
+{
+    assert(!blk->legacy_dinfo);
+    return blk->legacy_dinfo = dinfo;
+}
+
+/*
+ * Return the BlockBackend with DriveInfo @dinfo.
+ * It must exist.
+ */
+BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
+{
+    BlockBackend *blk;
+
+    QTAILQ_FOREACH(blk, &blk_backends, link) {
+        if (blk->legacy_dinfo == dinfo) {
+            return blk;
+        }
+    }
+    abort();
+}
+
+/*
+ * Hide @blk.
+ * @blk must not have been hidden already.
+ * Make attached BlockDriverState, if any, anonymous.
+ * Once hidden, @blk is invisible to all functions that don't receive
+ * it as argument.  For example, blk_by_name() won't return it.
+ * Strictly for use by do_drive_del().
+ * TODO get rid of it!
+ */
+void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk)
+{
+    QTAILQ_REMOVE(&blk_backends, blk, link);
+    blk->name[0] = 0;
+    if (blk->bs) {
+        bdrv_make_anon(blk->bs);
+    }
+}
+
+/*
+ * Attach device model @dev to @blk.
+ * Return 0 on success, -EBUSY when a device model is attached already.
+ */
+int blk_attach_dev(BlockBackend *blk, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    if (blk->dev) {
+        return -EBUSY;
+    }
+    blk_ref(blk);
+    blk->dev = dev;
+    bdrv_iostatus_reset(blk->bs);
+
+    /* We're expecting I/O from the device so bump up coroutine pool size */
+    qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
+    return 0;
+}
+
+/*
+ * Attach device model @dev to @blk.
+ * @blk must not have a device model attached already.
+ * TODO qdevified devices don't use this, remove when devices are qdevified
+ */
+void blk_attach_dev_nofail(BlockBackend *blk, void *dev)
+{
+    if (blk_attach_dev(blk, dev) < 0) {
+        abort();
+    }
+}
+
+/*
+ * Detach device model @dev from @blk.
+ * @dev must be currently attached to @blk.
+ */
+void blk_detach_dev(BlockBackend *blk, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    assert(blk->dev == dev);
+    blk->dev = NULL;
+    blk->dev_ops = NULL;
+    blk->dev_opaque = NULL;
+    bdrv_set_guest_block_size(blk->bs, 512);
+    qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
+    blk_unref(blk);
+}
+
+/*
+ * Return the device model attached to @blk if any, else null.
+ */
+void *blk_get_attached_dev(BlockBackend *blk)
+/* TODO change to return DeviceState * when all users are qdevified */
+{
+    return blk->dev;
+}
+
+/*
+ * Set @blk's device model callbacks to @ops.
+ * @opaque is the opaque argument to pass to the callbacks.
+ * This is for use by device models.
+ */
+void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
+                     void *opaque)
+{
+    blk->dev_ops = ops;
+    blk->dev_opaque = opaque;
+}
+
+/*
+ * Notify @blk's attached device model of media change.
+ * If @load is true, notify of media load.
+ * Else, notify of media eject.
+ * Also send DEVICE_TRAY_MOVED events as appropriate.
+ */
+void blk_dev_change_media_cb(BlockBackend *blk, bool load)
+{
+    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
+        bool tray_was_closed = !blk_dev_is_tray_open(blk);
+
+        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
+        if (tray_was_closed) {
+            /* tray open */
+            qapi_event_send_device_tray_moved(blk_name(blk),
+                                              true, &error_abort);
+        }
+        if (load) {
+            /* tray close */
+            qapi_event_send_device_tray_moved(blk_name(blk),
+                                              false, &error_abort);
+        }
+    }
+}
+
+/*
+ * Does @blk's attached device model have removable media?
+ * %true if no device model is attached.
+ */
+bool blk_dev_has_removable_media(BlockBackend *blk)
+{
+    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
+}
+
+/*
+ * Notify @blk's attached device model of a media eject request.
+ * If @force is true, the medium is about to be yanked out forcefully.
+ */
+void blk_dev_eject_request(BlockBackend *blk, bool force)
+{
+    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
+        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
+    }
+}
+
+/*
+ * Does @blk's attached device model have a tray, and is it open?
+ */
+bool blk_dev_is_tray_open(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->is_tray_open) {
+        return blk->dev_ops->is_tray_open(blk->dev_opaque);
+    }
+    return false;
+}
+
+/*
+ * Does @blk's attached device model have the medium locked?
+ * %false if the device model has no such lock.
+ */
+bool blk_dev_is_medium_locked(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
+        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
+    }
+    return false;
+}
+
+/*
+ * Notify @blk's attached device model of a backend size change.
+ */
+void blk_dev_resize_cb(BlockBackend *blk)
+{
+    if (blk->dev_ops && blk->dev_ops->resize_cb) {
+        blk->dev_ops->resize_cb(blk->dev_opaque);
+    }
+}
+
+void blk_iostatus_enable(BlockBackend *blk)
+{
+    bdrv_iostatus_enable(blk->bs);
+}
+
+int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+             int nb_sectors)
+{
+    return bdrv_read(blk->bs, sector_num, buf, nb_sectors);
+}
+
+int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+                         int nb_sectors)
+{
+    return bdrv_read_unthrottled(blk->bs, sector_num, buf, nb_sectors);
+}
+
+int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
+              int nb_sectors)
+{
+    return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
+}
+
+BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                                 int nb_sectors, BdrvRequestFlags flags,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags,
+                                 cb, opaque);
+}
+
+int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
+{
+    return bdrv_pread(blk->bs, offset, buf, count);
+}
+
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+{
+    return bdrv_pwrite(blk->bs, offset, buf, count);
+}
+
+int64_t blk_getlength(BlockBackend *blk)
+{
+    return bdrv_getlength(blk->bs);
+}
+
+void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
+{
+    bdrv_get_geometry(blk->bs, nb_sectors_ptr);
+}
+
+BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
+                          QEMUIOVector *iov, int nb_sectors,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
+                           QEMUIOVector *iov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_flush(BlockBackend *blk,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_flush(blk->bs, cb, opaque);
+}
+
+BlockAIOCB *blk_aio_discard(BlockBackend *blk,
+                            int64_t sector_num, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque);
+}
+
+void blk_aio_cancel(BlockAIOCB *acb)
+{
+    bdrv_aio_cancel(acb);
+}
+
+void blk_aio_cancel_async(BlockAIOCB *acb)
+{
+    bdrv_aio_cancel_async(acb);
+}
+
+int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
+{
+    return bdrv_aio_multiwrite(blk->bs, reqs, num_reqs);
+}
+
+int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
+{
+    return bdrv_ioctl(blk->bs, req, buf);
+}
+
+BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque);
+}
+
+int blk_flush(BlockBackend *blk)
+{
+    return bdrv_flush(blk->bs);
+}
+
+int blk_flush_all(void)
+{
+    return bdrv_flush_all();
+}
+
+void blk_drain_all(void)
+{
+    bdrv_drain_all();
+}
+
+BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
+{
+    return bdrv_get_on_error(blk->bs, is_read);
+}
+
+BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
+                                      int error)
+{
+    return bdrv_get_error_action(blk->bs, is_read, error);
+}
+
+void blk_error_action(BlockBackend *blk, BlockErrorAction action,
+                      bool is_read, int error)
+{
+    bdrv_error_action(blk->bs, action, is_read, error);
+}
+
+int blk_is_read_only(BlockBackend *blk)
+{
+    return bdrv_is_read_only(blk->bs);
+}
+
+int blk_is_sg(BlockBackend *blk)
+{
+    return bdrv_is_sg(blk->bs);
+}
+
+int blk_enable_write_cache(BlockBackend *blk)
+{
+    return bdrv_enable_write_cache(blk->bs);
+}
+
+void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
+{
+    bdrv_set_enable_write_cache(blk->bs, wce);
+}
+
+int blk_is_inserted(BlockBackend *blk)
+{
+    return bdrv_is_inserted(blk->bs);
+}
+
+void blk_lock_medium(BlockBackend *blk, bool locked)
+{
+    bdrv_lock_medium(blk->bs, locked);
+}
+
+void blk_eject(BlockBackend *blk, bool eject_flag)
+{
+    bdrv_eject(blk->bs, eject_flag);
+}
+
+int blk_get_flags(BlockBackend *blk)
+{
+    return bdrv_get_flags(blk->bs);
+}
+
+void blk_set_guest_block_size(BlockBackend *blk, int align)
+{
+    bdrv_set_guest_block_size(blk->bs, align);
+}
+
+void *blk_blockalign(BlockBackend *blk, size_t size)
+{
+    return qemu_blockalign(blk ? blk->bs : NULL, size);
+}
+
+bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
+{
+    return bdrv_op_is_blocked(blk->bs, op, errp);
+}
+
+void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
+{
+    bdrv_op_unblock(blk->bs, op, reason);
+}
+
+void blk_op_block_all(BlockBackend *blk, Error *reason)
+{
+    bdrv_op_block_all(blk->bs, reason);
+}
+
+void blk_op_unblock_all(BlockBackend *blk, Error *reason)
+{
+    bdrv_op_unblock_all(blk->bs, reason);
+}
+
+AioContext *blk_get_aio_context(BlockBackend *blk)
+{
+    return bdrv_get_aio_context(blk->bs);
+}
+
+void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
+{
+    bdrv_set_aio_context(blk->bs, new_context);
+}
+
+void blk_io_plug(BlockBackend *blk)
+{
+    bdrv_io_plug(blk->bs);
+}
+
+void blk_io_unplug(BlockBackend *blk)
+{
+    bdrv_io_unplug(blk->bs);
+}
+
+BlockAcctStats *blk_get_stats(BlockBackend *blk)
+{
+    return bdrv_get_stats(blk->bs);
+}
+
+void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
+                  BlockCompletionFunc *cb, void *opaque)
+{
+    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
+}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -131,7 +131,11 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
        return -EFBIG;
    }

-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        error_setg(errp, "Could not allocate memory for catalog");
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
                     s->catalog_size * 4);
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -116,7 +116,12 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
                   "try increasing block size");
        return -EINVAL;
    }
-    s->offsets = g_malloc(offsets_size);
+
+    s->offsets = g_try_malloc(offsets_size);
+    if (s->offsets == NULL) {
+        error_setg(errp, "Could not allocate offsets table");
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
    if (ret < 0) {
@@ -158,8 +163,20 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* initialize zlib engine */
-    s->compressed_block = g_malloc(max_compressed_block_size + 1);
-    s->uncompressed_block = g_malloc(s->block_size);
+    s->compressed_block = g_try_malloc(max_compressed_block_size + 1);
+    if (s->compressed_block == NULL) {
+        error_setg(errp, "Could not allocate compressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    s->uncompressed_block = g_try_malloc(s->block_size);
+    if (s->uncompressed_block == NULL) {
+        error_setg(errp, "Could not allocate uncompressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    if (inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
--- a/block/commit.c
+++ b/block/commit.c
@@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs,
    return 0;
 }

-static void coroutine_fn commit_run(void *opaque)
+typedef struct {
+    int ret;
+} CommitCompleteData;
+
+static void commit_complete(BlockJob *job, void *opaque)
 {
-    CommitBlockJob *s = opaque;
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
    BlockDriverState *top = s->top;
    BlockDriverState *base = s->base;
    BlockDriverState *overlay_bs;
+    int ret = data->ret;
+
+    if (!block_job_is_cancelled(&s->common) && ret == 0) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+    }
+
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    overlay_bs = bdrv_find_overlay(active, top);
+    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, ret);
+    g_free(data);
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    CommitCompleteData *data;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
-    void *buf;
+    void *buf = NULL;
    int bytes_written = 0;
    int64_t base_len;

@@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque)


    if (s->common.len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    ret = base_len = bdrv_getlength(base);
    if (base_len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    if (base_len < s->common.len) {
        ret = bdrv_truncate(base, s->common.len);
        if (ret) {
-            goto exit_restore_reopen;
+            goto out;
        }
    }

@@ -128,7 +161,7 @@ wait:
            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
-                goto exit_free_buf;
+                goto out;
            } else {
                n = 0;
                continue;
@@ -140,27 +173,12 @@ wait:

    ret = 0;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
-        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
-    }
-
-exit_free_buf:
+out:
    qemu_vfree(buf);

-exit_restore_reopen:
-    /* restore base open flags here if appropriate (e.g., change the base back
-     * to r/o). These reopens do not need to be atomic, since we won't abort
-     * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
-    }
-    overlay_bs = bdrv_find_overlay(active, top);
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
-    }
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }

 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -182,7 +200,7 @@ static const BlockJobDriver commit_job_driver = {

 void commit_start(BlockDriverState *bs, BlockDriverState *base,
                  BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                  BlockdevOnError on_error, BlockCompletionFunc *cb,
                  void *opaque, const char *backing_file_str, Error **errp)
 {
    CommitBlockJob *s;
--- a/block/cow.c
+++ b/block/cow.c
@@ -1,432 +0,0 @@
-/*
- * Block driver for the COW format
- *
- * Copyright (c) 2004 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include "qemu-common.h"
-#include "block/block_int.h"
-#include "qemu/module.h"
-
-/**************************************************************/
-/* COW block driver using file system holes */
-
-/* user mode linux compatible COW file */
-#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
-#define COW_VERSION 2
-
-struct cow_header_v2 {
-    uint32_t magic;
-    uint32_t version;
-    char backing_file[1024];
-    int32_t mtime;
-    uint64_t size;
-    uint32_t sectorsize;
-};
-
-typedef struct BDRVCowState {
-    CoMutex lock;
-    int64_t cow_sectors_offset;
-} BDRVCowState;
-
-static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
-{
-    const struct cow_header_v2 *cow_header = (const void *)buf;
-
-    if (buf_size >= sizeof(struct cow_header_v2) &&
-        be32_to_cpu(cow_header->magic) == COW_MAGIC &&
-        be32_to_cpu(cow_header->version) == COW_VERSION)
-        return 100;
-    else
-        return 0;
-}
-
-static int cow_open(BlockDriverState *bs, QDict *options, int flags,
-                    Error **errp)
-{
-    BDRVCowState *s = bs->opaque;
-    struct cow_header_v2 cow_header;
-    int bitmap_size;
-    int64_t size;
-    int ret;
-
-    /* see if it is a cow image */
-    ret = bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header));
-    if (ret < 0) {
-        goto fail;
-    }
-
-    if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
-        error_setg(errp, "Image not in COW format");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    if (be32_to_cpu(cow_header.version) != COW_VERSION) {
-        char version[64];
-        snprintf(version, sizeof(version),
-               "COW version %" PRIu32, cow_header.version);
-        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "cow", version);
-        ret = -ENOTSUP;
-        goto fail;
-    }
-
-    /* cow image found */
-    size = be64_to_cpu(cow_header.size);
-    bs->total_sectors = size / 512;
-
-    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
-            cow_header.backing_file);
-
-    bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
-    s->cow_sectors_offset = (bitmap_size + 511) & ~511;
-    qemu_co_mutex_init(&s->lock);
-    return 0;
- fail:
-    return ret;
-}
-
-static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors)
-{
-    int64_t bitnum = start, last = start + nb_sectors;
-    while (bitnum < last) {
-        if ((bitnum & 7) == 0 && bitnum + 8 <= last) {
-            bitmap[bitnum / 8] = 0xFF;
-            bitnum += 8;
-            continue;
-        }
-        bitmap[bitnum/8] |= (1 << (bitnum % 8));
-        bitnum++;
-    }
-}
-
-#define BITS_PER_BITMAP_SECTOR (512 * 8)
-
-/* Cannot use bitmap.c on big-endian machines.  */
-static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
-{
-    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
-}
-
-static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors)
-{
-    int streak_value = value ? 0xFF : 0;
-    int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR);
-    int bitnum = start;
-    while (bitnum < last) {
-        if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) {
-            bitnum += 8;
-            continue;
-        }
-        if (cow_test_bit(bitnum, bitmap) == value) {
-            bitnum++;
-            continue;
-        }
-        break;
-    }
-    return MIN(bitnum, last) - start;
-}
-
-/* Return true if first block has been changed (ie. current version is
- * in COW file).  Set the number of continuous blocks for which that
- * is true. */
-static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *num_same)
-{
-    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
-    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
-    bool first = true;
-    int changed = 0, same = 0;
-
-    do {
-        int ret;
-        uint8_t bitmap[BDRV_SECTOR_SIZE];
-
-        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
-        int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
-
-        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-
-        if (first) {
-            changed = cow_test_bit(bitnum, bitmap);
-            first = false;
-        }
-
-        same += cow_find_streak(bitmap, changed, bitnum, nb_sectors);
-
-        bitnum += sector_bits;
-        nb_sectors -= sector_bits;
-        offset += BDRV_SECTOR_SIZE;
-    } while (nb_sectors);
-
-    *num_same = same;
-    return changed;
-}
-
-static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *num_same)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same);
-    int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS);
-    if (ret < 0) {
-        return ret;
-    }
-    return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID;
-}
-
-static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
-        int nb_sectors)
-{
-    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
-    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
-    bool first = true;
-    int sector_bits;
-
-    for ( ; nb_sectors;
-            bitnum += sector_bits,
-            nb_sectors -= sector_bits,
-            offset += BDRV_SECTOR_SIZE) {
-        int ret, set;
-        uint8_t bitmap[BDRV_SECTOR_SIZE];
-
-        bitnum &= BITS_PER_BITMAP_SECTOR - 1;
-        sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum);
-
-        ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-
-        /* Skip over any already set bits */
-        set = cow_find_streak(bitmap, 1, bitnum, sector_bits);
-        bitnum += set;
-        sector_bits -= set;
-        nb_sectors -= set;
-        if (!sector_bits) {
-            continue;
-        }
-
-        if (first) {
-            ret = bdrv_flush(bs->file);
-            if (ret < 0) {
-                return ret;
-            }
-            first = false;
-        }
-
-        cow_set_bits(bitmap, bitnum, sector_bits);
-
-        ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return 0;
-}
-
-static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
-                                 uint8_t *buf, int nb_sectors)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret, n;
-
-    while (nb_sectors > 0) {
-        ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n);
-        if (ret < 0) {
-            return ret;
-        }
-        if (ret) {
-            ret = bdrv_pread(bs->file,
-                        s->cow_sectors_offset + sector_num * 512,
-                        buf, n * 512);
-            if (ret < 0) {
-                return ret;
-            }
-        } else {
-            if (bs->backing_hd) {
-                /* read from the base image */
-                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
-                if (ret < 0) {
-                    return ret;
-                }
-            } else {
-                memset(buf, 0, n * 512);
-            }
-        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
-    }
-    return 0;
-}
-
-static coroutine_fn int cow_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCowState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cow_read(bs, sector_num, buf, nb_sectors);
-    qemu_co_mutex_unlock(&s->lock);
-    return ret;
-}
-
-static int cow_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
-{
-    BDRVCowState *s = bs->opaque;
-    int ret;
-
-    ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512,
-                      buf, nb_sectors * 512);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return cow_update_bitmap(bs, sector_num, nb_sectors);
-}
-
-static coroutine_fn int cow_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCowState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cow_write(bs, sector_num, buf, nb_sectors);
-    qemu_co_mutex_unlock(&s->lock);
-    return ret;
-}
-
-static void cow_close(BlockDriverState *bs)
-{
-}
-
-static int cow_create(const char *filename, QemuOpts *opts, Error **errp)
-{
-    struct cow_header_v2 cow_header;
-    struct stat st;
-    int64_t image_sectors = 0;
-    char *image_filename = NULL;
-    Error *local_err = NULL;
-    int ret;
-    BlockDriverState *cow_bs = NULL;
-
-    /* Read out options */
-    image_sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
-    image_filename = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-
-    ret = bdrv_create_file(filename, opts, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        goto exit;
-    }
-
-    ret = bdrv_open(&cow_bs, filename, NULL, NULL,
-                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        goto exit;
-    }
-
-    memset(&cow_header, 0, sizeof(cow_header));
-    cow_header.magic = cpu_to_be32(COW_MAGIC);
-    cow_header.version = cpu_to_be32(COW_VERSION);
-    if (image_filename) {
-        /* Note: if no file, we put a dummy mtime */
-        cow_header.mtime = cpu_to_be32(0);
-
-        if (stat(image_filename, &st) != 0) {
-            goto mtime_fail;
-        }
-        cow_header.mtime = cpu_to_be32(st.st_mtime);
-    mtime_fail:
-        pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
-                image_filename);
-    }
-    cow_header.sectorsize = cpu_to_be32(512);
-    cow_header.size = cpu_to_be64(image_sectors * 512);
-    ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
-    if (ret < 0) {
-        goto exit;
-    }
-
-    /* resize to include at least all the bitmap */
-    ret = bdrv_truncate(cow_bs,
-        sizeof(cow_header) + ((image_sectors + 7) >> 3));
-    if (ret < 0) {
-        goto exit;
-    }
-
-exit:
-    g_free(image_filename);
-    if (cow_bs) {
-        bdrv_unref(cow_bs);
-    }
-    return ret;
-}
-
-static QemuOptsList cow_create_opts = {
-    .name = "cow-create-opts",
-    .head = QTAILQ_HEAD_INITIALIZER(cow_create_opts.head),
-    .desc = {
-        {
-            .name = BLOCK_OPT_SIZE,
-            .type = QEMU_OPT_SIZE,
-            .help = "Virtual disk size"
-        },
-        {
-            .name = BLOCK_OPT_BACKING_FILE,
-            .type = QEMU_OPT_STRING,
-            .help = "File name of a base image"
-        },
-        { /* end of list */ }
-    }
-};
-
-static BlockDriver bdrv_cow = {
-    .format_name    = "cow",
-    .instance_size  = sizeof(BDRVCowState),
-
-    .bdrv_probe     = cow_probe,
-    .bdrv_open      = cow_open,
-    .bdrv_close     = cow_close,
-    .bdrv_create    = cow_create,
-    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
-    .supports_backing       = true,
-
-    .bdrv_read              = cow_co_read,
-    .bdrv_write             = cow_co_write,
-    .bdrv_co_get_block_status   = cow_co_get_block_status,
-
-    .create_opts    = &cow_create_opts,
-};
-
-static void bdrv_cow_init(void)
-{
-    bdrv_register(&bdrv_cow);
-}
-
-block_init(bdrv_cow_init);
--- a/block/curl.c
+++ b/block/curl.c
@@ -26,7 +26,7 @@
 #include "qapi/qmp/qbool.h"
 #include <curl/curl.h>

-// #define DEBUG
+// #define DEBUG_CURL
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
@@ -63,6 +63,8 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_NUM_ACB    8
 #define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
+#define CURL_TIMEOUT_DEFAULT 5
+#define CURL_TIMEOUT_MAX 10000

 #define FIND_RET_NONE   0
 #define FIND_RET_OK     1
@@ -71,11 +73,13 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_URL       "url"
 #define CURL_BLOCK_OPT_READAHEAD "readahead"
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
+#define CURL_BLOCK_OPT_TIMEOUT "timeout"
+#define CURL_BLOCK_OPT_COOKIE    "cookie"

 struct BDRVCURLState;

 typedef struct CURLAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    QEMUIOVector *qiov;

@@ -109,6 +113,8 @@ typedef struct BDRVCURLState {
    char *url;
    size_t readahead_size;
    bool sslverify;
+    uint64_t timeout;
+    char *cookie;
    bool accept_range;
    AioContext *aio_context;
 } BDRVCURLState;
@@ -207,7 +213,7 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
                                acb->end - acb->start);
            acb->common.cb(acb->common.opaque, 0);
-            qemu_aio_release(acb);
+            qemu_aio_unref(acb);
            s->acb[i] = NULL;
        }
    }
@@ -299,7 +305,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
                    }

                    acb->common.cb(acb->common.opaque, -EIO);
-                    qemu_aio_release(acb);
+                    qemu_aio_unref(acb);
                    state->acb[i] = NULL;
                }
            }
@@ -352,7 +358,7 @@ static void curl_multi_timeout_do(void *arg)
 #endif
 }

-static CURLState *curl_init_state(BDRVCURLState *s)
+static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
 {
    CURLState *state = NULL;
    int i, j;
@@ -370,7 +376,7 @@ static CURLState *curl_init_state(BDRVCURLState *s)
            break;
        }
        if (!state) {
-            aio_poll(state->s->aio_context, true);
+            aio_poll(bdrv_get_aio_context(bs), true);
        }
    } while(!state);

@@ -382,7 +388,10 @@ static CURLState *curl_init_state(BDRVCURLState *s)
        curl_easy_setopt(state->curl, CURLOPT_URL, s->url);
        curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER,
                         (long) s->sslverify);
-        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5);
+        if (s->cookie) {
+            curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie);
+        }
+        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout);
        curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION,
                         (void *)curl_read_cb);
        curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state);
@@ -489,6 +498,16 @@ static QemuOptsList runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Verify SSL certificate"
        },
+        {
+            .name = CURL_BLOCK_OPT_TIMEOUT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Curl timeout"
+        },
+        {
+            .name = CURL_BLOCK_OPT_COOKIE,
+            .type = QEMU_OPT_STRING,
+            .help = "Pass the cookie or list of cookies with each request"
+        },
        { /* end of list */ }
    },
 };
@@ -501,6 +520,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *file;
+    const char *cookie;
    double d;

    static int inited = 0;
@@ -525,8 +545,18 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
        goto out_noclean;
    }

+    s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT,
+                                     CURL_TIMEOUT_DEFAULT);
+    if (s->timeout > CURL_TIMEOUT_MAX) {
+        error_setg(errp, "timeout parameter is too large or negative");
+        goto out_noclean;
+    }
+
    s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true);

+    cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE);
+    s->cookie = g_strdup(cookie);
+
    file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL);
    if (file == NULL) {
        error_setg(errp, "curl block driver requires an 'url' option");
@@ -541,7 +571,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    DPRINTF("CURL: Opening %s\n", file);
    s->aio_context = bdrv_get_aio_context(bs);
    s->url = g_strdup(file);
-    state = curl_init_state(s);
+    state = curl_init_state(bs, s);
    if (!state)
        goto out_noclean;

@@ -582,19 +612,14 @@ out:
    curl_easy_cleanup(state->curl);
    state->curl = NULL;
 out_noclean:
+    g_free(s->cookie);
    g_free(s->url);
    qemu_opts_del(opts);
    return -EINVAL;
 }

-static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    // Do we have to implement canceling? Seems to work without...
-}
-
 static const AIOCBInfo curl_aiocb_info = {
    .aiocb_size         = sizeof(CURLAIOCB),
-    .cancel             = curl_aio_cancel,
 };


@@ -616,7 +641,7 @@ static void curl_readv_bh_cb(void *p)
    // we can just call the callback and be done.
    switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
        case FIND_RET_OK:
-            qemu_aio_release(acb);
+            qemu_aio_unref(acb);
            // fall through
        case FIND_RET_WAIT:
            return;
@@ -625,10 +650,10 @@ static void curl_readv_bh_cb(void *p)
    }

    // No cache found, so let's start a new request
-    state = curl_init_state(s);
+    state = curl_init_state(acb->common.bs, s);
    if (!state) {
        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return;
    }

@@ -640,7 +665,13 @@ static void curl_readv_bh_cb(void *p)
    state->buf_start = start;
    state->buf_len = acb->end + s->readahead_size;
    end = MIN(start + state->buf_len, s->len) - 1;
-    state->orig_buf = g_malloc(state->buf_len);
+    state->orig_buf = g_try_malloc(state->buf_len);
+    if (state->buf_len && state->orig_buf == NULL) {
+        curl_clean_state(state);
+        acb->common.cb(acb->common.opaque, -ENOMEM);
+        qemu_aio_unref(acb);
+        return;
+    }
    state->acb[0] = acb;

    snprintf(state->range, 127, "%zd-%zd", start, end);
@@ -654,9 +685,9 @@ static void curl_readv_bh_cb(void *p)
    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 }

-static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    CURLAIOCB *acb;

@@ -678,6 +709,7 @@ static void curl_close(BlockDriverState *bs)
    DPRINTF("CURL: Close\n");
    curl_detach_aio_context(bs);

+    g_free(s->cookie);
    g_free(s->url);
 }

--- a/block/dmg.c
+++ b/block/dmg.c
@@ -284,8 +284,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size + 1);
-    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
+    s->compressed_chunk = qemu_try_blockalign(bs->file,
+                                              max_compressed_size + 1);
+    s->uncompressed_chunk = qemu_try_blockalign(bs->file,
+                                                512 * max_sectors_per_chunk);
+    if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    if (inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
@@ -302,8 +309,8 @@ fail:
    g_free(s->lengths);
    g_free(s->sectors);
    g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);
    return ret;
 }

@@ -426,8 +433,8 @@ static void dmg_close(BlockDriverState *bs)
    g_free(s->lengths);
    g_free(s->sectors);
    g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);

    inflateEnd(&s->zstream);
 }
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -291,7 +291,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
    BDRVGlusterState *s = bs->opaque;
    int open_flags = 0;
    int ret = 0;
-    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+    GlusterConf *gconf = g_new0(GlusterConf, 1);
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
@@ -351,12 +351,12 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
    assert(state != NULL);
    assert(state->bs != NULL);

-    state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState));
+    state->opaque = g_new0(BDRVGlusterReopenState, 1);
    reop_s = state->opaque;

    qemu_gluster_parse_flags(state->flags, &open_flags);

-    gconf = g_malloc0(sizeof(GlusterConf));
+    gconf = g_new0(GlusterConf, 1);

    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
    if (reop_s->glfs == NULL) {
@@ -486,7 +486,7 @@ static int qemu_gluster_create(const char *filename,
    int prealloc = 0;
    int64_t total_size = 0;
    char *tmp = NULL;
-    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+    GlusterConf *gconf = g_new0(GlusterConf, 1);

    glfs = qemu_gluster_init(gconf, filename, errp);
    if (!glfs) {
@@ -494,8 +494,8 @@ static int qemu_gluster_create(const char *filename,
        goto out;
    }

-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    if (!tmp || !strcmp(tmp, "off")) {
@@ -516,9 +516,8 @@ static int qemu_gluster_create(const char *filename,
    if (!fd) {
        ret = -errno;
    } else {
-        if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
-            if (prealloc && qemu_gluster_zerofill(fd, 0,
-                    total_size * BDRV_SECTOR_SIZE)) {
+        if (!glfs_ftruncate(fd, total_size)) {
+            if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
                ret = -errno;
            }
        } else {
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -34,7 +34,6 @@
 #include "qemu/bitops.h"
 #include "qemu/bitmap.h"
 #include "block/block_int.h"
-#include "trace.h"
 #include "block/scsi.h"
 #include "qemu/iov.h"
 #include "sysemu/sysemu.h"
@@ -81,14 +80,13 @@ typedef struct IscsiTask {
 } IscsiTask;

 typedef struct IscsiAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUIOVector *qiov;
    QEMUBH *bh;
    IscsiLun *iscsilun;
    struct scsi_task *task;
    uint8_t *buf;
    int status;
-    int canceled;
    int64_t sector_num;
    int nb_sectors;
 #ifdef __linux__
@@ -120,16 +118,14 @@ iscsi_bh_cb(void *p)
    g_free(acb->buf);
    acb->buf = NULL;

-    if (acb->canceled == 0) {
-        acb->common.cb(acb->common.opaque, acb->status);
-    }
+    acb->common.cb(acb->common.opaque, acb->status);

    if (acb->task != NULL) {
        scsi_free_scsi_task(acb->task);
        acb->task = NULL;
    }

-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static void
@@ -231,7 +227,7 @@ iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
 }

 static void
-iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
+iscsi_aio_cancel(BlockAIOCB *blockacb)
 {
    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
    IscsiLun *iscsilun = acb->iscsilun;
@@ -240,20 +236,15 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
        return;
    }

-    acb->canceled = 1;
-
    /* send a task mgmt call to the target to cancel the task on the target */
    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
                                     iscsi_abort_task_cb, acb);

-    while (acb->status == -EINPROGRESS) {
-        aio_poll(iscsilun->aio_context, true);
-    }
 }

 static const AIOCBInfo iscsi_aiocb_info = {
    .aiocb_size         = sizeof(IscsiAIOCB),
-    .cancel             = iscsi_aio_cancel,
+    .cancel_async       = iscsi_aio_cancel,
 };


@@ -325,6 +316,13 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
    return 1;
 }

+static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+{
+    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
+                                                       iscsilun),
+                                       iscsilun->cluster_sectors));
+}
+
 static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
                                    int nb_sectors)
 {
@@ -364,6 +362,12 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    lba = sector_qemu2lun(sector_num, iscsilun);
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
@@ -531,6 +535,12 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
        int64_t ret;
@@ -638,10 +648,6 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    g_free(acb->buf);
    acb->buf = NULL;

-    if (acb->canceled != 0) {
-        return;
-    }
-
    acb->status = 0;
    if (status < 0) {
        error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
@@ -669,9 +675,9 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    iscsi_schedule_bh(acb);
 }

-static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
+static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct iscsi_context *iscsi = iscsilun->iscsi;
@@ -683,7 +689,6 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
-    acb->canceled    = 0;
    acb->bh          = NULL;
    acb->status      = -EINPROGRESS;
    acb->buf         = NULL;
@@ -693,7 +698,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
                     iscsi_get_error(iscsi));
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return NULL;
    }
    memset(acb->task, 0, sizeof(struct scsi_task));
@@ -731,7 +736,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
                                 (data.size > 0) ? &data : NULL,
                                 acb) != 0) {
        scsi_free_scsi_task(acb->task);
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return NULL;
    }

@@ -893,7 +898,10 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);

    if (iscsilun->zeroblock == NULL) {
-        iscsilun->zeroblock = g_malloc0(iscsilun->block_size);
+        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
+        if (iscsilun->zeroblock == NULL) {
+            return -ENOMEM;
+        }
    }

    iscsi_co_init_iscsitask(iscsilun, &iTask);
@@ -1223,6 +1231,40 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
 }

+static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+{
+    struct scsi_task *task;
+    struct scsi_mode_sense *ms = NULL;
+    bool wrprotected = false;
+
+    task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
+                                 1, SCSI_MODESENSE_PC_CURRENT,
+                                 0x3F, 0, 255);
+    if (task == NULL) {
+        error_report("iSCSI: Failed to send MODE_SENSE(6) command: %s",
+                     iscsi_get_error(iscsilun->iscsi));
+        goto out;
+    }
+
+    if (task->status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI: Failed MODE_SENSE(6), LUN assumed writable");
+        goto out;
+    }
+    ms = scsi_datain_unmarshall(task);
+    if (!ms) {
+        error_report("iSCSI: Failed to unmarshall MODE_SENSE(6) data: %s",
+                     iscsi_get_error(iscsilun->iscsi));
+        goto out;
+    }
+    wrprotected = ms->device_specific_parameter & 0x80;
+
+out:
+    if (task) {
+        scsi_free_scsi_task(task);
+    }
+    return wrprotected;
+}
+
 /*
 * We support iscsi url's on the form
 * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
@@ -1343,6 +1385,14 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    scsi_free_scsi_task(task);
    task = NULL;

+    /* Check the write protect flag of the LUN if we want to write */
+    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
+        iscsi_is_write_protected(iscsilun)) {
+        error_setg(errp, "Cannot open a write protected LUN as read-write");
+        ret = -EACCES;
+        goto out;
+    }
+
    iscsi_readcapacity_sync(iscsilun, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -1413,9 +1463,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
-            iscsilun->allocationmap =
-                bitmap_new(DIV_ROUND_UP(bs->total_sectors,
-                                        iscsilun->cluster_sectors));
+            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+            if (iscsilun->allocationmap == NULL) {
+                ret = -ENOMEM;
+            }
        }
    }

@@ -1450,31 +1501,44 @@ static void iscsi_close(BlockDriverState *bs)
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1);
+}
+
 static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    IscsiLun *iscsilun = bs->opaque;
-
    /* We don't actually refresh here, but just return data queried in
     * iscsi_open(): iscsi targets don't change their limits. */
+
+    IscsiLun *iscsilun = bs->opaque;
+    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
+
+    if (iscsilun->bl.max_xfer_len) {
+        max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len);
+    }
+
+    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun);
+
    if (iscsilun->lbp.lbpu) {
        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
-                                                 iscsilun);
+            bs->bl.max_discard =
+                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun);
        }
-        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                   iscsilun);
+        bs->bl.discard_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }

    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
-                                                  iscsilun);
+        bs->bl.max_write_zeroes =
+            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
    }
    if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                        iscsilun);
+        bs->bl.write_zeroes_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }
-    bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
-                                                 iscsilun);
+    bs->bl.opt_transfer_length =
+        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
 }

 /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in
@@ -1508,10 +1572,7 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)

    if (iscsilun->allocationmap != NULL) {
        g_free(iscsilun->allocationmap);
-        iscsilun->allocationmap =
-            bitmap_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
-                                                    iscsilun),
-                                    iscsilun->cluster_sectors));
+        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
    }

    return 0;
@@ -1525,12 +1586,12 @@ static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
    IscsiLun *iscsilun = NULL;
    QDict *bs_options;

-    bs = bdrv_new("", &error_abort);
+    bs = bdrv_new();

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
-    bs->opaque = g_malloc0(sizeof(struct IscsiLun));
+    total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                              BDRV_SECTOR_SIZE);
+    bs->opaque = g_new0(struct IscsiLun, 1);
    iscsilun = bs->opaque;

    bs_options = qdict_new();
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -28,7 +28,7 @@
 #define MAX_QUEUED_IO  128

 struct qemu_laiocb {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    struct qemu_laio_state *ctx;
    struct iocb iocb;
    ssize_t ret;
@@ -51,6 +51,12 @@ struct qemu_laio_state {

    /* io queue for submit at batch */
    LaioQueue io_q;
+
+    /* I/O completion processing */
+    QEMUBH *completion_bh;
+    struct io_event events[MAX_EVENTS];
+    int event_idx;
+    int event_max;
 };

 static inline ssize_t io_event_ret(struct io_event *ev)
@@ -79,72 +85,89 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
                ret = -EINVAL;
            }
        }
+    }
+    laiocb->common.cb(laiocb->common.opaque, ret);

-        laiocb->common.cb(laiocb->common.opaque, ret);
+    qemu_aio_unref(laiocb);
+}
+
+/* The completion BH fetches completed I/O requests and invokes their
+ * callbacks.
+ *
+ * The function is somewhat tricky because it supports nested event loops, for
+ * example when a request callback invokes aio_poll().  In order to do this,
+ * the completion events array and index are kept in qemu_laio_state.  The BH
+ * reschedules itself as long as there are completions pending so it will
+ * either be called again in a nested event loop or will be called after all
+ * events have been completed.  When there are no events left to complete, the
+ * BH returns without rescheduling.
+ */
+static void qemu_laio_completion_bh(void *opaque)
+{
+    struct qemu_laio_state *s = opaque;
+
+    /* Fetch more completion events when empty */
+    if (s->event_idx == s->event_max) {
+        do {
+            struct timespec ts = { 0 };
+            s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS,
+                                        s->events, &ts);
+        } while (s->event_max == -EINTR);
+
+        s->event_idx = 0;
+        if (s->event_max <= 0) {
+            s->event_max = 0;
+            return; /* no more events */
+        }
    }

-    qemu_aio_release(laiocb);
+    /* Reschedule so nested event loops see currently pending completions */
+    qemu_bh_schedule(s->completion_bh);
+
+    /* Process completion events */
+    while (s->event_idx < s->event_max) {
+        struct iocb *iocb = s->events[s->event_idx].obj;
+        struct qemu_laiocb *laiocb =
+                container_of(iocb, struct qemu_laiocb, iocb);
+
+        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
+        s->event_idx++;
+
+        qemu_laio_process_completion(s, laiocb);
+    }
 }

 static void qemu_laio_completion_cb(EventNotifier *e)
 {
    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);

-    while (event_notifier_test_and_clear(&s->e)) {
-        struct io_event events[MAX_EVENTS];
-        struct timespec ts = { 0 };
-        int nevents, i;
-
-        do {
-            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
-        } while (nevents == -EINTR);
-
-        for (i = 0; i < nevents; i++) {
-            struct iocb *iocb = events[i].obj;
-            struct qemu_laiocb *laiocb =
-                    container_of(iocb, struct qemu_laiocb, iocb);
-
-            laiocb->ret = io_event_ret(&events[i]);
-            qemu_laio_process_completion(s, laiocb);
-        }
+    if (event_notifier_test_and_clear(&s->e)) {
+        qemu_bh_schedule(s->completion_bh);
    }
 }

-static void laio_cancel(BlockDriverAIOCB *blockacb)
+static void laio_cancel(BlockAIOCB *blockacb)
 {
    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
    struct io_event event;
    int ret;

-    if (laiocb->ret != -EINPROGRESS)
+    if (laiocb->ret != -EINPROGRESS) {
        return;
-
-    /*
-     * Note that as of Linux 2.6.31 neither the block device code nor any
-     * filesystem implements cancellation of AIO request.
-     * Thus the polling loop below is the normal code path.
-     */
+    }
    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
-    if (ret == 0) {
-        laiocb->ret = -ECANCELED;
+    laiocb->ret = -ECANCELED;
+    if (ret != 0) {
+        /* iocb is not cancelled, cb will be called by the event loop later */
        return;
    }

-    /*
-     * We have to wait for the iocb to finish.
-     *
-     * The only way to get the iocb status update is by polling the io context.
-     * We might be able to do this slightly more optimal by removing the
-     * O_NONBLOCK flag.
-     */
-    while (laiocb->ret == -EINPROGRESS) {
-        qemu_laio_completion_cb(&laiocb->ctx->e);
-    }
+    laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
 }

 static const AIOCBInfo laio_aiocb_info = {
    .aiocb_size         = sizeof(struct qemu_laiocb),
-    .cancel             = laio_cancel,
+    .cancel_async       = laio_cancel,
 };

 static void ioq_init(LaioQueue *io_q)
@@ -220,9 +243,9 @@ int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
    return ret;
 }

-BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    struct qemu_laio_state *s = aio_ctx;
    struct qemu_laiocb *laiocb;
@@ -263,7 +286,7 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
    return &laiocb->common;

 out_free_aiocb:
-    qemu_aio_release(laiocb);
+    qemu_aio_unref(laiocb);
    return NULL;
 }

@@ -272,12 +295,14 @@ void laio_detach_aio_context(void *s_, AioContext *old_context)
    struct qemu_laio_state *s = s_;

    aio_set_event_notifier(old_context, &s->e, NULL);
+    qemu_bh_delete(s->completion_bh);
 }

 void laio_attach_aio_context(void *s_, AioContext *new_context)
 {
    struct qemu_laio_state *s = s_;

+    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb);
 }

--- a/block/mirror.c
+++ b/block/mirror.c
@@ -45,6 +45,7 @@ typedef struct MirrorBlockJob {
    int64_t sector_num;
    int64_t granularity;
    size_t buf_size;
+    int64_t bdev_length;
    unsigned long *cow_bitmap;
    BdrvDirtyBitmap *dirty_bitmap;
    HBitmapIter hbi;
@@ -54,6 +55,7 @@ typedef struct MirrorBlockJob {

    unsigned long *in_flight_bitmap;
    int in_flight;
+    int sectors_in_flight;
    int ret;
 } MirrorBlockJob;

@@ -87,6 +89,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
+    s->sectors_in_flight -= op->nb_sectors;
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
@@ -98,8 +101,11 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
-    if (s->cow_bitmap && ret >= 0) {
-        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+    if (ret >= 0) {
+        if (s->cow_bitmap) {
+            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+        }
+        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
    }

    qemu_iovec_destroy(&op->qiov);
@@ -157,7 +163,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    BlockDriverState *source = s->common.bs;
    int nb_sectors, sectors_per_chunk, nb_chunks;
    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
-    uint64_t delay_ns;
+    uint64_t delay_ns = 0;
    MirrorOp *op;

    s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -172,7 +178,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    hbitmap_next_sector = s->sector_num;
    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    end = s->common.len >> BDRV_SECTOR_BITS;
+    end = s->bdev_length / BDRV_SECTOR_SIZE;

    /* Extend the QEMUIOVector to include all adjacent blocks that will
     * be copied in this operation.
@@ -247,8 +253,6 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        next_chunk += added_chunks;
        if (!s->synced && s->common.speed) {
            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors);
-        } else {
-            delay_ns = 0;
        }
    } while (delay_ns == 0 && next_sector < end);

@@ -286,6 +290,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)

    /* Copy the dirty cluster.  */
    s->in_flight++;
+    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
@@ -316,9 +321,56 @@ static void mirror_drain(MirrorBlockJob *s)
    }
 }

+typedef struct {
+    int ret;
+} MirrorExitData;
+
+static void mirror_exit(BlockJob *job, void *opaque)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    MirrorExitData *data = opaque;
+    AioContext *replace_aio_context = NULL;
+
+    if (s->to_replace) {
+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+    }
+
+    if (s->should_complete && data->ret == 0) {
+        BlockDriverState *to_replace = s->common.bs;
+        if (s->to_replace) {
+            to_replace = s->to_replace;
+        }
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
+        }
+        bdrv_swap(s->target, to_replace);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            bdrv_set_backing_hd(s->base, NULL);
+            bdrv_unref(p);
+        }
+    }
+    if (s->to_replace) {
+        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
+        error_free(s->replace_blocker);
+        bdrv_unref(s->to_replace);
+    }
+    if (replace_aio_context) {
+        aio_context_release(replace_aio_context);
+    }
+    g_free(s->replaces);
+    bdrv_unref(s->target);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
+    MirrorExitData *data;
    BlockDriverState *bs = s->common.bs;
    int64_t sector_num, end, sectors_per_chunk, length;
    uint64_t last_pause_ns;
@@ -331,11 +383,11 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
-        ret = s->common.len;
+    s->bdev_length = bdrv_getlength(bs);
+    if (s->bdev_length < 0) {
+        ret = s->bdev_length;
        goto immediate_exit;
-    } else if (s->common.len == 0) {
+    } else if (s->bdev_length == 0) {
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
@@ -346,7 +398,7 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    length = DIV_ROUND_UP(s->common.len, s->granularity);
+    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
    s->in_flight_bitmap = bitmap_new(length);

    /* If we have no backing file yet in the destination, we cannot let
@@ -366,8 +418,13 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

-    end = s->common.len >> BDRV_SECTOR_BITS;
-    s->buf = qemu_blockalign(bs, s->buf_size);
+    end = s->bdev_length / BDRV_SECTOR_SIZE;
+    s->buf = qemu_try_blockalign(bs, s->buf_size);
+    if (s->buf == NULL) {
+        ret = -ENOMEM;
+        goto immediate_exit;
+    }
+
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
    mirror_free_init(s);

@@ -406,6 +463,12 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        /* s->common.offset contains the number of bytes already processed so
+         * far, cnt is the number of dirty sectors remaining and
+         * s->sectors_in_flight is the number of sectors currently being
+         * processed; together those are the current total operation length */
+        s->common.len = s->common.offset +
+                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that qemu_aio_flush() returns.
@@ -442,7 +505,6 @@ static void coroutine_fn mirror_run(void *opaque)
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
-                s->common.offset = end * BDRV_SECTOR_SIZE;
                if (!s->synced) {
                    block_job_event_ready(&s->common);
                    s->synced = true;
@@ -464,15 +526,13 @@ static void coroutine_fn mirror_run(void *opaque)
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
-            bdrv_drain_all();
+            bdrv_drain(bs);
            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
        }

        ret = 0;
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
        if (!s->synced) {
-            /* Publish progress */
-            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
                break;
@@ -507,31 +567,10 @@ immediate_exit:
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
    bdrv_iostatus_disable(s->target);
-    if (s->should_complete && ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
-        if (s->to_replace) {
-            to_replace = s->to_replace;
-        }
-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_swap(s->target, to_replace);
-        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
-            /* drop the bs loop chain formed by the swap: break the loop then
-             * trigger the unref from the top one */
-            BlockDriverState *p = s->base->backing_hd;
-            bdrv_set_backing_hd(s->base, NULL);
-            bdrv_unref(p);
-        }
-    }
-    if (s->to_replace) {
-        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
-        error_free(s->replace_blocker);
-        bdrv_unref(s->to_replace);
-    }
-    g_free(s->replaces);
-    bdrv_unref(s->target);
-    block_job_completed(&s->common, ret);
+
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -564,22 +603,30 @@ static void mirror_complete(BlockJob *job, Error **errp)
        return;
    }
    if (!s->synced) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY,
+                  bdrv_get_device_name(job->bs));
        return;
    }

    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
+        AioContext *replace_aio_context;
+
        s->to_replace = check_to_replace_node(s->replaces, &local_err);
        if (!s->to_replace) {
            error_propagate(errp, local_err);
            return;
        }

+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
+
+        aio_context_release(replace_aio_context);
    }

    s->should_complete = true;
@@ -609,7 +656,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             int64_t buf_size,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
-                             BlockDriverCompletionFunc *cb,
+                             BlockCompletionFunc *cb,
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
                             bool is_none_mode, BlockDriverState *base)
@@ -669,7 +716,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                  int64_t speed, int64_t granularity, int64_t buf_size,
                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb,
+                  BlockCompletionFunc *cb,
                  void *opaque, Error **errp)
 {
    bool is_none_mode;
@@ -686,7 +733,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
 void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                         int64_t speed,
                         BlockdevOnError on_error,
-                         BlockDriverCompletionFunc *cb,
+                         BlockCompletionFunc *cb,
                         void *opaque, Error **errp)
 {
    int64_t length, base_length;
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -31,8 +31,10 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "qemu/sockets.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"

 #include <sys/types.h>
 #include <unistd.h>
@@ -338,6 +340,51 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
    nbd_client_session_attach_aio_context(&s->client, new_context);
 }

+static void nbd_refresh_filename(BlockDriverState *bs)
+{
+    QDict *opts = qdict_new();
+    const char *path   = qdict_get_try_str(bs->options, "path");
+    const char *host   = qdict_get_try_str(bs->options, "host");
+    const char *port   = qdict_get_try_str(bs->options, "port");
+    const char *export = qdict_get_try_str(bs->options, "export");
+
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));
+
+    if (path && export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix:///%s?socket=%s", export, path);
+    } else if (path && !export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix://?socket=%s", path);
+    } else if (!path && export && port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s/%s", host, port, export);
+    } else if (!path && export && !port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s/%s", host, export);
+    } else if (!path && !export && port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s", host, port);
+    } else if (!path && !export && !port) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s", host);
+    }
+
+    if (path) {
+        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path)));
+    } else if (port) {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port)));
+    } else {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+    }
+    if (export) {
+        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export)));
+    }
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_nbd = {
    .format_name                = "nbd",
    .protocol_name              = "nbd",
@@ -352,6 +399,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static BlockDriver bdrv_nbd_tcp = {
@@ -368,6 +416,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static BlockDriver bdrv_nbd_unix = {
@@ -384,6 +433,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_attach_aio_context,
+    .bdrv_refresh_filename      = nbd_refresh_filename,
 };

 static void bdrv_nbd_init(void)
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -172,7 +172,11 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,

    nfs_co_init_task(client, &task);

-    buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    if (nb_sectors && buf == NULL) {
+        return -ENOMEM;
+    }
+
    qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE);

    if (nfs_pwrite_async(client->context, client->fh,
@@ -389,28 +393,33 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
    }
    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                          errp);
    if (ret < 0) {
-        return ret;
+        goto out;
    }
    bs->total_sectors = ret;
-    return 0;
+    ret = 0;
+out:
+    qemu_opts_del(opts);
+    return ret;
 }

 static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
 {
    int ret = 0;
    int64_t total_size = 0;
-    NFSClient *client = g_malloc0(sizeof(NFSClient));
+    NFSClient *client = g_new0(NFSClient, 1);

    client->aio_context = qemu_get_aio_context();

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    ret = nfs_client_open(client, url, O_CREAT, errp);
    if (ret < 0) {
--- a/block/null.c
+++ b/block/null.c
@@ -0,0 +1,168 @@
+/*
+ * Null block driver
+ *
+ * Authors:
+ *  Fam Zheng <famz@redhat.com>
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "block/block_int.h"
+
+typedef struct {
+    int64_t length;
+} BDRVNullState;
+
+static QemuOptsList runtime_opts = {
+    .name = "null",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "",
+        },
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "size of the null block",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
+                          Error **errp)
+{
+    QemuOpts *opts;
+    BDRVNullState *s = bs->opaque;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &error_abort);
+    s->length =
+        qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+    qemu_opts_del(opts);
+    return 0;
+}
+
+static void null_close(BlockDriverState *bs)
+{
+}
+
+static int64_t null_getlength(BlockDriverState *bs)
+{
+    BDRVNullState *s = bs->opaque;
+    return s->length;
+}
+
+static coroutine_fn int null_co_readv(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      QEMUIOVector *qiov)
+{
+    return 0;
+}
+
+static coroutine_fn int null_co_writev(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       QEMUIOVector *qiov)
+{
+    return 0;
+}
+
+static coroutine_fn int null_co_flush(BlockDriverState *bs)
+{
+    return 0;
+}
+
+typedef struct {
+    BlockAIOCB common;
+    QEMUBH *bh;
+} NullAIOCB;
+
+static const AIOCBInfo null_aiocb_info = {
+    .aiocb_size = sizeof(NullAIOCB),
+};
+
+static void null_bh_cb(void *opaque)
+{
+    NullAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_unref(acb);
+}
+
+static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque)
+{
+    NullAIOCB *acb;
+
+    acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
+static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
+                                  int64_t sector_num, QEMUIOVector *qiov,
+                                  int nb_sectors,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
+                                   int64_t sector_num, QEMUIOVector *qiov,
+                                   int nb_sectors,
+                                   BlockCompletionFunc *cb,
+                                   void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque)
+{
+    return null_aio_common(bs, cb, opaque);
+}
+
+static BlockDriver bdrv_null_co = {
+    .format_name            = "null-co",
+    .protocol_name          = "null-co",
+    .instance_size          = sizeof(BDRVNullState),
+
+    .bdrv_file_open         = null_file_open,
+    .bdrv_close             = null_close,
+    .bdrv_getlength         = null_getlength,
+
+    .bdrv_co_readv          = null_co_readv,
+    .bdrv_co_writev         = null_co_writev,
+    .bdrv_co_flush_to_disk  = null_co_flush,
+};
+
+static BlockDriver bdrv_null_aio = {
+    .format_name            = "null-aio",
+    .protocol_name          = "null-aio",
+    .instance_size          = sizeof(BDRVNullState),
+
+    .bdrv_file_open         = null_file_open,
+    .bdrv_close             = null_close,
+    .bdrv_getlength         = null_getlength,
+
+    .bdrv_aio_readv         = null_aio_readv,
+    .bdrv_aio_writev        = null_aio_writev,
+    .bdrv_aio_flush         = null_aio_flush,
+};
+
+static void bdrv_null_init(void)
+{
+    bdrv_register(&bdrv_null_co);
+    bdrv_register(&bdrv_null_aio);
+}
+
+block_init(bdrv_null_init);
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -30,6 +30,7 @@
 /**************************************************************/

 #define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_MAGIC2 "WithouFreSpacExt"
 #define HEADER_VERSION 2
 #define HEADER_SIZE 64

@@ -41,8 +42,10 @@ struct parallels_header {
    uint32_t cylinders;
    uint32_t tracks;
    uint32_t catalog_entries;
-    uint32_t nb_sectors;
-    char padding[24];
+    uint64_t nb_sectors;
+    uint32_t inuse;
+    uint32_t data_off;
+    char padding[12];
 } QEMU_PACKED;

 typedef struct BDRVParallelsState {
@@ -52,6 +55,8 @@ typedef struct BDRVParallelsState {
    unsigned int catalog_size;

    unsigned int tracks;
+
+    unsigned int off_multiplier;
 } BDRVParallelsState;

 static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -59,11 +64,12 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam
    const struct parallels_header *ph = (const void *)buf;

    if (buf_size < HEADER_SIZE)
-	return 0;
+        return 0;

-    if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
-	(le32_to_cpu(ph->version) == HEADER_VERSION))
-	return 100;
+    if ((!memcmp(ph->magic, HEADER_MAGIC, 16) ||
+        !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
+        (le32_to_cpu(ph->version) == HEADER_VERSION))
+        return 100;

    return 0;
 }
@@ -83,14 +89,19 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
-        (le32_to_cpu(ph.version) != HEADER_VERSION)) {
-        error_setg(errp, "Image not in Parallels format");
-        ret = -EINVAL;
-        goto fail;
-    }
+    bs->total_sectors = le64_to_cpu(ph.nb_sectors);

-    bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+    if (le32_to_cpu(ph.version) != HEADER_VERSION) {
+        goto fail_format;
+    }
+    if (!memcmp(ph.magic, HEADER_MAGIC, 16)) {
+        s->off_multiplier = 1;
+        bs->total_sectors = 0xffffffff & bs->total_sectors;
+    } else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) {
+        s->off_multiplier = le32_to_cpu(ph.tracks);
+    } else {
+        goto fail_format;
+    }

    s->tracks = le32_to_cpu(ph.tracks);
    if (s->tracks == 0) {
@@ -98,6 +109,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto fail;
    }
+    if (s->tracks > INT32_MAX/513) {
+        error_setg(errp, "Invalid image: Too big cluster");
+        ret = -EFBIG;
+        goto fail;
+    }

    s->catalog_size = le32_to_cpu(ph.catalog_entries);
    if (s->catalog_size > INT_MAX / 4) {
@@ -105,7 +121,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EFBIG;
        goto fail;
    }
-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
    if (ret < 0) {
@@ -113,11 +133,14 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    }

    for (i = 0; i < s->catalog_size; i++)
-	le32_to_cpus(&s->catalog_bitmap[i]);
+        le32_to_cpus(&s->catalog_bitmap[i]);

    qemu_co_mutex_init(&s->lock);
    return 0;

+fail_format:
+    error_setg(errp, "Image not in Parallels format");
+    ret = -EINVAL;
 fail:
    g_free(s->catalog_bitmap);
    return ret;
@@ -132,9 +155,10 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    offset = sector_num % s->tracks;

    /* not allocated */
-    if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
-	return -1;
-    return (uint64_t)(s->catalog_bitmap[index] + offset) * 512;
+    if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0))
+        return -1;
+    return
+        ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512;
 }

 static int parallels_read(BlockDriverState *bs, int64_t sector_num,
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -28,6 +28,7 @@
 #include "qapi-visit.h"
 #include "qapi/qmp-output-visitor.h"
 #include "qapi/qmp/types.h"
+#include "sysemu/block-backend.h"

 BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 {
@@ -165,19 +166,25 @@ void bdrv_query_image_info(BlockDriverState *bs,
                           ImageInfo **p_info,
                           Error **errp)
 {
-    uint64_t total_sectors;
+    int64_t size;
    const char *backing_filename;
    char backing_filename2[1024];
    BlockDriverInfo bdi;
    int ret;
    Error *err = NULL;
-    ImageInfo *info = g_new0(ImageInfo, 1);
+    ImageInfo *info;

-    bdrv_get_geometry(bs, &total_sectors);
+    size = bdrv_getlength(bs);
+    if (size < 0) {
+        error_setg_errno(errp, -size, "Can't get size of device '%s'",
+                         bdrv_get_device_name(bs));
+        return;
+    }

+    info = g_new0(ImageInfo, 1);
    info->filename        = g_strdup(bs->filename);
    info->format          = g_strdup(bdrv_get_format_name(bs));
-    info->virtual_size    = total_sectors * 512;
+    info->virtual_size    = size;
    info->actual_size     = bdrv_get_allocated_file_size(bs);
    info->has_actual_size = info->actual_size >= 0;
    if (bdrv_is_encrypted(bs)) {
@@ -236,22 +243,22 @@ void bdrv_query_image_info(BlockDriverState *bs,
 }

 /* @p_info will be set only on success. */
-void bdrv_query_info(BlockDriverState *bs,
-                     BlockInfo **p_info,
-                     Error **errp)
+static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
+                            Error **errp)
 {
    BlockInfo *info = g_malloc0(sizeof(*info));
+    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *bs0;
    ImageInfo **p_image_info;
    Error *local_err = NULL;
-    info->device = g_strdup(bs->device_name);
+    info->device = g_strdup(blk_name(blk));
    info->type = g_strdup("unknown");
-    info->locked = bdrv_dev_is_medium_locked(bs);
-    info->removable = bdrv_dev_has_removable_media(bs);
+    info->locked = blk_dev_is_medium_locked(blk);
+    info->removable = blk_dev_has_removable_media(blk);

-    if (bdrv_dev_has_removable_media(bs)) {
+    if (blk_dev_has_removable_media(blk)) {
        info->has_tray_open = true;
-        info->tray_open = bdrv_dev_is_tray_open(bs);
+        info->tray_open = blk_dev_is_tray_open(blk);
    }

    if (bdrv_iostatus_is_enabled(bs)) {
@@ -299,21 +306,22 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs)

    s = g_malloc0(sizeof(*s));

-    if (bs->device_name[0]) {
+    if (bdrv_get_device_name(bs)[0]) {
        s->has_device = true;
-        s->device = g_strdup(bs->device_name);
+        s->device = g_strdup(bdrv_get_device_name(bs));
    }

    s->stats = g_malloc0(sizeof(*s->stats));
-    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
-    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
-    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
-    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
-    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
-    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
-    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
-    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
-    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
+    s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ];
+    s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
+    s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
+    s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
+    s->stats->wr_highest_offset =
+        bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
+    s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
+    s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE];
+    s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ];
+    s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH];

    if (bs->file) {
        s->has_parent = true;
@@ -331,12 +339,12 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs)
 BlockInfoList *qmp_query_block(Error **errp)
 {
    BlockInfoList *head = NULL, **p_next = &head;
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk;
    Error *local_err = NULL;

-     while ((bs = bdrv_next(bs))) {
+    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
        BlockInfoList *info = g_malloc0(sizeof(*info));
-        bdrv_query_info(bs, &info->value, &local_err);
+        bdrv_query_info(blk, &info->value, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            goto err;
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        snprintf(version, sizeof(version), "QCOW version %" PRIu32,
                 header.version);
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bs->device_name, "qcow", version);
+                  bdrv_get_device_name(bs), "qcow", version);
        ret = -ENOTSUP;
        goto fail;
    }
@@ -182,7 +182,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    }

    s->l1_table_offset = header.l1_table_offset;
-    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    s->l1_table = g_try_new(uint64_t, s->l1_size);
+    if (s->l1_table == NULL) {
+        error_setg(errp, "Could not allocate memory for L1 table");
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
               s->l1_size * sizeof(uint64_t));
@@ -193,8 +198,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    for(i = 0;i < s->l1_size; i++) {
        be64_to_cpus(&s->l1_table[i]);
    }
-    /* alloc L2 cache */
-    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+
+    /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
+    s->l2_cache =
+        qemu_try_blockalign(bs->file,
+                            s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    if (s->l2_cache == NULL) {
+        error_setg(errp, "Could not allocate L2 table cache");
+        ret = -ENOMEM;
+        goto fail;
+    }
    s->cluster_cache = g_malloc(s->cluster_size);
    s->cluster_data = g_malloc(s->cluster_size);
    s->cluster_cache_offset = -1;
@@ -218,7 +231,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when qcow images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qcow", bs->device_name, "live migration");
+              "qcow", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    qemu_co_mutex_init(&s->lock);
@@ -226,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,

 fail:
    g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);
    return ret;
@@ -517,7 +530,10 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
    void *orig_buf;

    if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
    } else {
        orig_buf = NULL;
        buf = (uint8_t *)qiov->iov->iov_base;
@@ -619,7 +635,10 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
    s->cluster_cache_offset = -1; /* disable compressed cache */

    if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
    } else {
        orig_buf = NULL;
@@ -685,7 +704,7 @@ static void qcow_close(BlockDriverState *bs)
    BDRVQcowState *s = bs->opaque;

    g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);

@@ -706,7 +725,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    BlockDriverState *qcow_bs;

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
        flags |= BLOCK_FLAG_ENCRYPT;
@@ -734,7 +754,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(&header, 0, sizeof(header));
    header.magic = cpu_to_be32(QCOW_MAGIC);
    header.version = cpu_to_be32(QCOW_VERSION);
-    header.size = cpu_to_be64(total_size * 512);
+    header.size = cpu_to_be64(total_size);
    header_size = sizeof(header);
    backing_filename_len = 0;
    if (backing_file) {
@@ -756,7 +776,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }
    header_size = (header_size + 7) & ~7;
    shift = header.cluster_bits + header.l2_bits;
-    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+    l1_size = (total_size + (1LL << shift) - 1) >> shift;

    header.l1_table_offset = cpu_to_be64(header_size);
    if (flags & BLOCK_FLAG_ENCRYPT) {
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -48,15 +48,31 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
    Qcow2Cache *c;
    int i;

-    c = g_malloc0(sizeof(*c));
+    c = g_new0(Qcow2Cache, 1);
    c->size = num_tables;
-    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+    c->entries = g_try_new0(Qcow2CachedTable, num_tables);
+    if (!c->entries) {
+        goto fail;
+    }

    for (i = 0; i < c->size; i++) {
-        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+        c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (c->entries[i].table == NULL) {
+            goto fail;
+        }
    }

    return c;
+
+fail:
+    if (c->entries) {
+        for (i = 0; i < c->size; i++) {
+            qemu_vfree(c->entries[i].table);
+        }
+    }
+    g_free(c->entries);
+    g_free(c);
+    return NULL;
 }

 int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -72,14 +72,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 #endif

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
-    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_size2, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));

    /* write new table (align to cluster) */
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
    if (new_l1_table_offset < 0) {
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
        return new_l1_table_offset;
    }

@@ -113,7 +119,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
    if (ret < 0) {
        goto fail;
    }
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    old_l1_table_offset = s->l1_table_offset;
    s->l1_table_offset = new_l1_table_offset;
    s->l1_table = new_l1_table;
@@ -123,7 +129,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        QCOW2_DISCARD_OTHER);
    return 0;
 fail:
-    g_free(new_l1_table);
+    qemu_vfree(new_l1_table);
    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
                        QCOW2_DISCARD_OTHER);
    return ret;
@@ -158,12 +164,14 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 {
    BDRVQcowState *s = bs->opaque;
-    uint64_t buf[L1_ENTRIES_PER_SECTOR];
+    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
    int l1_start_index;
    int i, ret;

    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
-    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
+         i++)
+    {
        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
    }

@@ -372,7 +380,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
    }

    iov.iov_len = n * BDRV_SECTOR_SIZE;
-    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
+    if (iov.iov_base == NULL) {
+        return -ENOMEM;
+    }

    qemu_iovec_init_external(&qiov, &iov, 1);

@@ -477,6 +488,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        goto out;
    }

+    if (offset_into_cluster(s, l2_offset)) {
+        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
+                                " unaligned (L1 index: %#" PRIx64 ")",
+                                l2_offset, l1_index);
+        return -EIO;
+    }
+
    /* load the l2 table in memory */

    ret = l2_load(bs, l2_offset, &l2_table);
@@ -499,8 +517,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        break;
    case QCOW2_CLUSTER_ZERO:
        if (s->qcow_version < 3) {
-            qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-            return -EIO;
+            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
+                                    " in pre-v3 image (L2 offset: %#" PRIx64
+                                    ", L2 index: %#x)", l2_offset, l2_index);
+            ret = -EIO;
+            goto fail;
        }
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                &l2_table[l2_index], QCOW_OFLAG_ZERO);
@@ -516,6 +537,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                &l2_table[l2_index], QCOW_OFLAG_ZERO);
        *cluster_offset &= L2E_OFFSET_MASK;
+        if (offset_into_cluster(s, *cluster_offset)) {
+            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
+                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
+                                    ", L2 index: %#x)", *cluster_offset,
+                                    l2_offset, l2_index);
+            ret = -EIO;
+            goto fail;
+        }
        break;
    default:
        abort();
@@ -532,6 +561,10 @@ out:
    *num = nb_available - index_in_cluster;

    return ret;
+
+fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+    return ret;
 }

 /*
@@ -567,6 +600,12 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,

    assert(l1_index < s->l1_size);
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+    if (offset_into_cluster(s, l2_offset)) {
+        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
+                                " unaligned (L1 index: %#" PRIx64 ")",
+                                l2_offset, l1_index);
+        return -EIO;
+    }

    /* seek the l2 table of the given l2 offset */

@@ -702,7 +741,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
    assert(m->nb_clusters > 0);

-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+    old_cluster = g_try_new(uint64_t, m->nb_clusters);
+    if (old_cluster == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }

    /* copy content of unmodified sectors */
    ret = perform_cow(bs, m, &m->cow_start);
@@ -935,6 +978,15 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
        bool offset_matches =
            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;

+        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
+            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
+                                    "%#llx unaligned (guest offset: %#" PRIx64
+                                    ")", cluster_offset & L2E_OFFSET_MASK,
+                                    guest_offset);
+            ret = -EIO;
+            goto out;
+        }
+
        if (*host_offset != 0 && !offset_matches) {
            *bytes = 0;
            ret = 0;
@@ -966,7 +1018,7 @@ out:

    /* Only return a host offset if we actually made progress. Otherwise we
     * would make requirements for handle_alloc() that it can't fulfill */
-    if (ret) {
+    if (ret > 0) {
        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
                     + offset_into_cluster(s, guest_offset);
    }
@@ -1106,6 +1158,17 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
        return 0;
    }

+    /* !*host_offset would overwrite the image header and is reserved for "no
+     * host offset preferred". If 0 was a valid host offset, it'd trigger the
+     * following overlap check; do that now to avoid having an invalid value in
+     * *host_offset. */
+    if (!alloc_cluster_offset) {
+        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
+                                            nb_clusters * s->cluster_size);
+        assert(ret < 0);
+        goto fail;
+    }
+
    /*
     * Save info needed for meta data update.
     *
@@ -1351,7 +1414,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 * clusters.
 */
 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters, enum qcow2_discard_type type)
+    unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l2_table;
@@ -1373,23 +1436,30 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);

        /*
-         * Make sure that a discarded area reads back as zeroes for v3 images
-         * (we cannot do it for v2 without actually writing a zero-filled
-         * buffer). We can skip the operation if the cluster is already marked
-         * as zero, or if it's unallocated and we don't have a backing file.
+         * If full_discard is false, make sure that a discarded area reads back
+         * as zeroes for v3 images (we cannot do it for v2 without actually
+         * writing a zero-filled buffer). We can skip the operation if the
+         * cluster is already marked as zero, or if it's unallocated and we
+         * don't have a backing file.
         *
         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
         * holding s->lock, so that doesn't work today.
+         *
+         * If full_discard is true, the sector should not read back as zeroes,
+         * but rather fall through to the backing file.
         */
        switch (qcow2_get_cluster_type(old_l2_entry)) {
            case QCOW2_CLUSTER_UNALLOCATED:
-                if (!bs->backing_hd) {
+                if (full_discard || !bs->backing_hd) {
                    continue;
                }
                break;

            case QCOW2_CLUSTER_ZERO:
-                continue;
+                if (!full_discard) {
+                    continue;
+                }
+                break;

            case QCOW2_CLUSTER_NORMAL:
            case QCOW2_CLUSTER_COMPRESSED:
@@ -1401,7 +1471,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,

        /* First remove L2 entries */
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-        if (s->qcow_version >= 3) {
+        if (!full_discard && s->qcow_version >= 3) {
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
        } else {
            l2_table[l2_index + i] = cpu_to_be64(0);
@@ -1420,7 +1490,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
 }

 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type)
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t end_offset;
@@ -1443,7 +1513,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,

    /* Each L2 table is handled by its own loop iteration */
    while (nb_clusters > 0) {
-        ret = discard_single_l2(bs, offset, nb_clusters, type);
+        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
        if (ret < 0) {
            goto fail;
        }
@@ -1543,15 +1613,14 @@ fail:
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
 * non-backed non-pre-allocated zero clusters).
 *
- * expanded_clusters is a bitmap where every bit corresponds to one cluster in
- * the image file; a bit gets set if the corresponding cluster has been used for
- * zero expansion (i.e., has been filled with zeroes and is referenced from an
- * L2 table). nb_clusters contains the total cluster count of the image file,
- * i.e., the number of bits in expanded_clusters.
+ * l1_entries and *visited_l1_entries are used to keep track of progress for
+ * status_cb(). l1_entries contains the total number of L1 entries and
+ * *visited_l1_entries counts all visited L1 entries.
 */
 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
-                                      int l1_size, uint8_t **expanded_clusters,
-                                      uint64_t *nb_clusters)
+                                      int l1_size, int64_t *visited_l1_entries,
+                                      int64_t l1_entries,
+                                      BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
@@ -1562,15 +1631,23 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
    if (!is_active_l1) {
        /* inactive L2 tables require a buffer to be stored in when loading
         * them from disk */
-        l2_table = qemu_blockalign(bs, s->cluster_size);
+        l2_table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (l2_table == NULL) {
+            return -ENOMEM;
+        }
    }

    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
        bool l2_dirty = false;
+        int l2_refcount;

        if (!l2_offset) {
            /* unallocated */
+            (*visited_l1_entries)++;
+            if (status_cb) {
+                status_cb(bs, *visited_l1_entries, l1_entries);
+            }
            continue;
        }

@@ -1587,33 +1664,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            goto fail;
        }

+        l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (l2_refcount < 0) {
+            ret = l2_refcount;
+            goto fail;
+        }
+
        for (j = 0; j < s->l2_size; j++) {
            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
-            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int64_t offset = l2_entry & L2E_OFFSET_MASK;
            int cluster_type = qcow2_get_cluster_type(l2_entry);
            bool preallocated = offset != 0;

-            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
-                cluster_index = offset >> s->cluster_bits;
-                assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-                if ((*expanded_clusters)[cluster_index / 8] &
-                    (1 << (cluster_index % 8))) {
-                    /* Probably a shared L2 table; this cluster was a zero
-                     * cluster which has been expanded, its refcount
-                     * therefore most likely requires an update. */
-                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
-                                                        QCOW2_DISCARD_NEVER);
-                    if (ret < 0) {
-                        goto fail;
-                    }
-                    /* Since we just increased the refcount, the COPIED flag may
-                     * no longer be set. */
-                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
-                    l2_dirty = true;
-                }
-                continue;
-            }
-            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+            if (cluster_type != QCOW2_CLUSTER_ZERO) {
                continue;
            }

@@ -1631,6 +1694,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    ret = offset;
                    goto fail;
                }
+
+                if (l2_refcount > 1) {
+                    /* For shared L2 tables, set the refcount accordingly (it is
+                     * already 1 and needs to be l2_refcount) */
+                    ret = qcow2_update_cluster_refcount(bs,
+                            offset >> s->cluster_bits, l2_refcount - 1,
+                            QCOW2_DISCARD_OTHER);
+                    if (ret < 0) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_OTHER);
+                        goto fail;
+                    }
+                }
            }

            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
@@ -1652,29 +1728,12 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                goto fail;
            }

-            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
-            l2_dirty = true;
-
-            cluster_index = offset >> s->cluster_bits;
-
-            if (cluster_index >= *nb_clusters) {
-                uint64_t old_bitmap_size = (*nb_clusters + 7) / 8;
-                uint64_t new_bitmap_size;
-                /* The offset may lie beyond the old end of the underlying image
-                 * file for growable files only */
-                assert(bs->file->growable);
-                *nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                                BDRV_SECTOR_SIZE);
-                new_bitmap_size = (*nb_clusters + 7) / 8;
-                *expanded_clusters = g_realloc(*expanded_clusters,
-                                               new_bitmap_size);
-                /* clear the newly allocated space */
-                memset(&(*expanded_clusters)[old_bitmap_size], 0,
-                       new_bitmap_size - old_bitmap_size);
+            if (l2_refcount == 1) {
+                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            } else {
+                l2_table[j] = cpu_to_be64(offset);
            }
-
-            assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-            (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8);
+            l2_dirty = true;
        }

        if (is_active_l1) {
@@ -1703,6 +1762,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                }
            }
        }
+
+        (*visited_l1_entries)++;
+        if (status_cb) {
+            status_cb(bs, *visited_l1_entries, l1_entries);
+        }
    }

    ret = 0;
@@ -1729,21 +1793,25 @@ fail:
 * allocation for pre-allocated ones). This is important for downgrading to a
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
-int qcow2_expand_zero_clusters(BlockDriverState *bs)
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l1_table = NULL;
-    uint64_t nb_clusters;
-    uint8_t *expanded_clusters;
+    int64_t l1_entries = 0, visited_l1_entries = 0;
    int ret;
    int i, j;

-    nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                   BDRV_SECTOR_SIZE);
-    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);
+    if (status_cb) {
+        l1_entries = s->l1_size;
+        for (i = 0; i < s->nb_snapshots; i++) {
+            l1_entries += s->snapshots[i].l1_size;
+        }
+    }

    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
-                                     &expanded_clusters, &nb_clusters);
+                                     &visited_l1_entries, l1_entries,
+                                     status_cb);
    if (ret < 0) {
        goto fail;
    }
@@ -1777,7 +1845,8 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
        }

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
-                                         &expanded_clusters, &nb_clusters);
+                                         &visited_l1_entries, l1_entries,
+                                         status_cb);
        if (ret < 0) {
            goto fail;
        }
@@ -1786,7 +1855,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
    ret = 0;

 fail:
-    g_free(expanded_clusters);
    g_free(l1_table);
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -58,7 +58,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
    }

    offset = s->snapshots_offset;
-    s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+    s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots);

    for(i = 0; i < s->nb_snapshots; i++) {
        /* Read statically sized part of the snapshot header */
@@ -381,7 +381,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    sn->l1_table_offset = l1_table_offset;
    sn->l1_size = s->l1_size;

-    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    l1_table = g_try_new(uint64_t, s->l1_size);
+    if (s->l1_size && l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    for(i = 0; i < s->l1_size; i++) {
        l1_table[i] = cpu_to_be64(s->l1_table[i]);
    }
@@ -412,7 +417,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    }

    /* Append the new snapshot to the snapshot list */
-    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    new_snapshot_list = g_new(QCowSnapshot, s->nb_snapshots + 1);
    if (s->snapshots) {
        memcpy(new_snapshot_list, s->snapshots,
               s->nb_snapshots * sizeof(QCowSnapshot));
@@ -436,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
                           align_offset(sn->vm_state_size, s->cluster_size)
                                >> BDRV_SECTOR_BITS,
-                           QCOW2_DISCARD_NEVER);
+                           QCOW2_DISCARD_NEVER, false);

 #ifdef DEBUG_ALLOC
    {
@@ -499,7 +504,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     * Decrease the refcount referenced by the old one only when the L1
     * table is overwritten.
     */
-    sn_l1_table = g_malloc0(cur_l1_bytes);
+    sn_l1_table = g_try_malloc0(cur_l1_bytes);
+    if (cur_l1_bytes && sn_l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
    if (ret < 0) {
@@ -652,7 +661,7 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
        return s->nb_snapshots;
    }

-    sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    sn_tab = g_new0(QEMUSnapshotInfo, s->nb_snapshots);
    for(i = 0; i < s->nb_snapshots; i++) {
        sn_info = sn_tab + i;
        sn = s->snapshots + i;
@@ -698,17 +707,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
        return -EFBIG;
    }
    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
-    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_bytes, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }

    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
    if (ret < 0) {
        error_setg(errp, "Failed to read l1 table for snapshot");
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
        return ret;
    }

    /* Switch the L1 table */
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);

    s->l1_size = sn->l1_size;
    s->l1_table_offset = sn->l1_table_offset;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -30,6 +30,9 @@
 #include "qemu/error-report.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qbool.h"
+#include "qapi/util.h"
+#include "qapi/qmp/types.h"
+#include "qapi-event.h"
 #include "trace.h"
 #include "qemu/option_int.h"

@@ -203,8 +206,8 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
    vsnprintf(msg, sizeof(msg), fmt, ap);
    va_end(ap);

-    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2",
-              msg);
+    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+              bdrv_get_device_name(bs), "qcow2", msg);
 }

 static void report_unsupported_feature(BlockDriverState *bs,
@@ -402,6 +405,12 @@ static QemuOptsList qcow2_runtime_opts = {
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
+        {
+            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
+            .type = QEMU_OPT_STRING,
+            .help = "Selects which overlap checks to perform from a range of "
+                    "templates (none, constant, cached, all)",
+        },
        {
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
            .type = QEMU_OPT_BOOL,
@@ -442,6 +451,22 @@ static QemuOptsList qcow2_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L2 table",
        },
+        {
+            .name = QCOW2_OPT_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
+                    "cache size",
+        },
+        {
+            .name = QCOW2_OPT_L2_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum L2 table cache size",
+        },
+        {
+            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum refcount block cache size",
+        },
        { /* end of list */ }
    },
 };
@@ -457,6 +482,61 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 };

+static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
+                             uint64_t *refcount_cache_size, Error **errp)
+{
+    uint64_t combined_cache_size;
+    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
+
+    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
+    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
+    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
+
+    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
+    *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
+    *refcount_cache_size = qemu_opt_get_size(opts,
+                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
+
+    if (combined_cache_size_set) {
+        if (l2_cache_size_set && refcount_cache_size_set) {
+            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
+                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
+                       "the same time");
+            return;
+        } else if (*l2_cache_size > combined_cache_size) {
+            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
+                       QCOW2_OPT_CACHE_SIZE);
+            return;
+        } else if (*refcount_cache_size > combined_cache_size) {
+            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
+                       QCOW2_OPT_CACHE_SIZE);
+            return;
+        }
+
+        if (l2_cache_size_set) {
+            *refcount_cache_size = combined_cache_size - *l2_cache_size;
+        } else if (refcount_cache_size_set) {
+            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+        } else {
+            *refcount_cache_size = combined_cache_size
+                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
+            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+        }
+    } else {
+        if (!l2_cache_size_set && !refcount_cache_size_set) {
+            *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE;
+            *refcount_cache_size = *l2_cache_size
+                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        } else if (!l2_cache_size_set) {
+            *l2_cache_size = *refcount_cache_size
+                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        } else if (!refcount_cache_size_set) {
+            *refcount_cache_size = *l2_cache_size
+                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        }
+    }
+}
+
 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
@@ -464,12 +544,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    unsigned int len, i;
    int ret = 0;
    QCowHeader header;
-    QemuOpts *opts;
+    QemuOpts *opts = NULL;
    Error *local_err = NULL;
    uint64_t ext_end;
    uint64_t l1_vm_state_index;
-    const char *opt_overlap_check;
+    const char *opt_overlap_check, *opt_overlap_check_template;
    int overlap_check_template = 0;
+    uint64_t l2_cache_size, refcount_cache_size;

    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
@@ -617,6 +698,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,

    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
+    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
+    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
+    s->refcount_block_size = 1 << s->refcount_block_bits;
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
@@ -688,8 +772,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,


    if (s->l1_size > 0) {
-        s->l1_table = g_malloc0(
+        s->l1_table = qemu_try_blockalign(bs->file,
            align_offset(s->l1_size * sizeof(uint64_t), 512));
+        if (s->l1_table == NULL) {
+            error_setg(errp, "Could not allocate L1 table");
+            ret = -ENOMEM;
+            goto fail;
+        }
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
@@ -701,14 +790,61 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        }
    }

+    /* get L2 table/refcount block cache size from command line options */
+    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    l2_cache_size /= s->cluster_size;
+    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
+        l2_cache_size = MIN_L2_CACHE_SIZE;
+    }
+    if (l2_cache_size > INT_MAX) {
+        error_setg(errp, "L2 cache size too big");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    refcount_cache_size /= s->cluster_size;
+    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
+        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
+    }
+    if (refcount_cache_size > INT_MAX) {
+        error_setg(errp, "Refcount cache size too big");
+        ret = -EINVAL;
+        goto fail;
+    }
+
    /* alloc L2 table/refcount block cache */
-    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
-    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+    s->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
+    s->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
+    if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) {
+        error_setg(errp, "Could not allocate metadata caches");
+        ret = -ENOMEM;
+        goto fail;
+    }

    s->cluster_cache = g_malloc(s->cluster_size);
    /* one more sector for decompressed data alignment */
-    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
-                                  + 512);
+    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                    * s->cluster_size + 512);
+    if (s->cluster_data == NULL) {
+        error_setg(errp, "Could not allocate temporary cluster buffer");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    s->cluster_cache_offset = -1;
    s->flags = flags;

@@ -774,7 +910,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
        BdrvCheckResult result = {0};

-        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not repair dirty image");
            goto fail;
@@ -782,14 +918,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Enable lazy_refcounts according to image and command line options */
-    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto fail;
-    }
-
    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));

@@ -803,7 +931,21 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

-    opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached";
+    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
+    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
+    if (opt_overlap_check_template && opt_overlap_check &&
+        strcmp(opt_overlap_check_template, opt_overlap_check))
+    {
+        error_setg(errp, "Conflicting values for qcow2 options '"
+                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
+                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (!opt_overlap_check) {
+        opt_overlap_check = opt_overlap_check_template ?: "cached";
+    }
+
    if (!strcmp(opt_overlap_check, "none")) {
        overlap_check_template = 0;
    } else if (!strcmp(opt_overlap_check, "constant")) {
@@ -816,7 +958,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
                   "'overlap-check'. Allowed are either of the following: "
                   "none, constant, cached, all", opt_overlap_check);
-        qemu_opts_del(opts);
        ret = -EINVAL;
        goto fail;
    }
@@ -831,6 +972,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    }

    qemu_opts_del(opts);
+    opts = NULL;

    if (s->use_lazy_refcounts && s->qcow_version < 3) {
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
@@ -848,11 +990,12 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    return ret;

 fail:
+    qemu_opts_del(opts);
    g_free(s->unknown_header_fields);
    cleanup_unknown_header_ext(bs);
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
    if (s->l2_table_cache) {
@@ -1082,7 +1225,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 */
                if (!cluster_data) {
                    cluster_data =
-                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                        qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                      * s->cluster_size);
+                    if (cluster_data == NULL) {
+                        ret = -ENOMEM;
+                        goto fail;
+                    }
                }

                assert(cur_nr_sectors <=
@@ -1182,8 +1330,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,

        if (s->crypt_method) {
            if (!cluster_data) {
-                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
-                                                 s->cluster_size);
+                cluster_data = qemu_try_blockalign(bs->file,
+                                                   QCOW_MAX_CRYPT_CLUSTERS
+                                                   * s->cluster_size);
+                if (cluster_data == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
            }

            assert(hd_qiov.size <=
@@ -1270,7 +1423,7 @@ fail:
 static void qcow2_close(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;

@@ -1557,7 +1710,7 @@ static int preallocate(BlockDriverState *bs)
    int ret;
    QCowL2Meta *meta;

-    nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    nb_sectors = bdrv_nb_sectors(bs);
    offset = 0;

    while (nb_sectors) {
@@ -1612,7 +1765,7 @@ static int preallocate(BlockDriverState *bs)

 static int qcow2_create2(const char *filename, int64_t total_size,
                         const char *backing_file, const char *backing_format,
-                         int flags, size_t cluster_size, int prealloc,
+                         int flags, size_t cluster_size, PreallocMode prealloc,
                         QemuOpts *opts, int version,
                         Error **errp)
 {
@@ -1645,6 +1798,56 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    Error *local_err = NULL;
    int ret;

+    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
+        int64_t meta_size = 0;
+        uint64_t nreftablee, nrefblocke, nl1e, nl2e;
+        int64_t aligned_total_size = align_offset(total_size, cluster_size);
+
+        /* header: 1 cluster */
+        meta_size += cluster_size;
+
+        /* total size of L2 tables */
+        nl2e = aligned_total_size / cluster_size;
+        nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
+        meta_size += nl2e * sizeof(uint64_t);
+
+        /* total size of L1 tables */
+        nl1e = nl2e * sizeof(uint64_t) / cluster_size;
+        nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
+        meta_size += nl1e * sizeof(uint64_t);
+
+        /* total size of refcount blocks
+         *
+         * note: every host cluster is reference-counted, including metadata
+         * (even refcount blocks are recursively included).
+         * Let:
+         *   a = total_size (this is the guest disk size)
+         *   m = meta size not including refcount blocks and refcount tables
+         *   c = cluster size
+         *   y1 = number of refcount blocks entries
+         *   y2 = meta size including everything
+         * then,
+         *   y1 = (y2 + a)/c
+         *   y2 = y1 * sizeof(u16) + y1 * sizeof(u16) * sizeof(u64) / c + m
+         * we can get y1:
+         *   y1 = (a + m) / (c - sizeof(u16) - sizeof(u16) * sizeof(u64) / c)
+         */
+        nrefblocke = (aligned_total_size + meta_size + cluster_size) /
+            (cluster_size - sizeof(uint16_t) -
+             1.0 * sizeof(uint16_t) * sizeof(uint64_t) / cluster_size);
+        nrefblocke = align_offset(nrefblocke, cluster_size / sizeof(uint16_t));
+        meta_size += nrefblocke * sizeof(uint16_t);
+
+        /* total size of refcount tables */
+        nreftablee = nrefblocke * sizeof(uint16_t) / cluster_size;
+        nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t));
+        meta_size += nreftablee * sizeof(uint64_t);
+
+        qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
+                            aligned_total_size + meta_size);
+        qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc]);
+    }
+
    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
@@ -1671,7 +1874,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        .l1_size                    = cpu_to_be32(0),
        .refcount_table_offset      = cpu_to_be64(cluster_size),
        .refcount_table_clusters    = cpu_to_be32(1),
-        .refcount_order             = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .refcount_order             = cpu_to_be32(4),
        .header_length              = cpu_to_be32(sizeof(*header)),
    };

@@ -1733,7 +1936,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* Okay, now that we have a valid image, let's give it the right size */
-    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+    ret = bdrv_truncate(bs, total_size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not resize image");
        goto out;
@@ -1750,7 +1953,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    }

    /* And if we're supposed to preallocate metadata, do that now */
-    if (prealloc) {
+    if (prealloc != PREALLOC_MODE_OFF) {
        BDRVQcowState *s = bs->opaque;
        qemu_co_mutex_lock(&s->lock);
        ret = preallocate(bs);
@@ -1786,16 +1989,17 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    char *backing_file = NULL;
    char *backing_fmt = NULL;
    char *buf = NULL;
-    uint64_t sectors = 0;
+    uint64_t size = 0;
    int flags = 0;
    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
-    int prealloc = 0;
+    PreallocMode prealloc;
    int version = 3;
    Error *local_err = NULL;
    int ret;

    /* Read out options */
-    sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                    BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
@@ -1804,12 +2008,11 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                                         DEFAULT_CLUSTER_SIZE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    if (!buf || !strcmp(buf, "off")) {
-        prealloc = 0;
-    } else if (!strcmp(buf, "metadata")) {
-        prealloc = 1;
-    } else {
-        error_setg(errp, "Invalid preallocation mode: '%s'", buf);
+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto finish;
    }
@@ -1831,7 +2034,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
        flags |= BLOCK_FLAG_LAZY_REFCOUNTS;
    }

-    if (backing_file && prealloc) {
+    if (backing_file && prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Backing file and preallocation cannot be used at "
                   "the same time");
        ret = -EINVAL;
@@ -1845,7 +2048,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
        goto finish;
    }

-    ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                        cluster_size, prealloc, opts, version, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -1886,7 +2089,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors, QCOW2_DISCARD_REQUEST);
+        nb_sectors, QCOW2_DISCARD_REQUEST, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -1947,7 +2150,6 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
        cluster_offset = bdrv_getlength(bs->file);
-        cluster_offset = (cluster_offset + 511) & ~511;
        bdrv_truncate(bs->file, cluster_offset);
        return 0;
    }
@@ -2028,6 +2230,195 @@ fail:
    return ret;
 }

+static int make_completely_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, l1_clusters;
+    int64_t offset;
+    uint64_t *new_reftable = NULL;
+    uint64_t rt_entry, l1_size2;
+    struct {
+        uint64_t l1_offset;
+        uint64_t reftable_offset;
+        uint32_t reftable_clusters;
+    } QEMU_PACKED l1_ofs_rt_ofs_cls;
+
+    ret = qcow2_cache_empty(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Refcounts will be broken utterly */
+    ret = qcow2_mark_dirty(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
+
+    /* After this call, neither the in-memory nor the on-disk refcount
+     * information accurately describe the actual references */
+
+    ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE,
+                            l1_clusters * s->cluster_sectors, 0);
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    memset(s->l1_table, 0, l1_size2);
+
+    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
+
+    /* Overwrite enough clusters at the beginning of the sectors to place
+     * the refcount table, a refcount block and the L1 table in; this may
+     * overwrite parts of the existing refcount and L1 table, which is not
+     * an issue because the dirty flag is set, complete data loss is in fact
+     * desired and partial data loss is consequently fine as well */
+    ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
+                            (2 + l1_clusters) * s->cluster_size /
+                            BDRV_SECTOR_SIZE, 0);
+    /* This call (even if it failed overall) may have overwritten on-disk
+     * refcount structures; in that case, the in-memory refcount information
+     * will probably differ from the on-disk information which makes the BDS
+     * unusable */
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+
+    /* "Create" an empty reftable (one cluster) directly after the image
+     * header and an empty L1 table three clusters after the image header;
+     * the cluster between those two will be used as the first refblock */
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
+    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
+                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    s->l1_table_offset = 3 * s->cluster_size;
+
+    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
+    if (!new_reftable) {
+        ret = -ENOMEM;
+        goto fail_broken_refcounts;
+    }
+
+    s->refcount_table_offset = s->cluster_size;
+    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
+
+    g_free(s->refcount_table);
+    s->refcount_table = new_reftable;
+    new_reftable = NULL;
+
+    /* Now the in-memory refcount information again corresponds to the on-disk
+     * information (reftable is empty and no refblocks (the refblock cache is
+     * empty)); however, this means some clusters (e.g. the image header) are
+     * referenced, but not refcounted, but the normal qcow2 code assumes that
+     * the in-memory information is always correct */
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+    /* Enter the first refblock into the reftable */
+    rt_entry = cpu_to_be64(2 * s->cluster_size);
+    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
+                           &rt_entry, sizeof(rt_entry));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    s->refcount_table[0] = 2 * s->cluster_size;
+
+    s->free_cluster_index = 0;
+    assert(3 + l1_clusters <= s->refcount_block_size);
+    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
+    if (offset < 0) {
+        ret = offset;
+        goto fail_broken_refcounts;
+    } else if (offset > 0) {
+        error_report("First cluster in emptied image is in use");
+        abort();
+    }
+
+    /* Now finally the in-memory information corresponds to the on-disk
+     * structures and is correct */
+    ret = qcow2_mark_clean(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    return 0;
+
+fail_broken_refcounts:
+    /* The BDS is unusable at this point. If we wanted to make it usable, we
+     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
+     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
+     * again. However, because the functions which could have caused this error
+     * path to be taken are used by those functions as well, it's very likely
+     * that that sequence will fail as well. Therefore, just eject the BDS. */
+    bs->drv = NULL;
+
+fail:
+    g_free(new_reftable);
+    return ret;
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t start_sector;
+    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
+    int l1_clusters, ret = 0;
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+
+    if (s->qcow_version >= 3 && !s->snapshots &&
+        3 + l1_clusters <= s->refcount_block_size) {
+        /* The following function only works for qcow2 v3 images (it requires
+         * the dirty flag) and only as long as there are no snapshots (because
+         * it completely empties the image). Furthermore, the L1 table and three
+         * additional clusters (image header, refcount table, one refcount
+         * block) have to fit inside one refcount block. */
+        return make_completely_empty(bs);
+    }
+
+    /* This fallback code simply discards every active cluster; this is slow,
+     * but works in all cases */
+    for (start_sector = 0; start_sector < bs->total_sectors;
+         start_sector += sector_step)
+    {
+        /* As this function is generally used after committing an external
+         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
+         * default action for this kind of discard is to pass the discard,
+         * which will ideally result in an actually smaller image file, as
+         * is probably desired. */
+        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
+                                     MIN(sector_step,
+                                         bs->total_sectors - start_sector),
+                                     QCOW2_DISCARD_SNAPSHOT, true);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
@@ -2083,6 +2474,9 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
            .has_lazy_refcounts = true,
+            .corrupt            = s->incompatible_features &
+                                  QCOW2_INCOMPAT_CORRUPT,
+            .has_corrupt        = true,
        };
    }

@@ -2156,7 +2550,8 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
-static int qcow2_downgrade(BlockDriverState *bs, int target_version)
+static int qcow2_downgrade(BlockDriverState *bs, int target_version,
+                           BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int current_version = s->qcow_version;
@@ -2205,7 +2600,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

-    ret = qcow2_expand_zero_clusters(bs);
+    ret = qcow2_expand_zero_clusters(bs, status_cb);
    if (ret < 0) {
        return ret;
    }
@@ -2219,7 +2614,8 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    return 0;
 }

-static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
+static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int old_version = s->qcow_version, new_version = old_version;
@@ -2297,7 +2693,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
                return ret;
            }
        } else {
-            ret = qcow2_downgrade(bs, new_version);
+            ret = qcow2_downgrade(bs, new_version, status_cb);
            if (ret < 0) {
                return ret;
            }
@@ -2353,6 +2749,52 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
    return 0;
 }

+/*
+ * If offset or size are negative, respectively, they will not be included in
+ * the BLOCK_IMAGE_CORRUPTED event emitted.
+ * fatal will be ignored for read-only BDS; corruptions found there will always
+ * be considered non-fatal.
+ */
+void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
+                             int64_t size, const char *message_format, ...)
+{
+    BDRVQcowState *s = bs->opaque;
+    char *message;
+    va_list ap;
+
+    fatal = fatal && !bs->read_only;
+
+    if (s->signaled_corruption &&
+        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
+    {
+        return;
+    }
+
+    va_start(ap, message_format);
+    message = g_strdup_vprintf(message_format, ap);
+    va_end(ap);
+
+    if (fatal) {
+        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
+                "corruption events will be suppressed\n", message);
+    } else {
+        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
+                "corruption events will be suppressed\n", message);
+    }
+
+    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
+                                          offset >= 0, offset, size >= 0, size,
+                                          fatal, &error_abort);
+    g_free(message);
+
+    if (fatal) {
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+    }
+
+    s->signaled_corruption = true;
+}
+
 static QemuOptsList qcow2_create_opts = {
    .name = "qcow2-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
@@ -2392,7 +2834,8 @@ static QemuOptsList qcow2_create_opts = {
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
-            .help = "Preallocation mode (allowed values: off, metadata)"
+            .help = "Preallocation mode (allowed values: off, metadata, "
+                    "falloc, full)"
        },
        {
            .name = BLOCK_OPT_LAZY_REFCOUNTS,
@@ -2424,6 +2867,7 @@ static BlockDriver bdrv_qcow2 = {
    .bdrv_co_discard        = qcow2_co_discard,
    .bdrv_truncate          = qcow2_truncate,
    .bdrv_write_compressed  = qcow2_write_compressed,
+    .bdrv_make_empty        = qcow2_make_empty,

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -59,15 +59,19 @@
 /* The cluster reads as all zeros */
 #define QCOW_OFLAG_ZERO (1ULL << 0)

-#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
-
 #define MIN_CLUSTER_BITS 9
 #define MAX_CLUSTER_BITS 21

-#define L2_CACHE_SIZE 16
+#define MIN_L2_CACHE_SIZE 1 /* cluster */

 /* Must be at least 4 to cover all cases of refcount table growth */
-#define REFCOUNT_CACHE_SIZE 4
+#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
+
+#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
+
+/* The refblock cache needs only a fourth of the L2 cache size to cover as many
+ * clusters */
+#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4

 #define DEFAULT_CLUSTER_SIZE 65536

@@ -77,6 +81,7 @@
 #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
 #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
 #define QCOW2_OPT_OVERLAP "overlap-check"
+#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
 #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
 #define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1"
 #define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2"
@@ -85,6 +90,9 @@
 #define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table"
 #define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1"
 #define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
+#define QCOW2_OPT_CACHE_SIZE "cache-size"
+#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"

 typedef struct QCowHeader {
    uint32_t magic;
@@ -213,6 +221,8 @@ typedef struct BDRVQcowState {
    int l2_size;
    int l1_size;
    int l1_vm_state_index;
+    int refcount_block_bits;
+    int refcount_block_size;
    int csize_shift;
    int csize_mask;
    uint64_t cluster_offset_mask;
@@ -252,6 +262,7 @@ typedef struct BDRVQcowState {
    bool discard_passthrough[QCOW2_DISCARD_MAX];

    int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
+    bool signaled_corruption;

    uint64_t incompatible_features;
    uint64_t compatible_features;
@@ -468,10 +479,16 @@ int qcow2_mark_corrupt(BlockDriverState *bs);
 int qcow2_mark_consistent(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);

+void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
+                             int64_t size, const char *message_format, ...)
+                             GCC_FMT_ATTR(5, 6);
+
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);

+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index);
+
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  int addend, enum qcow2_discard_type type);

@@ -519,10 +536,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,

 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type);
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard);
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);

-int qcow2_expand_zero_clusters(BlockDriverState *bs);
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb);

 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -227,8 +227,10 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
    };
    int ret;

-    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
-                                       sizeof(check.used_clusters[0]));
+    check.used_clusters = g_try_new0(uint32_t, (check.nclusters + 31) / 32);
+    if (check.nclusters && check.used_clusters == NULL) {
+        return -ENOMEM;
+    }

    check.result->bfi.total_clusters =
        (s->header.image_size + s->header.cluster_size - 1) /
--- a/block/qed-gencb.c
+++ b/block/qed-gencb.c
@@ -13,7 +13,7 @@

 #include "qed.h"

-void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque)
 {
    GenericCB *gencb = g_malloc(len);
    gencb->cb = cb;
@@ -24,7 +24,7 @@ void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
 void gencb_complete(void *opaque, int ret)
 {
    GenericCB *gencb = opaque;
-    BlockDriverCompletionFunc *cb = gencb->cb;
+    BlockCompletionFunc *cb = gencb->cb;
    void *user_opaque = gencb->opaque;

    g_free(gencb);
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -49,7 +49,7 @@ out:
 }

 static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                           BlockDriverCompletionFunc *cb, void *opaque)
+                           BlockCompletionFunc *cb, void *opaque)
 {
    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
                                                cb, opaque);
@@ -119,7 +119,7 @@ out:
 */
 static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
                            unsigned int index, unsigned int n, bool flush,
-                            BlockDriverCompletionFunc *cb, void *opaque)
+                            BlockCompletionFunc *cb, void *opaque)
 {
    QEDWriteTableCB *write_table_cb;
    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
@@ -180,7 +180,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 }

 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockDriverCompletionFunc *cb, void *opaque)
+                        BlockCompletionFunc *cb, void *opaque)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
    qed_write_table(s, s->header.l1_table_offset,
@@ -235,7 +235,7 @@ static void qed_read_l2_table_cb(void *opaque, int ret)
 }

 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockDriverCompletionFunc *cb, void *opaque)
+                       BlockCompletionFunc *cb, void *opaque)
 {
    QEDReadL2TableCB *read_l2_table_cb;

@@ -275,7 +275,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset

 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                        unsigned int index, unsigned int n, bool flush,
-                        BlockDriverCompletionFunc *cb, void *opaque)
+                        BlockCompletionFunc *cb, void *opaque)
 {
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
    qed_write_table(s, request->l2_table->offset,
--- a/block/qed.c
+++ b/block/qed.c
@@ -18,22 +18,8 @@
 #include "qapi/qmp/qerror.h"
 #include "migration/migration.h"

-static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QEDAIOCB *acb = (QEDAIOCB *)blockacb;
-    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
-    bool finished = false;
-
-    /* Wait for the request to finish */
-    acb->finished = &finished;
-    while (!finished) {
-        aio_poll(aio_context, true);
-    }
-}
-
 static const AIOCBInfo qed_aiocb_info = {
    .aiocb_size         = sizeof(QEDAIOCB),
-    .cancel             = qed_aio_cancel,
 };

 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
@@ -144,7 +130,7 @@ static void qed_write_header_read_cb(void *opaque, int ret)
 * This function only updates known header fields in-place and does not affect
 * extra data after the QED header.
 */
-static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
                             void *opaque)
 {
    /* We must write full sectors for O_DIRECT but cannot necessarily generate
@@ -422,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
        snprintf(buf, sizeof(buf), "%" PRIx64,
            s->header.features & ~QED_FEATURE_MASK);
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "QED", buf);
+            bdrv_get_device_name(bs), "QED", buf);
        return -ENOTSUP;
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -648,7 +634,8 @@ static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
    char *backing_fmt = NULL;
    int ret;

-    image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
    cluster_size = qemu_opt_get_size_del(opts,
@@ -772,7 +759,7 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
                                  QEMUIOVector *qiov,
                                  QEMUIOVector **backing_qiov,
-                                  BlockDriverCompletionFunc *cb, void *opaque)
+                                  BlockCompletionFunc *cb, void *opaque)
 {
    uint64_t backing_length = 0;
    size_t size;
@@ -864,7 +851,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret)
 */
 static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
                                       uint64_t len, uint64_t offset,
-                                       BlockDriverCompletionFunc *cb,
+                                       BlockCompletionFunc *cb,
                                       void *opaque)
 {
    CopyFromBackingFileCB *copy_cb;
@@ -915,21 +902,15 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
    QEDAIOCB *acb = opaque;
-    BlockDriverCompletionFunc *cb = acb->common.cb;
+    BlockCompletionFunc *cb = acb->common.cb;
    void *user_opaque = acb->common.opaque;
    int ret = acb->bh_ret;
-    bool *finished = acb->finished;

    qemu_bh_delete(acb->bh);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);

    /* Invoke callback */
    cb(user_opaque, ret);
-
-    /* Signal cancel completion */
-    if (finished) {
-        *finished = true;
-    }
 }

 static void qed_aio_complete(QEDAIOCB *acb, int ret)
@@ -1083,7 +1064,7 @@ static void qed_aio_write_main(void *opaque, int ret)
    BDRVQEDState *s = acb_to_s(acb);
    uint64_t offset = acb->cur_cluster +
                      qed_offset_into_cluster(s, acb->cur_pos);
-    BlockDriverCompletionFunc *next_fn;
+    BlockCompletionFunc *next_fn;

    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);

@@ -1183,7 +1164,7 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
    BDRVQEDState *s = acb_to_s(acb);
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;

    /* Cancel timer when the first allocating request comes in */
    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -1240,7 +1221,11 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
        struct iovec *iov = acb->qiov->iov;

        if (!iov->iov_base) {
-            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+            if (iov->iov_base == NULL) {
+                qed_aio_complete(acb, -ENOMEM);
+                return;
+            }
            memset(iov->iov_base, 0, iov->iov_len);
        }
    }
@@ -1380,11 +1365,11 @@ static void qed_aio_next_io(void *opaque, int ret)
                      io_fn, acb);
 }

-static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque, int flags)
+static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
+                                 int64_t sector_num,
+                                 QEMUIOVector *qiov, int nb_sectors,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque, int flags)
 {
    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);

@@ -1392,7 +1377,6 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
                        opaque, flags);

    acb->flags = flags;
-    acb->finished = NULL;
    acb->qiov = qiov;
    acb->qiov_offset = 0;
    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@@ -1406,20 +1390,20 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
    return &acb->common;
 }

-static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            QEMUIOVector *qiov, int nb_sectors,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov, int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 }

-static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             QEMUIOVector *qiov, int nb_sectors,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
                         opaque, QED_AIOCB_WRITE);
@@ -1447,7 +1431,7 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                 int nb_sectors,
                                                 BdrvRequestFlags flags)
 {
-    BlockDriverAIOCB *blockacb;
+    BlockAIOCB *blockacb;
    BDRVQEDState *s = bs->opaque;
    QEDWriteZeroesCB cb = { .done = false };
    QEMUIOVector qiov;
--- a/block/qed.h
+++ b/block/qed.h
@@ -128,7 +128,7 @@ enum {
 };

 typedef struct QEDAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int bh_ret;                     /* final return status for completion bh */
    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
@@ -203,11 +203,11 @@ typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t l
 * Generic callback for chaining async callbacks
 */
 typedef struct {
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;
    void *opaque;
 } GenericCB;

-void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
+void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque);
 void gencb_complete(void *opaque, int ret);

 /**
@@ -230,16 +230,16 @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
 */
 int qed_read_l1_table_sync(BDRVQEDState *s);
 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockDriverCompletionFunc *cb, void *opaque);
+                        BlockCompletionFunc *cb, void *opaque);
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                            unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                           uint64_t offset);
 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockDriverCompletionFunc *cb, void *opaque);
+                       BlockCompletionFunc *cb, void *opaque);
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                        unsigned int index, unsigned int n, bool flush,
-                        BlockDriverCompletionFunc *cb, void *opaque);
+                        BlockCompletionFunc *cb, void *opaque);
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            unsigned int index, unsigned int n, bool flush);

--- a/block/quorum.c
+++ b/block/quorum.c
@@ -16,7 +16,12 @@
 #include <gnutls/gnutls.h>
 #include <gnutls/crypto.h>
 #include "block/block_int.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
 #include "qapi/qmp/qjson.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qstring.h"
 #include "qapi-event.h"

 #define HASH_LENGTH 32
@@ -24,6 +29,7 @@
 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
 #define QUORUM_OPT_BLKVERIFY      "blkverify"
 #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
+#define QUORUM_OPT_READ_PATTERN   "read-pattern"

 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -74,6 +80,8 @@ typedef struct BDRVQuorumState {
    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
                            * block if Quorum is reached.
                            */
+
+    QuorumReadPattern read_pattern;
 } BDRVQuorumState;

 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -84,7 +92,7 @@ typedef struct QuorumAIOCB QuorumAIOCB;
 * $children_count QuorumChildRequest.
 */
 typedef struct QuorumChildRequest {
-    BlockDriverAIOCB *aiocb;
+    BlockAIOCB *aiocb;
    QEMUIOVector qiov;
    uint8_t *buf;
    int ret;
@@ -97,7 +105,7 @@ typedef struct QuorumChildRequest {
 * used to do operations on each children and track overall progress.
 */
 struct QuorumAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;

    /* Request metadata */
    uint64_t sector_num;
@@ -117,11 +125,12 @@ struct QuorumAIOCB {

    bool is_read;
    int vote_ret;
+    int child_iter;             /* which child to read in fifo pattern */
 };

 static bool quorum_vote(QuorumAIOCB *acb);

-static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
+static void quorum_aio_cancel(BlockAIOCB *blockacb)
 {
    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
    BDRVQuorumState *s = acb->common.bs->opaque;
@@ -129,21 +138,19 @@ static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)

    /* cancel all callbacks */
    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_cancel(acb->qcrs[i].aiocb);
+        if (acb->qcrs[i].aiocb) {
+            bdrv_aio_cancel_async(acb->qcrs[i].aiocb);
+        }
    }
-
-    g_free(acb->qcrs);
-    qemu_aio_release(acb);
 }

 static AIOCBInfo quorum_aiocb_info = {
    .aiocb_size         = sizeof(QuorumAIOCB),
-    .cancel             = quorum_aio_cancel,
+    .cancel_async       = quorum_aio_cancel,
 };

 static void quorum_aio_finalize(QuorumAIOCB *acb)
 {
-    BDRVQuorumState *s = acb->common.bs->opaque;
    int i, ret = 0;

    if (acb->vote_ret) {
@@ -153,14 +160,15 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
    acb->common.cb(acb->common.opaque, ret);

    if (acb->is_read) {
-        for (i = 0; i < s->num_children; i++) {
+        /* on the quorum case acb->child_iter == s->num_children - 1 */
+        for (i = 0; i <= acb->child_iter; i++) {
            qemu_vfree(acb->qcrs[i].buf);
            qemu_iovec_destroy(&acb->qcrs[i].qiov);
        }
    }

    g_free(acb->qcrs);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
@@ -178,7 +186,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
                                   QEMUIOVector *qiov,
                                   uint64_t sector_num,
                                   int nb_sectors,
-                                   BlockDriverCompletionFunc *cb,
+                                   BlockCompletionFunc *cb,
                                   void *opaque)
 {
    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
@@ -218,8 +226,8 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)

 static void quorum_report_failure(QuorumAIOCB *acb)
 {
-    const char *reference = acb->common.bs->device_name[0] ?
-                            acb->common.bs->device_name :
+    const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
+                            bdrv_get_device_name(acb->common.bs) :
                            acb->common.bs->node_name;

    qapi_event_send_quorum_failure(reference, acb->sector_num,
@@ -256,6 +264,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret)
    quorum_aio_finalize(acb);
 }

+static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb);
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+    int i;
+    assert(dest->niov == source->niov);
+    assert(dest->size == source->size);
+    for (i = 0; i < source->niov; i++) {
+        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+        memcpy(dest->iov[i].iov_base,
+               source->iov[i].iov_base,
+               source->iov[i].iov_len);
+    }
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
    QuorumChildRequest *sacb = opaque;
@@ -263,6 +286,21 @@ static void quorum_aio_cb(void *opaque, int ret)
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;

+    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
+        /* We try to read next child in FIFO order if we fail to read */
+        if (ret < 0 && ++acb->child_iter < s->num_children) {
+            read_fifo_child(acb);
+            return;
+        }
+
+        if (ret == 0) {
+            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
+        }
+        acb->vote_ret = ret;
+        quorum_aio_finalize(acb);
+        return;
+    }
+
    sacb->ret = ret;
    acb->count++;
    if (ret == 0) {
@@ -343,19 +381,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
    return count;
 }

-static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
-{
-    int i;
-    assert(dest->niov == source->niov);
-    assert(dest->size == source->size);
-    for (i = 0; i < source->niov; i++) {
-        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
-        memcpy(dest->iov[i].iov_base,
-               source->iov[i].iov_base,
-               source->iov[i].iov_len);
-    }
-}
-
 static void quorum_count_vote(QuorumVotes *votes,
                              QuorumVoteValue *value,
                              int index)
@@ -615,40 +640,68 @@ free_exit:
    return rewrite;
 }

-static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
-                                         void *opaque)
+static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
 {
-    BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
-                                      nb_sectors, cb, opaque);
+    BDRVQuorumState *s = acb->common.bs->opaque;
    int i;

-    acb->is_read = true;
-
    for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
-        qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
-        qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
+        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
+        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
+        qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
    }

    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors,
-                       quorum_aio_cb, &acb->qcrs[i]);
+        bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov,
+                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
    }

    return &acb->common;
 }

-static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
-                                          int64_t sector_num,
-                                          QEMUIOVector *qiov,
-                                          int nb_sectors,
-                                          BlockDriverCompletionFunc *cb,
-                                          void *opaque)
+static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter],
+                                                     acb->qiov->size);
+    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
+    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
+                     acb->qcrs[acb->child_iter].buf);
+    bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num,
+                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+                   quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+
+    return &acb->common;
+}
+
+static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs,
+                                    int64_t sector_num,
+                                    QEMUIOVector *qiov,
+                                    int nb_sectors,
+                                    BlockCompletionFunc *cb,
+                                    void *opaque)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
+                                      nb_sectors, cb, opaque);
+    acb->is_read = true;
+
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        acb->child_iter = s->num_children - 1;
+        return read_quorum_children(acb);
+    }
+
+    acb->child_iter = 0;
+    return read_fifo_child(acb);
+}
+
+static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs,
+                                     int64_t sector_num,
+                                     QEMUIOVector *qiov,
+                                     int nb_sectors,
+                                     BlockCompletionFunc *cb,
+                                     void *opaque)
 {
    BDRVQuorumState *s = bs->opaque;
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
@@ -782,16 +835,39 @@ static QemuOptsList quorum_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Rewrite corrupted block on read quorum",
        },
+        {
+            .name = QUORUM_OPT_READ_PATTERN,
+            .type = QEMU_OPT_STRING,
+            .help = "Allowed pattern: quorum, fifo. Quorum is default",
+        },
        { /* end of list */ }
    },
 };

+static int parse_read_pattern(const char *opt)
+{
+    int i;
+
+    if (!opt) {
+        /* Set quorum as default */
+        return QUORUM_READ_PATTERN_QUORUM;
+    }
+
+    for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+        if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
+            return i;
+        }
+    }
+
+    return -EINVAL;
+}
+
 static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                       Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    Error *local_err = NULL;
-    QemuOpts *opts;
+    QemuOpts *opts = NULL;
    bool *opened;
    QDict *sub = NULL;
    QList *list = NULL;
@@ -827,28 +903,37 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
    }

    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
-
-    /* and validate it against s->num_children */
-    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
    if (ret < 0) {
+        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
        goto exit;
    }
+    s->read_pattern = ret;

-    /* is the driver in blkverify mode */
-    if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
-        s->num_children == 2 && s->threshold == 2) {
-        s->is_blkverify = true;
-    } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
-        fprintf(stderr, "blkverify mode is set by setting blkverify=on "
-                "and using two files with vote_threshold=2\n");
-    }
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        /* and validate it against s->num_children */
+        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+        if (ret < 0) {
+            goto exit;
+        }

-    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
-    if (s->rewrite_corrupted && s->is_blkverify) {
-        error_setg(&local_err,
-                   "rewrite-corrupted=on cannot be used with blkverify=on");
-        ret = -EINVAL;
-        goto exit;
+        /* is the driver in blkverify mode */
+        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
+            s->num_children == 2 && s->threshold == 2) {
+            s->is_blkverify = true;
+        } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
+            fprintf(stderr, "blkverify mode is set by setting blkverify=on "
+                    "and using two files with vote_threshold=2\n");
+        }
+
+        s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,
+                                                 false);
+        if (s->rewrite_corrupted && s->is_blkverify) {
+            error_setg(&local_err,
+                       "rewrite-corrupted=on cannot be used with blkverify=on");
+            ret = -EINVAL;
+            goto exit;
+        }
    }

    /* allocate the children BlockDriverState array */
@@ -903,6 +988,7 @@ close_exit:
    g_free(s->bs);
    g_free(opened);
 exit:
+    qemu_opts_del(opts);
    /* propagate error */
    if (local_err) {
        error_propagate(errp, local_err);
@@ -945,6 +1031,39 @@ static void quorum_attach_aio_context(BlockDriverState *bs,
    }
 }

+static void quorum_refresh_filename(BlockDriverState *bs)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QDict *opts;
+    QList *children;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_refresh_filename(s->bs[i]);
+        if (!s->bs[i]->full_open_options) {
+            return;
+        }
+    }
+
+    children = qlist_new();
+    for (i = 0; i < s->num_children; i++) {
+        QINCREF(s->bs[i]->full_open_options);
+        qlist_append_obj(children, QOBJECT(s->bs[i]->full_open_options));
+    }
+
+    opts = qdict_new();
+    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("quorum")));
+    qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD,
+                  QOBJECT(qint_from_int(s->threshold)));
+    qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY,
+                  QOBJECT(qbool_from_int(s->is_blkverify)));
+    qdict_put_obj(opts, QUORUM_OPT_REWRITE,
+                  QOBJECT(qbool_from_int(s->rewrite_corrupted)));
+    qdict_put_obj(opts, "children", QOBJECT(children));
+
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_quorum = {
    .format_name                        = "quorum",
    .protocol_name                      = "quorum",
@@ -953,6 +1072,7 @@ static BlockDriver bdrv_quorum = {

    .bdrv_file_open                     = quorum_open,
    .bdrv_close                         = quorum_close,
+    .bdrv_refresh_filename              = quorum_refresh_filename,

    .bdrv_co_flush_to_disk              = quorum_co_flush,

--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -35,9 +35,9 @@
 #ifdef CONFIG_LINUX_AIO
 void *laio_init(void);
 void laio_cleanup(void *s);
-BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
+        BlockCompletionFunc *cb, void *opaque, int type);
 void laio_detach_aio_context(void *s, AioContext *old_context);
 void laio_attach_aio_context(void *s, AioContext *new_context);
 void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
@@ -49,10 +49,10 @@ typedef struct QEMUWin32AIOState QEMUWin32AIOState;
 QEMUWin32AIOState *win32_aio_init(void);
 void win32_aio_cleanup(QEMUWin32AIOState *aio);
 int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
-BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
        QEMUWin32AIOState *aio, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
+        BlockCompletionFunc *cb, void *opaque, int type);
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                  AioContext *old_context);
 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -30,6 +30,7 @@
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "raw-aio.h"
+#include "qapi/util.h"

 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@@ -59,9 +60,6 @@
 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
 #endif
 #endif
-#ifdef CONFIG_FIEMAP
-#include <linux/fiemap.h>
-#endif
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
 #include <linux/falloc.h>
 #endif
@@ -149,9 +147,7 @@ typedef struct BDRVRawState {
    bool has_discard:1;
    bool has_write_zeroes:1;
    bool discard_zeroes:1;
-#ifdef CONFIG_FIEMAP
-    bool skip_fiemap;
-#endif
+    bool needs_alignment;
 } BDRVRawState;

 typedef struct BDRVRawReopenState {
@@ -229,7 +225,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)

    /* For /dev/sg devices the alignment is not really used.
       With buffered I/O, we don't have any restrictions. */
-    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+    if (bs->sg || !s->needs_alignment) {
        bs->request_alignment = 1;
        s->buf_align = 1;
        return;
@@ -445,6 +441,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
+    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
+        s->needs_alignment = true;
+    }

    if (fstat(s->fd, &st) < 0) {
        error_setg_errno(errp, errno, "Could not stat file");
@@ -471,6 +470,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        }
 #endif
    }
+#ifdef __FreeBSD__
+    if (S_ISCHR(st.st_mode)) {
+        /*
+         * The file is a char device (disk), which on FreeBSD isn't behind
+         * a pager, so force all requests to be aligned. This is needed
+         * so QEMU makes sure all IO operations on the device are aligned
+         * to sector size, or else FreeBSD will reject them with EINVAL.
+         */
+        s->needs_alignment = true;
+    }
+#endif

 #ifdef CONFIG_XFS
    if (platform_test_xfs_fd(s->fd)) {
@@ -517,7 +527,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,

    s = state->bs->opaque;

-    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+    state->opaque = g_new0(BDRVRawReopenState, 1);
    raw_s = state->opaque;

 #ifdef CONFIG_LINUX_AIO
@@ -807,7 +817,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
     * Ok, we have to do it the hard way, copy all segments into
     * a single aligned buffer.
     */
-    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    if (buf == NULL) {
+        return -ENOMEM;
+    }
+
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
        char *p = buf;
        int i;
@@ -1036,9 +1050,9 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    return thread_pool_submit_co(pool, aio_worker, acb);
 }

-static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
    ThreadPool *pool;
@@ -1061,9 +1075,9 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }

-static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    BDRVRawState *s = bs->opaque;

@@ -1071,11 +1085,12 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
        return NULL;

    /*
-     * If O_DIRECT is used the buffer needs to be aligned on a sector
-     * boundary.  Check if this is the case or tell the low-level
-     * driver that it needs to copy the buffer.
+     * Check if the underlying device requires requests to be aligned,
+     * and if the request we are trying to submit is aligned or not.
+     * If this is the case tell the low-level driver that it needs
+     * to copy the buffer.
     */
-    if ((bs->open_flags & BDRV_O_NOCACHE)) {
+    if (s->needs_alignment) {
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
@@ -1120,24 +1135,24 @@ static void raw_aio_flush_io_queue(BlockDriverState *bs)
 #endif
 }

-static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_READ);
 }

-static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_WRITE);
 }

-static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1361,128 +1376,199 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    int result = 0;
    int64_t total_size = 0;
    bool nocow = false;
+    PreallocMode prealloc;
+    char *buf = NULL;
+    Error *local_err = NULL;

    strstart(filename, "file:", &filename);

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               &local_err);
+    g_free(buf);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        result = -EINVAL;
+        goto out;
+    }

    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                   0644);
    if (fd < 0) {
        result = -errno;
        error_setg_errno(errp, -result, "Could not create file");
-    } else {
-        if (nocow) {
-#ifdef __linux__
-            /* Set NOCOW flag to solve performance issue on fs like btrfs.
-             * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
-             * will be ignored since any failure of this operation should not
-             * block the left work.
-             */
-            int attr;
-            if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-                attr |= FS_NOCOW_FL;
-                ioctl(fd, FS_IOC_SETFLAGS, &attr);
-            }
-#endif
-        }
-
-        if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not resize file");
-        }
-        if (qemu_close(fd) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not close the new file");
-        }
+        goto out;
    }
+
+    if (nocow) {
+#ifdef __linux__
+        /* Set NOCOW flag to solve performance issue on fs like btrfs.
+         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
+         * will be ignored since any failure of this operation should not
+         * block the left work.
+         */
+        int attr;
+        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
+            attr |= FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &attr);
+        }
+#endif
+    }
+
+    if (ftruncate(fd, total_size) != 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not resize file");
+        goto out_close;
+    }
+
+    switch (prealloc) {
+#ifdef CONFIG_POSIX_FALLOCATE
+    case PREALLOC_MODE_FALLOC:
+        /* posix_fallocate() doesn't set errno. */
+        result = -posix_fallocate(fd, 0, total_size);
+        if (result != 0) {
+            error_setg_errno(errp, -result,
+                             "Could not preallocate data for the new file");
+        }
+        break;
+#endif
+    case PREALLOC_MODE_FULL:
+    {
+        int64_t num = 0, left = total_size;
+        buf = g_malloc0(65536);
+
+        while (left > 0) {
+            num = MIN(left, 65536);
+            result = write(fd, buf, num);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not write to the new file");
+                break;
+            }
+            left -= result;
+        }
+        if (result >= 0) {
+            result = fsync(fd);
+            if (result < 0) {
+                result = -errno;
+                error_setg_errno(errp, -result,
+                                 "Could not flush new file to disk");
+            }
+        }
+        g_free(buf);
+        break;
+    }
+    case PREALLOC_MODE_OFF:
+        break;
+    default:
+        result = -EINVAL;
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_lookup[prealloc]);
+        break;
+    }
+
+out_close:
+    if (qemu_close(fd) != 0 && result == 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not close the new file");
+    }
+out:
    return result;
 }

-static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
-                          off_t *hole, int nb_sectors, int *pnum)
-{
-#ifdef CONFIG_FIEMAP
-    BDRVRawState *s = bs->opaque;
-    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-    struct {
-        struct fiemap fm;
-        struct fiemap_extent fe;
-    } f;
-
-    if (s->skip_fiemap) {
-        return -ENOTSUP;
-    }
-
-    f.fm.fm_start = start;
-    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
-    f.fm.fm_flags = 0;
-    f.fm.fm_extent_count = 1;
-    f.fm.fm_reserved = 0;
-    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
-        s->skip_fiemap = true;
-        return -errno;
-    }
-
-    if (f.fm.fm_mapped_extents == 0) {
-        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
-         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
-         */
-        off_t length = lseek(s->fd, 0, SEEK_END);
-        *hole = f.fm.fm_start;
-        *data = MIN(f.fm.fm_start + f.fm.fm_length, length);
-    } else {
-        *data = f.fe.fe_logical;
-        *hole = f.fe.fe_logical + f.fe.fe_length;
-        if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
-            ret |= BDRV_BLOCK_ZERO;
-        }
-    }
-
-    return ret;
-#else
-    return -ENOTSUP;
-#endif
-}
-
-static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
-                             off_t *hole, int *pnum)
+/*
+ * Find allocation range in @bs around offset @start.
+ * May change underlying file descriptor's file offset.
+ * If @start is not in a hole, store @start in @data, and the
+ * beginning of the next hole in @hole, and return 0.
+ * If @start is in a non-trailing hole, store @start in @hole and the
+ * beginning of the next non-hole in @data, and return 0.
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
+ * If we can't find out, return a negative errno other than -ENXIO.
+ */
+static int find_allocation(BlockDriverState *bs, off_t start,
+                           off_t *data, off_t *hole)
 {
 #if defined SEEK_HOLE && defined SEEK_DATA
    BDRVRawState *s = bs->opaque;
+    off_t offs;

-    *hole = lseek(s->fd, start, SEEK_HOLE);
-    if (*hole == -1) {
-        /* -ENXIO indicates that sector_num was past the end of the file.
-         * There is a virtual hole there.  */
-        assert(errno != -ENXIO);
+    /*
+     * SEEK_DATA cases:
+     * D1. offs == start: start is in data
+     * D2. offs > start: start is in a hole, next data at offs
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
+     *                              or start is beyond EOF
+     *     If the latter happens, the file has been truncated behind
+     *     our back since we opened it.  All bets are off then.
+     *     Treating like a trailing hole is simplest.
+     * D4. offs < 0, errno != ENXIO: we learned nothing
+     */
+    offs = lseek(s->fd, start, SEEK_DATA);
+    if (offs < 0) {
+        return -errno;          /* D3 or D4 */
+    }
+    assert(offs >= start);

-        return -errno;
+    if (offs > start) {
+        /* D2: in hole, next data at offs */
+        *hole = start;
+        *data = offs;
+        return 0;
    }

-    if (*hole > start) {
+    /* D1: in data, end not yet known */
+
+    /*
+     * SEEK_HOLE cases:
+     * H1. offs == start: start is in a hole
+     *     If this happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H2. offs > start: either start is in data, next hole at offs,
+     *                   or start is in trailing hole, EOF at offs
+     *     Linux treats trailing holes like any other hole: offs ==
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
+     *     If that happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
+     *     If this happens, the file has been truncated behind our
+     *     back since we opened it.  Treat it like a trailing hole.
+     * H4. offs < 0, errno != ENXIO: we learned nothing
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
+     */
+    offs = lseek(s->fd, start, SEEK_HOLE);
+    if (offs < 0) {
+        return -errno;          /* D1 and (H3 or H4) */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /*
+         * D1 and H2: either in data, next hole at offs, or it was in
+         * data but is now in a trailing hole.  In the latter case,
+         * all bets are off.  Treating it as if it there was data all
+         * the way to EOF is safe, so simply do that.
+         */
        *data = start;
-    } else {
-        /* On a hole.  We need another syscall to find its end.  */
-        *data = lseek(s->fd, start, SEEK_DATA);
-        if (*data == -1) {
-            *data = lseek(s->fd, 0, SEEK_END);
-        }
+        *hole = offs;
+        return 0;
    }

-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+    /* D1 and H1 */
+    return -EBUSY;
 #else
    return -ENOTSUP;
 #endif
 }

 /*
- * Returns true iff the specified sector is present in the disk image. Drivers
- * not implementing the functionality are assumed to not support backing files,
- * hence all their sectors are reported as allocated.
+ * Returns the allocation status of the specified sectors.
 *
 * If 'sector_num' is beyond the end of the disk image the return value is 0
 * and 'pnum' is set to 0.
@@ -1499,7 +1585,8 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                                    int nb_sectors, int *pnum)
 {
    off_t start, data = 0, hole = 0;
-    int64_t ret;
+    int64_t total_size;
+    int ret;

    ret = fd_open(bs);
    if (ret < 0) {
@@ -1507,34 +1594,41 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
    }

    start = sector_num * BDRV_SECTOR_SIZE;
-
-    ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
-    if (ret < 0) {
-        ret = try_seek_hole(bs, start, &data, &hole, pnum);
-        if (ret < 0) {
-            /* Assume everything is allocated. */
-            data = 0;
-            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
-            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-        }
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        return total_size;
+    } else if (start >= total_size) {
+        *pnum = 0;
+        return 0;
+    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
+        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
    }

-    if (data <= start) {
+    ret = find_allocation(bs, start, &data, &hole);
+    if (ret == -ENXIO) {
+        /* Trailing hole */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_ZERO;
+    } else if (ret < 0) {
+        /* No info available, so pretend there are no holes */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA;
+    } else if (data == start) {
        /* On a data extent, compute sectors to the end of the extent.  */
        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        ret = BDRV_BLOCK_DATA;
    } else {
        /* On a hole, compute sectors to the beginning of the next extent.  */
+        assert(hole == start);
        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
-        ret &= ~BDRV_BLOCK_DATA;
-        ret |= BDRV_BLOCK_ZERO;
+        ret = BDRV_BLOCK_ZERO;
    }
-
-    return ret;
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }

-static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
+static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1581,6 +1675,11 @@ static QemuOptsList raw_create_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Turn off copy-on-write (valid only on btrfs)"
        },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, falloc, full)"
+        },
        { /* end of list */ }
    }
 };
@@ -1867,9 +1966,9 @@ static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    return ioctl(s->fd, req, buf);
 }

-static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
+static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    RawPosixAIOData *acb;
@@ -1908,9 +2007,9 @@ static int fd_open(BlockDriverState *bs)

 #endif /* !linux && !FreeBSD */

-static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
+static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors,
-    BlockDriverCompletionFunc *cb, void *opaque)
+    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

@@ -1962,8 +2061,8 @@ static int hdev_create(const char *filename, QemuOpts *opts,
    (void)has_prefix;

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    fd = qemu_open(filename, O_WRONLY | O_BINARY);
    if (fd < 0) {
@@ -1979,7 +2078,7 @@ static int hdev_create(const char *filename, QemuOpts *opts,
        error_setg(errp,
                   "The given file is neither a block nor a character device");
        ret = -ENODEV;
-    } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) {
+    } else if (lseek(fd, 0, SEEK_END) < total_size) {
        error_setg(errp, "Device is too small");
        ret = -ENOSPC;
    }
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -138,9 +138,9 @@ static int aio_worker(void *arg)
    return ret;
 }

-static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
+static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
    ThreadPool *pool;
@@ -369,9 +369,9 @@ fail:
    return ret;
 }

-static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                         BlockDriverCompletionFunc *cb, void *opaque)
+                         BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
@@ -383,9 +383,9 @@ static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
    }
 }

-static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                          BlockDriverCompletionFunc *cb, void *opaque)
+                          BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
@@ -397,8 +397,8 @@ static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
    }
 }

-static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
-                         BlockDriverCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
+                         BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
    return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
@@ -511,8 +511,8 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
    strstart(filename, "file:", &filename);

    /* Read out options */
-    total_size =
-        qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512;
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);

    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                   0644);
@@ -521,7 +521,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }
    set_sparse(fd);
-    ftruncate(fd, total_size * 512);
+    ftruncate(fd, total_size);
    qemu_close(fd);
    return 0;
 }
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -129,10 +129,10 @@ static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    return bdrv_ioctl(bs->file, req, buf);
 }

-static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
-                                       unsigned long int req, void *buf,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque)
+static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+                                 unsigned long int req, void *buf,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque)
 {
    return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque);
 }
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -68,7 +68,7 @@ typedef enum {
 } RBDAIOCmd;

 typedef struct RBDAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    QEMUBH *bh;
    int64_t ret;
    QEMUIOVector *qiov;
@@ -77,7 +77,6 @@ typedef struct RBDAIOCB {
    int64_t sector_num;
    int error;
    struct BDRVRBDState *s;
-    int cancelled;
    int status;
 } RBDAIOCB;

@@ -314,7 +313,8 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* Read out options */
-    bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                     BDRV_SECTOR_SIZE);
    objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0);
    if (objsize) {
        if ((objsize - 1) & objsize) {    /* not a power of 2? */
@@ -407,9 +407,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
    acb->status = 0;

-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    qemu_aio_unref(acb);
 }

 /* TODO Convert to fine grained options */
@@ -538,25 +536,8 @@ static void qemu_rbd_close(BlockDriverState *bs)
    rados_shutdown(s->cluster);
 }

-/*
- * Cancel aio. Since we don't reference acb in a non qemu threads,
- * it is safe to access it here.
- */
-static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
-    acb->cancelled = 1;
-
-    while (acb->status == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(acb->common.bs), true);
-    }
-
-    qemu_aio_release(acb);
-}
-
 static const AIOCBInfo rbd_aiocb_info = {
    .aiocb_size = sizeof(RBDAIOCB),
-    .cancel = qemu_rbd_aio_cancel,
 };

 static void rbd_finish_bh(void *opaque)
@@ -608,16 +589,16 @@ static int rbd_aio_flush_wrapper(rbd_image_t image,
 #endif
 }

-static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov,
-                                       int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque,
-                                       RBDAIOCmd cmd)
+static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
+                                 int64_t sector_num,
+                                 QEMUIOVector *qiov,
+                                 int nb_sectors,
+                                 BlockCompletionFunc *cb,
+                                 void *opaque,
+                                 RBDAIOCmd cmd)
 {
    RBDAIOCB *acb;
-    RADOSCB *rcb;
+    RADOSCB *rcb = NULL;
    rbd_completion_t c;
    int64_t off, size;
    char *buf;
@@ -631,12 +612,14 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
        acb->bounce = NULL;
    } else {
-        acb->bounce = qemu_blockalign(bs, qiov->size);
+        acb->bounce = qemu_try_blockalign(bs, qiov->size);
+        if (acb->bounce == NULL) {
+            goto failed;
+        }
    }
    acb->ret = 0;
    acb->error = 0;
    acb->s = s;
-    acb->cancelled = 0;
    acb->bh = NULL;
    acb->status = -EINPROGRESS;

@@ -649,7 +632,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    off = sector_num * BDRV_SECTOR_SIZE;
    size = nb_sectors * BDRV_SECTOR_SIZE;

-    rcb = g_malloc(sizeof(RADOSCB));
+    rcb = g_new(RADOSCB, 1);
    rcb->done = 0;
    rcb->acb = acb;
    rcb->buf = buf;
@@ -688,36 +671,36 @@ failed_completion:
 failed:
    g_free(rcb);
    qemu_vfree(acb->bounce);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
    return NULL;
 }

-static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            QEMUIOVector *qiov,
-                                            int nb_sectors,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
                         RBD_AIO_READ);
 }

-static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             QEMUIOVector *qiov,
-                                             int nb_sectors,
-                                             BlockDriverCompletionFunc *cb,
-                                             void *opaque)
+static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov,
+                                       int nb_sectors,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
                         RBD_AIO_WRITE);
 }

 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
-static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
+static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque)
 {
    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
 }
@@ -859,7 +842,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
    int max_snaps = RBD_MAX_SNAPS;

    do {
-        snaps = g_malloc(sizeof(*snaps) * max_snaps);
+        snaps = g_new(rbd_snap_info_t, max_snaps);
        snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
        if (snap_count <= 0) {
            g_free(snaps);
@@ -870,7 +853,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
        goto done;
    }

-    sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
+    sn_tab = g_new0(QEMUSnapshotInfo, snap_count);

    for (i = 0; i < snap_count; i++) {
        const char *snap_name = snaps[i].name;
@@ -893,17 +876,29 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }

 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
-                                              int64_t sector_num,
-                                              int nb_sectors,
-                                              BlockDriverCompletionFunc *cb,
-                                              void *opaque)
+static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
+                                        int64_t sector_num,
+                                        int nb_sectors,
+                                        BlockCompletionFunc *cb,
+                                        void *opaque)
 {
    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
                         RBD_AIO_DISCARD);
 }
 #endif

+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+static void qemu_rbd_invalidate_cache(BlockDriverState *bs,
+                                      Error **errp)
+{
+    BDRVRBDState *s = bs->opaque;
+    int r = rbd_invalidate_cache(s->image);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "Failed to invalidate the cache");
+    }
+}
+#endif
+
 static QemuOptsList qemu_rbd_create_opts = {
    .name = "rbd-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head),
@@ -953,6 +948,9 @@ static BlockDriver bdrv_rbd = {
    .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+    .bdrv_invalidate_cache  = qemu_rbd_invalidate_cache,
+#endif
 };

 static void bdrv_rbd_init(void)
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -103,6 +103,9 @@
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0

+#define LOCK_TYPE_NORMAL 0
+#define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */
+
 typedef struct SheepdogReq {
    uint8_t proto_ver;
    uint8_t opcode;
@@ -166,7 +169,8 @@ typedef struct SheepdogVdiReq {
    uint8_t copy_policy;
    uint8_t reserved[2];
    uint32_t snapid;
-    uint32_t pad[3];
+    uint32_t type;
+    uint32_t pad[2];
 } SheepdogVdiReq;

 typedef struct SheepdogVdiRsp {
@@ -297,7 +301,7 @@ enum AIOCBState {
 };

 struct SheepdogAIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;

    QEMUIOVector *qiov;

@@ -311,7 +315,6 @@ struct SheepdogAIOCB {
    void (*aio_done_func)(SheepdogAIOCB *);

    bool cancelable;
-    bool *finished;
    int nr_pending;
 };

@@ -442,10 +445,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
 {
    qemu_coroutine_enter(acb->coroutine, NULL);
-    if (acb->finished) {
-        *acb->finished = true;
-    }
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }

 /*
@@ -473,41 +473,38 @@ static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
    return true;
 }

-static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
+static void sd_aio_cancel(BlockAIOCB *blockacb)
 {
    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
    BDRVSheepdogState *s = acb->common.bs->opaque;
    AIOReq *aioreq, *next;
-    bool finished = false;

-    acb->finished = &finished;
-    while (!finished) {
-        if (sd_acb_cancelable(acb)) {
-            /* Remove outstanding requests from pending and failed queues.  */
-            QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
-                               next) {
-                if (aioreq->aiocb == acb) {
-                    free_aio_req(s, aioreq);
-                }
+    if (sd_acb_cancelable(acb)) {
+        /* Remove outstanding requests from pending and failed queues.  */
+        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
+                           next) {
+            if (aioreq->aiocb == acb) {
+                free_aio_req(s, aioreq);
            }
-            QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
-                               next) {
-                if (aioreq->aiocb == acb) {
-                    free_aio_req(s, aioreq);
-                }
-            }
-
-            assert(acb->nr_pending == 0);
-            sd_finish_aiocb(acb);
-            return;
        }
-        aio_poll(s->aio_context, true);
+        QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
+                           next) {
+            if (aioreq->aiocb == acb) {
+                free_aio_req(s, aioreq);
+            }
+        }
+
+        assert(acb->nr_pending == 0);
+        if (acb->common.cb) {
+            acb->common.cb(acb->common.opaque, -ECANCELED);
+        }
+        sd_finish_aiocb(acb);
    }
 }

 static const AIOCBInfo sd_aiocb_info = {
-    .aiocb_size = sizeof(SheepdogAIOCB),
-    .cancel = sd_aio_cancel,
+    .aiocb_size     = sizeof(SheepdogAIOCB),
+    .cancel_async   = sd_aio_cancel,
 };

 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
@@ -524,7 +521,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,

    acb->aio_done_func = NULL;
    acb->cancelable = true;
-    acb->finished = NULL;
    acb->coroutine = qemu_coroutine_self();
    acb->ret = 0;
    acb->nr_pending = 0;
@@ -712,7 +708,6 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)

 static coroutine_fn void reconnect_to_sdog(void *opaque)
 {
-    Error *local_err = NULL;
    BDRVSheepdogState *s = opaque;
    AIOReq *aio_req, *next;

@@ -727,6 +722,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)

    /* Try to reconnect the sheepdog server every one second. */
    while (s->fd < 0) {
+        Error *local_err = NULL;
        s->fd = get_sheep_fd(s, &local_err);
        if (s->fd < 0) {
            DPRINTF("Wait for connection to be established\n");
@@ -1090,6 +1086,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
    memset(&hdr, 0, sizeof(hdr));
    if (lock) {
        hdr.opcode = SD_OP_LOCK_VDI;
+        hdr.type = LOCK_TYPE_NORMAL;
    } else {
        hdr.opcode = SD_OP_GET_VDI_INFO;
    }
@@ -1110,6 +1107,8 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
                   sd_strerror(rsp->result), filename, snapid, tag);
        if (rsp->result == SD_RES_NO_VDI) {
            ret = -ENOENT;
+        } else if (rsp->result == SD_RES_VDI_LOCKED) {
+            ret = -EBUSY;
        } else {
            ret = -EIO;
        }
@@ -1682,7 +1681,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
    uint32_t snapid;
    bool prealloc = false;

-    s = g_malloc0(sizeof(BDRVSheepdogState));
+    s = g_new0(BDRVSheepdogState, 1);

    memset(tag, 0, sizeof(tag));
    if (strstr(filename, "://")) {
@@ -1695,7 +1694,8 @@ static int sd_create(const char *filename, QemuOpts *opts,
        goto out;
    }

-    s->inode.vdi_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                                 BDRV_SECTOR_SIZE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    if (!buf || !strcmp(buf, "off")) {
@@ -1793,6 +1793,7 @@ static void sd_close(BlockDriverState *bs)
    memset(&hdr, 0, sizeof(hdr));

    hdr.opcode = SD_OP_RELEASE_VDI;
+    hdr.type = LOCK_TYPE_NORMAL;
    hdr.base_vdi_id = s->inode.vdi_id;
    wlen = strlen(s->name) + 1;
    hdr.data_length = wlen;
@@ -2129,7 +2130,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

@@ -2150,7 +2151,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

@@ -2273,7 +2274,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
    uint32_t snapid = 0;
    int ret = 0;

-    old_s = g_malloc(sizeof(BDRVSheepdogState));
+    old_s = g_new(BDRVSheepdogState, 1);

    memcpy(old_s, s, sizeof(BDRVSheepdogState));

@@ -2357,7 +2358,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
        goto out;
    }

-    sn_tab = g_malloc0(nr * sizeof(*sn_tab));
+    sn_tab = g_new0(QEMUSnapshotInfo, nr);

    /* calculate a vdi id with hash function */
    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
@@ -2509,7 +2510,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,

    ret = sd_co_rw_vector(acb);
    if (ret <= 0) {
-        qemu_aio_release(acb);
+        qemu_aio_unref(acb);
        return ret;
    }

--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -236,6 +236,10 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
        error_setg(errp, "snapshot_id and name are both NULL");
        return -EINVAL;
    }
+
+    /* drain all pending i/o before deleting snapshot */
+    bdrv_drain_all();
+
    if (drv->bdrv_snapshot_delete) {
        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
    }
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -517,6 +517,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    const char *host, *user, *path, *host_key_check;
    int port;

+    if (!qdict_haskey(options, "host")) {
+        ret = -EINVAL;
+        error_setg(errp, "No hostname was specified");
+        goto err;
+    }
    host = qdict_get_str(options, "host");

    if (qdict_haskey(options, "port")) {
@@ -525,6 +530,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        port = 22;
    }

+    if (!qdict_haskey(options, "path")) {
+        ret = -EINVAL;
+        error_setg(errp, "No path was specified");
+        goto err;
+    }
    path = qdict_get_str(options, "path");

    if (qdict_haskey(options, "user")) {
@@ -700,7 +710,8 @@ static int ssh_create(const char *filename, QemuOpts *opts, Error **errp)
    ssh_state_init(&s);

    /* Get desired file size. */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    DPRINTF("total_size=%" PRIi64, total_size);

    uri_options = qdict_new();
--- a/block/stream.c
+++ b/block/stream.c
@@ -79,9 +79,39 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
    bdrv_refresh_limits(top, NULL);
 }

+typedef struct {
+    int ret;
+    bool reached_end;
+} StreamCompleteData;
+
+static void stream_complete(BlockJob *job, void *opaque)
+{
+    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
+    StreamCompleteData *data = opaque;
+    BlockDriverState *base = s->base;
+
+    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
+        data->ret == 0) {
+        const char *base_id = NULL, *base_fmt = NULL;
+        if (base) {
+            base_id = s->backing_file_str;
+            if (base->drv) {
+                base_fmt = base->drv->format_name;
+            }
+        }
+        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
+        close_unused_images(job->bs, base, base_id);
+    }
+
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
+    StreamCompleteData *data;
    BlockDriverState *bs = s->common.bs;
    BlockDriverState *base = s->base;
    int64_t sector_num, end;
@@ -183,21 +213,13 @@ wait:
    /* Do not remove the backing file if an error was there but ignored.  */
    ret = error;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
-        const char *base_id = NULL, *base_fmt = NULL;
-        if (base) {
-            base_id = s->backing_file_str;
-            if (base->drv) {
-                base_fmt = base->drv->format_name;
-            }
-        }
-        ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        close_unused_images(bs, base, base_id);
-    }
-
    qemu_vfree(buf);
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+
+    /* Modify backing chain and close BDSes in main loop */
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    data->reached_end = sector_num == end;
+    block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }

 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -220,7 +242,7 @@ static const BlockJobDriver stream_job_driver = {
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *backing_file_str, int64_t speed,
                  BlockdevOnError on_error,
-                  BlockDriverCompletionFunc *cb,
+                  BlockCompletionFunc *cb,
                  void *opaque, Error **errp)
 {
    StreamBlockJob *s;
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,13 +53,6 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif

 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
@@ -127,8 +120,18 @@ typedef unsigned char uuid_t[16];

 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)

-/* max blocks in image is (0xffffffff / 4) */
-#define VDI_BLOCKS_IN_IMAGE_MAX  0x3fffffff
+/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since
+ * the bmap is read and written in a single operation, its size needs to be
+ * limited to INT_MAX; furthermore, when opening an image, the bmap size is
+ * rounded up to be aligned on BDRV_SECTOR_SIZE.
+ * Therefore this should satisfy the following:
+ * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1
+ * (INT_MAX + 1 is the first value not representable as an int)
+ * This guarantees that any value below or equal to the constant will, when
+ * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary,
+ * still be below or equal to INT_MAX. */
+#define VDI_BLOCKS_IN_IMAGE_MAX \
+    ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t)))
 #define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
                                  (uint64_t)DEFAULT_CLUSTER_SIZE)

@@ -144,12 +147,14 @@ static inline int uuid_is_null(const uuid_t uu)
    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
 }

+# if defined(CONFIG_VDI_DEBUG)
 static inline void uuid_unparse(const uuid_t uu, char *out)
 {
    snprintf(out, 37, UUID_FMT,
            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
            uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
 }
+# endif
 #endif

 typedef struct {
@@ -299,7 +304,12 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
        return -ENOTSUP;
    }

-    bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t));
+    bmap = g_try_new(uint32_t, s->header.blocks_in_image);
+    if (s->header.blocks_in_image && bmap == NULL) {
+        res->check_errors++;
+        return -ENOMEM;
+    }
+
    memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));

    /* Check block map and value of blocks_allocated. */
@@ -357,23 +367,23 @@ static int vdi_make_empty(BlockDriverState *bs)
 static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
    const VdiHeader *header = (const VdiHeader *)buf;
-    int result = 0;
+    int ret = 0;

    logout("\n");

    if (buf_size < sizeof(*header)) {
        /* Header too small, no VDI. */
    } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
-        result = 100;
+        ret = 100;
    }

-    if (result == 0) {
+    if (ret == 0) {
        logout("no vdi image\n");
    } else {
        logout("%s", header->text);
    }

-    return result;
+    return ret;
 }

 static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
@@ -409,8 +419,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
           We accept them but round the disk size to the next multiple of
           SECTOR_SIZE. */
        logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
-        header.disk_size += SECTOR_SIZE - 1;
-        header.disk_size &= ~(SECTOR_SIZE - 1);
+        header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE);
    }

    if (header.signature != VDI_SIGNATURE) {
@@ -477,8 +486,13 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    s->header = header;

    bmap_size = header.blocks_in_image * sizeof(uint32_t);
-    bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
-    s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+    bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE);
+    s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE);
+    if (s->bmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
    if (ret < 0) {
        goto fail_free_bmap;
@@ -487,13 +501,13 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when vdi images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vdi", bs->device_name, "live migration");
+              "vdi", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;

 fail_free_bmap:
-    g_free(s->bmap);
+    qemu_vfree(s->bmap);

 fail:
    return ret;
@@ -681,8 +695,7 @@ static int vdi_co_write(BlockDriverState *bs,

 static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    int fd;
-    int result = 0;
+    int ret = 0;
    uint64_t bytes = 0;
    uint32_t blocks;
    size_t block_size = DEFAULT_CLUSTER_SIZE;
@@ -690,12 +703,16 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    VdiHeader header;
    size_t i;
    size_t bmap_size;
-    bool nocow = false;
+    int64_t offset = 0;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;
+    uint32_t *bmap = NULL;

    logout("\n");

    /* Read out options. */
-    bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                     BDRV_SECTOR_SIZE);
 #if defined(CONFIG_VDI_BLOCK_SIZE)
    /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */
    block_size = qemu_opt_get_size_del(opts,
@@ -707,45 +724,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
        image_type = VDI_TYPE_STATIC;
    }
 #endif
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);

    if (bytes > VDI_DISK_SIZE_MAX) {
-        result = -ENOTSUP;
+        ret = -ENOTSUP;
        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
                          ", max supported is 0x%" PRIx64 ")",
                          bytes, VDI_DISK_SIZE_MAX);
        goto exit;
    }

-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        result = -errno;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
        goto exit;
    }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
    }

    /* We need enough blocks to store the given disk size,
       so always round up. */
-    blocks = (bytes + block_size - 1) / block_size;
+    blocks = DIV_ROUND_UP(bytes, block_size);

    bmap_size = blocks * sizeof(uint32_t);
-    bmap_size = ((bmap_size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE -1));
+    bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE);

    memset(&header, 0, sizeof(header));
    pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
@@ -769,13 +774,20 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    if (write(fd, &header, sizeof(header)) < 0) {
-        result = -errno;
-        goto close_and_exit;
+    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header));
+    if (ret < 0) {
+        error_setg(errp, "Error writing header to %s", filename);
+        goto exit;
    }
+    offset += sizeof(header);

    if (bmap_size > 0) {
-        uint32_t *bmap = g_malloc0(bmap_size);
+        bmap = g_try_malloc0(bmap_size);
+        if (bmap == NULL) {
+            ret = -ENOMEM;
+            error_setg(errp, "Could not allocate bmap");
+            goto exit;
+        }
        for (i = 0; i < blocks; i++) {
            if (image_type == VDI_TYPE_STATIC) {
                bmap[i] = i;
@@ -783,35 +795,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        if (write(fd, bmap, bmap_size) < 0) {
-            result = -errno;
-            g_free(bmap);
-            goto close_and_exit;
+        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size);
+        if (ret < 0) {
+            error_setg(errp, "Error writing bmap to %s", filename);
+            goto exit;
        }
-        g_free(bmap);
+        offset += bmap_size;
    }

    if (image_type == VDI_TYPE_STATIC) {
-        if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
-            result = -errno;
-            goto close_and_exit;
+        ret = bdrv_truncate(bs, offset + blocks * block_size);
+        if (ret < 0) {
+            error_setg(errp, "Failed to statically allocate %s", filename);
+            goto exit;
        }
    }

-close_and_exit:
-    if ((close(fd) < 0) && !result) {
-        result = -errno;
-    }
-
 exit:
-    return result;
+    bdrv_unref(bs);
+    g_free(bmap);
+    return ret;
 }

 static void vdi_close(BlockDriverState *bs)
 {
    BDRVVdiState *s = bs->opaque;

-    g_free(s->bmap);
+    qemu_vfree(s->bmap);

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -82,8 +82,6 @@ void vhdx_log_desc_le_import(VHDXLogDescriptor *d)
    assert(d != NULL);

    le32_to_cpus(&d->signature);
-    le32_to_cpus(&d->trailing_bytes);
-    le64_to_cpus(&d->leading_bytes);
    le64_to_cpus(&d->file_offset);
    le64_to_cpus(&d->sequence_number);
 }
@@ -99,6 +97,15 @@ void vhdx_log_desc_le_export(VHDXLogDescriptor *d)
    cpu_to_le64s(&d->sequence_number);
 }

+void vhdx_log_data_le_import(VHDXLogDataSector *d)
+{
+    assert(d != NULL);
+
+    le32_to_cpus(&d->data_signature);
+    le32_to_cpus(&d->sequence_high);
+    le32_to_cpus(&d->sequence_low);
+}
+
 void vhdx_log_data_le_export(VHDXLogDataSector *d)
 {
    assert(d != NULL);
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -84,6 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
    if (ret < 0) {
        goto exit;
    }
+    vhdx_log_entry_hdr_le_import(hdr);

 exit:
    return ret;
@@ -211,7 +212,7 @@ static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
 {
    int valid = false;

-    if (memcmp(&hdr->signature, "loge", 4)) {
+    if (hdr->signature != VHDX_LOG_SIGNATURE) {
        goto exit;
    }

@@ -275,12 +276,12 @@ static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
        goto exit;
    }

-    if (!memcmp(&desc->signature, "zero", 4)) {
+    if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
        if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
            /* valid */
            ret = true;
        }
-    } else if (!memcmp(&desc->signature, "desc", 4)) {
+    } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
            /* valid */
            ret = true;
    }
@@ -327,13 +328,15 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
 * passed into this function. Each descriptor will also be validated,
 * and error returned if any are invalid. */
 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
-                              VHDXLogEntries *log, VHDXLogDescEntries **buffer)
+                              VHDXLogEntries *log, VHDXLogDescEntries **buffer,
+                              bool convert_endian)
 {
    int ret = 0;
    uint32_t desc_sectors;
    uint32_t sectors_read;
    VHDXLogEntryHeader hdr;
    VHDXLogDescEntries *desc_entries = NULL;
+    VHDXLogDescriptor desc;
    int i;

    assert(*buffer == NULL);
@@ -342,14 +345,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
    if (ret < 0) {
        goto exit;
    }
-    vhdx_log_entry_hdr_le_import(&hdr);
+
    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
        ret = -EINVAL;
        goto exit;
    }

    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
-    desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    desc_entries = qemu_try_blockalign(bs->file,
+                                       desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    if (desc_entries == NULL) {
+        ret = -ENOMEM;
+        goto exit;
+    }

    ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
                                desc_sectors, false);
@@ -363,12 +371,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,

    /* put in proper endianness, and validate each desc */
    for (i = 0; i < hdr.descriptor_count; i++) {
-        vhdx_log_desc_le_import(&desc_entries->desc[i]);
-        if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
+        desc = desc_entries->desc[i];
+        vhdx_log_desc_le_import(&desc);
+        if (convert_endian) {
+            desc_entries->desc[i] = desc;
+        }
+        if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
            ret = -EINVAL;
            goto free_and_exit;
        }
    }
+    if (convert_endian) {
+        desc_entries->hdr = hdr;
+    }

    *buffer = desc_entries;
    goto exit;
@@ -403,7 +418,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,

    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);

-    if (!memcmp(&desc->signature, "desc", 4)) {
+    if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
        /* data sector */
        if (data == NULL) {
            ret = -EFAULT;
@@ -431,10 +446,15 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,

        memcpy(buffer+offset, &desc->trailing_bytes, 4);

-    } else if (!memcmp(&desc->signature, "zero", 4)) {
+    } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
        /* write 'count' sectors of sector */
        memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
        count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
+    } else {
+        error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
+                      desc->signature);
+        ret = -EINVAL;
+        goto exit;
    }

    file_offset = desc->file_offset;
@@ -493,13 +513,13 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
            goto exit;
        }

-        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
+        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
        if (ret < 0) {
            goto exit;
        }

        for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
-            if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
+            if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
                /* data sector, so read a sector to flush */
                ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
                                            data, 1, false);
@@ -510,6 +530,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                    ret = -EINVAL;
                    goto exit;
                }
+                vhdx_log_data_le_import(data);
            }

            ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
@@ -558,9 +579,6 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
        goto inc_and_exit;
    }

-    vhdx_log_entry_hdr_le_import(&hdr);
-
-
    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
        goto inc_and_exit;
    }
@@ -573,13 +591,13 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,

    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);

-    /* Read desc sectors, and calculate log checksum */
+    /* Read all log sectors, and calculate log checksum */

    total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;


    /* read_desc() will increment the read idx */
-    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
+    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
    if (ret < 0) {
        goto free_and_exit;
    }
@@ -602,7 +620,7 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
        }
    }
    crc ^= 0xffffffff;
-    if (crc != desc_buffer->hdr.checksum) {
+    if (crc != hdr.checksum) {
        goto free_and_exit;
    }

@@ -905,7 +923,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
    buffer = qemu_blockalign(bs, total_length);
    memcpy(buffer, &new_hdr, sizeof(new_hdr));

-    new_desc = (VHDXLogDescriptor *) (buffer + sizeof(new_hdr));
+    new_desc = buffer + sizeof(new_hdr);
    data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
    data_tmp = data;

@@ -962,7 +980,6 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
     * last data sector */
    vhdx_update_checksum(buffer, total_length,
                         offsetof(VHDXLogEntryHeader, checksum));
-    cpu_to_le32s((uint32_t *)(buffer + 4));

    /* now write to the log */
    ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -99,7 +99,8 @@ static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
 /* Each parent type must have a valid GUID; this is for parent images
 * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
 * need to make up our own QCOW2 GUID type */
-static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
+static const MSGUID parent_vhdx_guid __attribute__((unused))
+                                     = { .data1 = 0xb04aefb7,
                                         .data2 = 0xd19e,
                                         .data3 = 0x4a81,
                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8,
@@ -135,10 +136,8 @@ typedef struct VHDXSectorInfo {
 * buf: buffer pointer
 * size: size of buffer (must be > crc_offset+4)
 *
- * Note: The resulting checksum is in the CPU endianness, not necessarily
- *       in the file format endianness (LE).  Any header export to disk should
- *       make sure that vhdx_header_le_export() is used to convert to the
- *       correct endianness
+ * Note: The buffer should have all multi-byte data in little-endian format,
+ *       and the resulting checksum is in little endian format.
 */
 uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
 {
@@ -149,6 +148,7 @@ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)

    memset(buf + crc_offset, 0, sizeof(crc));
    crc =  crc32c(0xffffffff, buf, size);
+    cpu_to_le32s(&crc);
    memcpy(buf + crc_offset, &crc, sizeof(crc));

    return crc;
@@ -300,7 +300,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
 {
    uint8_t *buffer = NULL;
    int ret;
-    VHDXHeader header_le;
+    VHDXHeader *header_le;

    assert(bs_file != NULL);
    assert(hdr != NULL);
@@ -321,11 +321,12 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
    }

    /* overwrite the actual VHDXHeader portion */
-    memcpy(buffer, hdr, sizeof(VHDXHeader));
-    hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
-                                         offsetof(VHDXHeader, checksum));
-    vhdx_header_le_export(hdr, &header_le);
-    ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader));
+    header_le = (VHDXHeader *)buffer;
+    memcpy(header_le, hdr, sizeof(VHDXHeader));
+    vhdx_header_le_export(hdr, header_le);
+    vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
+                         offsetof(VHDXHeader, checksum));
+    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader));

 exit:
    qemu_vfree(buffer);
@@ -432,13 +433,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    }
    /* copy over just the relevant portion that we need */
    memcpy(header1, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header1);

-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header1->signature, "head", 4)             &&
-        header1->version == 1) {
-        h1_seq = header1->sequence_number;
-        h1_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header1);
+        if (header1->signature == VHDX_HEADER_SIGNATURE &&
+            header1->version == 1) {
+            h1_seq = header1->sequence_number;
+            h1_valid = true;
+        }
    }

    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
@@ -447,13 +449,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    }
    /* copy over just the relevant portion that we need */
    memcpy(header2, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header2);

-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header2->signature, "head", 4)             &&
-        header2->version == 1) {
-        h2_seq = header2->sequence_number;
-        h2_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header2);
+        if (header2->signature == VHDX_HEADER_SIGNATURE &&
+            header2->version == 1) {
+            h2_seq = header2->sequence_number;
+            h2_valid = true;
+        }
    }

    /* If there is only 1 valid header (or no valid headers), we
@@ -519,15 +522,21 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
        goto fail;
    }
    memcpy(&s->rt, buffer, sizeof(s->rt));
-    vhdx_region_header_le_import(&s->rt);
    offset += sizeof(s->rt);

-    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
-        memcmp(&s->rt.signature, "regi", 4)) {
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) {
        ret = -EINVAL;
        goto fail;
    }

+    vhdx_region_header_le_import(&s->rt);
+
+    if (s->rt.signature != VHDX_REGION_SIGNATURE) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+
    /* Per spec, maximum region table entry count is 2047 */
    if (s->rt.entry_count > 2047) {
        ret = -EINVAL;
@@ -630,7 +639,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)

    vhdx_metadata_header_le_import(&s->metadata_hdr);

-    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+    if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) {
        ret = -EINVAL;
        goto exit;
    }
@@ -950,7 +959,11 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* s->bat is freed in vhdx_close() */
-    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+    s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length);
+    if (s->bat == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }

    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
    if (ret < 0) {
@@ -991,7 +1004,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VHDX images are used */
    error_set(&s->migration_blocker,
            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-            "vhdx", bs->device_name, "live migration");
+            "vhdx", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;
@@ -1369,7 +1382,7 @@ static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
    int ret = 0;
    VHDXHeader *hdr = NULL;

-    hdr = g_malloc0(sizeof(VHDXHeader));
+    hdr = g_new0(VHDXHeader, 1);

    hdr->signature       = VHDX_HEADER_SIGNATURE;
    hdr->sequence_number = g_random_int();
@@ -1395,6 +1408,12 @@ exit:
    return ret;
 }

+#define VHDX_METADATA_ENTRY_BUFFER_SIZE \
+                                    (sizeof(VHDXFileParameters)               +\
+                                     sizeof(VHDXVirtualDiskSize)              +\
+                                     sizeof(VHDXPage83Data)                   +\
+                                     sizeof(VHDXVirtualDiskLogicalSectorSize) +\
+                                     sizeof(VHDXVirtualDiskPhysicalSectorSize))

 /*
 * Create the Metadata entries.
@@ -1433,11 +1452,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs,
    VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size;
    VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size;

-    entry_buffer = g_malloc0(sizeof(VHDXFileParameters)               +
-                             sizeof(VHDXVirtualDiskSize)              +
-                             sizeof(VHDXPage83Data)                   +
-                             sizeof(VHDXVirtualDiskLogicalSectorSize) +
-                             sizeof(VHDXVirtualDiskPhysicalSectorSize));
+    entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE);

    mt_file_params = entry_buffer;
    offset += sizeof(VHDXFileParameters);
@@ -1518,7 +1533,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs,
    }

    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
-                      VHDX_HEADER_BLOCK_SIZE);
+                      VHDX_METADATA_ENTRY_BUFFER_SIZE);
    if (ret < 0) {
        goto exit;
    }
@@ -1540,7 +1555,8 @@ exit:
 */
 static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                           uint64_t image_size, VHDXImageType type,
-                           bool use_zero_blocks, VHDXRegionTableEntry *rt_bat)
+                           bool use_zero_blocks, uint64_t file_offset,
+                           uint32_t length)
 {
    int ret = 0;
    uint64_t data_file_offset;
@@ -1555,7 +1571,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
    /* this gives a data start after BAT/bitmap entries, and well
     * past any metadata entries (with a 4 MB buffer for future
     * expansion */
-    data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB;
+    data_file_offset = file_offset + length + 5 * MiB;
    total_sectors = image_size >> s->logical_sector_size_bits;

    if (type == VHDX_TYPE_DYNAMIC) {
@@ -1579,7 +1595,11 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                use_zero_blocks ||
                bdrv_has_zero_init(bs) == 0) {
        /* for a fixed file, the default BAT entry is not zero */
-        s->bat = g_malloc0(rt_bat->length);
+        s->bat = g_try_malloc0(length);
+        if (length && s->bat == NULL) {
+            ret = -ENOMEM;
+            goto exit;
+        }
        block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
                                                PAYLOAD_BLOCK_NOT_PRESENT;
        block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
@@ -1594,7 +1614,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
            cpu_to_le64s(&s->bat[sinfo.bat_idx]);
            sector_num += s->sectors_per_block;
        }
-        ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length);
+        ret = bdrv_pwrite(bs, file_offset, s->bat, length);
        if (ret < 0) {
            goto exit;
        }
@@ -1626,6 +1646,8 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
    int ret = 0;
    uint32_t offset = 0;
    void *buffer = NULL;
+    uint64_t bat_file_offset;
+    uint32_t bat_length;
    BDRVVHDXState *s = NULL;
    VHDXRegionTableHeader *region_table;
    VHDXRegionTableEntry *rt_bat;
@@ -1635,7 +1657,7 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,

    /* Populate enough of the BDRVVHDXState to be able to use the
     * pre-existing BAT calculation, translation, and update functions */
-    s = g_malloc0(sizeof(BDRVVHDXState));
+    s = g_new0(BDRVVHDXState, 1);

    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
                     (uint64_t) sector_size / (uint64_t) block_size;
@@ -1674,19 +1696,26 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
    rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
    *metadata_offset = rt_metadata->file_offset;

+    bat_file_offset = rt_bat->file_offset;
+    bat_length = rt_bat->length;
+
+    vhdx_region_header_le_export(region_table);
+    vhdx_region_entry_le_export(rt_bat);
+    vhdx_region_entry_le_export(rt_metadata);
+
    vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
                         offsetof(VHDXRegionTableHeader, checksum));


    /* The region table gives us the data we need to create the BAT,
     * so do that now */
-    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat);
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks,
+                          bat_file_offset, bat_length);
+    if (ret < 0) {
+        goto exit;
+    }

    /* Now write out the region headers to disk */
-    vhdx_region_header_le_export(region_table);
-    vhdx_region_entry_le_export(rt_bat);
-    vhdx_region_entry_le_export(rt_metadata);
-
    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
                      VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
@@ -1699,7 +1728,6 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
        goto exit;
    }

-
 exit:
    g_free(s);
    g_free(buffer);
@@ -1740,7 +1768,8 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    VHDXImageType image_type;
    Error *local_err = NULL;

-    image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0);
    block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0);
    type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
@@ -1849,7 +1878,6 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }


-
 delete_and_exit:
    bdrv_unref(bs);
 exit:
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -435,6 +435,7 @@ void vhdx_header_le_import(VHDXHeader *h);
 void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h);
 void vhdx_log_desc_le_import(VHDXLogDescriptor *d);
 void vhdx_log_desc_le_export(VHDXLogDescriptor *d);
+void vhdx_log_data_le_import(VHDXLogDataSector *d);
 void vhdx_log_data_le_export(VHDXLogDataSector *d);
 void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr);
 void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr);
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -106,6 +106,7 @@ typedef struct VmdkExtent {
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

    int64_t cluster_sectors;
+    int64_t next_cluster_sector;
    char *type;
 } VmdkExtent;

@@ -124,7 +125,6 @@ typedef struct BDRVVmdkState {
 } BDRVVmdkState;

 typedef struct VmdkMetaData {
-    uint32_t offset;
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
@@ -233,7 +233,7 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
        return;
    }
    s->num_extents--;
-    s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
+    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }

 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
@@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
 {
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
+    int64_t nb_sectors;

    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@@ -412,8 +413,12 @@ static int vmdk_add_extent(BlockDriverState *bs,
        return -EFBIG;
    }

-    s->extents = g_realloc(s->extents,
-                              (s->num_extents + 1) * sizeof(VmdkExtent));
+    nb_sectors = bdrv_nb_sectors(file);
+    if (nb_sectors < 0) {
+        return nb_sectors;
+    }
+
+    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
    extent = &s->extents[s->num_extents];
    s->num_extents++;

@@ -427,6 +432,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
+    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -448,7 +454,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,

    /* read the L1 table */
    l1_size = extent->l1_size * sizeof(uint32_t);
-    extent->l1_table = g_malloc(l1_size);
+    extent->l1_table = g_try_malloc(l1_size);
+    if (l1_size && extent->l1_table == NULL) {
+        return -ENOMEM;
+    }
+
    ret = bdrv_pread(extent->file,
                     extent->l1_table_offset,
                     extent->l1_table,
@@ -464,7 +474,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
    }

    if (extent->l1_backup_table_offset) {
-        extent->l1_backup_table = g_malloc(l1_size);
+        extent->l1_backup_table = g_try_malloc(l1_size);
+        if (l1_size && extent->l1_backup_table == NULL) {
+            ret = -ENOMEM;
+            goto fail_l1;
+        }
        ret = bdrv_pread(extent->file,
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
@@ -481,7 +495,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
    }

    extent->l2_cache =
-        g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
    return 0;
 fail_l1b:
    g_free(extent->l1_backup_table);
@@ -643,7 +657,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
        snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
                 le32_to_cpu(header.version));
        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bs->device_name, "vmdk", buf);
+                  bdrv_get_device_name(bs), "vmdk", buf);
        return -ENOTSUP;
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
        /* VMware KB 2064959 explains that version 3 added support for
@@ -669,8 +683,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
-    if (bdrv_getlength(file) <
-            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+    if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) {
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
@@ -821,6 +834,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
                            0, 0, 0, 0, 0, &extent, errp);
            if (ret < 0) {
+                bdrv_unref(extent_file);
                return ret;
            }
            extent->flat_start_offset = flat_offset << 9;
@@ -832,14 +846,15 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            } else {
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
            }
+            g_free(buf);
            if (ret) {
-                g_free(buf);
                bdrv_unref(extent_file);
                return ret;
            }
            extent = &s->extents[s->num_extents - 1];
        } else {
            error_setg(errp, "Unsupported extent type '%s'", type);
+            bdrv_unref(extent_file);
            return -ENOTSUP;
        }
        extent->type = g_strdup(type);
@@ -924,7 +939,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VMDK images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vmdk", bs->device_name, "live migration");
+              "vmdk", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);
    g_free(buf);
    return 0;
@@ -952,57 +967,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
    }
 }

+/**
+ * get_whole_cluster
+ *
+ * Copy backing file's cluster that covers @sector_num, otherwise write zero,
+ * to the cluster at @cluster_sector_num.
+ *
+ * If @skip_start_sector < @skip_end_sector, the relative range
+ * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
+ * it for call to write user data in the request.
+ */
 static int get_whole_cluster(BlockDriverState *bs,
-                VmdkExtent *extent,
-                uint64_t cluster_offset,
-                uint64_t offset,
-                bool allocate)
+                             VmdkExtent *extent,
+                             uint64_t cluster_sector_num,
+                             uint64_t sector_num,
+                             uint64_t skip_start_sector,
+                             uint64_t skip_end_sector)
 {
    int ret = VMDK_OK;
-    uint8_t *whole_grain = NULL;
+    int64_t cluster_bytes;
+    uint8_t *whole_grain;

+    /* For COW, align request sector_num to cluster start */
+    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
+    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    whole_grain = qemu_blockalign(bs, cluster_bytes);
+
+    if (!bs->backing_hd) {
+        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
+        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
+               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+    }
+
+    assert(skip_end_sector <= extent->cluster_sectors);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
-    if (bs->backing_hd) {
-        whole_grain =
-            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
-        if (!vmdk_is_cid_valid(bs)) {
-            ret = VMDK_ERROR;
-            goto exit;
-        }
+    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
+        ret = VMDK_ERROR;
+        goto exit;
+    }

-        /* floor offset to cluster */
-        offset -= offset % (extent->cluster_sectors * 512);
-        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-                extent->cluster_sectors);
-        if (ret < 0) {
-            ret = VMDK_ERROR;
-            goto exit;
+    /* Read backing data before skip range */
+    if (skip_start_sector > 0) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num,
+                            whole_grain, skip_start_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
        }
-
-        /* Write grain only into the active image */
-        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
-                extent->cluster_sectors);
+        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
+                         skip_start_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
+    /* Read backing data after skip range */
+    if (skip_end_sector < extent->cluster_sectors) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
+                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                            extent->cluster_sectors - skip_end_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
+                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                         extent->cluster_sectors - skip_end_sector);
+        if (ret < 0) {
+            ret = VMDK_ERROR;
+            goto exit;
+        }
+    }
+
 exit:
    qemu_vfree(whole_grain);
    return ret;
 }

-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+                         uint32_t offset)
 {
-    uint32_t offset;
-    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
-    offset = cpu_to_le32(m_data->offset);
+    offset = cpu_to_le32(offset);
    /* update L2 table */
    if (bdrv_pwrite_sync(
                extent->file,
                ((int64_t)m_data->l2_offset * 512)
-                    + (m_data->l2_index * sizeof(m_data->offset)),
+                    + (m_data->l2_index * sizeof(offset)),
                &offset, sizeof(offset)) < 0) {
        return VMDK_ERROR;
    }
@@ -1012,7 +1067,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
        if (bdrv_pwrite_sync(
                    extent->file,
                    ((int64_t)m_data->l2_offset * 512)
-                        + (m_data->l2_index * sizeof(m_data->offset)),
+                        + (m_data->l2_index * sizeof(offset)),
                    &offset, sizeof(offset)) < 0) {
            return VMDK_ERROR;
        }
@@ -1024,17 +1079,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
    return VMDK_OK;
 }

+/**
+ * get_cluster_offset
+ *
+ * Look up cluster offset in extent file by sector number, and store in
+ * @cluster_offset.
+ *
+ * For flat extents, the start offset as parsed from the description file is
+ * returned.
+ *
+ * For sparse extents, look up in L1, L2 table. If allocate is true, return an
+ * offset for a new cluster and update L2 cache. If there is a backing file,
+ * COW is done before returning; otherwise, zeroes are written to the allocated
+ * cluster. Both COW and zero writing skips the sector range
+ * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
+ * has new data to write there.
+ *
+ * Returns: VMDK_OK if cluster exists and mapped in the image.
+ *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
+ *          VMDK_ERROR if failed.
+ */
 static int get_cluster_offset(BlockDriverState *bs,
-                                    VmdkExtent *extent,
-                                    VmdkMetaData *m_data,
-                                    uint64_t offset,
-                                    int allocate,
-                                    uint64_t *cluster_offset)
+                              VmdkExtent *extent,
+                              VmdkMetaData *m_data,
+                              uint64_t offset,
+                              bool allocate,
+                              uint64_t *cluster_offset,
+                              uint64_t skip_start_sector,
+                              uint64_t skip_end_sector)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
    uint32_t min_count, *l2_table;
    bool zeroed = false;
+    int64_t ret;
+    int64_t cluster_sector;

    if (m_data) {
        m_data->valid = 0;
@@ -1088,52 +1167,41 @@ static int get_cluster_offset(BlockDriverState *bs,
    extent->l2_cache_counts[min_index] = 1;
 found:
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
-    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
+    cluster_sector = le32_to_cpu(l2_table[l2_index]);

    if (m_data) {
        m_data->valid = 1;
        m_data->l1_index = l1_index;
        m_data->l2_index = l2_index;
-        m_data->offset = *cluster_offset;
        m_data->l2_offset = l2_offset;
        m_data->l2_cache_entry = &l2_table[l2_index];
    }
-    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
        zeroed = true;
    }

-    if (!*cluster_offset || zeroed) {
+    if (!cluster_sector || zeroed) {
        if (!allocate) {
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
        }

-        /* Avoid the L2 tables update for the images that have snapshots. */
-        *cluster_offset = bdrv_getlength(extent->file);
-        if (!extent->compressed) {
-            bdrv_truncate(
-                extent->file,
-                *cluster_offset + (extent->cluster_sectors << 9)
-            );
-        }
-
-        *cluster_offset >>= 9;
-        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
+        cluster_sector = extent->next_cluster_sector;
+        extent->next_cluster_sector += extent->cluster_sectors;

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        if (get_whole_cluster(
-                bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return VMDK_ERROR;
-        }
-
-        if (m_data) {
-            m_data->offset = *cluster_offset;
+        ret = get_whole_cluster(bs, extent,
+                                cluster_sector,
+                                offset >> BDRV_SECTOR_BITS,
+                                skip_start_sector, skip_end_sector);
+        if (ret) {
+            return ret;
        }
    }
-    *cluster_offset <<= 9;
+    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
    return VMDK_OK;
 }

@@ -1168,7 +1236,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
    }
    qemu_co_mutex_lock(&s->lock);
    ret = get_cluster_offset(bs, extent, NULL,
-                            sector_num * 512, 0, &offset);
+                             sector_num * 512, false, &offset,
+                             0, 0);
    qemu_co_mutex_unlock(&s->lock);

    switch (ret) {
@@ -1321,9 +1390,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
-        ret = get_cluster_offset(
-                            bs, extent, NULL,
-                            sector_num << 9, 0, &cluster_offset);
+        ret = get_cluster_offset(bs, extent, NULL,
+                                 sector_num << 9, false, &cluster_offset,
+                                 0, 0);
        extent_begin_sector = extent->end_sector - extent->sectors;
        extent_relative_sector_num = sector_num - extent_begin_sector;
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@@ -1404,12 +1473,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
-        ret = get_cluster_offset(
-                                bs,
-                                extent,
-                                &m_data,
-                                sector_num << 9, !extent->compressed,
-                                &cluster_offset);
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                 !(extent->compressed || zeroed),
+                                 &cluster_offset,
+                                 index_in_cluster, index_in_cluster + n);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1418,24 +1492,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(
-                                        bs,
-                                        extent,
-                                        &m_data,
-                                        sector_num << 9, 1,
-                                        &cluster_offset);
+                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                         true, &cluster_offset, 0, 0);
            }
        }
        if (ret == VMDK_ERROR) {
            return -EINVAL;
        }
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
@@ -1443,9 +1506,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                    n >= extent->cluster_sectors) {
                n = extent->cluster_sectors;
                if (!zero_dry_run) {
-                    m_data.offset = VMDK_GTE_ZEROED;
                    /* update L2 tables */
-                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
+                            != VMDK_OK) {
                        return -EIO;
                    }
                }
@@ -1461,7 +1524,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
            }
            if (m_data.valid) {
                /* update L2 tables */
-                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                if (vmdk_L2update(extent, &m_data,
+                                  cluster_offset >> BDRV_SECTOR_BITS)
+                        != VMDK_OK) {
                    return -EIO;
                }
            }
@@ -1742,7 +1807,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        goto exit;
    }
    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
@@ -1999,7 +2065,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
-    int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int64_t total_sectors = bdrv_nb_sectors(bs);
    int ret;
    uint64_t cluster_offset;

@@ -2020,7 +2086,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
-                                 0, &cluster_offset);
+                                 false, &cluster_offset, 0, 0);
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
@@ -2071,23 +2137,29 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
    return spec_info;
 }

+static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
+{
+    return a->flat == b->flat &&
+           a->compressed == b->compressed &&
+           (a->flat || a->cluster_sectors == b->cluster_sectors);
+}
+
 static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
+
+    /* See if we have multiple extents but they have different cases */
+    for (i = 1; i < s->num_extents; i++) {
+        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
+            return -ENOTSUP;
+        }
+    }
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
-    /* See if we have multiple extents but they have different cases */
-    for (i = 1; i < s->num_extents; i++) {
-        if (bdi->needs_compressed_writes != s->extents[i].compressed ||
-            (bdi->cluster_size && bdi->cluster_size !=
-                s->extents[i].cluster_sectors << BDRV_SECTOR_BITS)) {
-            return -ENOTSUP;
-        }
-    }
    return 0;
 }

--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,13 +29,6 @@
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif

 /**************************************************************/

@@ -214,7 +207,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
            "incorrect.\n", bs->filename);

    /* Write 'checksum' back to footer, or else will leave it with zero. */
-    footer->checksum = be32_to_cpu(checksum);
+    footer->checksum = cpu_to_be32(checksum);

    // The visible size of a image in Virtual PC depends on the geometry
    // rather than on the size stored in the footer (the size in the footer
@@ -276,7 +269,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
            goto fail;
        }

-        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);
+        s->pagetable = qemu_try_blockalign(bs->file, s->max_table_entries * 4);
+        if (s->pagetable == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }

        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);

@@ -323,7 +320,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    /* Disable migration when VHD images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vpc", bs->device_name, "live migration");
+              "vpc", bdrv_get_device_name(bs), "live migration");
    migrate_add_blocker(s->migration_blocker);

    return 0;
@@ -475,7 +472,7 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)

    // Write BAT entry to disk
    bat_offset = s->bat_offset + (4 * index);
-    bat_value = be32_to_cpu(s->pagetable[index]);
+    bat_value = cpu_to_be32(s->pagetable[index]);
    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
    if (ret < 0)
        goto fail;
@@ -492,7 +489,7 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) != VHD_FIXED) {
+    if (be32_to_cpu(footer->type) != VHD_FIXED) {
        bdi->cluster_size = s->block_size;
    }

@@ -509,7 +506,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num,
    int64_t sectors, sectors_per_block;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_read(bs->file, sector_num, buf, nb_sectors);
    }
    while (nb_sectors > 0) {
@@ -558,7 +555,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num,
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_write(bs->file, sector_num, buf, nb_sectors);
    }
    while (nb_sectors > 0) {
@@ -656,39 +653,41 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    return 0;
 }

-static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
+static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
+                               int64_t total_sectors)
 {
    VHDDynDiskHeader *dyndisk_header =
        (VHDDynDiskHeader *) buf;
    size_t block_size, num_bat_entries;
    int i;
-    int ret = -EIO;
+    int ret;
+    int64_t offset = 0;

    // Write the footer (twice: at the beginning and at the end)
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret) {
        goto fail;
    }

-    if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret < 0) {
        goto fail;
    }

    // Write the initial BAT
-    if (lseek(fd, 3 * 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 3 * 512;

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        if (write(fd, buf, 512) != 512) {
+        ret = bdrv_pwrite_sync(bs, offset, buf, 512);
+        if (ret < 0) {
            goto fail;
        }
+        offset += 512;
    }

    // Prepare the Dynamic Disk Header
@@ -700,48 +699,44 @@ static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
     * Note: The spec is actually wrong here for data_offset, it says
     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
     */
-    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
-    dyndisk_header->table_offset = be64_to_cpu(3 * 512);
-    dyndisk_header->version = be32_to_cpu(0x00010000);
-    dyndisk_header->block_size = be32_to_cpu(block_size);
-    dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+    dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
+    dyndisk_header->table_offset = cpu_to_be64(3 * 512);
+    dyndisk_header->version = cpu_to_be32(0x00010000);
+    dyndisk_header->block_size = cpu_to_be32(block_size);
+    dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);

-    dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));

    // Write the header
-    if (lseek(fd, 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 512;

-    if (write(fd, buf, 1024) != 1024) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, 1024);
+    if (ret < 0) {
        goto fail;
    }
-    ret = 0;

 fail:
    return ret;
 }

-static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size)
+static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
+                             int64_t total_size)
 {
-    int ret = -EIO;
+    int ret;

    /* Add footer to total size */
-    total_size += 512;
-    if (ftruncate(fd, total_size) != 0) {
-        ret = -errno;
-        goto fail;
-    }
-    if (lseek(fd, -512, SEEK_END) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
-        goto fail;
+    total_size += HEADER_SIZE;
+
+    ret = bdrv_truncate(bs, total_size);
+    if (ret < 0) {
+        return ret;
    }

-    ret = 0;
+    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    if (ret < 0) {
+        return ret;
+    }

- fail:
    return ret;
 }

@@ -750,7 +745,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    uint8_t buf[1024];
    VHDFooter *footer = (VHDFooter *) buf;
    char *disk_type_param;
-    int fd, i;
+    int i;
    uint16_t cyls = 0;
    uint8_t heads = 0;
    uint8_t secs_per_cyl = 0;
@@ -758,10 +753,12 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size;
    int disk_type;
    int ret = -EIO;
-    bool nocow = false;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;

    /* Read out options */
-    total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
    disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (disk_type_param) {
        if (!strcmp(disk_type_param, "dynamic")) {
@@ -775,28 +772,17 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    } else {
        disk_type = VHD_DYNAMIC;
    }
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);

-    /* Create the file */
-    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
-    if (fd < 0) {
-        ret = -EIO;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
        goto out;
    }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto out;
    }

    /*
@@ -810,7 +796,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
                               &secs_per_cyl))
        {
            ret = -EFBIG;
-            goto fail;
+            goto out;
        }
    }

@@ -824,46 +810,45 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
    memcpy(footer->creator_app, "qemu", 4);
    memcpy(footer->creator_os, "Wi2k", 4);

-    footer->features = be32_to_cpu(0x02);
-    footer->version = be32_to_cpu(0x00010000);
+    footer->features = cpu_to_be32(0x02);
+    footer->version = cpu_to_be32(0x00010000);
    if (disk_type == VHD_DYNAMIC) {
-        footer->data_offset = be64_to_cpu(HEADER_SIZE);
+        footer->data_offset = cpu_to_be64(HEADER_SIZE);
    } else {
-        footer->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
+        footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
    }
-    footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+    footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);

    /* Version of Virtual PC 2007 */
-    footer->major = be16_to_cpu(0x0005);
-    footer->minor = be16_to_cpu(0x0003);
+    footer->major = cpu_to_be16(0x0005);
+    footer->minor = cpu_to_be16(0x0003);
    if (disk_type == VHD_DYNAMIC) {
-        footer->orig_size = be64_to_cpu(total_sectors * 512);
-        footer->size = be64_to_cpu(total_sectors * 512);
+        footer->orig_size = cpu_to_be64(total_sectors * 512);
+        footer->size = cpu_to_be64(total_sectors * 512);
    } else {
-        footer->orig_size = be64_to_cpu(total_size);
-        footer->size = be64_to_cpu(total_size);
+        footer->orig_size = cpu_to_be64(total_size);
+        footer->size = cpu_to_be64(total_size);
    }
-    footer->cyls = be16_to_cpu(cyls);
+    footer->cyls = cpu_to_be16(cyls);
    footer->heads = heads;
    footer->secs_per_cyl = secs_per_cyl;

-    footer->type = be32_to_cpu(disk_type);
+    footer->type = cpu_to_be32(disk_type);

 #if defined(CONFIG_UUID)
    uuid_generate(footer->uuid);
 #endif

-    footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));

    if (disk_type == VHD_DYNAMIC) {
-        ret = create_dynamic_disk(fd, buf, total_sectors);
+        ret = create_dynamic_disk(bs, buf, total_sectors);
    } else {
-        ret = create_fixed_disk(fd, buf, total_size);
+        ret = create_fixed_disk(bs, buf, total_size);
    }

-fail:
-    qemu_close(fd);
 out:
+    bdrv_unref(bs);
    g_free(disk_type_param);
    return ret;
 }
@@ -873,7 +858,7 @@ static int vpc_has_zero_init(BlockDriverState *bs)
    BDRVVPCState *s = bs->opaque;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;

-    if (cpu_to_be32(footer->type) == VHD_FIXED) {
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
        return bdrv_has_zero_init(bs->file);
    } else {
        return 1;
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -52,10 +52,6 @@

 #define DLOG(a) a

-#undef stderr
-#define stderr STDERR
-FILE* stderr = NULL;
-
 static void checkpoint(void);

 #ifdef __MINGW32__
@@ -732,7 +728,7 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)
 	if(first_cluster == 0 && (is_dotdot || is_dot))
 	    continue;

-	buffer=(char*)g_malloc(length);
+	buffer = g_malloc(length);
 	snprintf(buffer,length,"%s/%s",dirname,entry->d_name);

 	if(stat(buffer,&st)<0) {
@@ -767,7 +763,7 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)

 	/* create mapping for this file */
 	if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
-	    s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+	    s->current_mapping = array_get_next(&(s->mapping));
 	    s->current_mapping->begin=0;
 	    s->current_mapping->end=st.st_size;
 	    /*
@@ -811,12 +807,12 @@ static int read_directory(BDRVVVFATState* s, int mapping_index)
    }

     /* reget the mapping, since s->mapping was possibly realloc()ed */
-    mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+    mapping = array_get(&(s->mapping), mapping_index);
    first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
 	* 0x20 / s->cluster_size;
    mapping->end = first_cluster;

-    direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+    direntry = array_get(&(s->directory), mapping->dir_index);
    set_begin_of_direntry(direntry, mapping->begin);

    return 0;
@@ -1082,11 +1078,6 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
    vvv = s;
 #endif

-DLOG(if (stderr == NULL) {
-    stderr = fopen("vvfat.log", "a");
-    setbuf(stderr, NULL);
-})
-
    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
@@ -1191,7 +1182,7 @@ DLOG(if (stderr == NULL) {
    if (s->qcow) {
        error_set(&s->migration_blocker,
                  QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  "vvfat (rw)", bs->device_name, "live migration");
+                  "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
        migrate_add_blocker(s->migration_blocker);
    }

@@ -2948,9 +2939,9 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
    unlink(s->qcow_filename);
 #endif

-    bdrv_set_backing_hd(s->bs, bdrv_new("", &error_abort));
+    bdrv_set_backing_hd(s->bs, bdrv_new());
    s->bs->backing_hd->drv = &vvfat_write_target;
-    s->bs->backing_hd->opaque = g_malloc(sizeof(void*));
+    s->bs->backing_hd->opaque = g_new(void *, 1);
    *(void**)s->bs->backing_hd->opaque = s;

    return 0;
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -44,7 +44,7 @@ struct QEMUWin32AIOState {
 };

 typedef struct QEMUWin32AIOCB {
-    BlockDriverAIOCB common;
+    BlockAIOCB common;
    struct QEMUWin32AIOState *ctx;
    int nbytes;
    OVERLAPPED ov;
@@ -88,7 +88,7 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,


    waiocb->common.cb(waiocb->common.opaque, ret);
-    qemu_aio_release(waiocb);
+    qemu_aio_unref(waiocb);
 }

 static void win32_aio_completion_cb(EventNotifier *e)
@@ -106,28 +106,14 @@ static void win32_aio_completion_cb(EventNotifier *e)
    }
 }

-static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
-
-    /*
-     * CancelIoEx is only supported in Vista and newer.  For now, just
-     * wait for completion.
-     */
-    while (!HasOverlappedIoCompleted(&waiocb->ov)) {
-        aio_poll(bdrv_get_aio_context(blockacb->bs), true);
-    }
-}
-
 static const AIOCBInfo win32_aiocb_info = {
    .aiocb_size         = sizeof(QEMUWin32AIOCB),
-    .cancel             = win32_aio_cancel,
 };

-BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
        QEMUWin32AIOState *aio, HANDLE hfile,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type)
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    struct QEMUWin32AIOCB *waiocb;
    uint64_t offset = sector_num * 512;
@@ -139,7 +125,10 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
    waiocb->is_read = (type == QEMU_AIO_READ);

    if (qiov->niov > 1) {
-        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        waiocb->buf = qemu_try_blockalign(bs, qiov->size);
+        if (waiocb->buf == NULL) {
+            goto out;
+        }
        if (type & QEMU_AIO_WRITE) {
            iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
        }
@@ -168,7 +157,8 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,

 out_dec_count:
    aio->count--;
-    qemu_aio_release(waiocb);
+out:
+    qemu_aio_unref(waiocb);
    return NULL;
 }

--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -108,7 +108,7 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,

    nbd_export_set_name(exp, device);

-    n = g_malloc0(sizeof(NBDCloseNotifier));
+    n = g_new0(NBDCloseNotifier, 1);
    n->n.notify = nbd_close_notifier;
    n->exp = exp;
    bdrv_add_close_notifier(bs, &n->n);
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -36,7 +36,7 @@
 #include "qapi-event.h"

 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
-                       int64_t speed, BlockDriverCompletionFunc *cb,
+                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
 {
    BlockJob *job;
@@ -50,6 +50,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
    bdrv_op_block_all(bs, job->blocker);
+    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);

    job->driver        = driver;
    job->bs            = bs;
@@ -107,7 +108,8 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 void block_job_complete(BlockJob *job, Error **errp)
 {
    if (job->paused || job->cancelled || !job->driver->complete) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY,
+                  bdrv_get_device_name(job->bs));
        return;
    }

@@ -152,27 +154,30 @@ void block_job_iostatus_reset(BlockJob *job)
    }
 }

-struct BlockCancelData {
+struct BlockFinishData {
    BlockJob *job;
-    BlockDriverCompletionFunc *cb;
+    BlockCompletionFunc *cb;
    void *opaque;
    bool cancelled;
    int ret;
 };

-static void block_job_cancel_cb(void *opaque, int ret)
+static void block_job_finish_cb(void *opaque, int ret)
 {
-    struct BlockCancelData *data = opaque;
+    struct BlockFinishData *data = opaque;

    data->cancelled = block_job_is_cancelled(data->job);
    data->ret = ret;
    data->cb(data->opaque, ret);
 }

-int block_job_cancel_sync(BlockJob *job)
+static int block_job_finish_sync(BlockJob *job,
+                                 void (*finish)(BlockJob *, Error **errp),
+                                 Error **errp)
 {
-    struct BlockCancelData data;
+    struct BlockFinishData data;
    BlockDriverState *bs = job->bs;
+    Error *local_err = NULL;

    assert(bs->job == job);

@@ -183,15 +188,37 @@ int block_job_cancel_sync(BlockJob *job)
    data.cb = job->cb;
    data.opaque = job->opaque;
    data.ret = -EINPROGRESS;
-    job->cb = block_job_cancel_cb;
+    job->cb = block_job_finish_cb;
    job->opaque = &data;
-    block_job_cancel(job);
+    finish(job, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EBUSY;
+    }
    while (data.ret == -EINPROGRESS) {
        aio_poll(bdrv_get_aio_context(bs), true);
    }
    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
 }

+/* A wrapper around block_job_cancel() taking an Error ** parameter so it may be
+ * used with block_job_finish_sync() without the need for (rather nasty)
+ * function pointer casts there. */
+static void block_job_cancel_err(BlockJob *job, Error **errp)
+{
+    block_job_cancel(job);
+}
+
+int block_job_cancel_sync(BlockJob *job)
+{
+    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
+}
+
+int block_job_complete_sync(BlockJob *job, Error **errp)
+{
+    return block_job_finish_sync(job, &block_job_complete, errp);
+}
+
 void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
 {
    assert(job->busy);
@@ -205,7 +232,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    if (block_job_is_paused(job)) {
        qemu_coroutine_yield();
    } else {
-        co_sleep_ns(type, ns);
+        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
    }
    job->busy = true;
 }
@@ -235,6 +262,7 @@ BlockJobInfo *block_job_query(BlockJob *job)
    info->offset    = job->offset;
    info->speed     = job->speed;
    info->io_status = job->iostatus;
+    info->ready     = job->ready;
    return info;
 }

@@ -270,6 +298,8 @@ void block_job_event_completed(BlockJob *job, const char *msg)

 void block_job_event_ready(BlockJob *job)
 {
+    job->ready = true;
+
    qapi_event_send_block_job_ready(job->driver->job_type,
                                    bdrv_get_device_name(job->bs),
                                    job->len,
@@ -313,3 +343,48 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
    }
    return action;
 }
+
+typedef struct {
+    BlockJob *job;
+    QEMUBH *bh;
+    AioContext *aio_context;
+    BlockJobDeferToMainLoopFn *fn;
+    void *opaque;
+} BlockJobDeferToMainLoopData;
+
+static void block_job_defer_to_main_loop_bh(void *opaque)
+{
+    BlockJobDeferToMainLoopData *data = opaque;
+    AioContext *aio_context;
+
+    qemu_bh_delete(data->bh);
+
+    /* Prevent race with block_job_defer_to_main_loop() */
+    aio_context_acquire(data->aio_context);
+
+    /* Fetch BDS AioContext again, in case it has changed */
+    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context_acquire(aio_context);
+
+    data->fn(data->job, data->opaque);
+
+    aio_context_release(aio_context);
+
+    aio_context_release(data->aio_context);
+
+    g_free(data);
+}
+
+void block_job_defer_to_main_loop(BlockJob *job,
+                                  BlockJobDeferToMainLoopFn *fn,
+                                  void *opaque)
+{
+    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
+    data->job = job;
+    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
+    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->fn = fn;
+    data->opaque = opaque;
+
+    qemu_bh_schedule(data->bh);
+}
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -0,0 +1,258 @@
+/*
+ * QEMU Boot Device Implement
+ *
+ * Copyright (c) 2014 HUAWEI TECHNOLOGIES CO.,LTD.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "sysemu/sysemu.h"
+#include "qapi/visitor.h"
+#include "qemu/error-report.h"
+
+typedef struct FWBootEntry FWBootEntry;
+
+struct FWBootEntry {
+    QTAILQ_ENTRY(FWBootEntry) link;
+    int32_t bootindex;
+    DeviceState *dev;
+    char *suffix;
+};
+
+static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
+    QTAILQ_HEAD_INITIALIZER(fw_boot_order);
+
+void check_boot_index(int32_t bootindex, Error **errp)
+{
+    FWBootEntry *i;
+
+    if (bootindex >= 0) {
+        QTAILQ_FOREACH(i, &fw_boot_order, link) {
+            if (i->bootindex == bootindex) {
+                error_setg(errp, "The bootindex %d has already been used",
+                           bootindex);
+                return;
+            }
+        }
+    }
+}
+
+void del_boot_device_path(DeviceState *dev, const char *suffix)
+{
+    FWBootEntry *i;
+
+    if (dev == NULL) {
+        return;
+    }
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        if ((!suffix || !g_strcmp0(i->suffix, suffix)) &&
+             i->dev == dev) {
+            QTAILQ_REMOVE(&fw_boot_order, i, link);
+            g_free(i->suffix);
+            g_free(i);
+
+            break;
+        }
+    }
+}
+
+void add_boot_device_path(int32_t bootindex, DeviceState *dev,
+                          const char *suffix)
+{
+    FWBootEntry *node, *i;
+
+    if (bootindex < 0) {
+        del_boot_device_path(dev, suffix);
+        return;
+    }
+
+    assert(dev != NULL || suffix != NULL);
+
+    del_boot_device_path(dev, suffix);
+
+    node = g_malloc0(sizeof(FWBootEntry));
+    node->bootindex = bootindex;
+    node->suffix = g_strdup(suffix);
+    node->dev = dev;
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        if (i->bootindex == bootindex) {
+            error_report("Two devices with same boot index %d", bootindex);
+            exit(1);
+        } else if (i->bootindex < bootindex) {
+            continue;
+        }
+        QTAILQ_INSERT_BEFORE(i, node, link);
+        return;
+    }
+    QTAILQ_INSERT_TAIL(&fw_boot_order, node, link);
+}
+
+DeviceState *get_boot_device(uint32_t position)
+{
+    uint32_t counter = 0;
+    FWBootEntry *i = NULL;
+    DeviceState *res = NULL;
+
+    if (!QTAILQ_EMPTY(&fw_boot_order)) {
+        QTAILQ_FOREACH(i, &fw_boot_order, link) {
+            if (counter == position) {
+                res = i->dev;
+                break;
+            }
+            counter++;
+        }
+    }
+    return res;
+}
+
+/*
+ * This function returns null terminated string that consist of new line
+ * separated device paths.
+ *
+ * memory pointed by "size" is assigned total length of the array in bytes
+ *
+ */
+char *get_boot_devices_list(size_t *size, bool ignore_suffixes)
+{
+    FWBootEntry *i;
+    size_t total = 0;
+    char *list = NULL;
+
+    QTAILQ_FOREACH(i, &fw_boot_order, link) {
+        char *devpath = NULL, *bootpath;
+        size_t len;
+
+        if (i->dev) {
+            devpath = qdev_get_fw_dev_path(i->dev);
+            assert(devpath);
+        }
+
+        if (i->suffix && !ignore_suffixes && devpath) {
+            size_t bootpathlen = strlen(devpath) + strlen(i->suffix) + 1;
+
+            bootpath = g_malloc(bootpathlen);
+            snprintf(bootpath, bootpathlen, "%s%s", devpath, i->suffix);
+            g_free(devpath);
+        } else if (devpath) {
+            bootpath = devpath;
+        } else if (!ignore_suffixes) {
+            assert(i->suffix);
+            bootpath = g_strdup(i->suffix);
+        } else {
+            bootpath = g_strdup("");
+        }
+
+        if (total) {
+            list[total-1] = '\n';
+        }
+        len = strlen(bootpath) + 1;
+        list = g_realloc(list, total + len);
+        memcpy(&list[total], bootpath, len);
+        total += len;
+        g_free(bootpath);
+    }
+
+    *size = total;
+
+    if (boot_strict && *size > 0) {
+        list[total-1] = '\n';
+        list = g_realloc(list, total + 5);
+        memcpy(&list[total], "HALT", 5);
+        *size = total + 5;
+    }
+    return list;
+}
+
+typedef struct {
+    int32_t *bootindex;
+    const char *suffix;
+    DeviceState *dev;
+} BootIndexProperty;
+
+static void device_get_bootindex(Object *obj, Visitor *v, void *opaque,
+                                 const char *name, Error **errp)
+{
+    BootIndexProperty *prop = opaque;
+    visit_type_int32(v, prop->bootindex, name, errp);
+}
+
+static void device_set_bootindex(Object *obj, Visitor *v, void *opaque,
+                                 const char *name, Error **errp)
+{
+    BootIndexProperty *prop = opaque;
+    int32_t boot_index;
+    Error *local_err = NULL;
+
+    visit_type_int32(v, &boot_index, name, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* check whether bootindex is present in fw_boot_order list  */
+    check_boot_index(boot_index, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* change bootindex to a new one */
+    *prop->bootindex = boot_index;
+
+    add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix);
+
+out:
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+}
+
+static void property_release_bootindex(Object *obj, const char *name,
+                                       void *opaque)
+
+{
+    BootIndexProperty *prop = opaque;
+
+    del_boot_device_path(prop->dev, prop->suffix);
+    g_free(prop);
+}
+
+void device_add_bootindex_property(Object *obj, int32_t *bootindex,
+                                   const char *name, const char *suffix,
+                                   DeviceState *dev, Error **errp)
+{
+    Error *local_err = NULL;
+    BootIndexProperty *prop = g_malloc0(sizeof(*prop));
+
+    prop->bootindex = bootindex;
+    prop->suffix = suffix;
+    prop->dev = dev;
+
+    object_property_add(obj, name, "int32",
+                        device_get_bootindex,
+                        device_set_bootindex,
+                        property_release_bootindex,
+                        prop, &local_err);
+
+    if (local_err) {
+        error_propagate(errp, local_err);
+        g_free(prop);
+        return;
+    }
+    /* initialize devices' bootindex property to -1 */
+    object_property_set_int(obj, -1, name, NULL);
+}
--- a/170
+++ b/170
@@ -326,7 +326,7 @@ seccomp=""
 glusterfs=""
 glusterfs_discard="no"
 glusterfs_zerofill="no"
-virtio_blk_data_plane=""
+archipelago=""
 gtk=""
 gtkabi=""
 vte=""
@@ -388,6 +388,7 @@ cpp="${CPP-$cc -E}"
 objcopy="${OBJCOPY-${cross_prefix}objcopy}"
 ld="${LD-${cross_prefix}ld}"
 libtool="${LIBTOOL-${cross_prefix}libtool}"
+nm="${NM-${cross_prefix}nm}"
 strip="${STRIP-${cross_prefix}strip}"
 windres="${WINDRES-${cross_prefix}windres}"
 pkg_config_exe="${PKG_CONFIG-${cross_prefix}pkg-config}"
@@ -1087,9 +1088,12 @@ for opt do
  ;;
  --enable-glusterfs) glusterfs="yes"
  ;;
-  --disable-virtio-blk-data-plane) virtio_blk_data_plane="no"
+  --disable-archipelago) archipelago="no"
  ;;
-  --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes"
+  --enable-archipelago) archipelago="yes"
+  ;;
+  --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane)
+      echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2
  ;;
  --disable-gtk) gtk="no"
  ;;
@@ -1344,7 +1348,7 @@ Advanced options (experts only):
  --enable-linux-aio       enable Linux AIO support
  --disable-cap-ng         disable libcap-ng support
  --enable-cap-ng          enable libcap-ng support
-  --disable-attr           disables attr and xattr support
+  --disable-attr           disable attr and xattr support
  --enable-attr            enable attr and xattr support
  --disable-blobs          disable installing provided firmware blobs
  --enable-docs            enable documentation build
@@ -1375,20 +1379,22 @@ Advanced options (experts only):
  --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
  --with-win-sdk=SDK-path  path to Windows Platform SDK (to build VSS .tlb)
  --disable-seccomp        disable seccomp support
-  --enable-seccomp         enables seccomp support
+  --enable-seccomp         enable seccomp support
  --with-coroutine=BACKEND coroutine backend. Supported options:
                           gthread, ucontext, sigaltstack, windows
  --disable-coroutine-pool disable coroutine freelist (worse performance)
  --enable-coroutine-pool  enable coroutine freelist (better performance)
  --enable-glusterfs       enable GlusterFS backend
  --disable-glusterfs      disable GlusterFS backend
+  --enable-archipelago     enable Archipelago backend
+  --disable-archipelago    disable Archipelago backend
  --enable-gcov            enable test coverage analysis with gcov
  --gcov=GCOV              use specified gcov [$gcov_tool]
  --disable-tpm            disable TPM support
  --enable-tpm             enable TPM support
  --disable-libssh2        disable ssh block device support
  --enable-libssh2         enable ssh block device support
-  --disable-vhdx           disables support for the Microsoft VHDX image format
+  --disable-vhdx           disable support for the Microsoft VHDX image format
  --enable-vhdx            enable support for the Microsoft VHDX image format
  --disable-quorum         disable quorum block filter support
  --enable-quorum          enable quorum block filter support
@@ -1817,7 +1823,8 @@ fi
 # libseccomp check

 if test "$seccomp" != "no" ; then
-    if $pkg_config --atleast-version=2.1.0 libseccomp; then
+    if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
+        $pkg_config --atleast-version=2.1.1 libseccomp; then
        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
 	seccomp="yes"
@@ -2709,6 +2716,12 @@ for i in $glib_modules; do
    fi
 done

+# g_test_trap_subprocess added in 2.38. Used by some tests.
+glib_subprocess=yes
+if ! $pkg_config --atleast-version=2.38 glib-2.0; then
+    glib_subprocess=no
+fi
+
 ##########################################
 # SHA command probe for modules
 if test "$modules" = yes; then
@@ -2730,7 +2743,7 @@ fi
 if test "$pixman" = ""; then
  if test "$want_tools" = "no" -a "$softmmu" = "no"; then
    pixman="none"
-  elif $pkg_config pixman-1 > /dev/null 2>&1; then
+  elif $pkg_config --atleast-version=0.21.8 pixman-1 > /dev/null 2>&1; then
    pixman="system"
  else
    pixman="internal"
@@ -2746,11 +2759,12 @@ if test "$pixman" = "none"; then
  pixman_cflags=
  pixman_libs=
 elif test "$pixman" = "system"; then
+  # pixman version has been checked above
  pixman_cflags=`$pkg_config --cflags pixman-1`
  pixman_libs=`$pkg_config --libs pixman-1`
 else
  if test ! -d ${source_path}/pixman/pixman; then
-    error_exit "pixman not present. Your options:" \
+    error_exit "pixman >= 0.21.8 not present. Your options:" \
        "  (1) Preferred: Install the pixman devel package (any recent" \
        "      distro should have packages as Xorg needs pixman too)." \
        "  (2) Fetch the pixman submodule, using:" \
@@ -2928,16 +2942,6 @@ else
  tpm_passthrough=no
 fi

-##########################################
-# adjust virtio-blk-data-plane based on linux-aio
-
-if test "$virtio_blk_data_plane" = "yes" -a \
-	"$linux_aio" != "yes" ; then
-  error_exit "virtio-blk-data-plane requires Linux AIO, please try --enable-linux-aio"
-elif test -z "$virtio_blk_data_plane" ; then
-  virtio_blk_data_plane=$linux_aio
-fi
-
 ##########################################
 # attr probe

@@ -3072,6 +3076,33 @@ EOF
  fi
 fi

+
+##########################################
+# archipelago probe
+if test "$archipelago" != "no" ; then
+    cat > $TMPC <<EOF
+#include <stdio.h>
+#include <xseg/xseg.h>
+#include <xseg/protocol.h>
+int main(void) {
+    xseg_initialize();
+    return 0;
+}
+EOF
+    archipelago_libs=-lxseg
+    if compile_prog "" "$archipelago_libs"; then
+        archipelago="yes"
+        libs_tools="$archipelago_libs $libs_tools"
+        libs_softmmu="$archipelago_libs $libs_softmmu"
+    else
+      if test "$archipelago" = "yes" ; then
+        feature_not_found "Archipelago backend support" "Install libxseg devel"
+      fi
+      archipelago="no"
+    fi
+fi
+
+
 ##########################################
 # glusterfs probe
 if test "$glusterfs" != "no" ; then
@@ -3087,7 +3118,8 @@ if test "$glusterfs" != "no" ; then
    fi
  else
    if test "$glusterfs" = "yes" ; then
-      feature_not_found "GlusterFS backend support" "Install glusterfs-api devel"
+      feature_not_found "GlusterFS backend support" \
+          "Install glusterfs-api devel >= 3"
    fi
    glusterfs="no"
  fi
@@ -3277,6 +3309,21 @@ if compile_prog "" "" ; then
  fallocate_punch_hole=yes
 fi

+# check for posix_fallocate
+posix_fallocate=no
+cat > $TMPC << EOF
+#include <fcntl.h>
+
+int main(void)
+{
+    posix_fallocate(0, 0, 0);
+    return 0;
+}
+EOF
+if compile_prog "" "" ; then
+    posix_fallocate=yes
+fi
+
 # check for sync_file_range
 sync_file_range=no
 cat > $TMPC << EOF
@@ -3421,6 +3468,37 @@ if compile_prog "" "" ; then
  sendfile=yes
 fi

+# check for timerfd support (glibc 2.8 and newer)
+timerfd=no
+cat > $TMPC << EOF
+#include <sys/timerfd.h>
+
+int main(void)
+{
+    return(timerfd_create(CLOCK_REALTIME, 0));
+}
+EOF
+if compile_prog "" "" ; then
+  timerfd=yes
+fi
+
+# check for setns and unshare support
+setns=no
+cat > $TMPC << EOF
+#include <sched.h>
+
+int main(void)
+{
+    int ret;
+    ret = setns(0, 0);
+    ret = unshare(0);
+    return ret;
+}
+EOF
+if compile_prog "" "" ; then
+  setns=yes
+fi
+
 # Check if tools are available to build documentation.
 if test "$docs" != "no" ; then
  if has makeinfo && has pod2man; then
@@ -3532,7 +3610,8 @@ EOF
    spice_server_version=$($pkg_config --modversion spice-server)
  else
    if test "$spice" = "yes" ; then
-      feature_not_found "spice" "Install spice-server and spice-protocol devel"
+      feature_not_found "spice" \
+          "Install spice-server(>=0.12.0) and spice-protocol(>=0.12.3) devel"
    fi
    spice="no"
  fi
@@ -3563,7 +3642,7 @@ EOF
        smartcard_nss="yes"
    else
        if test "$smartcard_nss" = "yes"; then
-            feature_not_found "nss"
+            feature_not_found "nss" "Install nss devel >= 3.12.8"
        fi
        smartcard_nss="no"
    fi
@@ -3579,7 +3658,7 @@ if test "$libusb" != "no" ; then
        libs_softmmu="$libs_softmmu $libusb_libs"
    else
        if test "$libusb" = "yes"; then
-            feature_not_found "libusb" "Install libusb devel"
+            feature_not_found "libusb" "Install libusb devel >= 1.0.13"
        fi
        libusb="no"
    fi
@@ -3893,12 +3972,11 @@ else
 fi

 ########################################
-# check if we have valgrind/valgrind.h and valgrind/memcheck.h
+# check if we have valgrind/valgrind.h

 valgrind_h=no
 cat > $TMPC << EOF
 #include <valgrind/valgrind.h>
-#include <valgrind/memcheck.h>
 int main(void) {
  return 0;
 }
@@ -4004,7 +4082,7 @@ if test "$libnfs" != "no" ; then
    LIBS="$LIBS $libnfs_libs"
  else
    if test "$libnfs" = "yes" ; then
-      feature_not_found "libnfs"
+      feature_not_found "libnfs" "Install libnfs devel >= 1.9.3"
    fi
    libnfs="no"
  fi
@@ -4134,9 +4212,9 @@ EOF
  fi
 fi

-# add pixman flags after all config tests are done
-QEMU_CFLAGS="$QEMU_CFLAGS $pixman_cflags $fdt_cflags"
-libs_softmmu="$libs_softmmu $pixman_libs"
+# prepend pixman and ftd flags after all config tests are done
+QEMU_CFLAGS="$pixman_cflags $fdt_cflags $QEMU_CFLAGS"
+libs_softmmu="$pixman_libs $libs_softmmu"

 echo "Install prefix    $prefix"
 echo "BIOS directory    `eval echo $qemu_datadir`"
@@ -4251,7 +4329,7 @@ echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
 echo "GlusterFS support $glusterfs"
-echo "virtio-blk-data-plane $virtio_blk_data_plane"
+echo "Archipelago support $archipelago"
 echo "gcov              $gcov_tool"
 echo "gcov enabled      $gcov"
 echo "TPM support       $tpm"
@@ -4460,6 +4538,9 @@ fi
 if test "$fallocate_punch_hole" = "yes" ; then
  echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak
 fi
+if test "$posix_fallocate" = "yes" ; then
+  echo "CONFIG_POSIX_FALLOCATE=y" >> $config_host_mak
+fi
 if test "$sync_file_range" = "yes" ; then
  echo "CONFIG_SYNC_FILE_RANGE=y" >> $config_host_mak
 fi
@@ -4487,6 +4568,12 @@ fi
 if test "$sendfile" = "yes" ; then
  echo "CONFIG_SENDFILE=y" >> $config_host_mak
 fi
+if test "$timerfd" = "yes" ; then
+  echo "CONFIG_TIMERFD=y" >> $config_host_mak
+fi
+if test "$setns" = "yes" ; then
+  echo "CONFIG_SETNS=y" >> $config_host_mak
+fi
 if test "$inotify" = "yes" ; then
  echo "CONFIG_INOTIFY=y" >> $config_host_mak
 fi
@@ -4511,6 +4598,9 @@ if test "$bluez" = "yes" ; then
  echo "CONFIG_BLUEZ=y" >> $config_host_mak
  echo "BLUEZ_CFLAGS=$bluez_cflags" >> $config_host_mak
 fi
+if test "glib_subprocess" = "yes" ; then
+  echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak
+fi
 echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 if test "$gtk" = "yes" ; then
  echo "CONFIG_GTK=y" >> $config_host_mak
@@ -4689,6 +4779,11 @@ if test "$glusterfs_zerofill" = "yes" ; then
  echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
 fi

+if test "$archipelago" = "yes" ; then
+  echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak
+  echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
  echo "CONFIG_LIBSSH2=m" >> $config_host_mak
  echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak
@@ -4699,10 +4794,6 @@ if test "$quorum" = "yes" ; then
  echo "CONFIG_QUORUM=y" >> $config_host_mak
 fi

-if test "$virtio_blk_data_plane" = "yes" ; then
-  echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak
-fi
-
 if test "$vhdx" = "yes" ; then
  echo "CONFIG_VHDX=y" >> $config_host_mak
 fi
@@ -4809,6 +4900,7 @@ echo "AS=$as" >> $config_host_mak
 echo "CPP=$cpp" >> $config_host_mak
 echo "OBJCOPY=$objcopy" >> $config_host_mak
 echo "LD=$ld" >> $config_host_mak
+echo "NM=$nm" >> $config_host_mak
 echo "WINDRES=$windres" >> $config_host_mak
 echo "LIBTOOL=$libtool" >> $config_host_mak
 echo "CFLAGS=$CFLAGS" >> $config_host_mak
@@ -4817,6 +4909,7 @@ echo "QEMU_CFLAGS=$QEMU_CFLAGS" >> $config_host_mak
 echo "QEMU_INCLUDES=$QEMU_INCLUDES" >> $config_host_mak
 if test "$sparse" = "yes" ; then
  echo "CC           := REAL_CC=\"\$(CC)\" cgcc"       >> $config_host_mak
+  echo "CXX          := REAL_CC=\"\$(CXX)\" cgcc"      >> $config_host_mak
  echo "HOST_CC      := REAL_CC=\"\$(HOST_CC)\" cgcc"  >> $config_host_mak
  echo "QEMU_CFLAGS  += -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-non-pointer-null" >> $config_host_mak
 fi
@@ -4937,7 +5030,7 @@ case "$target_name" in
  aarch64)
    TARGET_BASE_ARCH=arm
    bflt="yes"
-    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml"
+    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  cris)
  ;;
@@ -4966,6 +5059,8 @@ case "$target_name" in
    TARGET_BASE_ARCH=mips
    echo "TARGET_ABI_MIPSN64=y" >> $config_target_mak
  ;;
+  tricore)
+  ;;
  moxie)
  ;;
  or32)
@@ -5014,6 +5109,7 @@ case "$target_name" in
    echo "TARGET_ABI32=y" >> $config_target_mak
  ;;
  s390x)
+    gdb_xml_files="s390x-core64.xml s390-acr.xml s390-fpr.xml"
  ;;
  unicore32)
  ;;
@@ -5293,10 +5389,6 @@ for rom in seabios vgabios ; do
    echo "LD=$ld" >> $config_mak
 done

-if test "$docs" = "yes" ; then
-  mkdir -p QMP
-fi
-
 # set up qemu-iotests in this build directory
 iotests_common_env="tests/qemu-iotests/common.env"
 iotests_check="tests/qemu-iotests/check"
--- a/coroutine-sigaltstack.c
+++ b/coroutine-sigaltstack.c
@@ -155,7 +155,7 @@ Coroutine *qemu_coroutine_new(void)
    stack_t oss;
    sigset_t sigs;
    sigset_t osigs;
-    jmp_buf old_env;
+    sigjmp_buf old_env;

    /* The way to manipulate stack is with the sigaltstack function. We
     * prepare a stack, with it delivering a signal to ourselves and then
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -18,10 +18,114 @@
 */
 #include "config.h"
 #include "cpu.h"
+#include "trace.h"
 #include "disas/disas.h"
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
+#include "qemu/timer.h"
+
+/* -icount align implementation. */
+
+typedef struct SyncClocks {
+    int64_t diff_clk;
+    int64_t last_cpu_icount;
+    int64_t realtime_clock;
+} SyncClocks;
+
+#if !defined(CONFIG_USER_ONLY)
+/* Allow the guest to have a max 3ms advance.
+ * The difference between the 2 clocks could therefore
+ * oscillate around 0.
+ */
+#define VM_CLOCK_ADVANCE 3000000
+#define THRESHOLD_REDUCE 1.5
+#define MAX_DELAY_PRINT_RATE 2000000000LL
+#define MAX_NB_PRINTS 100
+
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+    int64_t cpu_icount;
+
+    if (!icount_align_option) {
+        return;
+    }
+
+    cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+    sc->diff_clk += cpu_icount_to_ns(sc->last_cpu_icount - cpu_icount);
+    sc->last_cpu_icount = cpu_icount;
+
+    if (sc->diff_clk > VM_CLOCK_ADVANCE) {
+#ifndef _WIN32
+        struct timespec sleep_delay, rem_delay;
+        sleep_delay.tv_sec = sc->diff_clk / 1000000000LL;
+        sleep_delay.tv_nsec = sc->diff_clk % 1000000000LL;
+        if (nanosleep(&sleep_delay, &rem_delay) < 0) {
+            sc->diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * 1000000000LL;
+            sc->diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec;
+        } else {
+            sc->diff_clk = 0;
+        }
+#else
+        Sleep(sc->diff_clk / SCALE_MS);
+        sc->diff_clk = 0;
+#endif
+    }
+}
+
+static void print_delay(const SyncClocks *sc)
+{
+    static float threshold_delay;
+    static int64_t last_realtime_clock;
+    static int nb_prints;
+
+    if (icount_align_option &&
+        sc->realtime_clock - last_realtime_clock >= MAX_DELAY_PRINT_RATE &&
+        nb_prints < MAX_NB_PRINTS) {
+        if ((-sc->diff_clk / (float)1000000000LL > threshold_delay) ||
+            (-sc->diff_clk / (float)1000000000LL <
+             (threshold_delay - THRESHOLD_REDUCE))) {
+            threshold_delay = (-sc->diff_clk / 1000000000LL) + 1;
+            printf("Warning: The guest is now late by %.1f to %.1f seconds\n",
+                   threshold_delay - 1,
+                   threshold_delay);
+            nb_prints++;
+            last_realtime_clock = sc->realtime_clock;
+        }
+    }
+}
+
+static void init_delay_params(SyncClocks *sc,
+                              const CPUState *cpu)
+{
+    if (!icount_align_option) {
+        return;
+    }
+    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) -
+                   sc->realtime_clock +
+                   cpu_get_clock_offset();
+    sc->last_cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+    if (sc->diff_clk < max_delay) {
+        max_delay = sc->diff_clk;
+    }
+    if (sc->diff_clk > max_advance) {
+        max_advance = sc->diff_clk;
+    }
+
+    /* Print every 2s max if the guest is late. We limit the number
+       of printed messages to NB_PRINT_MAX(currently 100) */
+    print_delay(sc);
+}
+#else
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+}
+
+static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
+{
+}
+#endif /* CONFIG USER ONLY */

 void cpu_loop_exit(CPUState *cpu)
 {
@@ -65,6 +169,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
 #endif /* DEBUG_DISAS */

    next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
+                       next_tb & TB_EXIT_MASK);
+
    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
        /* We didn't start executing this TB (eg because the instruction
         * counter hit zero); we must restore the guest PC to the address
@@ -105,6 +212,7 @@ static void cpu_exec_nocache(CPUArchState *env, int max_cycles,
                     max_cycles);
    cpu->current_tb = tb;
    /* execute the generated code */
+    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb->tc_ptr);
    cpu->current_tb = NULL;
    tb_phys_invalidate(tb, -1);
@@ -187,16 +295,10 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env)
    return tb;
 }

-static CPUDebugExcpHandler *debug_excp_handler;
-
-void cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler)
-{
-    debug_excp_handler = handler;
-}
-
 static void cpu_handle_debug_exception(CPUArchState *env)
 {
    CPUState *cpu = ENV_GET_CPU(env);
+    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUWatchpoint *wp;

    if (!cpu->watchpoint_hit) {
@@ -204,9 +306,8 @@ static void cpu_handle_debug_exception(CPUArchState *env)
            wp->flags &= ~BP_WATCHPOINT_HIT;
        }
    }
-    if (debug_excp_handler) {
-        debug_excp_handler(env);
-    }
+
+    cc->debug_excp_handler(cpu);
 }

 /* main execution loop */
@@ -216,10 +317,7 @@ volatile sig_atomic_t exit_request;
 int cpu_exec(CPUArchState *env)
 {
    CPUState *cpu = ENV_GET_CPU(env);
-#if !(defined(CONFIG_USER_ONLY) && \
-      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
    CPUClass *cc = CPU_GET_CLASS(cpu);
-#endif
 #ifdef TARGET_I386
    X86CPU *x86_cpu = X86_CPU(cpu);
 #endif
@@ -227,6 +325,8 @@ int cpu_exec(CPUArchState *env)
    TranslationBlock *tb;
    uint8_t *tc_ptr;
    uintptr_t next_tb;
+    SyncClocks sc;
+
    /* This must be volatile so it is not trashed by longjmp() */
    volatile bool have_tb_lock = false;

@@ -252,37 +352,16 @@ int cpu_exec(CPUArchState *env)
        cpu->exit_request = 1;
    }

-#if defined(TARGET_I386)
-    /* put eflags in CPU temporary format */
-    CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
-    env->df = 1 - (2 * ((env->eflags >> 10) & 1));
-    CC_OP = CC_OP_EFLAGS;
-    env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
-#elif defined(TARGET_SPARC)
-#elif defined(TARGET_M68K)
-    env->cc_op = CC_OP_FLAGS;
-    env->cc_dest = env->sr & 0xf;
-    env->cc_x = (env->sr >> 4) & 1;
-#elif defined(TARGET_ALPHA)
-#elif defined(TARGET_ARM)
-#elif defined(TARGET_UNICORE32)
-#elif defined(TARGET_PPC)
-    env->reserve_addr = -1;
-#elif defined(TARGET_LM32)
-#elif defined(TARGET_MICROBLAZE)
-#elif defined(TARGET_MIPS)
-#elif defined(TARGET_MOXIE)
-#elif defined(TARGET_OPENRISC)
-#elif defined(TARGET_SH4)
-#elif defined(TARGET_CRIS)
-#elif defined(TARGET_S390X)
-#elif defined(TARGET_XTENSA)
-    /* XXXXX */
-#else
-#error unsupported target CPU
-#endif
+    cc->cpu_exec_enter(cpu);
    cpu->exception_index = -1;

+    /* Calculate difference between guest clock and host clock.
+     * This delay includes the delay of the last cycle, so
+     * what we have to do is sleep until it is 0. As for the
+     * advance/delay we gain here, we try to fix it next time.
+     */
+    init_delay_params(&sc, cpu);
+
    /* prepare setjmp context for exception handling */
    for(;;) {
        if (sigsetjmp(cpu->jmp_env, 0) == 0) {
@@ -325,16 +404,12 @@ int cpu_exec(CPUArchState *env)
                        cpu->exception_index = EXCP_DEBUG;
                        cpu_loop_exit(cpu);
                    }
-#if defined(TARGET_ARM) || defined(TARGET_SPARC) || defined(TARGET_MIPS) || \
-    defined(TARGET_PPC) || defined(TARGET_ALPHA) || defined(TARGET_CRIS) || \
-    defined(TARGET_MICROBLAZE) || defined(TARGET_LM32) || defined(TARGET_UNICORE32)
                    if (interrupt_request & CPU_INTERRUPT_HALT) {
                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                        cpu->halted = 1;
                        cpu->exception_index = EXCP_HLT;
                        cpu_loop_exit(cpu);
                    }
-#endif
 #if defined(TARGET_I386)
                    if (interrupt_request & CPU_INTERRUPT_INIT) {
                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
@@ -347,251 +422,15 @@ int cpu_exec(CPUArchState *env)
                        cpu_reset(cpu);
                    }
 #endif
-#if defined(TARGET_I386)
-#if !defined(CONFIG_USER_ONLY)
-                    if (interrupt_request & CPU_INTERRUPT_POLL) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
-                        apic_poll_irq(x86_cpu->apic_state);
-                    }
-#endif
-                    if (interrupt_request & CPU_INTERRUPT_SIPI) {
-                            do_cpu_sipi(x86_cpu);
-                    } else if (env->hflags2 & HF2_GIF_MASK) {
-                        if ((interrupt_request & CPU_INTERRUPT_SMI) &&
-                            !(env->hflags & HF_SMM_MASK)) {
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_SMI,
-                                                          0);
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
-                            do_smm_enter(x86_cpu);
-                            next_tb = 0;
-                        } else if ((interrupt_request & CPU_INTERRUPT_NMI) &&
-                                   !(env->hflags2 & HF2_NMI_MASK)) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
-                            env->hflags2 |= HF2_NMI_MASK;
-                            do_interrupt_x86_hardirq(env, EXCP02_NMI, 1);
-                            next_tb = 0;
-                        } else if (interrupt_request & CPU_INTERRUPT_MCE) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_MCE;
-                            do_interrupt_x86_hardirq(env, EXCP12_MCHK, 0);
-                            next_tb = 0;
-                        } else if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                                   (((env->hflags2 & HF2_VINTR_MASK) && 
-                                     (env->hflags2 & HF2_HIF_MASK)) ||
-                                    (!(env->hflags2 & HF2_VINTR_MASK) && 
-                                     (env->eflags & IF_MASK && 
-                                      !(env->hflags & HF_INHIBIT_IRQ_MASK))))) {
-                            int intno;
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_INTR,
-                                                          0);
-                            cpu->interrupt_request &= ~(CPU_INTERRUPT_HARD |
-                                                        CPU_INTERRUPT_VIRQ);
-                            intno = cpu_get_pic_interrupt(env);
-                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing hardware INT=0x%02x\n", intno);
-                            do_interrupt_x86_hardirq(env, intno, 1);
-                            /* ensure that no TB jump will be modified as
-                               the program flow was changed */
-                            next_tb = 0;
-#if !defined(CONFIG_USER_ONLY)
-                        } else if ((interrupt_request & CPU_INTERRUPT_VIRQ) &&
-                                   (env->eflags & IF_MASK) && 
-                                   !(env->hflags & HF_INHIBIT_IRQ_MASK)) {
-                            int intno;
-                            /* FIXME: this should respect TPR */
-                            cpu_svm_check_intercept_param(env, SVM_EXIT_VINTR,
-                                                          0);
-                            intno = ldl_phys(cpu->as,
-                                             env->vm_vmcb
-                                             + offsetof(struct vmcb,
-                                                        control.int_vector));
-                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing virtual hardware INT=0x%02x\n", intno);
-                            do_interrupt_x86_hardirq(env, intno, 1);
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_VIRQ;
-                            next_tb = 0;
-#endif
-                        }
-                    }
-#elif defined(TARGET_PPC)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        ppc_hw_interrupt(env);
-                        if (env->pending_interrupts == 0) {
-                            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
-                        }
+                    /* The target hook has 3 exit conditions:
+                       False when the interrupt isn't processed,
+                       True when it is, and we should restart on a new TB,
+                       and via longjmp via cpu_loop_exit.  */
+                    if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
                        next_tb = 0;
                    }
-#elif defined(TARGET_LM32)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD)
-                        && (env->ie & IE_IE)) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_MICROBLAZE)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD)
-                        && (env->sregs[SR_MSR] & MSR_IE)
-                        && !(env->sregs[SR_MSR] & (MSR_EIP | MSR_BIP))
-                        && !(env->iflags & (D_FLAG | IMM_FLAG))) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_MIPS)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                        cpu_mips_hw_interrupts_pending(env)) {
-                        /* Raise it */
-                        cpu->exception_index = EXCP_EXT_INTERRUPT;
-                        env->error_code = 0;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_OPENRISC)
-                    {
-                        int idx = -1;
-                        if ((interrupt_request & CPU_INTERRUPT_HARD)
-                            && (env->sr & SR_IEE)) {
-                            idx = EXCP_INT;
-                        }
-                        if ((interrupt_request & CPU_INTERRUPT_TIMER)
-                            && (env->sr & SR_TEE)) {
-                            idx = EXCP_TICK;
-                        }
-                        if (idx >= 0) {
-                            cpu->exception_index = idx;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_SPARC)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        if (cpu_interrupts_enabled(env) &&
-                            env->interrupt_index > 0) {
-                            int pil = env->interrupt_index & 0xf;
-                            int type = env->interrupt_index & 0xf0;
-
-                            if (((type == TT_EXTINT) &&
-                                  cpu_pil_allowed(env, pil)) ||
-                                  type != TT_EXTINT) {
-                                cpu->exception_index = env->interrupt_index;
-                                cc->do_interrupt(cpu);
-                                next_tb = 0;
-                            }
-                        }
-                    }
-#elif defined(TARGET_ARM)
-                    if (interrupt_request & CPU_INTERRUPT_FIQ
-                        && !(env->daif & PSTATE_F)) {
-                        cpu->exception_index = EXCP_FIQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-                    /* ARMv7-M interrupt return works by loading a magic value
-                       into the PC.  On real hardware the load causes the
-                       return to occur.  The qemu implementation performs the
-                       jump normally, then does the exception return when the
-                       CPU tries to execute code at the magic address.
-                       This will cause the magic PC value to be pushed to
-                       the stack if an interrupt occurred at the wrong time.
-                       We avoid this by disabling interrupts when
-                       pc contains a magic address.  */
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && ((IS_M(env) && env->regs[15] < 0xfffffff0)
-                            || !(env->daif & PSTATE_I))) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_UNICORE32)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && !(env->uncached_asr & ASR_I)) {
-                        cpu->exception_index = UC32_EXCP_INTR;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_SH4)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_ALPHA)
-                    {
-                        int idx = -1;
-                        /* ??? This hard-codes the OSF/1 interrupt levels.  */
-                        switch (env->pal_mode ? 7 : env->ps & PS_INT_MASK) {
-                        case 0 ... 3:
-                            if (interrupt_request & CPU_INTERRUPT_HARD) {
-                                idx = EXCP_DEV_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 4:
-                            if (interrupt_request & CPU_INTERRUPT_TIMER) {
-                                idx = EXCP_CLK_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 5:
-                            if (interrupt_request & CPU_INTERRUPT_SMP) {
-                                idx = EXCP_SMP_INTERRUPT;
-                            }
-                            /* FALLTHRU */
-                        case 6:
-                            if (interrupt_request & CPU_INTERRUPT_MCHK) {
-                                idx = EXCP_MCHK;
-                            }
-                        }
-                        if (idx >= 0) {
-                            cpu->exception_index = idx;
-                            env->error_code = 0;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_CRIS)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && (env->pregs[PR_CCS] & I_FLAG)
-                        && !env->locked_irq) {
-                        cpu->exception_index = EXCP_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-                    if (interrupt_request & CPU_INTERRUPT_NMI) {
-                        unsigned int m_flag_archval;
-                        if (env->pregs[PR_VR] < 32) {
-                            m_flag_archval = M_FLAG_V10;
-                        } else {
-                            m_flag_archval = M_FLAG_V32;
-                        }
-                        if ((env->pregs[PR_CCS] & m_flag_archval)) {
-                            cpu->exception_index = EXCP_NMI;
-                            cc->do_interrupt(cpu);
-                            next_tb = 0;
-                        }
-                    }
-#elif defined(TARGET_M68K)
-                    if (interrupt_request & CPU_INTERRUPT_HARD
-                        && ((env->sr & SR_I) >> SR_I_SHIFT)
-                            < env->pending_level) {
-                        /* Real hardware gets the interrupt vector via an
-                           IACK cycle at this point.  Current emulated
-                           hardware doesn't rely on this, so we
-                           provide/save the vector when the interrupt is
-                           first signalled.  */
-                        cpu->exception_index = env->pending_vector;
-                        do_interrupt_m68k_hardirq(env);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_S390X) && !defined(CONFIG_USER_ONLY)
-                    if ((interrupt_request & CPU_INTERRUPT_HARD) &&
-                        (env->psw.mask & PSW_MASK_EXT)) {
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#elif defined(TARGET_XTENSA)
-                    if (interrupt_request & CPU_INTERRUPT_HARD) {
-                        cpu->exception_index = EXC_IRQ;
-                        cc->do_interrupt(cpu);
-                        next_tb = 0;
-                    }
-#endif
-                   /* Don't use the cached interrupt_request value,
-                      do_interrupt may have updated the EXITTB flag. */
+                    /* Don't use the cached interrupt_request value,
+                       do_interrupt may have updated the EXITTB flag. */
                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
                        cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
                        /* ensure that no TB jump will be modified as
@@ -637,6 +476,7 @@ int cpu_exec(CPUArchState *env)
                cpu->current_tb = tb;
                barrier();
                if (likely(!cpu->exit_request)) {
+                    trace_exec_tb(tb, tb->pc);
                    tc_ptr = tb->tc_ptr;
                    /* execute the generated code */
                    next_tb = cpu_tb_exec(cpu, tc_ptr);
@@ -672,6 +512,7 @@ int cpu_exec(CPUArchState *env)
                            if (insns_left > 0) {
                                /* Execute remaining instructions.  */
                                cpu_exec_nocache(env, insns_left, tb);
+                                align_clocks(&sc, cpu);
                            }
                            cpu->exception_index = EXCP_INTERRUPT;
                            next_tb = 0;
@@ -684,6 +525,9 @@ int cpu_exec(CPUArchState *env)
                    }
                }
                cpu->current_tb = NULL;
+                /* Try to align the host and virtual clocks
+                   if the guest is in advance */
+                align_clocks(&sc, cpu);
                /* reset soft MMU for next block (it can currently
                   only be set by a memory fault) */
            } /* for(;;) */
@@ -692,10 +536,7 @@ int cpu_exec(CPUArchState *env)
             * local variables as longjmp is marked 'noreturn'. */
            cpu = current_cpu;
            env = cpu->env_ptr;
-#if !(defined(CONFIG_USER_ONLY) && \
-      (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X)))
            cc = CPU_GET_CLASS(cpu);
-#endif
 #ifdef TARGET_I386
            x86_cpu = X86_CPU(cpu);
 #endif
@@ -706,35 +547,7 @@ int cpu_exec(CPUArchState *env)
        }
    } /* for(;;) */

-
-#if defined(TARGET_I386)
-    /* restore flags in standard format */
-    env->eflags = env->eflags | cpu_cc_compute_all(env, CC_OP)
-        | (env->df & DF_MASK);
-#elif defined(TARGET_ARM)
-    /* XXX: Save/restore host fpu exception state?.  */
-#elif defined(TARGET_UNICORE32)
-#elif defined(TARGET_SPARC)
-#elif defined(TARGET_PPC)
-#elif defined(TARGET_LM32)
-#elif defined(TARGET_M68K)
-    cpu_m68k_flush_flags(env, env->cc_op);
-    env->cc_op = CC_OP_FLAGS;
-    env->sr = (env->sr & 0xffe0)
-              | env->cc_dest | (env->cc_x << 4);
-#elif defined(TARGET_MICROBLAZE)
-#elif defined(TARGET_MIPS)
-#elif defined(TARGET_MOXIE)
-#elif defined(TARGET_OPENRISC)
-#elif defined(TARGET_SH4)
-#elif defined(TARGET_ALPHA)
-#elif defined(TARGET_CRIS)
-#elif defined(TARGET_S390X)
-#elif defined(TARGET_XTENSA)
-    /* XXXXX */
-#else
-#error unsupported target CPU
-#endif
+    cc->cpu_exec_exit(cpu);

    /* fail safe : never use current_cpu outside cpu_exec() */
    current_cpu = NULL;
--- a/cpus.c
+++ b/cpus.c
@@ -40,6 +40,7 @@
 #include "qemu/bitmap.h"
 #include "qemu/seqlock.h"
 #include "qapi-event.h"
+#include "hw/nmi.h"

 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -64,6 +65,8 @@
 #endif /* CONFIG_LINUX */

 static CPUState *next_cpu;
+int64_t max_delay;
+int64_t max_advance;

 bool cpu_is_stopped(CPUState *cpu)
 {
@@ -102,17 +105,12 @@ static bool all_cpu_threads_idle(void)

 /* Protected by TimersState seqlock */

-/* Compensate for varying guest execution speed.  */
-static int64_t qemu_icount_bias;
-static int64_t vm_clock_warp_start;
+static int64_t vm_clock_warp_start = -1;
 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 static int icount_time_shift;
 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 #define MAX_ICOUNT_SHIFT 10

-/* Only written by TCG thread */
-static int64_t qemu_icount;
-
 static QEMUTimer *icount_rt_timer;
 static QEMUTimer *icount_vm_timer;
 static QEMUTimer *icount_warp_timer;
@@ -129,6 +127,11 @@ typedef struct TimersState {
    int64_t cpu_clock_offset;
    int32_t cpu_ticks_enabled;
    int64_t dummy;
+
+    /* Compensate for varying guest execution speed.  */
+    int64_t qemu_icount_bias;
+    /* Only written by TCG thread */
+    int64_t qemu_icount;
 } TimersState;

 static TimersState timers_state;
@@ -139,14 +142,14 @@ static int64_t cpu_get_icount_locked(void)
    int64_t icount;
    CPUState *cpu = current_cpu;

-    icount = qemu_icount;
+    icount = timers_state.qemu_icount;
    if (cpu) {
        if (!cpu_can_do_io(cpu)) {
            fprintf(stderr, "Bad clock read\n");
        }
        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
    }
-    return qemu_icount_bias + (icount << icount_time_shift);
+    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 }

 int64_t cpu_get_icount(void)
@@ -162,6 +165,11 @@ int64_t cpu_get_icount(void)
    return icount;
 }

+int64_t cpu_icount_to_ns(int64_t icount)
+{
+    return icount << icount_time_shift;
+}
+
 /* return the host CPU cycle counter and handle stop/restart */
 /* Caller must hold the BQL */
 int64_t cpu_get_ticks(void)
@@ -214,6 +222,23 @@ int64_t cpu_get_clock(void)
    return ti;
 }

+/* return the offset between the host clock and virtual CPU clock */
+int64_t cpu_get_clock_offset(void)
+{
+    int64_t ti;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        ti = timers_state.cpu_clock_offset;
+        if (!timers_state.cpu_ticks_enabled) {
+            ti -= get_clock();
+        }
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return -ti;
+}
+
 /* enable cpu_get_ticks()
 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 */
@@ -284,7 +309,8 @@ static void icount_adjust(void)
        icount_time_shift++;
    }
    last_delta = delta;
-    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+    timers_state.qemu_icount_bias = cur_icount
+                              - (timers_state.qemu_icount << icount_time_shift);
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }

@@ -333,7 +359,7 @@ static void icount_warp_rt(void *opaque)
            int64_t delta = cur_time - cur_icount;
            warp_delta = MIN(warp_delta, delta);
        }
-        qemu_icount_bias += warp_delta;
+        timers_state.qemu_icount_bias += warp_delta;
    }
    vm_clock_warp_start = -1;
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
@@ -351,7 +377,7 @@ void qtest_clock_warp(int64_t dest)
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
        seqlock_write_lock(&timers_state.vm_clock_seqlock);
-        qemu_icount_bias += warp;
+        timers_state.qemu_icount_bias += warp;
        seqlock_write_unlock(&timers_state.vm_clock_seqlock);

        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
@@ -428,6 +454,25 @@ void qemu_clock_warp(QEMUClockType type)
    }
 }

+static bool icount_state_needed(void *opaque)
+{
+    return use_icount;
+}
+
+/*
+ * This is a subsection for icount migration.
+ */
+static const VMStateDescription icount_vmstate_timers = {
+    .name = "timer/icount",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_INT64(qemu_icount_bias, TimersState),
+        VMSTATE_INT64(qemu_icount, TimersState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_timers = {
    .name = "timer",
    .version_id = 2,
@@ -437,23 +482,48 @@ static const VMStateDescription vmstate_timers = {
        VMSTATE_INT64(dummy, TimersState),
        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection[]) {
+        {
+            .vmsd = &icount_vmstate_timers,
+            .needed = icount_state_needed,
+        }, {
+            /* empty */
+        }
    }
 };

-void configure_icount(const char *option)
+void cpu_ticks_init(void)
 {
    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
+}
+
+void configure_icount(QemuOpts *opts, Error **errp)
+{
+    const char *option;
+    char *rem_str = NULL;
+
+    option = qemu_opt_get(opts, "shift");
    if (!option) {
+        if (qemu_opt_get(opts, "align") != NULL) {
+            error_setg(errp, "Please specify shift option when using align");
+        }
        return;
    }
-
+    icount_align_option = qemu_opt_get_bool(opts, "align", false);
    icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
                                          icount_warp_rt, NULL);
    if (strcmp(option, "auto") != 0) {
-        icount_time_shift = strtol(option, NULL, 0);
+        errno = 0;
+        icount_time_shift = strtol(option, &rem_str, 0);
+        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
+            error_setg(errp, "icount: Invalid shift value");
+        }
        use_icount = 1;
        return;
+    } else if (icount_align_option) {
+        error_setg(errp, "shift=auto and align=on are incompatible");
    }

    use_icount = 2;
@@ -523,6 +593,15 @@ void cpu_synchronize_all_post_init(void)
    }
 }

+void cpu_clean_all_dirty(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_clean_state(cpu);
+    }
+}
+
 static int do_vm_stop(RunState state)
 {
    int ret = 0;
@@ -1250,7 +1329,8 @@ static int tcg_cpu_exec(CPUArchState *env)
        int64_t count;
        int64_t deadline;
        int decr;
-        qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
@@ -1265,7 +1345,7 @@ static int tcg_cpu_exec(CPUArchState *env)
        }

        count = qemu_icount_round(deadline);
-        qemu_icount += count;
+        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
        cpu->icount_decr.u16.low = decr;
@@ -1278,7 +1358,8 @@ static int tcg_cpu_exec(CPUArchState *env)
    if (use_icount) {
        /* Fold pending instructions back into the
           instruction counter, and clear the interrupt flag.  */
-        qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
    }
@@ -1342,6 +1423,9 @@ CpuInfoList *qmp_query_cpus(Error **errp)
 #elif defined(TARGET_MIPS)
        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
        CPUMIPSState *env = &mips_cpu->env;
+#elif defined(TARGET_TRICORE)
+        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
+        CPUTriCoreState *env = &tricore_cpu->env;
 #endif

        cpu_synchronize_state(cpu);
@@ -1366,6 +1450,9 @@ CpuInfoList *qmp_query_cpus(Error **errp)
 #elif defined(TARGET_MIPS)
        info->value->has_PC = true;
        info->value->PC = env->active_tc.PC;
+#elif defined(TARGET_TRICORE)
+        info->value->has_PC = true;
+        info->value->PC = env->PC;
 #endif

        /* XXX: waiting for the qapi to support GSList */
@@ -1469,21 +1556,24 @@ void qmp_inject_nmi(Error **errp)
            apic_deliver_nmi(cpu->apic_state);
        }
    }
-#elif defined(TARGET_S390X)
-    CPUState *cs;
-    S390CPU *cpu;
-
-    CPU_FOREACH(cs) {
-        cpu = S390_CPU(cs);
-        if (cpu->env.cpu_num == monitor_get_cpu_index()) {
-            if (s390_cpu_restart(S390_CPU(cs)) == -1) {
-                error_set(errp, QERR_UNSUPPORTED);
-                return;
-            }
-            break;
-        }
-    }
 #else
-    error_set(errp, QERR_UNSUPPORTED);
+    nmi_monitor_handle(monitor_get_cpu_index(), errp);
 #endif
 }
+
+void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
+{
+    if (!use_icount) {
+        return;
+    }
+
+    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
+                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
+    if (icount_align_option) {
+        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
+        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
+    } else {
+        cpu_fprintf(f, "Max guest delay     NA\n");
+        cpu_fprintf(f, "Max guest advance   NA\n");
+    }
+}
--- a/cputlb.c
+++ b/cputlb.c
@@ -60,8 +60,10 @@ void tlb_flush(CPUState *cpu, int flush_global)
    cpu->current_tb = NULL;

    memset(env->tlb_table, -1, sizeof(env->tlb_table));
+    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));

+    env->vtlb_index = 0;
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;
    tlb_flush_count++;
@@ -108,6 +110,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
    }

+    /* check whether there are entries that need to be flushed in the vtlb */
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        int k;
+        for (k = 0; k < CPU_VTLB_SIZE; k++) {
+            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+        }
+    }
+
    tb_flush_jmp_cache(cpu, addr);
 }

@@ -172,6 +182,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, ram_addr_t length)
                tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
                                      start1, length);
            }
+
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
+                                      start1, length);
+            }
        }
    }
 }
@@ -195,6 +210,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong vaddr)
    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
    }
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        int k;
+        for (k = 0; k < CPU_VTLB_SIZE; k++) {
+            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+        }
+    }
 }

 /* Our TLB does not support large pages, so remember the area covered by
@@ -235,6 +257,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
    uintptr_t addend;
    CPUTLBEntry *te;
    hwaddr iotlb, xlat, sz;
+    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;

    assert(size >= TARGET_PAGE_SIZE);
    if (size != TARGET_PAGE_SIZE) {
@@ -267,8 +290,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                                            prot, &address);

    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    env->iotlb[mmu_idx][index] = iotlb - vaddr;
    te = &env->tlb_table[mmu_idx][index];
+
+    /* do not discard the translation in te, evict it into a victim tlb */
+    env->tlb_v_table[mmu_idx][vidx] = *te;
+    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
+
+    /* refill the tlb */
+    env->iotlb[mmu_idx][index] = iotlb - vaddr;
    te->addend = addend - vaddr;
    if (prot & PAGE_READ) {
        te->addr_read = address;
--- a/default-configs/mips-softmmu.mak
+++ b/default-configs/mips-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/mips64-softmmu.mak
+++ b/default-configs/mips64-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/mipsel-softmmu.mak
+++ b/default-configs/mipsel-softmmu.mak
@@ -32,6 +32,5 @@ CONFIG_G364FB=y
 CONFIG_I8259=y
 CONFIG_JAZZ_LED=y
 CONFIG_MC146818RTC=y
-CONFIG_VT82C686=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_EMPTY_SLOT=y
--- a/default-configs/ppc-softmmu.mak
+++ b/default-configs/ppc-softmmu.mak
@@ -45,8 +45,8 @@ CONFIG_PREP=y
 CONFIG_MAC=y
 CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
+CONFIG_ETSEC=y
+CONFIG_LIBDECNUMBER=y
 # For PReP
 CONFIG_MC146818RTC=y
-CONFIG_ETSEC=y
 CONFIG_ISA_TESTDEV=y
-CONFIG_LIBDECNUMBER=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -46,6 +46,8 @@ CONFIG_PREP=y
 CONFIG_MAC=y
 CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
+CONFIG_ETSEC=y
+CONFIG_LIBDECNUMBER=y
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
@@ -58,4 +60,3 @@ CONFIG_I82374=y
 CONFIG_I8257=y
 CONFIG_MC146818RTC=y
 CONFIG_ISA_TESTDEV=y
-CONFIG_LIBDECNUMBER=y
--- a/default-configs/tricore-softmmu.mak
+++ b/default-configs/tricore-softmmu.mak
--- a/device-hotplug.c
+++ b/device-hotplug.c
@@ -24,6 +24,7 @@

 #include "hw/hw.h"
 #include "hw/boards.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "qemu/config-file.h"
 #include "sysemu/sysemu.h"
@@ -76,6 +77,6 @@ void drive_hot_add(Monitor *mon, const QDict *qdict)

 err:
    if (dinfo) {
-        drive_del(dinfo);
+        blk_unref(blk_by_legacy_dinfo(dinfo));
    }
 }
--- a/device_tree.c
+++ b/device_tree.c
@@ -20,6 +20,7 @@

 #include "config.h"
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
@@ -59,13 +60,13 @@ void *create_device_tree(int *sizep)
    }
    ret = fdt_open_into(fdt, fdt, *sizep);
    if (ret) {
-        fprintf(stderr, "Unable to copy device tree in memory\n");
+        error_report("Unable to copy device tree in memory");
        exit(1);
    }

    return fdt;
 fail:
-    fprintf(stderr, "%s Couldn't create dt: %s\n", __func__, fdt_strerror(ret));
+    error_report("%s Couldn't create dt: %s", __func__, fdt_strerror(ret));
    exit(1);
 }

@@ -79,8 +80,8 @@ void *load_device_tree(const char *filename_path, int *sizep)
    *sizep = 0;
    dt_size = get_image_size(filename_path);
    if (dt_size < 0) {
-        printf("Unable to get size of device tree file '%s'\n",
-            filename_path);
+        error_report("Unable to get size of device tree file '%s'",
+                     filename_path);
        goto fail;
    }

@@ -92,21 +93,21 @@ void *load_device_tree(const char *filename_path, int *sizep)

    dt_file_load_size = load_image(filename_path, fdt);
    if (dt_file_load_size < 0) {
-        printf("Unable to open device tree file '%s'\n",
-               filename_path);
+        error_report("Unable to open device tree file '%s'",
+                     filename_path);
        goto fail;
    }

    ret = fdt_open_into(fdt, fdt, dt_size);
    if (ret) {
-        printf("Unable to copy device tree in memory\n");
+        error_report("Unable to copy device tree in memory");
        goto fail;
    }

    /* Check sanity of device tree */
    if (fdt_check_header(fdt)) {
-        printf ("Device tree file loaded into memory is invalid: %s\n",
-            filename_path);
+        error_report("Device tree file loaded into memory is invalid: %s",
+                     filename_path);
        goto fail;
    }
    *sizep = dt_size;
@@ -123,8 +124,8 @@ static int findnode_nofail(void *fdt, const char *node_path)

    offset = fdt_path_offset(fdt, node_path);
    if (offset < 0) {
-        fprintf(stderr, "%s Couldn't find node %s: %s\n", __func__, node_path,
-                fdt_strerror(offset));
+        error_report("%s Couldn't find node %s: %s", __func__, node_path,
+                     fdt_strerror(offset));
        exit(1);
    }

@@ -138,8 +139,8 @@ int qemu_fdt_setprop(void *fdt, const char *node_path,

    r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s: %s\n", __func__, node_path,
-                property, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s: %s", __func__, node_path,
+                     property, fdt_strerror(r));
        exit(1);
    }

@@ -153,8 +154,8 @@ int qemu_fdt_setprop_cell(void *fdt, const char *node_path,

    r = fdt_setprop_cell(fdt, findnode_nofail(fdt, node_path), property, val);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s = %#08x: %s\n", __func__,
-                node_path, property, val, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s = %#08x: %s", __func__,
+                     node_path, property, val, fdt_strerror(r));
        exit(1);
    }

@@ -175,8 +176,8 @@ int qemu_fdt_setprop_string(void *fdt, const char *node_path,

    r = fdt_setprop_string(fdt, findnode_nofail(fdt, node_path), property, string);
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't set %s/%s = %s: %s\n", __func__,
-                node_path, property, string, fdt_strerror(r));
+        error_report("%s: Couldn't set %s/%s = %s: %s", __func__,
+                     node_path, property, string, fdt_strerror(r));
        exit(1);
    }

@@ -193,8 +194,8 @@ const void *qemu_fdt_getprop(void *fdt, const char *node_path,
    }
    r = fdt_getprop(fdt, findnode_nofail(fdt, node_path), property, lenp);
    if (!r) {
-        fprintf(stderr, "%s: Couldn't get %s/%s: %s\n", __func__,
-                node_path, property, fdt_strerror(*lenp));
+        error_report("%s: Couldn't get %s/%s: %s", __func__,
+                     node_path, property, fdt_strerror(*lenp));
        exit(1);
    }
    return r;
@@ -206,8 +207,8 @@ uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path,
    int len;
    const uint32_t *p = qemu_fdt_getprop(fdt, node_path, property, &len);
    if (len != 4) {
-        fprintf(stderr, "%s: %s/%s not 4 bytes long (not a cell?)\n",
-                __func__, node_path, property);
+        error_report("%s: %s/%s not 4 bytes long (not a cell?)",
+                     __func__, node_path, property);
        exit(1);
    }
    return be32_to_cpu(*p);
@@ -219,8 +220,8 @@ uint32_t qemu_fdt_get_phandle(void *fdt, const char *path)

    r = fdt_get_phandle(fdt, findnode_nofail(fdt, path));
    if (r == 0) {
-        fprintf(stderr, "%s: Couldn't get phandle for %s: %s\n", __func__,
-                path, fdt_strerror(r));
+        error_report("%s: Couldn't get phandle for %s: %s", __func__,
+                     path, fdt_strerror(r));
        exit(1);
    }

@@ -265,8 +266,8 @@ int qemu_fdt_nop_node(void *fdt, const char *node_path)

    r = fdt_nop_node(fdt, findnode_nofail(fdt, node_path));
    if (r < 0) {
-        fprintf(stderr, "%s: Couldn't nop node %s: %s\n", __func__, node_path,
-                fdt_strerror(r));
+        error_report("%s: Couldn't nop node %s: %s", __func__, node_path,
+                     fdt_strerror(r));
        exit(1);
    }

@@ -294,8 +295,8 @@ int qemu_fdt_add_subnode(void *fdt, const char *name)

    retval = fdt_add_subnode(fdt, parent, basename);
    if (retval < 0) {
-        fprintf(stderr, "FDT: Failed to create subnode %s: %s\n", name,
-                fdt_strerror(retval));
+        error_report("FDT: Failed to create subnode %s: %s", name,
+                     fdt_strerror(retval));
        exit(1);
    }

--- a/disas/arm-a64.cc
+++ b/disas/arm-a64.cc
@@ -39,7 +39,7 @@ public:
    ~QEMUDisassembler() { }

 protected:
-    void ProcessOutput(Instruction *instr) {
+    virtual void ProcessOutput(const Instruction *instr) {
        fprintf(stream_, "%08" PRIx32 "      %s",
                instr->InstructionBits(), GetOutput());
    }
--- a/disas/libvixl/README
+++ b/disas/libvixl/README
@@ -2,7 +2,7 @@
 The code in this directory is a subset of libvixl:
 https://github.com/armvixl/vixl
 (specifically, it is the set of files needed for disassembly only,
-taken from libvixl 1.4).
+taken from libvixl 1.6).
 Bugfixes should preferably be sent upstream initially.

 The disassembler does not currently support the entire A64 instruction
--- a/disas/libvixl/a64/assembler-a64.h
+++ b/disas/libvixl/a64/assembler-a64.h
@@ -28,9 +28,11 @@
 #define VIXL_A64_ASSEMBLER_A64_H_

 #include <list>
+#include <stack>

 #include "globals.h"
 #include "utils.h"
+#include "code-buffer.h"
 #include "a64/instructions-a64.h"

 namespace vixl {
@@ -167,6 +169,11 @@ class CPURegister {
    return type_ == kFPRegister;
  }

+  bool IsW() const { return IsValidRegister() && Is32Bits(); }
+  bool IsX() const { return IsValidRegister() && Is64Bits(); }
+  bool IsS() const { return IsValidFPRegister() && Is32Bits(); }
+  bool IsD() const { return IsValidFPRegister() && Is64Bits(); }
+
  const Register& W() const;
  const Register& X() const;
  const FPRegister& S() const;
@@ -190,12 +197,12 @@ class CPURegister {

 class Register : public CPURegister {
 public:
-  explicit Register() : CPURegister() {}
+  Register() : CPURegister() {}
  inline explicit Register(const CPURegister& other)
      : CPURegister(other.code(), other.size(), other.type()) {
    VIXL_ASSERT(IsValidRegister());
  }
-  explicit Register(unsigned code, unsigned size)
+  Register(unsigned code, unsigned size)
      : CPURegister(code, size, kRegister) {}

  bool IsValid() const {
@@ -535,7 +542,7 @@ class Operand {
 class MemOperand {
 public:
  explicit MemOperand(Register base,
-                      ptrdiff_t offset = 0,
+                      int64_t offset = 0,
                      AddrMode addrmode = Offset);
  explicit MemOperand(Register base,
                      Register regoffset,
@@ -551,7 +558,7 @@ class MemOperand {

  const Register& base() const { return base_; }
  const Register& regoffset() const { return regoffset_; }
-  ptrdiff_t offset() const { return offset_; }
+  int64_t offset() const { return offset_; }
  AddrMode addrmode() const { return addrmode_; }
  Shift shift() const { return shift_; }
  Extend extend() const { return extend_; }
@@ -564,7 +571,7 @@ class MemOperand {
 private:
  Register base_;
  Register regoffset_;
-  ptrdiff_t offset_;
+  int64_t offset_;
  AddrMode addrmode_;
  Shift shift_;
  Extend extend_;
@@ -574,71 +581,233 @@ class MemOperand {

 class Label {
 public:
-  Label() : is_bound_(false), link_(NULL), target_(NULL) {}
+  Label() : location_(kLocationUnbound) {}
  ~Label() {
    // If the label has been linked to, it needs to be bound to a target.
    VIXL_ASSERT(!IsLinked() || IsBound());
  }

-  inline Instruction* link() const { return link_; }
-  inline Instruction* target() const { return target_; }
-
-  inline bool IsBound() const { return is_bound_; }
-  inline bool IsLinked() const { return link_ != NULL; }
-
-  inline void set_link(Instruction* new_link) { link_ = new_link; }
-
-  static const int kEndOfChain = 0;
+  inline bool IsBound() const { return location_ >= 0; }
+  inline bool IsLinked() const { return !links_.empty(); }

 private:
-  // Indicates if the label has been bound, ie its location is fixed.
-  bool is_bound_;
-  // Branches instructions branching to this label form a chained list, with
-  // their offset indicating where the next instruction is located.
-  // link_ points to the latest branch instruction generated branching to this
-  // branch.
-  // If link_ is not NULL, the label has been linked to.
-  Instruction* link_;
+  // The list of linked instructions is stored in a stack-like structure. We
+  // don't use std::stack directly because it's slow for the common case where
+  // only one or two instructions refer to a label, and labels themselves are
+  // short-lived. This class behaves like std::stack, but the first few links
+  // are preallocated (configured by kPreallocatedLinks).
+  //
+  // If more than N links are required, this falls back to std::stack.
+  class LinksStack {
+   public:
+    LinksStack() : size_(0), links_extended_(NULL) {}
+    ~LinksStack() {
+      delete links_extended_;
+    }
+
+    size_t size() const {
+      return size_;
+    }
+
+    bool empty() const {
+      return size_ == 0;
+    }
+
+    void push(ptrdiff_t value) {
+      if (size_ < kPreallocatedLinks) {
+        links_[size_] = value;
+      } else {
+        if (links_extended_ == NULL) {
+          links_extended_ = new std::stack<ptrdiff_t>();
+        }
+        VIXL_ASSERT(size_ == (links_extended_->size() + kPreallocatedLinks));
+        links_extended_->push(value);
+      }
+      size_++;
+    }
+
+    ptrdiff_t top() const {
+      return (size_ <= kPreallocatedLinks) ? links_[size_ - 1]
+                                           : links_extended_->top();
+    }
+
+    void pop() {
+      size_--;
+      if (size_ >= kPreallocatedLinks) {
+        links_extended_->pop();
+        VIXL_ASSERT(size_ == (links_extended_->size() + kPreallocatedLinks));
+      }
+    }
+
+   private:
+    static const size_t kPreallocatedLinks = 4;
+
+    size_t size_;
+    ptrdiff_t links_[kPreallocatedLinks];
+    std::stack<ptrdiff_t> * links_extended_;
+  };
+
+  inline ptrdiff_t location() const { return location_; }
+
+  inline void Bind(ptrdiff_t location) {
+    // Labels can only be bound once.
+    VIXL_ASSERT(!IsBound());
+    location_ = location;
+  }
+
+  inline void AddLink(ptrdiff_t instruction) {
+    // If a label is bound, the assembler already has the information it needs
+    // to write the instruction, so there is no need to add it to links_.
+    VIXL_ASSERT(!IsBound());
+    links_.push(instruction);
+  }
+
+  inline ptrdiff_t GetAndRemoveNextLink() {
+    VIXL_ASSERT(IsLinked());
+    ptrdiff_t link = links_.top();
+    links_.pop();
+    return link;
+  }
+
+  // The offsets of the instructions that have linked to this label.
+  LinksStack links_;
  // The label location.
-  Instruction* target_;
+  ptrdiff_t location_;

+  static const ptrdiff_t kLocationUnbound = -1;
+
+  // It is not safe to copy labels, so disable the copy constructor by declaring
+  // it private (without an implementation).
+  Label(const Label&);
+
+  // The Assembler class is responsible for binding and linking labels, since
+  // the stored offsets need to be consistent with the Assembler's buffer.
  friend class Assembler;
 };


-// TODO: Obtain better values for these, based on real-world data.
-const int kLiteralPoolCheckInterval = 4 * KBytes;
-const int kRecommendedLiteralPoolRange = 2 * kLiteralPoolCheckInterval;
-
-
-// Control whether a branch over the literal pool should also be emitted. This
-// is needed if the literal pool has to be emitted in the middle of the JITted
-// code.
-enum LiteralPoolEmitOption {
-  JumpRequired,
-  NoJumpRequired
-};
-
-
-// Literal pool entry.
-class Literal {
+// A literal is a 32-bit or 64-bit piece of data stored in the instruction
+// stream and loaded through a pc relative load. The same literal can be
+// referred to by multiple instructions but a literal can only reside at one
+// place in memory. A literal can be used by a load before or after being
+// placed in memory.
+//
+// Internally an offset of 0 is associated with a literal which has been
+// neither used nor placed. Then two possibilities arise:
+//  1) the label is placed, the offset (stored as offset + 1) is used to
+//     resolve any subsequent load using the label.
+//  2) the label is not placed and offset is the offset of the last load using
+//     the literal (stored as -offset -1). If multiple loads refer to this
+//     literal then the last load holds the offset of the preceding load and
+//     all loads form a chain. Once the offset is placed all the loads in the
+//     chain are resolved and future loads fall back to possibility 1.
+class RawLiteral {
 public:
-  Literal(Instruction* pc, uint64_t imm, unsigned size)
-      : pc_(pc), value_(imm), size_(size) {}
+  RawLiteral() : size_(0), offset_(0), raw_value_(0) {}

- private:
-  Instruction* pc_;
-  int64_t value_;
-  unsigned size_;
+  size_t size() {
+    VIXL_STATIC_ASSERT(kDRegSizeInBytes == kXRegSizeInBytes);
+    VIXL_STATIC_ASSERT(kSRegSizeInBytes == kWRegSizeInBytes);
+    VIXL_ASSERT((size_ == kXRegSizeInBytes) || (size_ == kWRegSizeInBytes));
+    return size_;
+  }
+  uint64_t raw_value64() {
+    VIXL_ASSERT(size_ == kXRegSizeInBytes);
+    return raw_value_;
+  }
+  uint32_t raw_value32() {
+    VIXL_ASSERT(size_ == kWRegSizeInBytes);
+    VIXL_ASSERT(is_uint32(raw_value_) || is_int32(raw_value_));
+    return static_cast<uint32_t>(raw_value_);
+  }
+  bool IsUsed() { return offset_ < 0; }
+  bool IsPlaced() { return offset_ > 0; }
+
+ protected:
+  ptrdiff_t offset() {
+    VIXL_ASSERT(IsPlaced());
+    return offset_ - 1;
+  }
+  void set_offset(ptrdiff_t offset) {
+    VIXL_ASSERT(offset >= 0);
+    VIXL_ASSERT(IsWordAligned(offset));
+    VIXL_ASSERT(!IsPlaced());
+    offset_ = offset + 1;
+  }
+  ptrdiff_t last_use() {
+    VIXL_ASSERT(IsUsed());
+    return -offset_ - 1;
+  }
+  void set_last_use(ptrdiff_t offset) {
+    VIXL_ASSERT(offset >= 0);
+    VIXL_ASSERT(IsWordAligned(offset));
+    VIXL_ASSERT(!IsPlaced());
+    offset_ = -offset - 1;
+  }
+
+  size_t size_;
+  ptrdiff_t offset_;
+  uint64_t raw_value_;

  friend class Assembler;
 };


+template <typename T>
+class Literal : public RawLiteral {
+ public:
+  explicit Literal(T value) {
+    size_ = sizeof(value);
+    memcpy(&raw_value_, &value, sizeof(value));
+  }
+};
+
+
+// Control whether or not position-independent code should be emitted.
+enum PositionIndependentCodeOption {
+  // All code generated will be position-independent; all branches and
+  // references to labels generated with the Label class will use PC-relative
+  // addressing.
+  PositionIndependentCode,
+
+  // Allow VIXL to generate code that refers to absolute addresses. With this
+  // option, it will not be possible to copy the code buffer and run it from a
+  // different address; code must be generated in its final location.
+  PositionDependentCode,
+
+  // Allow VIXL to assume that the bottom 12 bits of the address will be
+  // constant, but that the top 48 bits may change. This allows `adrp` to
+  // function in systems which copy code between pages, but otherwise maintain
+  // 4KB page alignment.
+  PageOffsetDependentCode
+};
+
+
+// Control how scaled- and unscaled-offset loads and stores are generated.
+enum LoadStoreScalingOption {
+  // Prefer scaled-immediate-offset instructions, but emit unscaled-offset,
+  // register-offset, pre-index or post-index instructions if necessary.
+  PreferScaledOffset,
+
+  // Prefer unscaled-immediate-offset instructions, but emit scaled-offset,
+  // register-offset, pre-index or post-index instructions if necessary.
+  PreferUnscaledOffset,
+
+  // Require scaled-immediate-offset instructions.
+  RequireScaledOffset,
+
+  // Require unscaled-immediate-offset instructions.
+  RequireUnscaledOffset
+};
+
+
 // Assembler.
 class Assembler {
 public:
-  Assembler(byte* buffer, unsigned buffer_size);
+  Assembler(size_t capacity,
+            PositionIndependentCodeOption pic = PositionIndependentCode);
+  Assembler(byte* buffer, size_t capacity,
+            PositionIndependentCodeOption pic = PositionIndependentCode);

  // The destructor asserts that one of the following is true:
  //  * The Assembler object has not been used.
@@ -650,9 +819,6 @@ class Assembler {

  // Start generating code from the beginning of the buffer, discarding any code
  // and data that has already been emitted into the buffer.
-  //
-  // In order to avoid any accidental transfer of state, Reset ASSERTs that the
-  // constant pool is not blocked.
  void Reset();

  // Finalize a code buffer of generated instructions. This function must be
@@ -662,12 +828,49 @@ class Assembler {
  // Label.
  // Bind a label to the current PC.
  void bind(Label* label);
-  int UpdateAndGetByteOffsetTo(Label* label);
-  inline int UpdateAndGetInstructionOffsetTo(Label* label) {
-    VIXL_ASSERT(Label::kEndOfChain == 0);
-    return UpdateAndGetByteOffsetTo(label) >> kInstructionSizeLog2;
+
+  // Bind a label to a specified offset from the start of the buffer.
+  void BindToOffset(Label* label, ptrdiff_t offset);
+
+  // Place a literal at the current PC.
+  void place(RawLiteral* literal);
+
+  ptrdiff_t CursorOffset() const {
+    return buffer_->CursorOffset();
  }

+  ptrdiff_t BufferEndOffset() const {
+    return static_cast<ptrdiff_t>(buffer_->capacity());
+  }
+
+  // Return the address of an offset in the buffer.
+  template <typename T>
+  inline T GetOffsetAddress(ptrdiff_t offset) {
+    VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
+    return buffer_->GetOffsetAddress<T>(offset);
+  }
+
+  // Return the address of a bound label.
+  template <typename T>
+  inline T GetLabelAddress(const Label * label) {
+    VIXL_ASSERT(label->IsBound());
+    VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
+    return GetOffsetAddress<T>(label->location());
+  }
+
+  // Return the address of the cursor.
+  template <typename T>
+  inline T GetCursorAddress() {
+    VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
+    return GetOffsetAddress<T>(CursorOffset());
+  }
+
+  // Return the address of the start of the buffer.
+  template <typename T>
+  inline T GetStartAddress() {
+    VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
+    return GetOffsetAddress<T>(0);
+  }

  // Instruction set functions.

@@ -733,6 +936,12 @@ class Assembler {
  // Calculate the address of a PC offset.
  void adr(const Register& rd, int imm21);

+  // Calculate the page address of a label.
+  void adrp(const Register& rd, Label* label);
+
+  // Calculate the page address of a PC offset.
+  void adrp(const Register& rd, int imm21);
+
  // Data Processing instructions.
  // Add.
  void add(const Register& rd,
@@ -1112,31 +1321,76 @@ class Assembler {

  // Memory instructions.
  // Load integer or FP register.
-  void ldr(const CPURegister& rt, const MemOperand& src);
+  void ldr(const CPURegister& rt, const MemOperand& src,
+           LoadStoreScalingOption option = PreferScaledOffset);

  // Store integer or FP register.
-  void str(const CPURegister& rt, const MemOperand& dst);
+  void str(const CPURegister& rt, const MemOperand& dst,
+           LoadStoreScalingOption option = PreferScaledOffset);

  // Load word with sign extension.
-  void ldrsw(const Register& rt, const MemOperand& src);
+  void ldrsw(const Register& rt, const MemOperand& src,
+             LoadStoreScalingOption option = PreferScaledOffset);

  // Load byte.
-  void ldrb(const Register& rt, const MemOperand& src);
+  void ldrb(const Register& rt, const MemOperand& src,
+            LoadStoreScalingOption option = PreferScaledOffset);

  // Store byte.
-  void strb(const Register& rt, const MemOperand& dst);
+  void strb(const Register& rt, const MemOperand& dst,
+            LoadStoreScalingOption option = PreferScaledOffset);

  // Load byte with sign extension.
-  void ldrsb(const Register& rt, const MemOperand& src);
+  void ldrsb(const Register& rt, const MemOperand& src,
+             LoadStoreScalingOption option = PreferScaledOffset);

  // Load half-word.
-  void ldrh(const Register& rt, const MemOperand& src);
+  void ldrh(const Register& rt, const MemOperand& src,
+            LoadStoreScalingOption option = PreferScaledOffset);

  // Store half-word.
-  void strh(const Register& rt, const MemOperand& dst);
+  void strh(const Register& rt, const MemOperand& dst,
+            LoadStoreScalingOption option = PreferScaledOffset);

  // Load half-word with sign extension.
-  void ldrsh(const Register& rt, const MemOperand& src);
+  void ldrsh(const Register& rt, const MemOperand& src,
+             LoadStoreScalingOption option = PreferScaledOffset);
+
+  // Load integer or FP register (with unscaled offset).
+  void ldur(const CPURegister& rt, const MemOperand& src,
+            LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Store integer or FP register (with unscaled offset).
+  void stur(const CPURegister& rt, const MemOperand& src,
+            LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Load word with sign extension.
+  void ldursw(const Register& rt, const MemOperand& src,
+              LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Load byte (with unscaled offset).
+  void ldurb(const Register& rt, const MemOperand& src,
+             LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Store byte (with unscaled offset).
+  void sturb(const Register& rt, const MemOperand& dst,
+             LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Load byte with sign extension (and unscaled offset).
+  void ldursb(const Register& rt, const MemOperand& src,
+              LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Load half-word (with unscaled offset).
+  void ldurh(const Register& rt, const MemOperand& src,
+             LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Store half-word (with unscaled offset).
+  void sturh(const Register& rt, const MemOperand& dst,
+             LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Load half-word with sign extension (and unscaled offset).
+  void ldursh(const Register& rt, const MemOperand& src,
+              LoadStoreScalingOption option = PreferUnscaledOffset);

  // Load integer or FP register pair.
  void ldp(const CPURegister& rt, const CPURegister& rt2,
@@ -1157,14 +1411,90 @@ class Assembler {
  void stnp(const CPURegister& rt, const CPURegister& rt2,
            const MemOperand& dst);

-  // Load literal to register.
-  void ldr(const Register& rt, uint64_t imm);
+  // Load integer or FP register from literal pool.
+  void ldr(const CPURegister& rt, RawLiteral* literal);

-  // Load double precision floating point literal to FP register.
-  void ldr(const FPRegister& ft, double imm);
+  // Load word with sign extension from literal pool.
+  void ldrsw(const Register& rt, RawLiteral* literal);
+
+  // Load integer or FP register from pc + imm19 << 2.
+  void ldr(const CPURegister& rt, int imm19);
+
+  // Load word with sign extension from pc + imm19 << 2.
+  void ldrsw(const Register& rt, int imm19);
+
+  // Store exclusive byte.
+  void stxrb(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Store exclusive half-word.
+  void stxrh(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Store exclusive register.
+  void stxr(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Load exclusive byte.
+  void ldxrb(const Register& rt, const MemOperand& src);
+
+  // Load exclusive half-word.
+  void ldxrh(const Register& rt, const MemOperand& src);
+
+  // Load exclusive register.
+  void ldxr(const Register& rt, const MemOperand& src);
+
+  // Store exclusive register pair.
+  void stxp(const Register& rs,
+            const Register& rt,
+            const Register& rt2,
+            const MemOperand& dst);
+
+  // Load exclusive register pair.
+  void ldxp(const Register& rt, const Register& rt2, const MemOperand& src);
+
+  // Store-release exclusive byte.
+  void stlxrb(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Store-release exclusive half-word.
+  void stlxrh(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Store-release exclusive register.
+  void stlxr(const Register& rs, const Register& rt, const MemOperand& dst);
+
+  // Load-acquire exclusive byte.
+  void ldaxrb(const Register& rt, const MemOperand& src);
+
+  // Load-acquire exclusive half-word.
+  void ldaxrh(const Register& rt, const MemOperand& src);
+
+  // Load-acquire exclusive register.
+  void ldaxr(const Register& rt, const MemOperand& src);
+
+  // Store-release exclusive register pair.
+  void stlxp(const Register& rs,
+             const Register& rt,
+             const Register& rt2,
+             const MemOperand& dst);
+
+  // Load-acquire exclusive register pair.
+  void ldaxp(const Register& rt, const Register& rt2, const MemOperand& src);
+
+  // Store-release byte.
+  void stlrb(const Register& rt, const MemOperand& dst);
+
+  // Store-release half-word.
+  void stlrh(const Register& rt, const MemOperand& dst);
+
+  // Store-release register.
+  void stlr(const Register& rt, const MemOperand& dst);
+
+  // Load-acquire byte.
+  void ldarb(const Register& rt, const MemOperand& src);
+
+  // Load-acquire half-word.
+  void ldarh(const Register& rt, const MemOperand& src);
+
+  // Load-acquire register.
+  void ldar(const Register& rt, const MemOperand& src);

-  // Load single precision floating point literal to FP register.
-  void ldr(const FPRegister& ft, float imm);

  // Move instructions. The default shift of -1 indicates that the move
  // instruction will calculate an appropriate 16-bit immediate and left shift
@@ -1214,6 +1544,9 @@ class Assembler {
  // System hint.
  void hint(SystemHint code);

+  // Clear exclusive monitor.
+  void clrex(int imm4 = 0xf);
+
  // Data memory barrier.
  void dmb(BarrierDomain domain, BarrierType type);

@@ -1375,25 +1708,26 @@ class Assembler {
  inline void dci(Instr raw_inst) { Emit(raw_inst); }

  // Emit 32 bits of data into the instruction stream.
-  inline void dc32(uint32_t data) { EmitData(&data, sizeof(data)); }
+  inline void dc32(uint32_t data) {
+    VIXL_ASSERT(buffer_monitor_ > 0);
+    buffer_->Emit32(data);
+  }

  // Emit 64 bits of data into the instruction stream.
-  inline void dc64(uint64_t data) { EmitData(&data, sizeof(data)); }
+  inline void dc64(uint64_t data) {
+    VIXL_ASSERT(buffer_monitor_ > 0);
+    buffer_->Emit64(data);
+  }

  // Copy a string into the instruction stream, including the terminating NULL
-  // character. The instruction pointer (pc_) is then aligned correctly for
+  // character. The instruction pointer is then aligned correctly for
  // subsequent instructions.
-  void EmitStringData(const char * string) {
+  void EmitString(const char * string) {
    VIXL_ASSERT(string != NULL);
+    VIXL_ASSERT(buffer_monitor_ > 0);

-    size_t len = strlen(string) + 1;
-    EmitData(string, len);
-
-    // Pad with NULL characters until pc_ is aligned.
-    const char pad[] = {'\0', '\0', '\0', '\0'};
-    VIXL_STATIC_ASSERT(sizeof(pad) == kInstructionSize);
-    Instruction* next_pc = AlignUp(pc_, kInstructionSize);
-    EmitData(&pad, next_pc - pc_);
+    buffer_->EmitString(string);
+    buffer_->Align();
  }

  // Code generation helpers.
@@ -1429,6 +1763,11 @@ class Assembler {
    return rt2.code() << Rt2_offset;
  }

+  static Instr Rs(CPURegister rs) {
+    VIXL_ASSERT(rs.code() != kSPRegInternalCode);
+    return rs.code() << Rs_offset;
+  }
+
  // These encoding functions allow the stack pointer to be encoded, and
  // disallow the zero register.
  static Instr RdSP(Register rd) {
@@ -1619,6 +1958,11 @@ class Assembler {
    return imm7 << ImmHint_offset;
  }

+  static Instr CRm(int imm4) {
+    VIXL_ASSERT(is_uint4(imm4));
+    return imm4 << CRm_offset;
+  }
+
  static Instr ImmBarrierDomain(int imm2) {
    VIXL_ASSERT(is_uint2(imm2));
    return imm2 << ImmBarrierDomain_offset;
@@ -1659,55 +2003,73 @@ class Assembler {
    return scale << FPScale_offset;
  }

-  // Size of the code generated in bytes
-  uint64_t SizeOfCodeGenerated() const {
-    VIXL_ASSERT((pc_ >= buffer_) && (pc_ < (buffer_ + buffer_size_)));
-    return pc_ - buffer_;
-  }
-
  // Size of the code generated since label to the current position.
-  uint64_t SizeOfCodeGeneratedSince(Label* label) const {
+  size_t SizeOfCodeGeneratedSince(Label* label) const {
    VIXL_ASSERT(label->IsBound());
-    VIXL_ASSERT((pc_ >= label->target()) && (pc_ < (buffer_ + buffer_size_)));
-    return pc_ - label->target();
+    return buffer_->OffsetFrom(label->location());
  }

+  size_t BufferCapacity() const { return buffer_->capacity(); }

-  inline void BlockLiteralPool() {
-    literal_pool_monitor_++;
-  }
+  size_t RemainingBufferSpace() const { return buffer_->RemainingBytes(); }

-  inline void ReleaseLiteralPool() {
-    if (--literal_pool_monitor_ == 0) {
-      // Has the literal pool been blocked for too long?
-      VIXL_ASSERT(literals_.empty() ||
-             (pc_ < (literals_.back()->pc_ + kMaxLoadLiteralRange)));
+  void EnsureSpaceFor(size_t amount) {
+    if (buffer_->RemainingBytes() < amount) {
+      size_t capacity = buffer_->capacity();
+      size_t size = buffer_->CursorOffset();
+      do {
+        // TODO(all): refine.
+        capacity *= 2;
+      } while ((capacity - size) <  amount);
+      buffer_->Grow(capacity);
    }
  }

-  inline bool IsLiteralPoolBlocked() {
-    return literal_pool_monitor_ != 0;
+#ifdef DEBUG
+  void AcquireBuffer() {
+    VIXL_ASSERT(buffer_monitor_ >= 0);
+    buffer_monitor_++;
  }

-  void CheckLiteralPool(LiteralPoolEmitOption option = JumpRequired);
-  void EmitLiteralPool(LiteralPoolEmitOption option = NoJumpRequired);
-  size_t LiteralPoolSize();
+  void ReleaseBuffer() {
+    buffer_monitor_--;
+    VIXL_ASSERT(buffer_monitor_ >= 0);
+  }
+#endif

- protected:
-  inline const Register& AppropriateZeroRegFor(const CPURegister& reg) const {
+  inline PositionIndependentCodeOption pic() {
+    return pic_;
+  }
+
+  inline bool AllowPageOffsetDependentCode() {
+    return (pic() == PageOffsetDependentCode) ||
+           (pic() == PositionDependentCode);
+  }
+
+  static inline const Register& AppropriateZeroRegFor(const CPURegister& reg) {
    return reg.Is64Bits() ? xzr : wzr;
  }


+ protected:
  void LoadStore(const CPURegister& rt,
                 const MemOperand& addr,
-                 LoadStoreOp op);
-  static bool IsImmLSUnscaled(ptrdiff_t offset);
-  static bool IsImmLSScaled(ptrdiff_t offset, LSDataSize size);
+                 LoadStoreOp op,
+                 LoadStoreScalingOption option = PreferScaledOffset);
+  static bool IsImmLSUnscaled(int64_t offset);
+  static bool IsImmLSScaled(int64_t offset, LSDataSize size);

+  void LoadStorePair(const CPURegister& rt,
+                     const CPURegister& rt2,
+                     const MemOperand& addr,
+                     LoadStorePairOp op);
+  static bool IsImmLSPair(int64_t offset, LSDataSize size);
+
+  // TODO(all): The third parameter should be passed by reference but gcc 4.8.2
+  // reports a bogus uninitialised warning then.
  void Logical(const Register& rd,
               const Register& rn,
-               const Operand& operand,
+               const Operand operand,
               LogicalOp op);
  void LogicalImmediate(const Register& rd,
                        const Register& rn,
@@ -1717,9 +2079,9 @@ class Assembler {
                        LogicalOp op);
  static bool IsImmLogical(uint64_t value,
                           unsigned width,
-                           unsigned* n,
-                           unsigned* imm_s,
-                           unsigned* imm_r);
+                           unsigned* n = NULL,
+                           unsigned* imm_s = NULL,
+                           unsigned* imm_r = NULL);

  void ConditionalCompare(const Register& rn,
                          const Operand& operand,
@@ -1768,6 +2130,7 @@ class Assembler {
    const CPURegister& rt, const CPURegister& rt2);
  static LoadStorePairNonTemporalOp StorePairNonTemporalOpFor(
    const CPURegister& rt, const CPURegister& rt2);
+  static LoadLiteralOp LoadLiteralOpFor(const CPURegister& rt);


 private:
@@ -1786,10 +2149,6 @@ class Assembler {
                                const Operand& operand,
                                FlagsUpdate S,
                                Instr op);
-  void LoadStorePair(const CPURegister& rt,
-                     const CPURegister& rt2,
-                     const MemOperand& addr,
-                     LoadStorePairOp op);
  void LoadStorePairNonTemporal(const CPURegister& rt,
                                const CPURegister& rt2,
                                const MemOperand& addr,
@@ -1821,75 +2180,110 @@ class Assembler {
                               const FPRegister& fa,
                               FPDataProcessing3SourceOp op);

-  void RecordLiteral(int64_t imm, unsigned size);
+  // Link the current (not-yet-emitted) instruction to the specified label, then
+  // return an offset to be encoded in the instruction. If the label is not yet
+  // bound, an offset of 0 is returned.
+  ptrdiff_t LinkAndGetByteOffsetTo(Label * label);
+  ptrdiff_t LinkAndGetInstructionOffsetTo(Label * label);
+  ptrdiff_t LinkAndGetPageOffsetTo(Label * label);

-  // Emit the instruction at pc_.
+  // A common implementation for the LinkAndGet<Type>OffsetTo helpers.
+  template <int element_shift>
+  ptrdiff_t LinkAndGetOffsetTo(Label* label);
+
+  // Literal load offset are in words (32-bit).
+  ptrdiff_t LinkAndGetWordOffsetTo(RawLiteral* literal);
+
+  // Emit the instruction in buffer_.
  void Emit(Instr instruction) {
-    VIXL_STATIC_ASSERT(sizeof(*pc_) == 1);
    VIXL_STATIC_ASSERT(sizeof(instruction) == kInstructionSize);
-    VIXL_ASSERT((pc_ + sizeof(instruction)) <= (buffer_ + buffer_size_));
-
-#ifdef DEBUG
-    finalized_ = false;
-#endif
-
-    memcpy(pc_, &instruction, sizeof(instruction));
-    pc_ += sizeof(instruction);
-    CheckBufferSpace();
+    VIXL_ASSERT(buffer_monitor_ > 0);
+    buffer_->Emit32(instruction);
  }

-  // Emit data inline in the instruction stream.
-  void EmitData(void const * data, unsigned size) {
-    VIXL_STATIC_ASSERT(sizeof(*pc_) == 1);
-    VIXL_ASSERT((pc_ + size) <= (buffer_ + buffer_size_));
+  // Buffer where the code is emitted.
+  CodeBuffer* buffer_;
+  PositionIndependentCodeOption pic_;

 #ifdef DEBUG
-    finalized_ = false;
-#endif
-
-    // TODO: Record this 'instruction' as data, so that it can be disassembled
-    // correctly.
-    memcpy(pc_, data, size);
-    pc_ += size;
-    CheckBufferSpace();
-  }
-
-  inline void CheckBufferSpace() {
-    VIXL_ASSERT(pc_ < (buffer_ + buffer_size_));
-    if (pc_ > next_literal_pool_check_) {
-      CheckLiteralPool();
-    }
-  }
-
-  // The buffer into which code and relocation info are generated.
-  Instruction* buffer_;
-  // Buffer size, in bytes.
-  unsigned buffer_size_;
-  Instruction* pc_;
-  std::list<Literal*> literals_;
-  Instruction* next_literal_pool_check_;
-  unsigned literal_pool_monitor_;
-
-  friend class BlockLiteralPoolScope;
-
-#ifdef DEBUG
-  bool finalized_;
+  int64_t buffer_monitor_;
 #endif
 };

-class BlockLiteralPoolScope {
+
+// All Assembler emits MUST acquire/release the underlying code buffer. The
+// helper scope below will do so and optionally ensure the buffer is big enough
+// to receive the emit. It is possible to request the scope not to perform any
+// checks (kNoCheck) if for example it is known in advance the buffer size is
+// adequate or there is some other size checking mechanism in place.
+class CodeBufferCheckScope {
 public:
-  explicit BlockLiteralPoolScope(Assembler* assm) : assm_(assm) {
-    assm_->BlockLiteralPool();
+  // Tell whether or not the scope needs to ensure the associated CodeBuffer
+  // has enough space for the requested size.
+  enum CheckPolicy {
+    kNoCheck,
+    kCheck
+  };
+
+  // Tell whether or not the scope should assert the amount of code emitted
+  // within the scope is consistent with the requested amount.
+  enum AssertPolicy {
+    kNoAssert,    // No assert required.
+    kExactSize,   // The code emitted must be exactly size bytes.
+    kMaximumSize  // The code emitted must be at most size bytes.
+  };
+
+  CodeBufferCheckScope(Assembler* assm,
+                       size_t size,
+                       CheckPolicy check_policy = kCheck,
+                       AssertPolicy assert_policy = kMaximumSize)
+      : assm_(assm) {
+    if (check_policy == kCheck) assm->EnsureSpaceFor(size);
+#ifdef DEBUG
+    assm->bind(&start_);
+    size_ = size;
+    assert_policy_ = assert_policy;
+    assm->AcquireBuffer();
+#else
+    USE(assert_policy);
+#endif
  }

-  ~BlockLiteralPoolScope() {
-    assm_->ReleaseLiteralPool();
+  // This is a shortcut for CodeBufferCheckScope(assm, 0, kNoCheck, kNoAssert).
+  explicit CodeBufferCheckScope(Assembler* assm) : assm_(assm) {
+#ifdef DEBUG
+    size_ = 0;
+    assert_policy_ = kNoAssert;
+    assm->AcquireBuffer();
+#endif
  }

- private:
+  ~CodeBufferCheckScope() {
+#ifdef DEBUG
+    assm_->ReleaseBuffer();
+    switch (assert_policy_) {
+      case kNoAssert: break;
+      case kExactSize:
+        VIXL_ASSERT(assm_->SizeOfCodeGeneratedSince(&start_) == size_);
+        break;
+      case kMaximumSize:
+        VIXL_ASSERT(assm_->SizeOfCodeGeneratedSince(&start_) <= size_);
+        break;
+      default:
+        VIXL_UNREACHABLE();
+    }
+#endif
+  }
+
+ protected:
  Assembler* assm_;
+#ifdef DEBUG
+  Label start_;
+  size_t size_;
+  AssertPolicy assert_policy_;
+#endif
 };
+
 }  // namespace vixl

 #endif  // VIXL_A64_ASSEMBLER_A64_H_
--- a/disas/libvixl/a64/constants-a64.h
+++ b/disas/libvixl/a64/constants-a64.h
@@ -46,13 +46,13 @@ R(24) R(25) R(26) R(27) R(28) R(29) R(30) R(31)

 #define INSTRUCTION_FIELDS_LIST(V_)                                            \
 /* Register fields */                                                          \
-V_(Rd, 4, 0, Bits)                        /* Destination register.     */      \
-V_(Rn, 9, 5, Bits)                        /* First source register.    */      \
-V_(Rm, 20, 16, Bits)                      /* Second source register.   */      \
-V_(Ra, 14, 10, Bits)                      /* Third source register.    */      \
-V_(Rt, 4, 0, Bits)                        /* Load dest / store source. */      \
-V_(Rt2, 14, 10, Bits)                     /* Load second dest /        */      \
-                                         /* store second source.      */       \
+V_(Rd, 4, 0, Bits)                        /* Destination register.        */   \
+V_(Rn, 9, 5, Bits)                        /* First source register.       */   \
+V_(Rm, 20, 16, Bits)                      /* Second source register.      */   \
+V_(Ra, 14, 10, Bits)                      /* Third source register.       */   \
+V_(Rt, 4, 0, Bits)                        /* Load/store register.         */   \
+V_(Rt2, 14, 10, Bits)                     /* Load/store second register.  */   \
+V_(Rs, 20, 16, Bits)                      /* Exclusive access status.     */   \
 V_(PrefetchMode, 4, 0, Bits)                                                   \
                                                                               \
 /* Common bits */                                                              \
@@ -126,6 +126,13 @@ V_(SysOp1, 18, 16, Bits)                                                       \
 V_(SysOp2, 7, 5, Bits)                                                         \
 V_(CRn, 15, 12, Bits)                                                          \
 V_(CRm, 11, 8, Bits)                                                           \
+                                                                               \
+/* Load-/store-exclusive */                                                    \
+V_(LdStXLoad, 22, 22, Bits)                                                    \
+V_(LdStXNotExclusive, 23, 23, Bits)                                            \
+V_(LdStXAcquireRelease, 15, 15, Bits)                                          \
+V_(LdStXSizeLog2, 31, 30, Bits)                                                \
+V_(LdStXPair, 21, 21, Bits)                                                    \


 #define SYSTEM_REGISTER_FIELDS_LIST(V_, M_)                                    \
@@ -585,6 +592,13 @@ enum MemBarrierOp {
  ISB             = MemBarrierFixed | 0x00000040
 };

+enum SystemExclusiveMonitorOp {
+  SystemExclusiveMonitorFixed = 0xD503305F,
+  SystemExclusiveMonitorFMask = 0xFFFFF0FF,
+  SystemExclusiveMonitorMask  = 0xFFFFF0FF,
+  CLREX                       = SystemExclusiveMonitorFixed
+};
+
 // Any load or store.
 enum LoadStoreAnyOp {
  LoadStoreAnyFMask = 0x0a000000,
@@ -702,7 +716,7 @@ enum LoadStoreUnscaledOffsetOp {

 // Load/store (post, pre, offset and unsigned.)
 enum LoadStoreOp {
-  LoadStoreOpMask   = 0xC4C00000,
+  LoadStoreOpMask = 0xC4C00000,
  #define LOAD_STORE(A, B, C, D)  \
  A##B##_##C = D
  LOAD_STORE_OP_LIST(LOAD_STORE),
@@ -756,6 +770,44 @@ enum LoadStoreRegisterOffset {
  #undef LOAD_STORE_REGISTER_OFFSET
 };

+enum LoadStoreExclusive {
+  LoadStoreExclusiveFixed = 0x08000000,
+  LoadStoreExclusiveFMask = 0x3F000000,
+  LoadStoreExclusiveMask  = 0xFFE08000,
+  STXRB_w  = LoadStoreExclusiveFixed | 0x00000000,
+  STXRH_w  = LoadStoreExclusiveFixed | 0x40000000,
+  STXR_w   = LoadStoreExclusiveFixed | 0x80000000,
+  STXR_x   = LoadStoreExclusiveFixed | 0xC0000000,
+  LDXRB_w  = LoadStoreExclusiveFixed | 0x00400000,
+  LDXRH_w  = LoadStoreExclusiveFixed | 0x40400000,
+  LDXR_w   = LoadStoreExclusiveFixed | 0x80400000,
+  LDXR_x   = LoadStoreExclusiveFixed | 0xC0400000,
+  STXP_w   = LoadStoreExclusiveFixed | 0x80200000,
+  STXP_x   = LoadStoreExclusiveFixed | 0xC0200000,
+  LDXP_w   = LoadStoreExclusiveFixed | 0x80600000,
+  LDXP_x   = LoadStoreExclusiveFixed | 0xC0600000,
+  STLXRB_w = LoadStoreExclusiveFixed | 0x00008000,
+  STLXRH_w = LoadStoreExclusiveFixed | 0x40008000,
+  STLXR_w  = LoadStoreExclusiveFixed | 0x80008000,
+  STLXR_x  = LoadStoreExclusiveFixed | 0xC0008000,
+  LDAXRB_w = LoadStoreExclusiveFixed | 0x00408000,
+  LDAXRH_w = LoadStoreExclusiveFixed | 0x40408000,
+  LDAXR_w  = LoadStoreExclusiveFixed | 0x80408000,
+  LDAXR_x  = LoadStoreExclusiveFixed | 0xC0408000,
+  STLXP_w  = LoadStoreExclusiveFixed | 0x80208000,
+  STLXP_x  = LoadStoreExclusiveFixed | 0xC0208000,
+  LDAXP_w  = LoadStoreExclusiveFixed | 0x80608000,
+  LDAXP_x  = LoadStoreExclusiveFixed | 0xC0608000,
+  STLRB_w  = LoadStoreExclusiveFixed | 0x00808000,
+  STLRH_w  = LoadStoreExclusiveFixed | 0x40808000,
+  STLR_w   = LoadStoreExclusiveFixed | 0x80808000,
+  STLR_x   = LoadStoreExclusiveFixed | 0xC0808000,
+  LDARB_w  = LoadStoreExclusiveFixed | 0x00C08000,
+  LDARH_w  = LoadStoreExclusiveFixed | 0x40C08000,
+  LDAR_w   = LoadStoreExclusiveFixed | 0x80C08000,
+  LDAR_x   = LoadStoreExclusiveFixed | 0xC0C08000
+};
+
 // Conditional compare.
 enum ConditionalCompareOp {
  ConditionalCompareMask = 0x60000000,
--- a/disas/libvixl/a64/cpu-a64.h
+++ b/disas/libvixl/a64/cpu-a64.h
@@ -28,6 +28,7 @@
 #define VIXL_CPU_A64_H

 #include "globals.h"
+#include "instructions-a64.h"

 namespace vixl {

@@ -42,6 +43,32 @@ class CPU {
  // safely run.
  static void EnsureIAndDCacheCoherency(void *address, size_t length);

+  // Handle tagged pointers.
+  template <typename T>
+  static T SetPointerTag(T pointer, uint64_t tag) {
+    VIXL_ASSERT(is_uintn(kAddressTagWidth, tag));
+
+    // Use C-style casts to get static_cast behaviour for integral types (T),
+    // and reinterpret_cast behaviour for other types.
+
+    uint64_t raw = (uint64_t)pointer;
+    VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(raw));
+
+    raw = (raw & ~kAddressTagMask) | (tag << kAddressTagOffset);
+    return (T)raw;
+  }
+
+  template <typename T>
+  static uint64_t GetPointerTag(T pointer) {
+    // Use C-style casts to get static_cast behaviour for integral types (T),
+    // and reinterpret_cast behaviour for other types.
+
+    uint64_t raw = (uint64_t)pointer;
+    VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(raw));
+
+    return (raw & kAddressTagMask) >> kAddressTagOffset;
+  }
+
 private:
  // Return the content of the cache type register.
  static uint32_t GetCacheType();
--- a/disas/libvixl/a64/decoder-a64.cc
+++ b/disas/libvixl/a64/decoder-a64.cc
@@ -29,8 +29,8 @@
 #include "a64/decoder-a64.h"

 namespace vixl {
-// Top-level instruction decode function.
-void Decoder::Decode(Instruction *instr) {
+
+void Decoder::DecodeInstruction(const Instruction *instr) {
  if (instr->Bits(28, 27) == 0) {
    VisitUnallocated(instr);
  } else {
@@ -109,20 +109,17 @@ void Decoder::Decode(Instruction *instr) {
 }

 void Decoder::AppendVisitor(DecoderVisitor* new_visitor) {
-  visitors_.remove(new_visitor);
-  visitors_.push_front(new_visitor);
+  visitors_.push_back(new_visitor);
 }


 void Decoder::PrependVisitor(DecoderVisitor* new_visitor) {
-  visitors_.remove(new_visitor);
-  visitors_.push_back(new_visitor);
+  visitors_.push_front(new_visitor);
 }


 void Decoder::InsertVisitorBefore(DecoderVisitor* new_visitor,
                                  DecoderVisitor* registered_visitor) {
-  visitors_.remove(new_visitor);
  std::list<DecoderVisitor*>::iterator it;
  for (it = visitors_.begin(); it != visitors_.end(); it++) {
    if (*it == registered_visitor) {
@@ -139,7 +136,6 @@ void Decoder::InsertVisitorBefore(DecoderVisitor* new_visitor,

 void Decoder::InsertVisitorAfter(DecoderVisitor* new_visitor,
                                 DecoderVisitor* registered_visitor) {
-  visitors_.remove(new_visitor);
  std::list<DecoderVisitor*>::iterator it;
  for (it = visitors_.begin(); it != visitors_.end(); it++) {
    if (*it == registered_visitor) {
@@ -160,7 +156,7 @@ void Decoder::RemoveVisitor(DecoderVisitor* visitor) {
 }


-void Decoder::DecodePCRelAddressing(Instruction* instr) {
+void Decoder::DecodePCRelAddressing(const Instruction* instr) {
  VIXL_ASSERT(instr->Bits(27, 24) == 0x0);
  // We know bit 28 is set, as <b28:b27> = 0 is filtered out at the top level
  // decode.
@@ -169,11 +165,11 @@ void Decoder::DecodePCRelAddressing(Instruction* instr) {
 }


-void Decoder::DecodeBranchSystemException(Instruction* instr) {
+void Decoder::DecodeBranchSystemException(const Instruction* instr) {
  VIXL_ASSERT((instr->Bits(27, 24) == 0x4) ||
-         (instr->Bits(27, 24) == 0x5) ||
-         (instr->Bits(27, 24) == 0x6) ||
-         (instr->Bits(27, 24) == 0x7) );
+              (instr->Bits(27, 24) == 0x5) ||
+              (instr->Bits(27, 24) == 0x6) ||
+              (instr->Bits(27, 24) == 0x7) );

  switch (instr->Bits(31, 29)) {
    case 0:
@@ -270,18 +266,17 @@ void Decoder::DecodeBranchSystemException(Instruction* instr) {
 }


-void Decoder::DecodeLoadStore(Instruction* instr) {
+void Decoder::DecodeLoadStore(const Instruction* instr) {
  VIXL_ASSERT((instr->Bits(27, 24) == 0x8) ||
-         (instr->Bits(27, 24) == 0x9) ||
-         (instr->Bits(27, 24) == 0xC) ||
-         (instr->Bits(27, 24) == 0xD) );
+              (instr->Bits(27, 24) == 0x9) ||
+              (instr->Bits(27, 24) == 0xC) ||
+              (instr->Bits(27, 24) == 0xD) );

  if (instr->Bit(24) == 0) {
    if (instr->Bit(28) == 0) {
      if (instr->Bit(29) == 0) {
        if (instr->Bit(26) == 0) {
-          // TODO: VisitLoadStoreExclusive.
-          VisitUnimplemented(instr);
+          VisitLoadStoreExclusive(instr);
        } else {
          DecodeAdvSIMDLoadStore(instr);
        }
@@ -389,7 +384,7 @@ void Decoder::DecodeLoadStore(Instruction* instr) {
 }


-void Decoder::DecodeLogical(Instruction* instr) {
+void Decoder::DecodeLogical(const Instruction* instr) {
  VIXL_ASSERT(instr->Bits(27, 24) == 0x2);

  if (instr->Mask(0x80400000) == 0x00400000) {
@@ -408,7 +403,7 @@ void Decoder::DecodeLogical(Instruction* instr) {
 }


-void Decoder::DecodeBitfieldExtract(Instruction* instr) {
+void Decoder::DecodeBitfieldExtract(const Instruction* instr) {
  VIXL_ASSERT(instr->Bits(27, 24) == 0x3);

  if ((instr->Mask(0x80400000) == 0x80000000) ||
@@ -433,7 +428,7 @@ void Decoder::DecodeBitfieldExtract(Instruction* instr) {
 }


-void Decoder::DecodeAddSubImmediate(Instruction* instr) {
+void Decoder::DecodeAddSubImmediate(const Instruction* instr) {
  VIXL_ASSERT(instr->Bits(27, 24) == 0x1);
  if (instr->Bit(23) == 1) {
    VisitUnallocated(instr);
@@ -443,7 +438,7 @@ void Decoder::DecodeAddSubImmediate(Instruction* instr) {
 }


-void Decoder::DecodeDataProcessing(Instruction* instr) {
+void Decoder::DecodeDataProcessing(const Instruction* instr) {
  VIXL_ASSERT((instr->Bits(27, 24) == 0xA) ||
              (instr->Bits(27, 24) == 0xB));

@@ -558,7 +553,7 @@ void Decoder::DecodeDataProcessing(Instruction* instr) {
 }


-void Decoder::DecodeFP(Instruction* instr) {
+void Decoder::DecodeFP(const Instruction* instr) {
  VIXL_ASSERT((instr->Bits(27, 24) == 0xE) ||
              (instr->Bits(27, 24) == 0xF));

@@ -685,14 +680,14 @@ void Decoder::DecodeFP(Instruction* instr) {
 }


-void Decoder::DecodeAdvSIMDLoadStore(Instruction* instr) {
+void Decoder::DecodeAdvSIMDLoadStore(const Instruction* instr) {
  // TODO: Implement Advanced SIMD load/store instruction decode.
  VIXL_ASSERT(instr->Bits(29, 25) == 0x6);
  VisitUnimplemented(instr);
 }


-void Decoder::DecodeAdvSIMDDataProcessing(Instruction* instr) {
+void Decoder::DecodeAdvSIMDDataProcessing(const Instruction* instr) {
  // TODO: Implement Advanced SIMD data processing instruction decode.
  VIXL_ASSERT(instr->Bits(27, 25) == 0x7);
  VisitUnimplemented(instr);
@@ -700,7 +695,7 @@ void Decoder::DecodeAdvSIMDDataProcessing(Instruction* instr) {


 #define DEFINE_VISITOR_CALLERS(A)                                              \
-  void Decoder::Visit##A(Instruction *instr) {                                 \
+  void Decoder::Visit##A(const Instruction *instr) {                           \
    VIXL_ASSERT(instr->Mask(A##FMask) == A##Fixed);                            \
    std::list<DecoderVisitor*>::iterator it;                                   \
    for (it = visitors_.begin(); it != visitors_.end(); it++) {                \
--- a/disas/libvixl/a64/decoder-a64.h
+++ b/disas/libvixl/a64/decoder-a64.h
@@ -59,6 +59,7 @@
  V(LoadStorePreIndex)              \
  V(LoadStoreRegisterOffset)        \
  V(LoadStoreUnsignedOffset)        \
+  V(LoadStoreExclusive)             \
  V(LogicalShifted)                 \
  V(AddSubShifted)                  \
  V(AddSubExtended)                 \
@@ -87,112 +88,152 @@ namespace vixl {
 // must provide implementations for all of these functions.
 class DecoderVisitor {
 public:
-  #define DECLARE(A) virtual void Visit##A(Instruction* instr) = 0;
-  VISITOR_LIST(DECLARE)
-  #undef DECLARE
+  enum VisitorConstness {
+    kConstVisitor,
+    kNonConstVisitor
+  };
+  explicit DecoderVisitor(VisitorConstness constness = kConstVisitor)
+      : constness_(constness) {}

  virtual ~DecoderVisitor() {}

- private:
-  // Visitors are registered in a list.
-  std::list<DecoderVisitor*> visitors_;
+  #define DECLARE(A) virtual void Visit##A(const Instruction* instr) = 0;
+  VISITOR_LIST(DECLARE)
+  #undef DECLARE

-  friend class Decoder;
+  bool IsConstVisitor() const { return constness_ == kConstVisitor; }
+  Instruction* MutableInstruction(const Instruction* instr) {
+    VIXL_ASSERT(!IsConstVisitor());
+    return const_cast<Instruction*>(instr);
+  }
+
+ private:
+  VisitorConstness constness_;
 };


-class Decoder: public DecoderVisitor {
+class Decoder {
 public:
  Decoder() {}

-  // Top-level instruction decoder function. Decodes an instruction and calls
-  // the visitor functions registered with the Decoder class.
-  void Decode(Instruction *instr);
+  // Top-level wrappers around the actual decoding function.
+  void Decode(const Instruction* instr) {
+    std::list<DecoderVisitor*>::iterator it;
+    for (it = visitors_.begin(); it != visitors_.end(); it++) {
+      VIXL_ASSERT((*it)->IsConstVisitor());
+    }
+    DecodeInstruction(instr);
+  }
+  void Decode(Instruction* instr) {
+    DecodeInstruction(const_cast<const Instruction*>(instr));
+  }

  // Register a new visitor class with the decoder.
  // Decode() will call the corresponding visitor method from all registered
  // visitor classes when decoding reaches the leaf node of the instruction
  // decode tree.
-  // Visitors are called in the order.
-  // A visitor can only be registered once.
-  // Registering an already registered visitor will update its position.
+  // Visitors are called in order.
+  // A visitor can be registered multiple times.
  //
  //   d.AppendVisitor(V1);
  //   d.AppendVisitor(V2);
-  //   d.PrependVisitor(V2);            // Move V2 at the start of the list.
-  //   d.InsertVisitorBefore(V3, V2);
-  //   d.AppendVisitor(V4);
-  //   d.AppendVisitor(V4);             // No effect.
+  //   d.PrependVisitor(V2);
+  //   d.AppendVisitor(V3);
  //
  //   d.Decode(i);
  //
-  // will call in order visitor methods in V3, V2, V1, V4.
+  // will call in order visitor methods in V2, V1, V2, V3.
  void AppendVisitor(DecoderVisitor* visitor);
  void PrependVisitor(DecoderVisitor* visitor);
+  // These helpers register `new_visitor` before or after the first instance of
+  // `registered_visiter` in the list.
+  // So if
+  //   V1, V2, V1, V2
+  // are registered in this order in the decoder, calls to
+  //   d.InsertVisitorAfter(V3, V1);
+  //   d.InsertVisitorBefore(V4, V2);
+  // will yield the order
+  //   V1, V3, V4, V2, V1, V2
+  //
+  // For more complex modifications of the order of registered visitors, one can
+  // directly access and modify the list of visitors via the `visitors()'
+  // accessor.
  void InsertVisitorBefore(DecoderVisitor* new_visitor,
                           DecoderVisitor* registered_visitor);
  void InsertVisitorAfter(DecoderVisitor* new_visitor,
                          DecoderVisitor* registered_visitor);

-  // Remove a previously registered visitor class from the list of visitors
-  // stored by the decoder.
+  // Remove all instances of a previously registered visitor class from the list
+  // of visitors stored by the decoder.
  void RemoveVisitor(DecoderVisitor* visitor);

-  #define DECLARE(A) void Visit##A(Instruction* instr);
+  #define DECLARE(A) void Visit##A(const Instruction* instr);
  VISITOR_LIST(DECLARE)
  #undef DECLARE

+
+  std::list<DecoderVisitor*>* visitors() { return &visitors_; }
+
 private:
+  // Decodes an instruction and calls the visitor functions registered with the
+  // Decoder class.
+  void DecodeInstruction(const Instruction* instr);
+
  // Decode the PC relative addressing instruction, and call the corresponding
  // visitors.
  // On entry, instruction bits 27:24 = 0x0.
-  void DecodePCRelAddressing(Instruction* instr);
+  void DecodePCRelAddressing(const Instruction* instr);

  // Decode the add/subtract immediate instruction, and call the correspoding
  // visitors.
  // On entry, instruction bits 27:24 = 0x1.
-  void DecodeAddSubImmediate(Instruction* instr);
+  void DecodeAddSubImmediate(const Instruction* instr);

  // Decode the branch, system command, and exception generation parts of
  // the instruction tree, and call the corresponding visitors.
  // On entry, instruction bits 27:24 = {0x4, 0x5, 0x6, 0x7}.
-  void DecodeBranchSystemException(Instruction* instr);
+  void DecodeBranchSystemException(const Instruction* instr);

  // Decode the load and store parts of the instruction tree, and call
  // the corresponding visitors.
  // On entry, instruction bits 27:24 = {0x8, 0x9, 0xC, 0xD}.
-  void DecodeLoadStore(Instruction* instr);
+  void DecodeLoadStore(const Instruction* instr);

  // Decode the logical immediate and move wide immediate parts of the
  // instruction tree, and call the corresponding visitors.
  // On entry, instruction bits 27:24 = 0x2.
-  void DecodeLogical(Instruction* instr);
+  void DecodeLogical(const Instruction* instr);

  // Decode the bitfield and extraction parts of the instruction tree,
  // and call the corresponding visitors.
  // On entry, instruction bits 27:24 = 0x3.
-  void DecodeBitfieldExtract(Instruction* instr);
+  void DecodeBitfieldExtract(const Instruction* instr);

  // Decode the data processing parts of the instruction tree, and call the
  // corresponding visitors.
  // On entry, instruction bits 27:24 = {0x1, 0xA, 0xB}.
-  void DecodeDataProcessing(Instruction* instr);
+  void DecodeDataProcessing(const Instruction* instr);

  // Decode the floating point parts of the instruction tree, and call the
  // corresponding visitors.
  // On entry, instruction bits 27:24 = {0xE, 0xF}.
-  void DecodeFP(Instruction* instr);
+  void DecodeFP(const Instruction* instr);

  // Decode the Advanced SIMD (NEON) load/store part of the instruction tree,
  // and call the corresponding visitors.
  // On entry, instruction bits 29:25 = 0x6.
-  void DecodeAdvSIMDLoadStore(Instruction* instr);
+  void DecodeAdvSIMDLoadStore(const Instruction* instr);

  // Decode the Advanced SIMD (NEON) data processing part of the instruction
  // tree, and call the corresponding visitors.
  // On entry, instruction bits 27:25 = 0x7.
-  void DecodeAdvSIMDDataProcessing(Instruction* instr);
+  void DecodeAdvSIMDDataProcessing(const Instruction* instr);
+
+ private:
+  // Visitors are registered in a list.
+  std::list<DecoderVisitor*> visitors_;
 };
+
 }  // namespace vixl

 #endif  // VIXL_A64_DECODER_A64_H_
--- a/disas/libvixl/a64/disasm-a64.cc
+++ b/disas/libvixl/a64/disasm-a64.cc
@@ -24,6 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+#include <cstdlib>
 #include "a64/disasm-a64.h"

 namespace vixl {
@@ -56,7 +57,7 @@ char* Disassembler::GetOutput() {
 }


-void Disassembler::VisitAddSubImmediate(Instruction* instr) {
+void Disassembler::VisitAddSubImmediate(const Instruction* instr) {
  bool rd_is_zr = RdIsZROrSP(instr);
  bool stack_op = (rd_is_zr || RnIsZROrSP(instr)) &&
                  (instr->ImmAddSub() == 0) ? true : false;
@@ -101,7 +102,7 @@ void Disassembler::VisitAddSubImmediate(Instruction* instr) {
 }


-void Disassembler::VisitAddSubShifted(Instruction* instr) {
+void Disassembler::VisitAddSubShifted(const Instruction* instr) {
  bool rd_is_zr = RdIsZROrSP(instr);
  bool rn_is_zr = RnIsZROrSP(instr);
  const char *mnemonic = "";
@@ -148,7 +149,7 @@ void Disassembler::VisitAddSubShifted(Instruction* instr) {
 }


-void Disassembler::VisitAddSubExtended(Instruction* instr) {
+void Disassembler::VisitAddSubExtended(const Instruction* instr) {
  bool rd_is_zr = RdIsZROrSP(instr);
  const char *mnemonic = "";
  Extend mode = static_cast<Extend>(instr->ExtendMode());
@@ -186,7 +187,7 @@ void Disassembler::VisitAddSubExtended(Instruction* instr) {
 }


-void Disassembler::VisitAddSubWithCarry(Instruction* instr) {
+void Disassembler::VisitAddSubWithCarry(const Instruction* instr) {
  bool rn_is_zr = RnIsZROrSP(instr);
  const char *mnemonic = "";
  const char *form = "'Rd, 'Rn, 'Rm";
@@ -221,7 +222,7 @@ void Disassembler::VisitAddSubWithCarry(Instruction* instr) {
 }


-void Disassembler::VisitLogicalImmediate(Instruction* instr) {
+void Disassembler::VisitLogicalImmediate(const Instruction* instr) {
  bool rd_is_zr = RdIsZROrSP(instr);
  bool rn_is_zr = RnIsZROrSP(instr);
  const char *mnemonic = "";
@@ -293,7 +294,7 @@ bool Disassembler::IsMovzMovnImm(unsigned reg_size, uint64_t value) {
 }


-void Disassembler::VisitLogicalShifted(Instruction* instr) {
+void Disassembler::VisitLogicalShifted(const Instruction* instr) {
  bool rd_is_zr = RdIsZROrSP(instr);
  bool rn_is_zr = RnIsZROrSP(instr);
  const char *mnemonic = "";
@@ -344,7 +345,7 @@ void Disassembler::VisitLogicalShifted(Instruction* instr) {
 }


-void Disassembler::VisitConditionalCompareRegister(Instruction* instr) {
+void Disassembler::VisitConditionalCompareRegister(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rn, 'Rm, 'INzcv, 'Cond";

@@ -359,7 +360,7 @@ void Disassembler::VisitConditionalCompareRegister(Instruction* instr) {
 }


-void Disassembler::VisitConditionalCompareImmediate(Instruction* instr) {
+void Disassembler::VisitConditionalCompareImmediate(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rn, 'IP, 'INzcv, 'Cond";

@@ -374,7 +375,7 @@ void Disassembler::VisitConditionalCompareImmediate(Instruction* instr) {
 }


-void Disassembler::VisitConditionalSelect(Instruction* instr) {
+void Disassembler::VisitConditionalSelect(const Instruction* instr) {
  bool rnm_is_zr = (RnIsZROrSP(instr) && RmIsZROrSP(instr));
  bool rn_is_rm = (instr->Rn() == instr->Rm());
  const char *mnemonic = "";
@@ -427,7 +428,7 @@ void Disassembler::VisitConditionalSelect(Instruction* instr) {
 }


-void Disassembler::VisitBitfield(Instruction* instr) {
+void Disassembler::VisitBitfield(const Instruction* instr) {
  unsigned s = instr->ImmS();
  unsigned r = instr->ImmR();
  unsigned rd_size_minus_1 =
@@ -505,7 +506,7 @@ void Disassembler::VisitBitfield(Instruction* instr) {
 }


-void Disassembler::VisitExtract(Instruction* instr) {
+void Disassembler::VisitExtract(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rd, 'Rn, 'Rm, 'IExtract";

@@ -526,16 +527,16 @@ void Disassembler::VisitExtract(Instruction* instr) {
 }


-void Disassembler::VisitPCRelAddressing(Instruction* instr) {
+void Disassembler::VisitPCRelAddressing(const Instruction* instr) {
  switch (instr->Mask(PCRelAddressingMask)) {
    case ADR: Format(instr, "adr", "'Xd, 'AddrPCRelByte"); break;
-    // ADRP is not implemented.
+    case ADRP: Format(instr, "adrp", "'Xd, 'AddrPCRelPage"); break;
    default: Format(instr, "unimplemented", "(PCRelAddressing)");
  }
 }


-void Disassembler::VisitConditionalBranch(Instruction* instr) {
+void Disassembler::VisitConditionalBranch(const Instruction* instr) {
  switch (instr->Mask(ConditionalBranchMask)) {
    case B_cond: Format(instr, "b.'CBrn", "'BImmCond"); break;
    default: VIXL_UNREACHABLE();
@@ -543,7 +544,8 @@ void Disassembler::VisitConditionalBranch(Instruction* instr) {
 }


-void Disassembler::VisitUnconditionalBranchToRegister(Instruction* instr) {
+void Disassembler::VisitUnconditionalBranchToRegister(
+    const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'Xn";

@@ -563,7 +565,7 @@ void Disassembler::VisitUnconditionalBranchToRegister(Instruction* instr) {
 }


-void Disassembler::VisitUnconditionalBranch(Instruction* instr) {
+void Disassembler::VisitUnconditionalBranch(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'BImmUncn";

@@ -576,7 +578,7 @@ void Disassembler::VisitUnconditionalBranch(Instruction* instr) {
 }


-void Disassembler::VisitDataProcessing1Source(Instruction* instr) {
+void Disassembler::VisitDataProcessing1Source(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rd, 'Rn";

@@ -597,7 +599,7 @@ void Disassembler::VisitDataProcessing1Source(Instruction* instr) {
 }


-void Disassembler::VisitDataProcessing2Source(Instruction* instr) {
+void Disassembler::VisitDataProcessing2Source(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'Rd, 'Rn, 'Rm";

@@ -618,7 +620,7 @@ void Disassembler::VisitDataProcessing2Source(Instruction* instr) {
 }


-void Disassembler::VisitDataProcessing3Source(Instruction* instr) {
+void Disassembler::VisitDataProcessing3Source(const Instruction* instr) {
  bool ra_is_zr = RaIsZROrSP(instr);
  const char *mnemonic = "";
  const char *form = "'Xd, 'Wn, 'Wm, 'Xa";
@@ -696,7 +698,7 @@ void Disassembler::VisitDataProcessing3Source(Instruction* instr) {
 }


-void Disassembler::VisitCompareBranch(Instruction* instr) {
+void Disassembler::VisitCompareBranch(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rt, 'BImmCmpa";

@@ -711,7 +713,7 @@ void Disassembler::VisitCompareBranch(Instruction* instr) {
 }


-void Disassembler::VisitTestBranch(Instruction* instr) {
+void Disassembler::VisitTestBranch(const Instruction* instr) {
  const char *mnemonic = "";
  // If the top bit of the immediate is clear, the tested register is
  // disassembled as Wt, otherwise Xt. As the top bit of the immediate is
@@ -728,7 +730,7 @@ void Disassembler::VisitTestBranch(Instruction* instr) {
 }


-void Disassembler::VisitMoveWideImmediate(Instruction* instr) {
+void Disassembler::VisitMoveWideImmediate(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rd, 'IMoveImm";

@@ -767,7 +769,7 @@ void Disassembler::VisitMoveWideImmediate(Instruction* instr) {
  V(LDR_s, "ldr", "'St")      \
  V(LDR_d, "ldr", "'Dt")

-void Disassembler::VisitLoadStorePreIndex(Instruction* instr) {
+void Disassembler::VisitLoadStorePreIndex(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStorePreIndex)";

@@ -781,7 +783,7 @@ void Disassembler::VisitLoadStorePreIndex(Instruction* instr) {
 }


-void Disassembler::VisitLoadStorePostIndex(Instruction* instr) {
+void Disassembler::VisitLoadStorePostIndex(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStorePostIndex)";

@@ -795,7 +797,7 @@ void Disassembler::VisitLoadStorePostIndex(Instruction* instr) {
 }


-void Disassembler::VisitLoadStoreUnsignedOffset(Instruction* instr) {
+void Disassembler::VisitLoadStoreUnsignedOffset(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStoreUnsignedOffset)";

@@ -810,7 +812,7 @@ void Disassembler::VisitLoadStoreUnsignedOffset(Instruction* instr) {
 }


-void Disassembler::VisitLoadStoreRegisterOffset(Instruction* instr) {
+void Disassembler::VisitLoadStoreRegisterOffset(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStoreRegisterOffset)";

@@ -825,7 +827,7 @@ void Disassembler::VisitLoadStoreRegisterOffset(Instruction* instr) {
 }


-void Disassembler::VisitLoadStoreUnscaledOffset(Instruction* instr) {
+void Disassembler::VisitLoadStoreUnscaledOffset(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'Wt, ['Xns'ILS]";
  const char *form_x = "'Xt, ['Xns'ILS]";
@@ -856,7 +858,7 @@ void Disassembler::VisitLoadStoreUnscaledOffset(Instruction* instr) {
 }


-void Disassembler::VisitLoadLiteral(Instruction* instr) {
+void Disassembler::VisitLoadLiteral(const Instruction* instr) {
  const char *mnemonic = "ldr";
  const char *form = "(LoadLiteral)";

@@ -865,6 +867,11 @@ void Disassembler::VisitLoadLiteral(Instruction* instr) {
    case LDR_x_lit: form = "'Xt, 'ILLiteral 'LValue"; break;
    case LDR_s_lit: form = "'St, 'ILLiteral 'LValue"; break;
    case LDR_d_lit: form = "'Dt, 'ILLiteral 'LValue"; break;
+    case LDRSW_x_lit: {
+      mnemonic = "ldrsw";
+      form = "'Xt, 'ILLiteral 'LValue";
+      break;
+    }
    default: mnemonic = "unimplemented";
  }
  Format(instr, mnemonic, form);
@@ -882,7 +889,7 @@ void Disassembler::VisitLoadLiteral(Instruction* instr) {
  V(STP_d, "stp", "'Dt, 'Dt2", "8")     \
  V(LDP_d, "ldp", "'Dt, 'Dt2", "8")

-void Disassembler::VisitLoadStorePairPostIndex(Instruction* instr) {
+void Disassembler::VisitLoadStorePairPostIndex(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStorePairPostIndex)";

@@ -896,7 +903,7 @@ void Disassembler::VisitLoadStorePairPostIndex(Instruction* instr) {
 }


-void Disassembler::VisitLoadStorePairPreIndex(Instruction* instr) {
+void Disassembler::VisitLoadStorePairPreIndex(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStorePairPreIndex)";

@@ -910,7 +917,7 @@ void Disassembler::VisitLoadStorePairPreIndex(Instruction* instr) {
 }


-void Disassembler::VisitLoadStorePairOffset(Instruction* instr) {
+void Disassembler::VisitLoadStorePairOffset(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(LoadStorePairOffset)";

@@ -924,7 +931,7 @@ void Disassembler::VisitLoadStorePairOffset(Instruction* instr) {
 }


-void Disassembler::VisitLoadStorePairNonTemporal(Instruction* instr) {
+void Disassembler::VisitLoadStorePairNonTemporal(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form;

@@ -943,7 +950,50 @@ void Disassembler::VisitLoadStorePairNonTemporal(Instruction* instr) {
 }


-void Disassembler::VisitFPCompare(Instruction* instr) {
+void Disassembler::VisitLoadStoreExclusive(const Instruction* instr) {
+  const char *mnemonic = "unimplemented";
+  const char *form;
+
+  switch (instr->Mask(LoadStoreExclusiveMask)) {
+    case STXRB_w: mnemonic = "stxrb"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STXRH_w: mnemonic = "stxrh"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STXR_w: mnemonic = "stxr"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STXR_x: mnemonic = "stxr"; form = "'Ws, 'Xt, ['Xns]"; break;
+    case LDXRB_w: mnemonic = "ldxrb"; form = "'Wt, ['Xns]"; break;
+    case LDXRH_w: mnemonic = "ldxrh"; form = "'Wt, ['Xns]"; break;
+    case LDXR_w: mnemonic = "ldxr"; form = "'Wt, ['Xns]"; break;
+    case LDXR_x: mnemonic = "ldxr"; form = "'Xt, ['Xns]"; break;
+    case STXP_w: mnemonic = "stxp"; form = "'Ws, 'Wt, 'Wt2, ['Xns]"; break;
+    case STXP_x: mnemonic = "stxp"; form = "'Ws, 'Xt, 'Xt2, ['Xns]"; break;
+    case LDXP_w: mnemonic = "ldxp"; form = "'Wt, 'Wt2, ['Xns]"; break;
+    case LDXP_x: mnemonic = "ldxp"; form = "'Xt, 'Xt2, ['Xns]"; break;
+    case STLXRB_w: mnemonic = "stlxrb"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STLXRH_w: mnemonic = "stlxrh"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STLXR_w: mnemonic = "stlxr"; form = "'Ws, 'Wt, ['Xns]"; break;
+    case STLXR_x: mnemonic = "stlxr"; form = "'Ws, 'Xt, ['Xns]"; break;
+    case LDAXRB_w: mnemonic = "ldaxrb"; form = "'Wt, ['Xns]"; break;
+    case LDAXRH_w: mnemonic = "ldaxrh"; form = "'Wt, ['Xns]"; break;
+    case LDAXR_w: mnemonic = "ldaxr"; form = "'Wt, ['Xns]"; break;
+    case LDAXR_x: mnemonic = "ldaxr"; form = "'Xt, ['Xns]"; break;
+    case STLXP_w: mnemonic = "stlxp"; form = "'Ws, 'Wt, 'Wt2, ['Xns]"; break;
+    case STLXP_x: mnemonic = "stlxp"; form = "'Ws, 'Xt, 'Xt2, ['Xns]"; break;
+    case LDAXP_w: mnemonic = "ldaxp"; form = "'Wt, 'Wt2, ['Xns]"; break;
+    case LDAXP_x: mnemonic = "ldaxp"; form = "'Xt, 'Xt2, ['Xns]"; break;
+    case STLRB_w: mnemonic = "stlrb"; form = "'Wt, ['Xns]"; break;
+    case STLRH_w: mnemonic = "stlrh"; form = "'Wt, ['Xns]"; break;
+    case STLR_w: mnemonic = "stlr"; form = "'Wt, ['Xns]"; break;
+    case STLR_x: mnemonic = "stlr"; form = "'Xt, ['Xns]"; break;
+    case LDARB_w: mnemonic = "ldarb"; form = "'Wt, ['Xns]"; break;
+    case LDARH_w: mnemonic = "ldarh"; form = "'Wt, ['Xns]"; break;
+    case LDAR_w: mnemonic = "ldar"; form = "'Wt, ['Xns]"; break;
+    case LDAR_x: mnemonic = "ldar"; form = "'Xt, ['Xns]"; break;
+    default: form = "(LoadStoreExclusive)";
+  }
+  Format(instr, mnemonic, form);
+}
+
+
+void Disassembler::VisitFPCompare(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'Fn, 'Fm";
  const char *form_zero = "'Fn, #0.0";
@@ -959,7 +1009,7 @@ void Disassembler::VisitFPCompare(Instruction* instr) {
 }


-void Disassembler::VisitFPConditionalCompare(Instruction* instr) {
+void Disassembler::VisitFPConditionalCompare(const Instruction* instr) {
  const char *mnemonic = "unmplemented";
  const char *form = "'Fn, 'Fm, 'INzcv, 'Cond";

@@ -974,7 +1024,7 @@ void Disassembler::VisitFPConditionalCompare(Instruction* instr) {
 }


-void Disassembler::VisitFPConditionalSelect(Instruction* instr) {
+void Disassembler::VisitFPConditionalSelect(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Fd, 'Fn, 'Fm, 'Cond";

@@ -987,7 +1037,7 @@ void Disassembler::VisitFPConditionalSelect(Instruction* instr) {
 }


-void Disassembler::VisitFPDataProcessing1Source(Instruction* instr) {
+void Disassembler::VisitFPDataProcessing1Source(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'Fd, 'Fn";

@@ -1015,7 +1065,7 @@ void Disassembler::VisitFPDataProcessing1Source(Instruction* instr) {
 }


-void Disassembler::VisitFPDataProcessing2Source(Instruction* instr) {
+void Disassembler::VisitFPDataProcessing2Source(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Fd, 'Fn, 'Fm";

@@ -1039,7 +1089,7 @@ void Disassembler::VisitFPDataProcessing2Source(Instruction* instr) {
 }


-void Disassembler::VisitFPDataProcessing3Source(Instruction* instr) {
+void Disassembler::VisitFPDataProcessing3Source(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Fd, 'Fn, 'Fm, 'Fa";

@@ -1058,7 +1108,7 @@ void Disassembler::VisitFPDataProcessing3Source(Instruction* instr) {
 }


-void Disassembler::VisitFPImmediate(Instruction* instr) {
+void Disassembler::VisitFPImmediate(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "(FPImmediate)";

@@ -1071,7 +1121,7 @@ void Disassembler::VisitFPImmediate(Instruction* instr) {
 }


-void Disassembler::VisitFPIntegerConvert(Instruction* instr) {
+void Disassembler::VisitFPIntegerConvert(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "(FPIntegerConvert)";
  const char *form_rf = "'Rd, 'Fn";
@@ -1127,7 +1177,7 @@ void Disassembler::VisitFPIntegerConvert(Instruction* instr) {
 }


-void Disassembler::VisitFPFixedPointConvert(Instruction* instr) {
+void Disassembler::VisitFPFixedPointConvert(const Instruction* instr) {
  const char *mnemonic = "";
  const char *form = "'Rd, 'Fn, 'IFPFBits";
  const char *form_fr = "'Fd, 'Rn, 'IFPFBits";
@@ -1155,14 +1205,22 @@ void Disassembler::VisitFPFixedPointConvert(Instruction* instr) {
 }


-void Disassembler::VisitSystem(Instruction* instr) {
+void Disassembler::VisitSystem(const Instruction* instr) {
  // Some system instructions hijack their Op and Cp fields to represent a
  // range of immediates instead of indicating a different instruction. This
  // makes the decoding tricky.
  const char *mnemonic = "unimplemented";
  const char *form = "(System)";

-  if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) {
+  if (instr->Mask(SystemExclusiveMonitorFMask) == SystemExclusiveMonitorFixed) {
+    switch (instr->Mask(SystemExclusiveMonitorMask)) {
+      case CLREX: {
+        mnemonic = "clrex";
+        form = (instr->CRm() == 0xf) ? NULL : "'IX";
+        break;
+      }
+    }
+  } else if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) {
    switch (instr->Mask(SystemSysRegMask)) {
      case MRS: {
        mnemonic = "mrs";
@@ -1184,7 +1242,6 @@ void Disassembler::VisitSystem(Instruction* instr) {
      }
    }
  } else if (instr->Mask(SystemHintFMask) == SystemHintFixed) {
-    VIXL_ASSERT(instr->Mask(SystemHintMask) == HINT);
    switch (instr->ImmHint()) {
      case NOP: {
        mnemonic = "nop";
@@ -1216,7 +1273,7 @@ void Disassembler::VisitSystem(Instruction* instr) {
 }


-void Disassembler::VisitException(Instruction* instr) {
+void Disassembler::VisitException(const Instruction* instr) {
  const char *mnemonic = "unimplemented";
  const char *form = "'IDebug";

@@ -1235,22 +1292,75 @@ void Disassembler::VisitException(Instruction* instr) {
 }


-void Disassembler::VisitUnimplemented(Instruction* instr) {
+void Disassembler::VisitUnimplemented(const Instruction* instr) {
  Format(instr, "unimplemented", "(Unimplemented)");
 }


-void Disassembler::VisitUnallocated(Instruction* instr) {
+void Disassembler::VisitUnallocated(const Instruction* instr) {
  Format(instr, "unallocated", "(Unallocated)");
 }


-void Disassembler::ProcessOutput(Instruction* /*instr*/) {
+void Disassembler::ProcessOutput(const Instruction* /*instr*/) {
  // The base disasm does nothing more than disassembling into a buffer.
 }


-void Disassembler::Format(Instruction* instr, const char* mnemonic,
+void Disassembler::AppendRegisterNameToOutput(const Instruction* instr,
+                                              const CPURegister& reg) {
+  USE(instr);
+  VIXL_ASSERT(reg.IsValid());
+  char reg_char;
+
+  if (reg.IsRegister()) {
+    reg_char = reg.Is64Bits() ? 'x' : 'w';
+  } else {
+    VIXL_ASSERT(reg.IsFPRegister());
+    reg_char = reg.Is64Bits() ? 'd' : 's';
+  }
+
+  if (reg.IsFPRegister() || !(reg.Aliases(sp) || reg.Aliases(xzr))) {
+    // A normal register: w0 - w30, x0 - x30, s0 - s31, d0 - d31.
+    AppendToOutput("%c%d", reg_char, reg.code());
+  } else if (reg.Aliases(sp)) {
+    // Disassemble w31/x31 as stack pointer wsp/sp.
+    AppendToOutput("%s", reg.Is64Bits() ? "sp" : "wsp");
+  } else {
+    // Disassemble w31/x31 as zero register wzr/xzr.
+    AppendToOutput("%czr", reg_char);
+  }
+}
+
+
+void Disassembler::AppendPCRelativeOffsetToOutput(const Instruction* instr,
+                                                  int64_t offset) {
+  USE(instr);
+  char sign = (offset < 0) ? '-' : '+';
+  AppendToOutput("#%c0x%" PRIx64, sign, std::abs(offset));
+}
+
+
+void Disassembler::AppendAddressToOutput(const Instruction* instr,
+                                         const void* addr) {
+  USE(instr);
+  AppendToOutput("(addr %p)", addr);
+}
+
+
+void Disassembler::AppendCodeAddressToOutput(const Instruction* instr,
+                                             const void* addr) {
+  AppendAddressToOutput(instr, addr);
+}
+
+
+void Disassembler::AppendDataAddressToOutput(const Instruction* instr,
+                                             const void* addr) {
+  AppendAddressToOutput(instr, addr);
+}
+
+
+void Disassembler::Format(const Instruction* instr, const char* mnemonic,
                          const char* format) {
  VIXL_ASSERT(mnemonic != NULL);
  ResetOutput();
@@ -1264,7 +1374,7 @@ void Disassembler::Format(Instruction* instr, const char* mnemonic,
 }


-void Disassembler::Substitute(Instruction* instr, const char* string) {
+void Disassembler::Substitute(const Instruction* instr, const char* string) {
  char chr = *string++;
  while (chr != '\0') {
    if (chr == '\'') {
@@ -1277,7 +1387,8 @@ void Disassembler::Substitute(Instruction* instr, const char* string) {
 }


-int Disassembler::SubstituteField(Instruction* instr, const char* format) {
+int Disassembler::SubstituteField(const Instruction* instr,
+                                  const char* format) {
  switch (format[0]) {
    case 'R':  // Register. X or W, selected by sf bit.
    case 'F':  // FP Register. S or D, selected by type field.
@@ -1303,7 +1414,7 @@ int Disassembler::SubstituteField(Instruction* instr, const char* format) {
 }


-int Disassembler::SubstituteRegisterField(Instruction* instr,
+int Disassembler::SubstituteRegisterField(const Instruction* instr,
                                          const char* format) {
  unsigned reg_num = 0;
  unsigned field_len = 2;
@@ -1312,6 +1423,7 @@ int Disassembler::SubstituteRegisterField(Instruction* instr,
    case 'n': reg_num = instr->Rn(); break;
    case 'm': reg_num = instr->Rm(); break;
    case 'a': reg_num = instr->Ra(); break;
+    case 's': reg_num = instr->Rs(); break;
    case 't': {
      if (format[2] == '2') {
        reg_num = instr->Rt2();
@@ -1329,34 +1441,47 @@ int Disassembler::SubstituteRegisterField(Instruction* instr,
    field_len = 3;
  }

-  char reg_type;
+  CPURegister::RegisterType reg_type;
+  unsigned reg_size;
+
  if (format[0] == 'R') {
    // Register type is R: use sf bit to choose X and W.
-    reg_type = instr->SixtyFourBits() ? 'x' : 'w';
+    reg_type = CPURegister::kRegister;
+    reg_size = instr->SixtyFourBits() ? kXRegSize : kWRegSize;
  } else if (format[0] == 'F') {
    // Floating-point register: use type field to choose S or D.
-    reg_type = ((instr->FPType() & 1) == 0) ? 's' : 'd';
+    reg_type = CPURegister::kFPRegister;
+    reg_size = ((instr->FPType() & 1) == 0) ? kSRegSize : kDRegSize;
  } else {
-    // Register type is specified. Make it lower case.
-    reg_type = format[0] + 0x20;
+    // The register type is specified.
+    switch (format[0]) {
+      case 'W':
+        reg_type = CPURegister::kRegister; reg_size = kWRegSize; break;
+      case 'X':
+        reg_type = CPURegister::kRegister; reg_size = kXRegSize; break;
+      case 'S':
+        reg_type = CPURegister::kFPRegister; reg_size = kSRegSize; break;
+      case 'D':
+        reg_type = CPURegister::kFPRegister; reg_size = kDRegSize; break;
+      default:
+        VIXL_UNREACHABLE();
+        reg_type = CPURegister::kRegister;
+        reg_size = kXRegSize;
+    }
  }

-  if ((reg_num != kZeroRegCode) || (reg_type == 's') || (reg_type == 'd')) {
-    // A normal register: w0 - w30, x0 - x30, s0 - s31, d0 - d31.
-    AppendToOutput("%c%d", reg_type, reg_num);
-  } else if (format[2] == 's') {
-    // Disassemble w31/x31 as stack pointer wsp/sp.
-    AppendToOutput("%s", (reg_type == 'w') ? "wsp" : "sp");
-  } else {
-    // Disassemble w31/x31 as zero register wzr/xzr.
-    AppendToOutput("%czr", reg_type);
+  if ((reg_type == CPURegister::kRegister) &&
+      (reg_num == kZeroRegCode) && (format[2] == 's')) {
+    reg_num = kSPRegInternalCode;
  }

+  AppendRegisterNameToOutput(instr, CPURegister(reg_num, reg_size, reg_type));
+
  return field_len;
 }


-int Disassembler::SubstituteImmediateField(Instruction* instr,
+int Disassembler::SubstituteImmediateField(const Instruction* instr,
                                           const char* format) {
  VIXL_ASSERT(format[0] == 'I');

@@ -1406,8 +1531,7 @@ int Disassembler::SubstituteImmediateField(Instruction* instr,
    }
    case 'C': {  // ICondB - Immediate Conditional Branch.
      int64_t offset = instr->ImmCondBranch() << 2;
-      char sign = (offset >= 0) ? '+' : '-';
-      AppendToOutput("#%c0x%" PRIx64, sign, offset);
+      AppendPCRelativeOffsetToOutput(instr, offset);
      return 6;
    }
    case 'A': {  // IAddSub.
@@ -1458,6 +1582,10 @@ int Disassembler::SubstituteImmediateField(Instruction* instr,
      AppendToOutput("#0x%" PRIx64, instr->ImmException());
      return 6;
    }
+    case 'X': {  // IX - CLREX instruction.
+      AppendToOutput("#0x%" PRIx64, instr->CRm());
+      return 2;
+    }
    default: {
      VIXL_UNIMPLEMENTED();
      return 0;
@@ -1466,7 +1594,7 @@ int Disassembler::SubstituteImmediateField(Instruction* instr,
 }


-int Disassembler::SubstituteBitfieldImmediateField(Instruction* instr,
+int Disassembler::SubstituteBitfieldImmediateField(const Instruction* instr,
                                                   const char* format) {
  VIXL_ASSERT((format[0] == 'I') && (format[1] == 'B'));
  unsigned r = instr->ImmR();
@@ -1501,7 +1629,7 @@ int Disassembler::SubstituteBitfieldImmediateField(Instruction* instr,
 }


-int Disassembler::SubstituteLiteralField(Instruction* instr,
+int Disassembler::SubstituteLiteralField(const Instruction* instr,
                                         const char* format) {
  VIXL_ASSERT(strncmp(format, "LValue", 6) == 0);
  USE(format);
@@ -1509,16 +1637,21 @@ int Disassembler::SubstituteLiteralField(Instruction* instr,
  switch (instr->Mask(LoadLiteralMask)) {
    case LDR_w_lit:
    case LDR_x_lit:
+    case LDRSW_x_lit:
    case LDR_s_lit:
-    case LDR_d_lit: AppendToOutput("(addr %p)", instr->LiteralAddress()); break;
-    default: VIXL_UNREACHABLE();
+    case LDR_d_lit:
+      AppendDataAddressToOutput(instr, instr->LiteralAddress());
+      break;
+    default:
+      VIXL_UNREACHABLE();
  }

  return 6;
 }


-int Disassembler::SubstituteShiftField(Instruction* instr, const char* format) {
+int Disassembler::SubstituteShiftField(const Instruction* instr,
+                                       const char* format) {
  VIXL_ASSERT(format[0] == 'H');
  VIXL_ASSERT(instr->ShiftDP() <= 0x3);

@@ -1541,7 +1674,7 @@ int Disassembler::SubstituteShiftField(Instruction* instr, const char* format) {
 }


-int Disassembler::SubstituteConditionField(Instruction* instr,
+int Disassembler::SubstituteConditionField(const Instruction* instr,
                                           const char* format) {
  VIXL_ASSERT(format[0] == 'C');
  const char* condition_code[] = { "eq", "ne", "hs", "lo",
@@ -1562,28 +1695,28 @@ int Disassembler::SubstituteConditionField(Instruction* instr,
 }


-int Disassembler::SubstitutePCRelAddressField(Instruction* instr,
+int Disassembler::SubstitutePCRelAddressField(const Instruction* instr,
                                              const char* format) {
-  USE(format);
-  VIXL_ASSERT(strncmp(format, "AddrPCRel", 9) == 0);
+  VIXL_ASSERT((strcmp(format, "AddrPCRelByte") == 0) ||   // Used by `adr`.
+              (strcmp(format, "AddrPCRelPage") == 0));    // Used by `adrp`.

-  int offset = instr->ImmPCRel();
+  int64_t offset = instr->ImmPCRel();
+  const Instruction * base = instr;

-  // Only ADR (AddrPCRelByte) is supported.
-  VIXL_ASSERT(strcmp(format, "AddrPCRelByte") == 0);
-
-  char sign = '+';
-  if (offset < 0) {
-    offset = -offset;
-    sign = '-';
+  if (format[9] == 'P') {
+    offset *= kPageSize;
+    base = AlignDown(base, kPageSize);
  }
-  VIXL_STATIC_ASSERT(sizeof(*instr) == 1);
-  AppendToOutput("#%c0x%x (addr %p)", sign, offset, instr + offset);
+
+  const void* target = reinterpret_cast<const void*>(base + offset);
+  AppendPCRelativeOffsetToOutput(instr, offset);
+  AppendToOutput(" ");
+  AppendAddressToOutput(instr, target);
  return 13;
 }


-int Disassembler::SubstituteBranchTargetField(Instruction* instr,
+int Disassembler::SubstituteBranchTargetField(const Instruction* instr,
                                              const char* format) {
  VIXL_ASSERT(strncmp(format, "BImm", 4) == 0);

@@ -1600,18 +1733,18 @@ int Disassembler::SubstituteBranchTargetField(Instruction* instr,
    default: VIXL_UNIMPLEMENTED();
  }
  offset <<= kInstructionSizeLog2;
-  char sign = '+';
-  if (offset < 0) {
-    offset = -offset;
-    sign = '-';
-  }
+  const void* target_address = reinterpret_cast<const void*>(instr + offset);
  VIXL_STATIC_ASSERT(sizeof(*instr) == 1);
-  AppendToOutput("#%c0x%" PRIx64 " (addr %p)", sign, offset, instr + offset);
+
+  AppendPCRelativeOffsetToOutput(instr, offset);
+  AppendToOutput(" ");
+  AppendCodeAddressToOutput(instr, target_address);
+
  return 8;
 }


-int Disassembler::SubstituteExtendField(Instruction* instr,
+int Disassembler::SubstituteExtendField(const Instruction* instr,
                                        const char* format) {
  VIXL_ASSERT(strncmp(format, "Ext", 3) == 0);
  VIXL_ASSERT(instr->ExtendMode() <= 7);
@@ -1638,7 +1771,7 @@ int Disassembler::SubstituteExtendField(Instruction* instr,
 }


-int Disassembler::SubstituteLSRegOffsetField(Instruction* instr,
+int Disassembler::SubstituteLSRegOffsetField(const Instruction* instr,
                                             const char* format) {
  VIXL_ASSERT(strncmp(format, "Offsetreg", 9) == 0);
  const char* extend_mode[] = { "undefined", "undefined", "uxtw", "lsl",
@@ -1667,7 +1800,7 @@ int Disassembler::SubstituteLSRegOffsetField(Instruction* instr,
 }


-int Disassembler::SubstitutePrefetchField(Instruction* instr,
+int Disassembler::SubstitutePrefetchField(const Instruction* instr,
                                          const char* format) {
  VIXL_ASSERT(format[0] == 'P');
  USE(format);
@@ -1682,7 +1815,7 @@ int Disassembler::SubstitutePrefetchField(Instruction* instr,
  return 6;
 }

-int Disassembler::SubstituteBarrierField(Instruction* instr,
+int Disassembler::SubstituteBarrierField(const Instruction* instr,
                                         const char* format) {
  VIXL_ASSERT(format[0] == 'M');
  USE(format);
@@ -1714,7 +1847,7 @@ void Disassembler::AppendToOutput(const char* format, ...) {
 }


-void PrintDisassembler::ProcessOutput(Instruction* instr) {
+void PrintDisassembler::ProcessOutput(const Instruction* instr) {
  fprintf(stream_, "0x%016" PRIx64 "  %08" PRIx32 "\t\t%s\n",
          reinterpret_cast<uint64_t>(instr),
          instr->InstructionBits(),
--- a/disas/libvixl/a64/disasm-a64.h
+++ b/disas/libvixl/a64/disasm-a64.h
@@ -31,6 +31,7 @@
 #include "utils.h"
 #include "instructions-a64.h"
 #include "decoder-a64.h"
+#include "assembler-a64.h"

 namespace vixl {

@@ -42,50 +43,85 @@ class Disassembler: public DecoderVisitor {
  char* GetOutput();

  // Declare all Visitor functions.
-  #define DECLARE(A)  void Visit##A(Instruction* instr);
+  #define DECLARE(A)  void Visit##A(const Instruction* instr);
  VISITOR_LIST(DECLARE)
  #undef DECLARE

 protected:
-  virtual void ProcessOutput(Instruction* instr);
+  virtual void ProcessOutput(const Instruction* instr);
+
+  // Default output functions.  The functions below implement a default way of
+  // printing elements in the disassembly. A sub-class can override these to
+  // customize the disassembly output.
+
+  // Prints the name of a register.
+  virtual void AppendRegisterNameToOutput(const Instruction* instr,
+                                          const CPURegister& reg);
+
+  // Prints a PC-relative offset. This is used for example when disassembling
+  // branches to immediate offsets.
+  virtual void AppendPCRelativeOffsetToOutput(const Instruction* instr,
+                                              int64_t offset);
+
+  // Prints an address, in the general case. It can be code or data. This is
+  // used for example to print the target address of an ADR instruction.
+  virtual void AppendAddressToOutput(const Instruction* instr,
+                                     const void* addr);
+
+  // Prints the address of some code.
+  // This is used for example to print the target address of a branch to an
+  // immediate offset.
+  // A sub-class can for example override this method to lookup the address and
+  // print an appropriate name.
+  virtual void AppendCodeAddressToOutput(const Instruction* instr,
+                                         const void* addr);
+
+  // Prints the address of some data.
+  // This is used for example to print the source address of a load literal
+  // instruction.
+  virtual void AppendDataAddressToOutput(const Instruction* instr,
+                                         const void* addr);

 private:
-  void Format(Instruction* instr, const char* mnemonic, const char* format);
-  void Substitute(Instruction* instr, const char* string);
-  int SubstituteField(Instruction* instr, const char* format);
-  int SubstituteRegisterField(Instruction* instr, const char* format);
-  int SubstituteImmediateField(Instruction* instr, const char* format);
-  int SubstituteLiteralField(Instruction* instr, const char* format);
-  int SubstituteBitfieldImmediateField(Instruction* instr, const char* format);
-  int SubstituteShiftField(Instruction* instr, const char* format);
-  int SubstituteExtendField(Instruction* instr, const char* format);
-  int SubstituteConditionField(Instruction* instr, const char* format);
-  int SubstitutePCRelAddressField(Instruction* instr, const char* format);
-  int SubstituteBranchTargetField(Instruction* instr, const char* format);
-  int SubstituteLSRegOffsetField(Instruction* instr, const char* format);
-  int SubstitutePrefetchField(Instruction* instr, const char* format);
-  int SubstituteBarrierField(Instruction* instr, const char* format);
+  void Format(
+      const Instruction* instr, const char* mnemonic, const char* format);
+  void Substitute(const Instruction* instr, const char* string);
+  int SubstituteField(const Instruction* instr, const char* format);
+  int SubstituteRegisterField(const Instruction* instr, const char* format);
+  int SubstituteImmediateField(const Instruction* instr, const char* format);
+  int SubstituteLiteralField(const Instruction* instr, const char* format);
+  int SubstituteBitfieldImmediateField(
+      const Instruction* instr, const char* format);
+  int SubstituteShiftField(const Instruction* instr, const char* format);
+  int SubstituteExtendField(const Instruction* instr, const char* format);
+  int SubstituteConditionField(const Instruction* instr, const char* format);
+  int SubstitutePCRelAddressField(const Instruction* instr, const char* format);
+  int SubstituteBranchTargetField(const Instruction* instr, const char* format);
+  int SubstituteLSRegOffsetField(const Instruction* instr, const char* format);
+  int SubstitutePrefetchField(const Instruction* instr, const char* format);
+  int SubstituteBarrierField(const Instruction* instr, const char* format);

-  inline bool RdIsZROrSP(Instruction* instr) const {
+  inline bool RdIsZROrSP(const Instruction* instr) const {
    return (instr->Rd() == kZeroRegCode);
  }

-  inline bool RnIsZROrSP(Instruction* instr) const {
+  inline bool RnIsZROrSP(const Instruction* instr) const {
    return (instr->Rn() == kZeroRegCode);
  }

-  inline bool RmIsZROrSP(Instruction* instr) const {
+  inline bool RmIsZROrSP(const Instruction* instr) const {
    return (instr->Rm() == kZeroRegCode);
  }

-  inline bool RaIsZROrSP(Instruction* instr) const {
+  inline bool RaIsZROrSP(const Instruction* instr) const {
    return (instr->Ra() == kZeroRegCode);
  }

  bool IsMovzMovnImm(unsigned reg_size, uint64_t value);

+ protected:
  void ResetOutput();
-  void AppendToOutput(const char* string, ...);
+  void AppendToOutput(const char* string, ...) PRINTF_CHECK(2, 3);

  char* buffer_;
  uint32_t buffer_pos_;
@@ -97,10 +133,10 @@ class Disassembler: public DecoderVisitor {
 class PrintDisassembler: public Disassembler {
 public:
  explicit PrintDisassembler(FILE* stream) : stream_(stream) { }
-  ~PrintDisassembler() { }
+  virtual ~PrintDisassembler() { }

 protected:
-  virtual void ProcessOutput(Instruction* instr);
+  virtual void ProcessOutput(const Instruction* instr);

 private:
  FILE *stream_;
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .1.2
 .2.0