memory: prevent dma-reentracy issues

Git-commit: a2e1753b80 References: bsc#1190011 (CVE-2021-3750) Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA. This flag is set/checked prior to calling a device's MemoryRegion handlers, and set when device code initiates DMA. The purpose of this flag is to prevent two types of DMA-based reentrancy issues: 1.) mmio -> dma -> mmio case 2.) bh -> dma write -> mmio case These issues have led to problems such as stack-exhaustion and use-after-frees. Summary of the problem from Peter Maydell: https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282 Resolves: CVE-2023-0330 Signed-off-by: Alexander Bulekov <alxndr@bu.edu> Reviewed-by: Thomas Huth <thuth@redhat.com> Message-Id: <20230427211013.2994127-2-alxndr@bu.edu> [thuth: Replace warn_report() with warn_report_once()] Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: Dario Faggioli <dfaggioli@suse.com>
io: remove io watch if TLS channel is closed during handshake
2023-10-20 14:57:22 +02:00 · 2023-10-20 14:19:10 +02:00 · 2023-10-20 14:19:01 +02:00 · 2023-10-20 14:18:43 +02:00 · 2023-04-20 16:18:30 +02:00 · 2023-04-20 16:05:53 +02:00
2253 changed files with 58340 additions and 116149 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@
 /config-target.*
 /config.status
 /config-temp
-/trace-events-all
 /trace/generated-tracers.h
 /trace/generated-tracers.c
 /trace/generated-tracers-dtrace.h
@@ -53,8 +52,7 @@
 /qemu-bridge-helper
 /qemu-monitor.texi
 /qemu-monitor-info.texi
-/qemu-version.h
-/qemu-version.h.tmp
+/qmp-commands.txt
 /vscclient
 /fsdev/virtfs-proxy-helper
 *.[1-9]
@@ -96,10 +94,6 @@
 /pc-bios/optionrom/linuxboot.bin
 /pc-bios/optionrom/linuxboot.raw
 /pc-bios/optionrom/linuxboot.img
-/pc-bios/optionrom/linuxboot_dma.asm
-/pc-bios/optionrom/linuxboot_dma.bin
-/pc-bios/optionrom/linuxboot_dma.raw
-/pc-bios/optionrom/linuxboot_dma.img
 /pc-bios/optionrom/multiboot.asm
 /pc-bios/optionrom/multiboot.bin
 /pc-bios/optionrom/multiboot.raw
@@ -114,5 +108,4 @@
 cscope.*
 tags
 TAGS
-docker-src.*
 *~
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,6 @@ addons:
      - libgtk-3-dev
      - libiscsi-dev
      - liblttng-ust-dev
-      - libnfs-dev
      - libncurses5-dev
      - libnss3-dev
      - libpixman-1-dev
@@ -34,13 +33,10 @@ addons:
      - sparse
      - uuid-dev

-# The channel name "irc.oftc.net#qemu" is encrypted against qemu/qemu
-# to prevent IRC notifications from forks. This was created using:
-# $ travis encrypt -r "qemu/qemu" "irc.oftc.net#qemu"
 notifications:
  irc:
    channels:
-      - secure: "F7GDRgjuOo5IUyRLqSkmDL7kvdU4UcH3Lm/W2db2JnDHTGCqgEdaYEYKciyCLZ57vOTsTsOgesN8iUT7hNHBd1KWKjZe9KDTZWppWRYVwAwQMzVeSOsbbU4tRoJ6Pp+3qhH1Z0eGYR9ZgKYAoTumDFgSAYRp4IscKS8jkoedOqM="
+      - "irc.oftc.net#qemu"
    on_success: change
    on_failure: always
 env:
@@ -67,6 +63,9 @@ script:
  - make -j3 && ${TEST_CMD}
 matrix:
  include:
+    # Sparse is GCC only
+    - env: CONFIG="--enable-sparse"
+      compiler: gcc
    # gprof/gcov are GCC features
    - env: CONFIG="--enable-gprof --enable-gcov --disable-pie"
      compiler: gcc
@@ -89,13 +88,3 @@ matrix:
    - env: CONFIG=""
      os: osx
      compiler: clang
-    - env: CONFIG=""
-      sudo: required
-      addons:
-      dist: trusty
-      compiler: gcc
-      before_install:
-        - sudo apt-get update -qq
-        - sudo apt-get build-dep -qq qemu
-        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
-        - git submodule update --init --recursive
--- a/8
+++ b/8
@@ -31,11 +31,7 @@ Do not leave whitespace dangling off the ends of lines.

 2. Line width

-Lines should be 80 characters; try not to make them longer.
-
-Sometimes it is hard to do, especially when dealing with QEMU subsystems
-that use long function or symbol names.  Even in that case, do not make
-lines much longer than 80 characters.
+Lines are 80 characters; not longer.

 Rationale:
 - Some people like to tile their 24" screens with a 6x4 matrix of 80x24
@@ -43,8 +39,6 @@ Rationale:
   let them keep doing it.
 - Code and especially patches is much more readable if limited to a sane
   line length.  Eighty is traditional.
- - The four-space indentation makes the most common excuse ("But look
-   at all that white space on the left!") moot.
 - It is the QEMU coding style.

 3. Naming
--- a/4
+++ b/4
@@ -158,10 +158,6 @@ painful. These are:
 * you may assume that right shift of a signed integer duplicates
   the sign bit (ie it is an arithmetic shift, not a logical shift)

-In addition, QEMU assumes that the compiler does not use the latitude
-given in C99 and C11 to treat aspects of signed '<<' as undefined, as
-documented in the GNU Compiler Collection manual starting at version 4.0.
-
 7. Error handling and reporting

 7.1 Reporting errors to the human user
--- a/153
+++ b/153
@@ -83,7 +83,6 @@ F: include/exec/cpu*.h
 F: include/exec/exec-all.h
 F: include/exec/helper*.h
 F: include/exec/tb-hash.h
-F: include/sysemu/cpus.h

 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
@@ -166,13 +165,11 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/

 PowerPC
-M: David Gibson <david@gibson.dropbear.id.au>
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Maintained
 F: target-ppc/
 F: hw/ppc/
-F: include/hw/ppc/
 F: disas/ppc.c

 S390
@@ -189,11 +186,10 @@ S: Odd Fixes
 F: target-sh4/
 F: hw/sh4/
 F: disas/sh4.c
-F: include/hw/sh4/

 SPARC
+M: Blue Swirl <blauwirbel@gmail.com>
 M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-M: Artyom Tarasenko <atar4qemu@gmail.com>
 S: Maintained
 F: target-sparc/
 F: hw/sparc/
@@ -205,7 +201,6 @@ M: Guan Xuetao <gxt@mprc.pku.edu.cn>
 S: Maintained
 F: target-unicore32/
 F: hw/unicore32/
-F: include/hw/unicore32/

 X86
 M: Paolo Bonzini <pbonzini@redhat.com>
@@ -229,7 +224,6 @@ M: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
 S: Maintained
 F: target-tricore/
 F: hw/tricore/
-F: include/hw/tricore/

 Guest CPU Cores (KVM):
 ----------------------
@@ -454,22 +448,23 @@ S: Maintained
 F: hw/*/versatile*

 Xilinx Zynq
-M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 L: qemu-arm@nongnu.org
 S: Maintained
-F: hw/*/xilinx_*
-F: hw/*/cadence_*
+F: hw/arm/xilinx_zynq.c
 F: hw/misc/zynq_slcr.c
-X: hw/ssi/xilinx_*
+F: hw/*/cadence_*
+F: hw/ssi/xilinx_spips.c

 Xilinx ZynqMP
 M: Alistair Francis <alistair.francis@xilinx.com>
-M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 L: qemu-arm@nongnu.org
 S: Maintained
-F: hw/*/xlnx*.c
-F: include/hw/*/xlnx*.h
+F: hw/arm/xlnx-zynqmp.c
+F: hw/arm/xlnx-ep108.c
+F: include/hw/arm/xlnx-zynqmp.h

 ARM ACPI Subsystem
 M: Shannon Zhao <zhaoshenglong@huawei.com>
@@ -575,9 +570,6 @@ L: qemu-ppc@nongnu.org
 S: Supported
 F: hw/ppc/e500.[hc]
 F: hw/ppc/e500plat.c
-F: include/hw/ppc/ppc_e500.h
-F: include/hw/pci-host/ppce500.h
-F: pc-bios/u-boot.e500

 mpc8544ds
 M: Alexander Graf <agraf@suse.de>
@@ -595,8 +587,6 @@ F: hw/ppc/mac_newworld.c
 F: hw/pci-host/uninorth.c
 F: hw/pci-bridge/dec.[hc]
 F: hw/misc/macio/
-F: include/hw/ppc/mac_dbdma.h
-F: hw/nvram/mac_nvram.c

 Old World
 M: Alexander Graf <agraf@suse.de>
@@ -607,7 +597,7 @@ F: hw/pci-host/grackle.c
 F: hw/misc/macio/

 PReP
-L: qemu-devel@nongnu.org
+M: Andreas Färber <andreas.faerber@web.de>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/prep.c
@@ -624,14 +614,6 @@ F: include/hw/*/spapr*
 F: hw/*/xics*
 F: include/hw/*/xics*
 F: pc-bios/spapr-rtas/*
-F: pc-bios/spapr-rtas.bin
-F: pc-bios/slof.bin
-F: docs/specs/ppc-spapr-hcalls.txt
-F: docs/specs/ppc-spapr-hotplug.txt
-F: tests/spapr*
-F: tests/libqos/*spapr*
-F: tests/rtas*
-F: tests/libqos/rtas*

 virtex_ml507
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
@@ -683,9 +665,6 @@ F: hw/s390x/
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
 F: hw/watchdog/wdt_diag288.c
-F: include/hw/watchdog/wdt_diag288.h
-F: pc-bios/s390-ccw.img
-F: default-configs/s390x-softmmu.mak
 T: git git://github.com/cohuck/qemu.git s390-next
 T: git git://github.com/borntraeger/qemu.git s390-next

@@ -715,7 +694,7 @@ F: hw/i2c/smbus_ich9.c
 F: hw/acpi/piix4.c
 F: hw/acpi/ich9.c
 F: include/hw/acpi/ich9.h
-F: include/hw/acpi/piix4.h
+F: include/hw/acpi/piix.h
 F: hw/misc/sga.c

 PC Chipset
@@ -735,10 +714,6 @@ F: hw/misc/pc-testdev.c
 F: hw/timer/hpet*
 F: hw/timer/i8254*
 F: hw/timer/mc146818rtc*
-F: include/hw/i2c/pm_smbus.h
-F: include/hw/timer/hpet.h
-F: include/hw/timer/i8254*
-F: include/hw/timer/mc146818rtc*

 Machine core
 M: Eduardo Habkost <ehabkost@redhat.com>
@@ -804,7 +779,6 @@ F: hw/ipack/

 PCI
 M: Michael S. Tsirkin <mst@redhat.com>
-M: Marcel Apfelbaum <marcel@redhat.com>
 S: Supported
 F: include/hw/pci/*
 F: hw/misc/pci-testdev.c
@@ -821,15 +795,16 @@ F: hw/mem/*
 F: hw/acpi/*
 F: hw/smbios/*
 F: hw/i386/acpi-build.[hc]
+F: hw/i386/*dsl
 F: hw/arm/virt-acpi-build.c
 F: include/hw/arm/virt-acpi-build.h
+F: scripts/acpi*py

 ppc4xx
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/ppc4*.c
-F: include/hw/ppc/ppc4xx.h

 ppce500
 M: Alexander Graf <agraf@suse.de>
@@ -849,15 +824,14 @@ Network devices
 M: Jason Wang <jasowang@redhat.com>
 S: Odd Fixes
 F: hw/net/
-F: tests/virtio-net-test.c
 T: git git://github.com/jasowang/qemu.git net

 SCSI
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Supported
-F: include/hw/scsi/*
+F: include/hw/scsi*
 F: hw/scsi/*
-F: tests/virtio-scsi-test.c
+F: tests/scsi-disk-test.c
 T: git git://github.com/bonzini/qemu.git scsi-next

 LSI53C895A
@@ -910,17 +884,15 @@ S: Supported
 F: hw/*/virtio*
 F: net/vhost-user.c
 F: include/hw/virtio/
-F: tests/virtio-balloon-test.c

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-M: Greg Kurz <groug@kaod.org>
+M: Greg Kurz <gkurz@linux.vnet.ibm.com>
 S: Supported
 F: hw/9pfs/
 F: fsdev/
 F: tests/virtio-9p-test.c
 T: git git://github.com/kvaneesh/QEMU.git
-T: git git://github.com/gkurz/qemu.git 9p-next

 virtio-blk
 M: Stefan Hajnoczi <stefanha@redhat.com>
@@ -928,7 +900,7 @@ L: qemu-block@nongnu.org
 S: Supported
 F: hw/block/virtio-blk.c
 F: hw/block/dataplane/*
-F: tests/virtio-blk-test.c
+F: hw/virtio/dataplane/*
 T: git git://github.com/stefanha/qemu.git block

 virtio-ccw
@@ -951,8 +923,6 @@ S: Supported
 F: hw/char/virtio-serial-bus.c
 F: hw/char/virtio-console.c
 F: include/hw/virtio/virtio-serial.h
-F: tests/virtio-console-test.c
-F: tests/virtio-serial-test.c

 virtio-rng
 M: Amit Shah <amit.shah@redhat.com>
@@ -961,7 +931,6 @@ F: hw/virtio/virtio-rng.c
 F: include/hw/virtio/virtio-rng.h
 F: include/sysemu/rng*.h
 F: backends/rng*.c
-F: tests/virtio-rng-test.c

 nvme
 M: Keith Busch <keith.busch@intel.com>
@@ -977,13 +946,13 @@ S: Supported
 F: hw/scsi/megasas.c
 F: hw/scsi/mfi.h

-Network packet abstractions
-M: Dmitry Fleytman <dmitry@daynix.com>
+Xilinx EDK
+M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
-F: include/net/eth.h
-F: net/eth.c
-F: hw/net/net_rx_pkt*
-F: hw/net/net_tx_pkt*
+F: hw/*/xilinx_*
+F: include/hw/xilinx.h

 Vmware
 M: Dmitry Fleytman <dmitry@daynix.com>
@@ -992,6 +961,7 @@ F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*

 Rocker
+M: Scott Feldman <sfeldma@gmail.com>
 M: Jiri Pirko <jiri@resnulli.us>
 S: Maintained
 F: hw/net/rocker/
@@ -1003,16 +973,6 @@ F: hw/acpi/nvdimm.c
 F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h

-e1000x
-M: Dmitry Fleytman <dmitry@daynix.com>
-S: Maintained
-F: hw/net/e1000x*
-
-e1000e
-M: Dmitry Fleytman <dmitry@daynix.com>
-S: Maintained
-F: hw/net/e1000e*
-
 Subsystems
 ----------
 Audio
@@ -1047,7 +1007,6 @@ F: async.c
 F: aio-*.c
 F: block/io.c
 F: migration/block*
-F: include/block/aio.h
 T: git git://github.com/stefanha/qemu.git block

 Block Jobs
@@ -1088,11 +1047,17 @@ S: Supported
 F: scripts/coverity-model.c

 CPU
-L: qemu-devel@nongnu.org
+M: Andreas Färber <afaerber@suse.de>
 S: Supported
 F: qom/cpu.c
 F: include/qom/cpu.h

+ICC Bus
+M: Igor Mammedov <imammedo@redhat.com>
+S: Supported
+F: include/hw/cpu/icc_bus.h
+F: hw/cpu/icc_bus.c
+
 Device Tree
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 M: Alexander Graf <agraf@suse.de>
@@ -1141,6 +1106,7 @@ F: ui/
 F: include/ui/

 Cocoa graphics
+M: Andreas Färber <andreas.faerber@web.de>
 M: Peter Maydell <peter.maydell@linaro.org>
 S: Odd Fixes
 F: ui/cocoa.m
@@ -1192,13 +1158,6 @@ F: numa.c
 F: include/sysemu/numa.h
 T: git git://github.com/ehabkost/qemu.git numa

-Host Memory Backends
-M: Eduardo Habkost <ehabkost@redhat.com>
-M: Igor Mammedov <imammedo@redhat.com>
-S: Maintained
-F: backends/hostmem*.c
-F: include/sysemu/hostmem.h
-
 QAPI
 M: Markus Armbruster <armbru@redhat.com>
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
@@ -1253,6 +1212,7 @@ F: qom/
 X: qom/cpu.c
 F: tests/check-qom-interface.c
 F: tests/check-qom-proplist.c
+F: tests/check-qom-props.c
 F: tests/qom-test.c

 QMP
@@ -1260,16 +1220,11 @@ M: Markus Armbruster <armbru@redhat.com>
 S: Supported
 F: qmp.c
 F: monitor.c
+F: qmp-commands.hx
 F: docs/*qmp-*
 F: scripts/qmp/
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

-Register API
-M: Alistair Francis <alistair.francis@xilinx.com>
-S: Maintained
-F: hw/core/register.c
-F: include/hw/register.h
-
 SLIRP
 M: Samuel Thibault <samuel.thibault@ens-lyon.org>
 M: Jan Kiszka <jan.kiszka@siemens.com>
@@ -1279,11 +1234,6 @@ F: net/slirp.c
 F: include/net/slirp.h
 T: git git://git.kiszka.org/qemu.git queues/slirp

-Stubs
-M: Paolo Bonzini <pbonzini@redhat.com>
-S: Maintained
-F: stubs/
-
 Tracing
 M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Maintained
@@ -1294,6 +1244,7 @@ F: docs/tracing.txt
 T: git git://github.com/stefanha/qemu.git tracing

 Checkpatch
+M: Blue Swirl <blauwirbel@gmail.com>
 S: Odd Fixes
 F: scripts/checkpatch.pl

@@ -1357,13 +1308,6 @@ F: include/qemu/throttle.h
 F: util/throttle.c
 L: qemu-block@nongnu.org

-UUID
-M: Fam Zheng <famz@redhat.com>
-S: Supported
-F: util/uuid.c
-F: include/qemu/uuid.h
-F: tests/test-uuid.c
-
 Usermode Emulation
 ------------------
 Overall
@@ -1373,7 +1317,8 @@ F: thunk.c
 F: user-exec.c

 BSD user
-S: Orphan
+M: Blue Swirl <blauwirbel@gmail.com>
+S: Maintained
 F: bsd-user/

 Linux user
@@ -1436,7 +1381,8 @@ F: tcg/s390/
 F: disas/s390.c

 SPARC target
-S: Odd Fixes
+M: Blue Swirl <blauwirbel@gmail.com>
+S: Maintained
 F: tcg/sparc/
 F: disas/sparc.c

@@ -1456,8 +1402,9 @@ S: Orphan

 Stable 0.15
 L: qemu-stable@nongnu.org
+M: Andreas Färber <afaerber@suse.de>
 T: git git://git.qemu-project.org/qemu-stable-0.15.git
-S: Orphan
+S: Supported

 Stable 0.14
 L: qemu-stable@nongnu.org
@@ -1614,7 +1561,7 @@ M: Kevin Wolf <kwolf@redhat.com>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/linux-aio.c
-F: include/block/raw-aio.h
+F: block/raw-aio.h
 F: block/raw-posix.c
 F: block/raw-win32.c
 F: block/raw_bsd.c
@@ -1658,15 +1605,6 @@ L: qemu-block@nongnu.org
 S: Supported
 F: tests/image-fuzzer/

-Replication
-M: Wen Congyang <wency@cn.fujitsu.com>
-M: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
-S: Supported
-F: replication*
-F: block/replication.c
-F: tests/test-replication.c
-F: docs/block-replication.txt
-
 Build and test automation
 -------------------------
 M: Alex Bennée <alex.bennee@linaro.org>
@@ -1680,10 +1618,3 @@ Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt
-
-Docker testing
--------------
-Docker based testing framework and cases
-M: Fam Zheng <famz@redhat.com>
-S: Maintained
-F: tests/docker/
--- a/133
+++ b/133
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.

-UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-%
+UNCHECKED_GOALS := %clean TAGS cscope ctags

 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -30,7 +30,8 @@ CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak

-config-host.mak: $(SRC_PATH)/configure $(SRC_PATH)/pc-bios
+include $(SRC_PATH)/rules.mak
+config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@# TODO: The next lines include code which supports a smooth
 	@# transition from old configurations without config.status.
@@ -48,9 +49,7 @@ ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fa
 endif
 endif

-include $(SRC_PATH)/rules.mak
-
-GENERATED_HEADERS = qemu-version.h config-host.h qemu-options.def
+GENERATED_HEADERS = config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
 GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
 GENERATED_HEADERS += qmp-introspect.h
@@ -76,15 +75,13 @@ GENERATED_HEADERS += trace/generated-ust-provider.h
 GENERATED_SOURCES += trace/generated-ust.c
 endif

-GENERATED_HEADERS += module_block.h
-
 # Don't try to regenerate Makefile or configure
 # We don't generate any of them
 Makefile: ;
 configure: ;

 .PHONY: all clean cscope distclean dvi html info install install-doc \
-	pdf recurse-all speed test dist msi FORCE
+	pdf recurse-all speed test dist msi

 $(call set-vpath, $(SRC_PATH))

@@ -94,6 +91,10 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)

 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
+DOCS+=qmp-commands.txt
+ifdef CONFIG_LINUX
+DOCS+=kvm_stat.1
+endif
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -118,7 +119,7 @@ endif

 -include $(SUBDIR_DEVICES_MAK_DEP)

-%/config-devices.mak: default-configs/%.mak $(SRC_PATH)/scripts/make_device_config.sh
+%/config-devices.mak: default-configs/%.mak
 	$(call quiet-command, \
            $(SHELL) $(SRC_PATH)/scripts/make_device_config.sh $< $*-config-devices.mak.d $@ > $@.tmp, "  GEN   $@.tmp")
 	$(call quiet-command, if test -f $@; then \
@@ -163,34 +164,14 @@ dummy := $(call unnest-vars,, \
                common-obj-m)

 ifneq ($(wildcard config-host.mak),)
-include $(SRC_PATH)/tests/Makefile.include
+include $(SRC_PATH)/tests/Makefile
 endif

 all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules

-qemu-version.h: FORCE
-	$(call quiet-command, \
-		(cd $(SRC_PATH); \
-		printf '#define QEMU_PKGVERSION '; \
-		if test -n "$(PKGVERSION)"; then \
-			printf '"$(PKGVERSION)"\n'; \
-		else \
-			if test -d .git; then \
-				printf '" ('; \
-				git describe --match 'v*' 2>/dev/null | tr -d '\n'; \
-				if ! git diff-index --quiet HEAD &>/dev/null; then \
-					printf -- '-dirty'; \
-				fi; \
-				printf ')"\n'; \
-			else \
-				printf '""\n'; \
-			fi; \
-		fi) > $@.tmp)
-	$(call quiet-command, cmp -s $@ $@.tmp || mv $@.tmp $@)
-
 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
-qemu-options.def: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
+qemu-options.def: $(SRC_PATH)/qemu-options.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")

 SUBDIR_RULES=$(patsubst %,subdir-%, $(TARGET_DIRS))
@@ -226,9 +207,8 @@ dtc/%:
 $(SUBDIR_RULES): libqemuutil.a libqemustub.a $(common-obj-y) $(qom-obj-y) $(crypto-aes-obj-$(CONFIG_USER_ONLY))

 ROMSUBDIR_RULES=$(patsubst %,romsubdir-%, $(ROMS))
-# Only keep -O and -g cflags
 romsubdir-%:
-	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C pc-bios/$* V="$(V)" TARGET_DIR="$*/" CFLAGS="$(filter -O% -g%,$(CFLAGS))",)
+	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C pc-bios/$* V="$(V)" TARGET_DIR="$*/",)

 ALL_SUBDIRS=$(TARGET_DIRS) $(patsubst %,pc-bios/%, $(ROMS))

@@ -247,6 +227,9 @@ Makefile: $(version-obj-y) $(version-lobj-y)
 libqemustub.a: $(stub-obj-y)
 libqemuutil.a: $(util-obj-y)

+block-modules = $(foreach o,$(block-obj-m),"$(basename $(subst /,-,$o))",) NULL
+util/module.o-cflags = -D'CONFIG_BLOCK_MODULES=$(block-modules)'
+
 ######################################################################

 qemu-img.o: qemu-img-cmds.h
@@ -260,7 +243,7 @@ qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap

-qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
+qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")

 qemu-ga$(EXESUF): LIBS = $(LIBS_QGA)
@@ -310,7 +293,7 @@ $(qapi-modules) $(SRC_PATH)/scripts/qapi-event.py $(qapi-py)
 qmp-commands.h qmp-marshal.c :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \
-		$(gen-out-type) -o "." $<, \
+		$(gen-out-type) -o "." -m $<, \
 		"  GEN   $@")
 qmp-introspect.h qmp-introspect.c :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
@@ -351,11 +334,6 @@ ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) libqemuutil.a libqemustub.a
 ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
 	$(call LINK, $^)

-module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
-	$(call quiet-command,$(PYTHON) $< $@ \
-	$(addprefix $(SRC_PATH)/,$(patsubst %.mo,%.c,$(block-obj-m))), \
-	"  GEN   $@")
-
 clean:
 # avoid old build problems by removing potentially incorrect old files
 	rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h
@@ -378,7 +356,6 @@ clean:
 	if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
 	rm -f $$d/qemu-options.def; \
        done
-	rm -f $(SUBDIR_DEVICES_MAK) config-all-devices.mak

 VERSION ?= $(shell cat VERSION)

@@ -420,10 +397,9 @@ pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \
 pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \
 efi-e1000.rom efi-eepro100.rom efi-ne2k_pci.rom \
 efi-pcnet.rom efi-rtl8139.rom efi-virtio.rom \
-efi-e1000e.rom efi-vmxnet3.rom \
 qemu-icon.bmp qemu_logo_no_text.svg \
 bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
-multiboot.bin linuxboot.bin linuxboot_dma.bin kvmvapic.bin \
+multiboot.bin linuxboot.bin kvmvapic.bin \
 s390-ccw.img \
 spapr-rtas.bin slof.bin \
 palcode-clipper \
@@ -435,7 +411,7 @@ endif
 install-doc: $(DOCS)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)"
 	$(INSTALL_DATA) qemu-doc.html  qemu-tech.html "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) $(SRC_PATH)/docs/qmp-commands.txt "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) qmp-commands.txt "$(DESTDIR)$(qemu_docdir)"
 ifdef CONFIG_POSIX
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1"
@@ -492,7 +468,7 @@ endif
 	set -e; for x in $(KEYMAPS); do \
 		$(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \
 	done
-	$(INSTALL_DATA) $(BUILD_DIR)/trace-events-all "$(DESTDIR)$(qemu_datadir)/trace-events-all"
+	$(INSTALL_DATA) $(SRC_PATH)/trace-events "$(DESTDIR)$(qemu_datadir)/trace-events"
 	for d in $(TARGET_DIRS); do \
 	$(MAKE) $(SUBDIR_MAKEFLAGS) TARGET_DIR=$$d/ -C $$d $@ || exit 1 ; \
        done
@@ -503,12 +479,12 @@ test speed: all

 .PHONY: ctags
 ctags:
-	rm -f tags
+	rm -f $@
 	find "$(SRC_PATH)" -name '*.[hc]' -exec ctags --append {} +

 .PHONY: TAGS
 TAGS:
-	rm -f TAGS
+	rm -f $@
 	find "$(SRC_PATH)" -name '*.[hc]' -exec etags --append {} +

 cscope:
@@ -549,16 +525,19 @@ TEXIFLAG=$(if $(V),,--quiet)
 %.pdf: %.texi
 	$(call quiet-command,texi2pdf $(TEXIFLAG) -I . $<,"  GEN   $@")

-qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
+qemu-options.texi: $(SRC_PATH)/qemu-options.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
+qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
+qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
+qmp-commands.txt: $(SRC_PATH)/qmp-commands.hx
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -q < $< > $@,"  GEN   $@")
+
+qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
@@ -566,9 +545,8 @@ qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu.pod > $@, \
 	  "  GEN   $@")
-qemu.1: qemu-option-trace.texi

-qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
+qemu-img.1: qemu-img.texi qemu-img-cmds.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-img.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu-img.pod > $@, \
@@ -580,7 +558,7 @@ fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
 	  $(POD2MAN) --section=1 --center=" " --release=" " fsdev/virtfs-proxy-helper.pod > $@, \
 	  "  GEN   $@")

-qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
+qemu-nbd.8: qemu-nbd.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-nbd.pod && \
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-nbd.pod > $@, \
@@ -592,13 +570,19 @@ qemu-ga.8: qemu-ga.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
 	  "  GEN   $@")

+kvm_stat.1: scripts/kvm/kvm_stat.texi
+	$(call quiet-command, \
+	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
+	  $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \
+	  "  GEN   $@")
+
 dvi: qemu-doc.dvi qemu-tech.dvi
 html: qemu-doc.html qemu-tech.html
 info: qemu-doc.info qemu-tech.info
 pdf: qemu-doc.pdf qemu-tech.pdf

 qemu-doc.dvi qemu-doc.html qemu-doc.info qemu-doc.pdf: \
-	qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \
+	qemu-img.texi qemu-nbd.texi qemu-options.texi \
 	qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi \
 	qemu-monitor-info.texi

@@ -667,42 +651,3 @@ endif
 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
 -include $(wildcard *.d tests/*.d)
-
-include $(SRC_PATH)/tests/docker/Makefile.include
-
-.PHONY: help
-help:
-	@echo  'Generic targets:'
-	@echo  '  all             - Build all'
-	@echo  '  dir/file.o      - Build specified target only'
-	@echo  '  install         - Install QEMU, documentation and tools'
-	@echo  '  ctags/TAGS      - Generate tags file for editors'
-	@echo  '  cscope          - Generate cscope index'
-	@echo  ''
-	@$(if $(TARGET_DIRS), \
-		echo 'Architecture specific targets:'; \
-		$(foreach t, $(TARGET_DIRS), \
-		printf "  %-30s - Build for %s\\n" $(patsubst %,subdir-%,$(t)) $(t);) \
-		echo '')
-	@echo  'Cleaning targets:'
-	@echo  '  clean           - Remove most generated files but keep the config'
-	@echo  '  distclean       - Remove all generated files'
-	@echo  '  dist            - Build a distributable tarball'
-	@echo  ''
-	@echo  'Test targets:'
-	@echo  '  check           - Run all tests (check-help for details)'
-	@echo  '  docker          - Help about targets running tests inside Docker containers'
-	@echo  ''
-	@echo  'Documentation targets:'
-	@echo  '  dvi html info pdf'
-	@echo  '                  - Build documentation in specified format'
-	@echo  ''
-ifdef CONFIG_WIN32
-	@echo  'Windows targets:'
-	@echo  '  installer       - Build NSIS-based installer for qemu-ga'
-ifdef QEMU_GA_MSI_ENABLED
-	@echo  '  msi             - Build MSI-based installer for qemu-ga'
-endif
-	@echo  ''
-endif
-	@echo  '  make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -15,7 +15,6 @@ block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
-block-obj-$(CONFIG_REPLICATION) += replication.o

 block-obj-m = block/

@@ -53,6 +52,7 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 common-obj-y += migration/
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += page_cache.o
+common-obj-y += qjson.o

 common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

@@ -116,47 +116,3 @@ qga-vss-dll-obj-y = qga/
 # contrib
 ivshmem-client-obj-y = contrib/ivshmem-client/
 ivshmem-server-obj-y = contrib/ivshmem-server/
-
-
-######################################################################
-trace-events-y = trace-events
-trace-events-y += util/trace-events
-trace-events-y += crypto/trace-events
-trace-events-y += io/trace-events
-trace-events-y += migration/trace-events
-trace-events-y += block/trace-events
-trace-events-y += hw/block/trace-events
-trace-events-y += hw/char/trace-events
-trace-events-y += hw/intc/trace-events
-trace-events-y += hw/net/trace-events
-trace-events-y += hw/virtio/trace-events
-trace-events-y += hw/audio/trace-events
-trace-events-y += hw/misc/trace-events
-trace-events-y += hw/usb/trace-events
-trace-events-y += hw/scsi/trace-events
-trace-events-y += hw/nvram/trace-events
-trace-events-y += hw/display/trace-events
-trace-events-y += hw/input/trace-events
-trace-events-y += hw/timer/trace-events
-trace-events-y += hw/dma/trace-events
-trace-events-y += hw/sparc/trace-events
-trace-events-y += hw/sd/trace-events
-trace-events-y += hw/isa/trace-events
-trace-events-y += hw/i386/trace-events
-trace-events-y += hw/9pfs/trace-events
-trace-events-y += hw/ppc/trace-events
-trace-events-y += hw/pci/trace-events
-trace-events-y += hw/s390x/trace-events
-trace-events-y += hw/vfio/trace-events
-trace-events-y += hw/acpi/trace-events
-trace-events-y += hw/arm/trace-events
-trace-events-y += hw/alpha/trace-events
-trace-events-y += ui/trace-events
-trace-events-y += audio/trace-events
-trace-events-y += net/trace-events
-trace-events-y += target-i386/trace-events
-trace-events-y += target-sparc/trace-events
-trace-events-y += target-s390x/trace-events
-trace-events-y += target-ppc/trace-events
-trace-events-y += qom/trace-events
-trace-events-y += linux-user/trace-events
--- a/Makefile.target
+++ b/Makefile.target
@@ -36,6 +36,10 @@ endif
 PROGS=$(QEMU_PROG) $(QEMU_PROGW)
 STPFILES=

+ifdef CONFIG_LINUX_USER
+PROGS+=$(QEMU_PROG)-binfmt
+endif
+
 config-target.h: config-target.h-timestamp
 config-target.h-timestamp: config-target.mak

@@ -48,7 +52,7 @@ else
 TARGET_TYPE=system
 endif

-$(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
+$(QEMU_PROG).stp-installed: $(SRC_PATH)/trace-events
 	$(call quiet-command,$(TRACETOOL) \
 		--format=stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -57,7 +61,7 @@ $(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp-installed")

-$(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
+$(QEMU_PROG).stp: $(SRC_PATH)/trace-events
 	$(call quiet-command,$(TRACETOOL) \
 		--format=stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -66,7 +70,7 @@ $(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp")

-$(QEMU_PROG)-simpletrace.stp: $(BUILD_DIR)/trace-events-all
+$(QEMU_PROG)-simpletrace.stp: $(SRC_PATH)/trace-events
 	$(call quiet-command,$(TRACETOOL) \
 		--format=simpletrace-stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -108,13 +112,13 @@ obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o

 ifdef CONFIG_LINUX_USER

-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
-             -I$(SRC_PATH)/linux-user/host/$(ARCH) \
-             -I$(SRC_PATH)/linux-user
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user

 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o

+obj-binfmt-y += linux-user/
+
 endif #CONFIG_LINUX_USER

 #########################################################
@@ -156,14 +160,18 @@ else
 obj-y += hw/$(TARGET_BASE_ARCH)/
 endif

-GENERATED_HEADERS += hmp-commands.h hmp-commands-info.h
+GENERATED_HEADERS += hmp-commands.h hmp-commands-info.h qmp-commands-old.h

 endif # CONFIG_SOFTMMU

 # Workaround for http://gcc.gnu.org/PR55489, see configure.
 %/translate.o: QEMU_CFLAGS += $(TRANSLATE_OPT_CFLAGS)

+ifdef CONFIG_LINUX_USER
+dummy := $(call unnest-vars,,obj-y obj-binfmt-y)
+else
 dummy := $(call unnest-vars,,obj-y)
+endif
 all-obj-y := $(obj-y)

 target-obj-y :=
@@ -200,19 +208,25 @@ ifdef CONFIG_DARWIN
 	$(call quiet-command,SetFile -a C $@,"  SETFILE $(TARGET_DIR)$@")
 endif

+$(QEMU_PROG)-binfmt: $(obj-binfmt-y)
+	$(call LINK,$^)
+
 gdbstub-xml.c: $(TARGET_XML_FILES) $(SRC_PATH)/scripts/feature_to_c.sh
 	$(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES),"  GEN   $(TARGET_DIR)$@")

-hmp-commands.h: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
+hmp-commands.h: $(SRC_PATH)/hmp-commands.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")

-hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
+hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")

-clean: clean-target
+qmp-commands-old.h: $(SRC_PATH)/qmp-commands.hx
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")
+
+clean:
 	rm -f *.a *~ $(PROGS)
 	rm -f $(shell find . -name '*.[od]')
-	rm -f hmp-commands.h gdbstub-xml.c
+	rm -f hmp-commands.h qmp-commands-old.h gdbstub-xml.c
 ifdef CONFIG_TRACE_SYSTEMTAP
 	rm -f *.stp
 endif
--- a/2
+++ b/2
@@ -1 +1 @@
-2.7.50
+2.6.2
--- a/accel.c
+++ b/accel.c
@@ -77,7 +77,7 @@ static int accel_init_machine(AccelClass *acc, MachineState *ms)
    return ret;
 }

-void configure_accelerator(MachineState *ms)
+int configure_accelerator(MachineState *ms)
 {
    const char *p;
    char buf[10];
@@ -128,6 +128,8 @@ void configure_accelerator(MachineState *ms)
    if (init_failed) {
        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
    }
+
+    return !accel_initialised;
 }


--- a/aio-posix.c
+++ b/aio-posix.c
@@ -485,13 +485,12 @@ bool aio_poll(AioContext *ctx, bool blocking)
    return progress;
 }

-void aio_context_setup(AioContext *ctx)
+void aio_context_setup(AioContext *ctx, Error **errp)
 {
 #ifdef CONFIG_EPOLL_CREATE1
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
-        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
        ctx->epoll_available = false;
    } else {
        ctx->epoll_available = true;
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -371,6 +371,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
    return progress;
 }

-void aio_context_setup(AioContext *ctx)
+void aio_context_setup(AioContext *ctx, Error **errp)
 {
 }
--- a/arch_init.c
+++ b/arch_init.c
@@ -22,8 +22,6 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "cpu.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
@@ -255,6 +253,13 @@ void do_smbios_option(QemuOpts *opts)
 #endif
 }

+void cpudef_init(void)
+{
+#if defined(cpudef_setup)
+    cpudef_setup(); /* parse cpu definitions in target config file */
+#endif
+}
+
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
--- a/async.c
+++ b/async.c
@@ -29,7 +29,6 @@
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
-#include "block/raw-aio.h"

 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -218,7 +217,7 @@ aio_ctx_check(GSource *source)
    for (bh = ctx->first_bh; bh; bh = bh->next) {
        if (!bh->deleted && bh->scheduled) {
            return true;
-        }
+	}
    }
    return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
 }
@@ -243,14 +242,6 @@ aio_ctx_finalize(GSource     *source)
    qemu_bh_delete(ctx->notify_dummy_bh);
    thread_pool_free(ctx->thread_pool);

-#ifdef CONFIG_LINUX_AIO
-    if (ctx->linux_aio) {
-        laio_detach_aio_context(ctx->linux_aio, ctx);
-        laio_cleanup(ctx->linux_aio);
-        ctx->linux_aio = NULL;
-    }
-#endif
-
    qemu_mutex_lock(&ctx->bh_lock);
    while (ctx->first_bh) {
        QEMUBH *next = ctx->first_bh->next;
@@ -291,17 +282,6 @@ ThreadPool *aio_get_thread_pool(AioContext *ctx)
    return ctx->thread_pool;
 }

-#ifdef CONFIG_LINUX_AIO
-LinuxAioState *aio_get_linux_aio(AioContext *ctx)
-{
-    if (!ctx->linux_aio) {
-        ctx->linux_aio = laio_init();
-        laio_attach_aio_context(ctx->linux_aio, ctx);
-    }
-    return ctx->linux_aio;
-}
-#endif
-
 void aio_notify(AioContext *ctx)
 {
    /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
@@ -347,10 +327,14 @@ AioContext *aio_context_new(Error **errp)
 {
    int ret;
    AioContext *ctx;
+    Error *local_err = NULL;

    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
-    aio_context_setup(ctx);
-
+    aio_context_setup(ctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto fail;
+    }
    ret = event_notifier_init(&ctx->notifier, false);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
@@ -361,9 +345,6 @@ AioContext *aio_context_new(Error **errp)
                           false,
                           (EventNotifierHandler *)
                           event_notifier_dummy_cb);
-#ifdef CONFIG_LINUX_AIO
-    ctx->linux_aio = NULL;
-#endif
    ctx->thread_pool = NULL;
    qemu_mutex_init(&ctx->bh_lock);
    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -1131,6 +1131,8 @@ static void audio_timer (void *opaque)
 */
 int AUD_write (SWVoiceOut *sw, void *buf, int size)
 {
+    int bytes;
+
    if (!sw) {
        /* XXX: Consider options */
        return size;
@@ -1141,11 +1143,14 @@ int AUD_write (SWVoiceOut *sw, void *buf, int size)
        return 0;
    }

-    return sw->hw->pcm_ops->write(sw, buf, size);
+    bytes = sw->hw->pcm_ops->write (sw, buf, size);
+    return bytes;
 }

 int AUD_read (SWVoiceIn *sw, void *buf, int size)
 {
+    int bytes;
+
    if (!sw) {
        /* XXX: Consider options */
        return size;
@@ -1156,7 +1161,8 @@ int AUD_read (SWVoiceIn *sw, void *buf, int size)
        return 0;
    }

-    return sw->hw->pcm_ops->read(sw, buf, size);
+    bytes = sw->hw->pcm_ops->read (sw, buf, size);
+    return bytes;
 }

 int AUD_get_buffer_size_out (SWVoiceOut *sw)
@@ -1739,21 +1745,13 @@ static void audio_vm_change_state_handler (void *opaque, int running,
    audio_reset_timer (s);
 }

-static bool is_cleaning_up;
-
-bool audio_is_cleaning_up(void)
-{
-    return is_cleaning_up;
-}
-
-void audio_cleanup(void)
+static void audio_atexit (void)
 {
    AudioState *s = &glob_audio_state;
-    HWVoiceOut *hwo, *hwon;
-    HWVoiceIn *hwi, *hwin;
+    HWVoiceOut *hwo = NULL;
+    HWVoiceIn *hwi = NULL;

-    is_cleaning_up = true;
-    QLIST_FOREACH_SAFE(hwo, &glob_audio_state.hw_head_out, entries, hwon) {
+    while ((hwo = audio_pcm_hw_find_any_out (hwo))) {
        SWVoiceCap *sc;

        if (hwo->enabled) {
@@ -1769,20 +1767,17 @@ void audio_cleanup(void)
                cb->ops.destroy (cb->opaque);
            }
        }
-        QLIST_REMOVE(hwo, entries);
    }

-    QLIST_FOREACH_SAFE(hwi, &glob_audio_state.hw_head_in, entries, hwin) {
+    while ((hwi = audio_pcm_hw_find_any_in (hwi))) {
        if (hwi->enabled) {
            hwi->pcm_ops->ctl_in (hwi, VOICE_DISABLE);
        }
        hwi->pcm_ops->fini_in (hwi);
-        QLIST_REMOVE(hwi, entries);
    }

    if (s->drv) {
        s->drv->fini (s->drv_opaque);
-        s->drv = NULL;
    }
 }

@@ -1810,7 +1805,7 @@ static void audio_init (void)
    QLIST_INIT (&s->hw_head_out);
    QLIST_INIT (&s->hw_head_in);
    QLIST_INIT (&s->cap_head);
-    atexit(audio_cleanup);
+    atexit (audio_atexit);

    s->ts = timer_new_ns(QEMU_CLOCK_VIRTUAL, audio_timer, s);

@@ -1977,7 +1972,8 @@ CaptureVoiceOut *AUD_add_capture (
        QLIST_INSERT_HEAD (&s->cap_head, cap, entries);
        QLIST_INSERT_HEAD (&cap->cb_head, cb, entries);

-        QLIST_FOREACH(hw, &glob_audio_state.hw_head_out, entries) {
+        hw = NULL;
+        while ((hw = audio_pcm_hw_find_any_out (hw))) {
            audio_attach_capture (hw);
        }
        return cap;
@@ -2023,6 +2019,8 @@ void AUD_del_capture (CaptureVoiceOut *cap, void *cb_opaque)
                    sw = sw1;
                }
                QLIST_REMOVE (cap, entries);
+                g_free (cap->hw.mix_buf);
+                g_free (cap->buf);
                g_free (cap);
            }
            return;
--- a/audio/audio.h
+++ b/audio/audio.h
@@ -21,7 +21,6 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-
 #ifndef QEMU_AUDIO_H
 #define QEMU_AUDIO_H

@@ -163,7 +162,4 @@ static inline void *advance (void *p, int incr)
 int wav_start_capture (CaptureState *s, const char *path, int freq,
                       int bits, int nchannels);

-bool audio_is_cleaning_up(void);
-void audio_cleanup(void);
-
-#endif /* QEMU_AUDIO_H */
+#endif  /* audio.h */
--- a/audio/audio_int.h
+++ b/audio/audio_int.h
@@ -21,7 +21,6 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-
 #ifndef QEMU_AUDIO_INT_H
 #define QEMU_AUDIO_INT_H

@@ -258,4 +257,4 @@ static inline int audio_ring_dist (int dst, int src, int len)
 #define AUDIO_FUNC __FILE__ ":" AUDIO_STRINGIFY (__LINE__)
 #endif

-#endif /* QEMU_AUDIO_INT_H */
+#endif /* audio_int.h */
--- a/audio/audio_pt_int.h
+++ b/audio/audio_pt_int.h
@@ -19,4 +19,4 @@ int audio_pt_wait (struct audio_pt *, const char *);
 int audio_pt_unlock_and_signal (struct audio_pt *, const char *);
 int audio_pt_join (struct audio_pt *, void **, const char *);

-#endif /* QEMU_AUDIO_PT_INT_H */
+#endif /* audio_pt_int.h */
--- a/audio/coreaudio.c
+++ b/audio/coreaudio.c
@@ -36,6 +36,8 @@
 #define MAC_OS_X_VERSION_10_6 1060
 #endif

+static int isAtexit;
+
 typedef struct {
    int buffer_frames;
    int nbuffers;
@@ -376,6 +378,11 @@ static inline UInt32 isPlaying (AudioDeviceID outputDeviceID)
    return result;
 }

+static void coreaudio_atexit (void)
+{
+    isAtexit = 1;
+}
+
 static int coreaudio_lock (coreaudioVoiceOut *core, const char *fn_name)
 {
    int err;
@@ -623,7 +630,7 @@ static void coreaudio_fini_out (HWVoiceOut *hw)
    int err;
    coreaudioVoiceOut *core = (coreaudioVoiceOut *) hw;

-    if (!audio_is_cleaning_up()) {
+    if (!isAtexit) {
        /* stop playback */
        if (isPlaying(core->outputDeviceID)) {
            status = AudioDeviceStop(core->outputDeviceID, core->ioprocid);
@@ -666,7 +673,7 @@ static int coreaudio_ctl_out (HWVoiceOut *hw, int cmd, ...)

    case VOICE_DISABLE:
        /* stop playback */
-        if (!audio_is_cleaning_up()) {
+        if (!isAtexit) {
            if (isPlaying(core->outputDeviceID)) {
                status = AudioDeviceStop(core->outputDeviceID,
                                         core->ioprocid);
@@ -690,6 +697,7 @@ static void *coreaudio_audio_init (void)
    CoreaudioConf *conf = g_malloc(sizeof(CoreaudioConf));
    *conf = glob_conf;

+    atexit(coreaudio_atexit);
    return conf;
 }

--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -24,7 +24,6 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
-#include "qemu/bswap.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
--- a/audio/mixeng.h
+++ b/audio/mixeng.h
@@ -21,7 +21,6 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-
 #ifndef QEMU_MIXENG_H
 #define QEMU_MIXENG_H

@@ -49,4 +48,4 @@ void st_rate_stop (void *opaque);
 void mixeng_clear (struct st_sample *buf, int len);
 void mixeng_volume (struct st_sample *buf, int len, struct mixeng_volume *vol);

-#endif /* QEMU_MIXENG_H */
+#endif  /* mixeng.h */
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@@ -23,7 +23,6 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
-#include "qemu/host-utils.h"
 #include "audio.h"
 #include "qemu/timer.h"

--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
+#include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <sys/soundcard.h>
 #include "qemu-common.h"
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -781,22 +781,23 @@ static int qpa_ctl_in (HWVoiceIn *hw, int cmd, ...)

            pa_threaded_mainloop_lock (g->mainloop);

-            op = pa_context_set_source_output_volume (g->context,
-                pa_stream_get_index (pa->stream),
+            /* FIXME: use the upcoming "set_source_output_{volume,mute}" */
+            op = pa_context_set_source_volume_by_index (g->context,
+                pa_stream_get_device_index (pa->stream),
                &v, NULL, NULL);
            if (!op) {
                qpa_logerr (pa_context_errno (g->context),
-                            "set_source_output_volume() failed\n");
+                            "set_source_volume() failed\n");
            } else {
                pa_operation_unref(op);
            }

-            op = pa_context_set_source_output_mute (g->context,
+            op = pa_context_set_source_mute_by_index (g->context,
                pa_stream_get_index (pa->stream),
                sw->vol.mute, NULL, NULL);
            if (!op) {
                qpa_logerr (pa_context_errno (g->context),
-                            "set_source_output_mute() failed\n");
+                            "set_source_mute() failed\n");
            } else {
                pa_operation_unref (op);
            }
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -19,7 +19,6 @@

 #include "qemu/osdep.h"
 #include "hw/hw.h"
-#include "qemu/host-utils.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "ui/qemu-spice.h"
--- a/audio/trace-events
+++ b/audio/trace-events
@@ -1,17 +0,0 @@
-# See docs/tracing.txt for syntax documentation.
-
-# audio/alsaaudio.c
-alsa_revents(int revents) "revents = %d"
-alsa_pollout(int i, int fd) "i = %d fd = %d"
-alsa_set_handler(int events, int index, int fd, int err) "events=%#x index=%d fd=%d err=%d"
-alsa_wrote_zero(int len) "Failed to write %d frames (wrote zero)"
-alsa_read_zero(long len) "Failed to read %ld frames (read zero)"
-alsa_xrun_out(void) "Recovering from playback xrun"
-alsa_xrun_in(void) "Recovering from capture xrun"
-alsa_resume_out(void) "Resuming suspended output stream"
-alsa_resume_in(void) "Resuming suspended input stream"
-alsa_no_frames(int state) "No frames available and ALSA state is %d"
-
-# audio/ossaudio.c
-oss_version(int version) "OSS version = %#x"
-oss_invalid_available_size(int size, int bufsize) "Invalid available size, size=%d bufsize=%d"
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include "qemu/host-utils.h"
+#include "hw/hw.h"
 #include "qemu/timer.h"
 #include "audio.h"

--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -64,14 +64,6 @@ out:
    error_propagate(errp, local_err);
 }

-static uint16List **host_memory_append_node(uint16List **node,
-                                            unsigned long value)
-{
-     *node = g_malloc0(sizeof(**node));
-     (*node)->value = value;
-     return &(*node)->next;
-}
-
 static void
 host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
                                   void *opaque, Error **errp)
@@ -82,23 +74,25 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
    unsigned long value;

    value = find_first_bit(backend->host_nodes, MAX_NODES);
-
-    node = host_memory_append_node(node, value);
-
    if (value == MAX_NODES) {
-        goto out;
+        return;
    }

+    *node = g_malloc0(sizeof(**node));
+    (*node)->value = value;
+    node = &(*node)->next;
+
    do {
        value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
        if (value == MAX_NODES) {
            break;
        }

-        node = host_memory_append_node(node, value);
+        *node = g_malloc0(sizeof(**node));
+        (*node)->value = value;
+        node = &(*node)->next;
    } while (true);

-out:
    visit_type_uint16List(v, name, &host_nodes, errp);
 }

@@ -203,7 +197,6 @@ static bool host_memory_backend_get_prealloc(Object *obj, Error **errp)
 static void host_memory_backend_set_prealloc(Object *obj, bool value,
                                             Error **errp)
 {
-    Error *local_err = NULL;
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);

    if (backend->force_prealloc) {
@@ -224,11 +217,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
        void *ptr = memory_region_get_ram_ptr(&backend->mr);
        uint64_t sz = memory_region_size(&backend->mr);

-        os_mem_prealloc(fd, ptr, sz, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
+        os_mem_prealloc(fd, ptr, sz);
        backend->prealloc = true;
    }
 }
@@ -269,16 +258,6 @@ host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp)
    return memory_region_size(&backend->mr) ? &backend->mr : NULL;
 }

-void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
-{
-    backend->is_mapped = mapped;
-}
-
-bool host_memory_backend_is_mapped(HostMemoryBackend *backend)
-{
-    return backend->is_mapped;
-}
-
 static void
 host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
 {
@@ -291,7 +270,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
    if (bc->alloc) {
        bc->alloc(backend, &local_err);
        if (local_err) {
-            goto out;
+            error_propagate(errp, local_err);
+            return;
        }

        ptr = memory_region_get_ram_ptr(&backend->mr);
@@ -347,21 +327,18 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
         * specified NUMA policy in place.
         */
        if (backend->prealloc) {
-            os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
-                            &local_err);
-            if (local_err) {
-                goto out;
-            }
+            os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
        }
    }
-out:
-    error_propagate(errp, local_err);
 }

 static bool
 host_memory_backend_can_be_deleted(UserCreatable *uc, Error **errp)
 {
-    if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
+    MemoryRegion *mr;
+
+    mr = host_memory_backend_get_memory(MEMORY_BACKEND(uc), errp);
+    if (memory_region_is_mapped(mr)) {
        return false;
    } else {
        return true;
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -25,51 +25,16 @@
 #include "qemu-common.h"
 #include "sysemu/char.h"
 #include "ui/console.h"
-#include "ui/input.h"

 #define MSMOUSE_LO6(n) ((n) & 0x3f)
 #define MSMOUSE_HI2(n) (((n) & 0xc0) >> 6)

-typedef struct {
-    CharDriverState *chr;
-    QemuInputHandlerState *hs;
-    int axis[INPUT_AXIS__MAX];
-    bool btns[INPUT_BUTTON__MAX];
-    bool btnc[INPUT_BUTTON__MAX];
-    uint8_t outbuf[32];
-    int outlen;
-} MouseState;
-
-static void msmouse_chr_accept_input(CharDriverState *chr)
+static void msmouse_event(void *opaque,
+                          int dx, int dy, int dz, int buttons_state)
 {
-    MouseState *mouse = chr->opaque;
-    int len;
+    CharDriverState *chr = (CharDriverState *)opaque;

-    len = qemu_chr_be_can_write(chr);
-    if (len > mouse->outlen) {
-        len = mouse->outlen;
-    }
-    if (!len) {
-        return;
-    }
-
-    qemu_chr_be_write(chr, mouse->outbuf, len);
-    mouse->outlen -= len;
-    if (mouse->outlen) {
-        memmove(mouse->outbuf, mouse->outbuf + len, mouse->outlen);
-    }
-}
-
-static void msmouse_queue_event(MouseState *mouse)
-{
    unsigned char bytes[4] = { 0x40, 0x00, 0x00, 0x00 };
-    int dx, dy, count = 3;
-
-    dx = mouse->axis[INPUT_AXIS_X];
-    mouse->axis[INPUT_AXIS_X] = 0;
-
-    dy = mouse->axis[INPUT_AXIS_Y];
-    mouse->axis[INPUT_AXIS_Y] = 0;

    /* Movement deltas */
    bytes[0] |= (MSMOUSE_HI2(dy) << 2) | MSMOUSE_HI2(dx);
@@ -77,54 +42,14 @@ static void msmouse_queue_event(MouseState *mouse)
    bytes[2] |= MSMOUSE_LO6(dy);

    /* Buttons */
-    bytes[0] |= (mouse->btns[INPUT_BUTTON_LEFT]   ? 0x20 : 0x00);
-    bytes[0] |= (mouse->btns[INPUT_BUTTON_RIGHT]  ? 0x10 : 0x00);
-    if (mouse->btns[INPUT_BUTTON_MIDDLE] ||
-        mouse->btnc[INPUT_BUTTON_MIDDLE]) {
-        bytes[3] |= (mouse->btns[INPUT_BUTTON_MIDDLE] ? 0x20 : 0x00);
-        mouse->btnc[INPUT_BUTTON_MIDDLE] = false;
-        count = 4;
-    }
+    bytes[0] |= (buttons_state & 0x01 ? 0x20 : 0x00);
+    bytes[0] |= (buttons_state & 0x02 ? 0x10 : 0x00);
+    bytes[3] |= (buttons_state & 0x04 ? 0x20 : 0x00);

-    if (mouse->outlen <= sizeof(mouse->outbuf) - count) {
-        memcpy(mouse->outbuf + mouse->outlen, bytes, count);
-        mouse->outlen += count;
-    } else {
-        /* queue full -> drop event */
-    }
-}
-
-static void msmouse_input_event(DeviceState *dev, QemuConsole *src,
-                                InputEvent *evt)
-{
-    MouseState *mouse = (MouseState *)dev;
-    InputMoveEvent *move;
-    InputBtnEvent *btn;
-
-    switch (evt->type) {
-    case INPUT_EVENT_KIND_REL:
-        move = evt->u.rel.data;
-        mouse->axis[move->axis] += move->value;
-        break;
-
-    case INPUT_EVENT_KIND_BTN:
-        btn = evt->u.btn.data;
-        mouse->btns[btn->button] = btn->down;
-        mouse->btnc[btn->button] = true;
-        break;
-
-    default:
-        /* keep gcc happy */
-        break;
-    }
-}
-
-static void msmouse_input_sync(DeviceState *dev)
-{
-    MouseState *mouse = (MouseState *)dev;
-
-    msmouse_queue_event(mouse);
-    msmouse_chr_accept_input(mouse->chr);
+    /* We always send the packet of, so that we do not have to keep track
+       of previous state of the middle button. This can potentially confuse
+       some very old drivers for two button mice though. */
+    qemu_chr_be_write(chr, bytes, 4);
 }

 static int msmouse_chr_write (struct CharDriverState *s, const uint8_t *buf, int len)
@@ -135,26 +60,15 @@ static int msmouse_chr_write (struct CharDriverState *s, const uint8_t *buf, int

 static void msmouse_chr_close (struct CharDriverState *chr)
 {
-    MouseState *mouse = chr->opaque;
-
-    qemu_input_handler_unregister(mouse->hs);
-    g_free(mouse);
+    g_free (chr);
 }

-static QemuInputHandler msmouse_handler = {
-    .name  = "QEMU Microsoft Mouse",
-    .mask  = INPUT_EVENT_MASK_BTN | INPUT_EVENT_MASK_REL,
-    .event = msmouse_input_event,
-    .sync  = msmouse_input_sync,
-};
-
 static CharDriverState *qemu_chr_open_msmouse(const char *id,
                                              ChardevBackend *backend,
                                              ChardevReturn *ret,
                                              Error **errp)
 {
    ChardevCommon *common = backend->u.msmouse.data;
-    MouseState *mouse;
    CharDriverState *chr;

    chr = qemu_chr_alloc(common, errp);
@@ -163,15 +77,9 @@ static CharDriverState *qemu_chr_open_msmouse(const char *id,
    }
    chr->chr_write = msmouse_chr_write;
    chr->chr_close = msmouse_chr_close;
-    chr->chr_accept_input = msmouse_chr_accept_input;
    chr->explicit_be_open = true;

-    mouse = g_new0(MouseState, 1);
-    mouse->hs = qemu_input_handler_register((DeviceState *)mouse,
-                                            &msmouse_handler);
-
-    mouse->chr = chr;
-    chr->opaque = mouse;
+    qemu_add_mouse_event_handler(msmouse_event, chr, 0, "QEMU Microsoft Mouse");

    return chr;
 }
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -41,9 +41,7 @@ static void rng_egd_request_entropy(RngBackend *b, RngRequest *req)
        header[0] = 0x02;
        header[1] = len;

-        /* XXX this blocks entire thread. Rewrite to use
-         * qemu_chr_fe_write and background I/O callbacks */
-        qemu_chr_fe_write_all(s->chr, header, sizeof(header));
+        qemu_chr_fe_write(s->chr, header, sizeof(header));

        size -= len;
    }
--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -17,7 +17,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/main-loop.h"

-struct RngRandom
+struct RndRandom
 {
    RngBackend parent;

@@ -34,7 +34,7 @@ struct RngRandom

 static void entropy_available(void *opaque)
 {
-    RngRandom *s = RNG_RANDOM(opaque);
+    RndRandom *s = RNG_RANDOM(opaque);

    while (!QSIMPLEQ_EMPTY(&s->parent.requests)) {
        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
@@ -57,7 +57,7 @@ static void entropy_available(void *opaque)

 static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
-    RngRandom *s = RNG_RANDOM(b);
+    RndRandom *s = RNG_RANDOM(b);

    if (QSIMPLEQ_EMPTY(&s->parent.requests)) {
        /* If there are no pending requests yet, we need to
@@ -68,7 +68,7 @@ static void rng_random_request_entropy(RngBackend *b, RngRequest *req)

 static void rng_random_opened(RngBackend *b, Error **errp)
 {
-    RngRandom *s = RNG_RANDOM(b);
+    RndRandom *s = RNG_RANDOM(b);

    if (s->filename == NULL) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
@@ -83,7 +83,7 @@ static void rng_random_opened(RngBackend *b, Error **errp)

 static char *rng_random_get_filename(Object *obj, Error **errp)
 {
-    RngRandom *s = RNG_RANDOM(obj);
+    RndRandom *s = RNG_RANDOM(obj);

    return g_strdup(s->filename);
 }
@@ -92,7 +92,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,
                                 Error **errp)
 {
    RngBackend *b = RNG_BACKEND(obj);
-    RngRandom *s = RNG_RANDOM(obj);
+    RndRandom *s = RNG_RANDOM(obj);

    if (b->opened) {
        error_setg(errp, QERR_PERMISSION_DENIED);
@@ -105,7 +105,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,

 static void rng_random_init(Object *obj)
 {
-    RngRandom *s = RNG_RANDOM(obj);
+    RndRandom *s = RNG_RANDOM(obj);

    object_property_add_str(obj, "filename",
                            rng_random_get_filename,
@@ -118,7 +118,7 @@ static void rng_random_init(Object *obj)

 static void rng_random_finalize(Object *obj)
 {
-    RngRandom *s = RNG_RANDOM(obj);
+    RndRandom *s = RNG_RANDOM(obj);

    if (s->fd != -1) {
        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
@@ -139,7 +139,7 @@ static void rng_random_class_init(ObjectClass *klass, void *data)
 static const TypeInfo rng_random_info = {
    .name = TYPE_RNG_RANDOM,
    .parent = TYPE_RNG_BACKEND,
-    .instance_size = sizeof(RngRandom),
+    .instance_size = sizeof(RndRandom),
    .class_init = rng_random_class_init,
    .instance_init = rng_random_init,
    .instance_finalize = rng_random_finalize,
--- a/block.c
+++ b/block.c
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,15 +1,15 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
-block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
+block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o commit.o io.o
+block-obj-y += null.o mirror.o io.o
 block-obj-y += throttle-groups.o

 block-obj-y += nbd.o nbd-client.o sheepdog.o
@@ -21,15 +21,16 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o dirty-bitmap.o
+block-obj-y += dictzip.o
+block-obj-y += tar.o
 block-obj-y += write-threshold.o
-block-obj-y += backup.o
-block-obj-$(CONFIG_REPLICATION) += replication.o

 block-obj-y += crypto.o

 common-obj-y += stream.o
+common-obj-y += commit.o
+common-obj-y += backup.o

-nfs.o-libs         := $(LIBNFS_LIBS)
 iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
 iscsi.o-libs       := $(LIBISCSI_LIBS)
 curl.o-cflags      := $(CURL_CFLAGS)
@@ -41,6 +42,7 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m        += dmg.o
 dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -974,9 +974,11 @@ err_exit2:

 static int64_t qemu_archipelago_getlength(BlockDriverState *bs)
 {
+    int64_t ret;
    BDRVArchipelagoState *s = bs->opaque;

-    return archipelago_volume_info(s);
+    ret = archipelago_volume_info(s);
+    return ret;
 }

 static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset)
--- a/block/backup.c
+++ b/block/backup.c
@@ -17,7 +17,6 @@
 #include "block/block.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
-#include "block/block_backup.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -28,9 +27,16 @@
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 100000000ULL /* ns */

+typedef struct CowRequest {
+    int64_t start;
+    int64_t end;
+    QLIST_ENTRY(CowRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
 typedef struct BackupBlockJob {
    BlockJob common;
-    BlockBackend *target;
+    BlockDriverState *target;
    /* bitmap for sync=incremental */
    BdrvDirtyBitmap *sync_bitmap;
    MirrorSyncMode sync_mode;
@@ -41,8 +47,6 @@ typedef struct BackupBlockJob {
    uint64_t sectors_read;
    unsigned long *done_bitmap;
    int64_t cluster_size;
-    bool compress;
-    NotifierWithReturn before_write;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

@@ -89,12 +93,12 @@ static void cow_request_end(CowRequest *req)
    qemu_co_queue_restart_all(&req->wait_queue);
 }

-static int coroutine_fn backup_do_cow(BackupBlockJob *job,
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read,
                                      bool is_write_notifier)
 {
-    BlockBackend *blk = job->common.blk;
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
@@ -127,15 +131,20 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = blk_blockalign(blk, job->cluster_size);
+            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

-        ret = blk_co_preadv(blk, start * job->cluster_size,
-                            bounce_qiov.size, &bounce_qiov,
-                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
+        if (is_write_notifier) {
+            ret = bdrv_co_readv_no_serialising(bs,
+                                           start * sectors_per_cluster,
+                                           n, &bounce_qiov);
+        } else {
+            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
+                                &bounce_qiov);
+        }
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
@@ -145,12 +154,13 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
-                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
+            ret = bdrv_co_write_zeroes(job->target,
+                                       start * sectors_per_cluster,
+                                       n, BDRV_REQ_MAY_UNMAP);
        } else {
-            ret = blk_co_pwritev(job->target, start * job->cluster_size,
-                                 bounce_qiov.size, &bounce_qiov,
-                                 job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
+            ret = bdrv_co_writev(job->target,
+                                 start * sectors_per_cluster, n,
+                                 &bounce_qiov);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
@@ -187,16 +197,14 @@ static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
 {
-    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
    BdrvTrackedRequest *req = opaque;
    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

-    assert(req->bs == blk_bs(job->common.blk));
    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);

-    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -210,10 +218,19 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

+static void backup_iostatus_reset(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
+}
+
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
    BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockDriverState *bs = job->common.bs;

    if (ret < 0 || block_job_is_cancelled(&job->common)) {
        /* Merge the successor back into the parent, delete nothing. */
@@ -242,82 +259,24 @@ static void backup_abort(BlockJob *job)
    }
 }

-static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    blk_set_aio_context(s->target, aio_context);
-}
-
-void backup_do_checkpoint(BlockJob *job, Error **errp)
-{
-    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-    int64_t len;
-
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-
-    if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
-        error_setg(errp, "The backup job only supports block checkpoint in"
-                   " sync=none mode");
-        return;
-    }
-
-    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
-    bitmap_zero(backup_job->done_bitmap, len);
-}
-
-void backup_wait_for_overlapping_requests(BlockJob *job, int64_t sector_num,
-                                          int nb_sectors)
-{
-    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
-    int64_t start, end;
-
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-
-    start = sector_num / sectors_per_cluster;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
-    wait_for_overlapping_requests(backup_job, start, end);
-}
-
-void backup_cow_request_begin(CowRequest *req, BlockJob *job,
-                              int64_t sector_num,
-                              int nb_sectors)
-{
-    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
-    int64_t start, end;
-
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-
-    start = sector_num / sectors_per_cluster;
-    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
-    cow_request_begin(req, backup_job, start, end);
-}
-
-void backup_cow_request_end(CowRequest *req)
-{
-    cow_request_end(req);
-}
-
 static const BlockJobDriver backup_job_driver = {
-    .instance_size          = sizeof(BackupBlockJob),
-    .job_type               = BLOCK_JOB_TYPE_BACKUP,
-    .set_speed              = backup_set_speed,
-    .commit                 = backup_commit,
-    .abort                  = backup_abort,
-    .attached_aio_context   = backup_attached_aio_context,
+    .instance_size  = sizeof(BackupBlockJob),
+    .job_type       = BLOCK_JOB_TYPE_BACKUP,
+    .set_speed      = backup_set_speed,
+    .iostatus_reset = backup_iostatus_reset,
+    .commit         = backup_commit,
+    .abort          = backup_abort,
 };

 static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
 {
    if (read) {
-        return block_job_error_action(&job->common, job->on_source_error,
-                                      true, error);
+        return block_job_error_action(&job->common, job->common.bs,
+                                      job->on_source_error, true, error);
    } else {
-        return block_job_error_action(&job->common, job->on_target_error,
-                                      false, error);
+        return block_job_error_action(&job->common, job->target,
+                                      job->on_target_error, false, error);
    }
 }

@@ -330,7 +289,7 @@ static void backup_complete(BlockJob *job, void *opaque)
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
    BackupCompleteData *data = opaque;

-    blk_unref(s->target);
+    bdrv_unref(s->target);

    block_job_completed(job, data->ret);
    g_free(data);
@@ -372,6 +331,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
+    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -393,7 +353,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(job, cluster * sectors_per_cluster,
+                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
@@ -426,8 +386,12 @@ static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
-    BlockDriverState *bs = blk_bs(job->common.blk);
-    BlockBackend *target = job->target;
+    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *target = job->target;
+    BlockdevOnError on_target_error = job->on_target_error;
+    NotifierWithReturn before_write = {
+        .notify = backup_before_write_notify,
+    };
    int64_t start, end;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;
@@ -440,14 +404,20 @@ static void coroutine_fn backup_run(void *opaque)

    job->done_bitmap = bitmap_new(end);

-    job->before_write.notify = backup_before_write_notify;
-    bdrv_add_before_write_notifier(bs, &job->before_write);
+    if (target->blk) {
+        blk_set_on_error(target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(target->blk);
+    }
+
+    bdrv_add_before_write_notifier(bs, &before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
            /* Yield until the job is cancelled.  We just let our before_write
             * notify callback service CoW requests. */
-            block_job_yield(&job->common);
+            job->common.busy = false;
+            qemu_coroutine_yield();
+            job->common.busy = true;
        }
    } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        ret = backup_run_incremental(job);
@@ -491,7 +461,7 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(job, start * sectors_per_cluster,
+            ret = backup_do_cow(bs, start * sectors_per_cluster,
                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
@@ -507,24 +477,26 @@ static void coroutine_fn backup_run(void *opaque)
        }
    }

-    notifier_with_return_remove(&job->before_write);
+    notifier_with_return_remove(&before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);
+    if (target->blk) {
+        blk_iostatus_disable(target->blk);
+    }
+    bdrv_op_unblock_all(target, job->common.blocker);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }

-void backup_start(const char *job_id, BlockDriverState *bs,
-                  BlockDriverState *target, int64_t speed,
-                  MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
-                  bool compress,
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  BlockCompletionFunc *cb, void *opaque,
@@ -537,12 +509,20 @@ void backup_start(const char *job_id, BlockDriverState *bs,

    assert(bs);
    assert(target);
+    assert(cb);

    if (bs == target) {
        error_setg(errp, "Source and target cannot be the same");
        return;
    }

+    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        return;
+    }
+
    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
@@ -555,12 +535,6 @@ void backup_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
-        error_setg(errp, "Compression is not supported for this drive %s",
-                   bdrv_get_device_name(target));
-        return;
-    }
-
    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
        return;
    }
@@ -595,26 +569,22 @@ void backup_start(const char *job_id, BlockDriverState *bs,
        goto error;
    }

-    job = block_job_create(job_id, &backup_job_driver, bs, speed,
-                           cb, opaque, errp);
+    job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp);
    if (!job) {
        goto error;
    }

-    job->target = blk_new();
-    blk_insert_bs(job->target, target);
-
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
+    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
-    job->compress = compress;

    /* If there is no backing file on the target, we cannot rely on COW if our
     * backup cluster size is smaller than the target cluster size. Even for
     * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(target, &bdi);
+    ret = bdrv_get_info(job->target, &bdi);
    if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
            "Couldn't determine the cluster size of the target image, "
@@ -631,9 +601,9 @@ void backup_start(const char *job_id, BlockDriverState *bs,

    bdrv_op_block_all(target, job->common.blocker);
    job->common.len = len;
-    job->common.co = qemu_coroutine_create(backup_run, job);
+    job->common.co = qemu_coroutine_create(backup_run);
    block_job_txn_add_job(txn, &job->common);
-    qemu_coroutine_enter(job->common.co);
+    qemu_coroutine_enter(job->common.co, job);
    return;

 error:
@@ -641,7 +611,6 @@ void backup_start(const char *job_id, BlockDriverState *bs,
        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
    }
    if (job) {
-        blk_unref(job->target);
        block_job_unref(&job->common);
    }
 }
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -37,10 +37,6 @@
 typedef struct BDRVBlkdebugState {
    int state;
    int new_state;
-    int align;
-
-    /* For blkdebug_refresh_filename() */
-    char *config_file;

    QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX];
    QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
@@ -354,6 +350,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVBlkdebugState *s = bs->opaque;
    QemuOpts *opts;
    Error *local_err = NULL;
+    const char *config;
    uint64_t align;
    int ret;

@@ -366,8 +363,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Read rules from config file or command line options */
-    s->config_file = g_strdup(qemu_opt_get(opts, "config"));
-    ret = read_config(s, s->config_file, options, errp);
+    config = qemu_opt_get(opts, "config");
+    ret = read_config(s, config, options, errp);
    if (ret) {
        goto out;
    }
@@ -385,10 +382,10 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* Set request alignment */
-    align = qemu_opt_get_size(opts, "align", 0);
-    if (align < INT_MAX && is_power_of_2(align)) {
-        s->align = align;
-    } else if (align) {
+    align = qemu_opt_get_size(opts, "align", bs->request_alignment);
+    if (align > 0 && align < INT_MAX && !(align & (align - 1))) {
+        bs->request_alignment = align;
+    } else {
        error_setg(errp, "Invalid alignment");
        ret = -EINVAL;
        goto fail_unref;
@@ -400,9 +397,6 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
 fail_unref:
    bdrv_unref_child(bs, bs->file);
 out:
-    if (ret < 0) {
-        g_free(s->config_file);
-    }
    qemu_opts_del(opts);
    return ret;
 }
@@ -462,7 +456,7 @@ static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
        return inject_error(bs, cb, opaque, rule);
    }

-    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
+    return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors,
                          cb, opaque);
 }

@@ -485,7 +479,7 @@ static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
        return inject_error(bs, cb, opaque, rule);
    }

-    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+    return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors,
                           cb, opaque);
 }

@@ -520,8 +514,6 @@ static void blkdebug_close(BlockDriverState *bs)
            remove_rule(rule);
        }
    }
-
-    g_free(s->config_file);
 }

 static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
@@ -628,7 +620,7 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)

    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) {
        if (!strcmp(r->tag, tag)) {
-            qemu_coroutine_enter(r->co);
+            qemu_coroutine_enter(r->co, NULL);
            return 0;
        }
    }
@@ -654,7 +646,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
    }
    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) {
        if (!strcmp(r->tag, tag)) {
-            qemu_coroutine_enter(r->co);
+            qemu_coroutine_enter(r->co, NULL);
            ret = 0;
        }
    }
@@ -686,7 +678,6 @@ static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)

 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 {
-    BDRVBlkdebugState *s = bs->opaque;
    QDict *opts;
    const QDictEntry *e;
    bool force_json = false;
@@ -708,7 +699,8 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)

    if (!force_json && bs->file->bs->exact_filename[0]) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkdebug:%s:%s", s->config_file ?: "",
+                 "blkdebug:%s:%s",
+                 qdict_get_try_str(options, "config") ?: "",
                 bs->file->bs->exact_filename);
    }

@@ -728,15 +720,6 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
    bs->full_open_options = opts;
 }

-static void blkdebug_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    BDRVBlkdebugState *s = bs->opaque;
-
-    if (s->align) {
-        bs->bl.request_alignment = s->align;
-    }
-}
-
 static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state,
                                   BlockReopenQueue *queue, Error **errp)
 {
@@ -755,7 +738,6 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_getlength         = blkdebug_getlength,
    .bdrv_truncate          = blkdebug_truncate,
    .bdrv_refresh_filename  = blkdebug_refresh_filename,
-    .bdrv_refresh_limits    = blkdebug_refresh_limits,

    .bdrv_aio_readv         = blkdebug_aio_readv,
    .bdrv_aio_writev        = blkdebug_aio_writev,
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -65,7 +65,7 @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
    Request *req = opaque;
-    qemu_coroutine_enter(req->co);
+    qemu_coroutine_enter(req->co, NULL);
    qemu_bh_delete(req->bh);
    g_free(req);
 }
@@ -81,44 +81,44 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs,
    replay_block_event(req->bh, reqid);
 }

-static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
-    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+static int coroutine_fn blkreplay_co_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
    uint64_t reqid = request_id++;
-    int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+    int ret = bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

    return ret;
 }

-static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
-    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
+static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
    uint64_t reqid = request_id++;
-    int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+    int ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

    return ret;
 }

-static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    uint64_t reqid = request_id++;
-    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

    return ret;
 }

-static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
-                                              int64_t offset, int count)
+static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
 {
    uint64_t reqid = request_id++;
-    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
+    int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

@@ -144,11 +144,11 @@ static BlockDriver bdrv_blkreplay = {
    .bdrv_close             = blkreplay_close,
    .bdrv_getlength         = blkreplay_getlength,

-    .bdrv_co_preadv         = blkreplay_co_preadv,
-    .bdrv_co_pwritev        = blkreplay_co_pwritev,
+    .bdrv_co_readv          = blkreplay_co_readv,
+    .bdrv_co_writev         = blkreplay_co_writev,

-    .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
-    .bdrv_co_pdiscard       = blkreplay_co_pdiscard,
+    .bdrv_co_write_zeroes   = blkreplay_co_write_zeroes,
+    .bdrv_co_discard        = blkreplay_co_discard,
    .bdrv_co_flush          = blkreplay_co_flush,
 };

--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -247,9 +247,9 @@ static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs,
    qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
    qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf);

-    bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
+    bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors,
                   blkverify_aio_cb, acb);
-    bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors,
+    bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors,
                   blkverify_aio_cb, acb);
    return &acb->common;
 }
@@ -262,9 +262,9 @@ static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs,
    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
                                            nb_sectors, cb, opaque);

-    bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors,
+    bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors,
                    blkverify_aio_cb, acb);
-    bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+    bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors,
                    blkverify_aio_cb, acb);
    return &acb->common;
 }
@@ -293,6 +293,22 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
    return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }

+/* Propagate AioContext changes to ->test_file */
+static void blkverify_detach_aio_context(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    bdrv_detach_aio_context(s->test_file->bs);
+}
+
+static void blkverify_attach_aio_context(BlockDriverState *bs,
+                                         AioContext *new_context)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    bdrv_attach_aio_context(s->test_file->bs, new_context);
+}
+
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkverifyState *s = bs->opaque;
@@ -340,6 +356,9 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_aio_writev                  = blkverify_aio_writev,
    .bdrv_aio_flush                   = blkverify_aio_flush,

+    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
+    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
+
    .is_filter                        = true,
    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -19,7 +19,6 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
-#include "trace.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -35,9 +34,9 @@ struct BlockBackend {
    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
-    BlockBackendPublic public;

    void *dev;                  /* attached device model, if any */
+    bool xen_dev;               /* true if dev is a Xen disk */
    /* TODO change to DeviceState when all users are qdevified */
    const BlockDevOps *dev_ops;
    void *dev_opaque;
@@ -76,7 +75,6 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };

 static void drive_info_del(DriveInfo *dinfo);
-static BlockBackend *bdrv_first_blk(BlockDriverState *bs);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -93,26 +91,9 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
    /* We're not supposed to call this function for root nodes */
    abort();
 }
-static void blk_root_drained_begin(BdrvChild *child);
-static void blk_root_drained_end(BdrvChild *child);
-
-static void blk_root_change_media(BdrvChild *child, bool load);
-static void blk_root_resize(BdrvChild *child);
-
-static const char *blk_root_get_name(BdrvChild *child)
-{
-    return blk_name(child->opaque);
-}

 static const BdrvChildRole child_root = {
-    .inherit_options    = blk_root_inherit_options,
-
-    .change_media       = blk_root_change_media,
-    .resize             = blk_root_resize,
-    .get_name           = blk_root_get_name,
-
-    .drained_begin      = blk_root_drained_begin,
-    .drained_end        = blk_root_drained_end,
+    .inherit_options = blk_root_inherit_options,
 };

 /*
@@ -120,26 +101,40 @@ static const BdrvChildRole child_root = {
 * Store an error through @errp on failure, unless it's null.
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(void)
+BlockBackend *blk_new(Error **errp)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
-    blk_set_enable_write_cache(blk, true);
-
-    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
-    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
-
    notifier_list_init(&blk->remove_bs_notifiers);
    notifier_list_init(&blk->insert_bs_notifiers);
-
    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
    return blk;
 }

 /*
- * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
+ * Create a new BlockBackend with a new BlockDriverState attached.
+ * Otherwise just like blk_new(), which see.
+ */
+BlockBackend *blk_new_with_bs(Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    blk = blk_new(errp);
+    if (!blk) {
+        return NULL;
+    }
+
+    bs = bdrv_new_root();
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
+    bs->blk = blk;
+    return blk;
+}
+
+/*
+ * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
@@ -154,16 +149,21 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                           QDict *options, int flags, Error **errp)
 {
    BlockBackend *blk;
-    BlockDriverState *bs;
+    int ret;

-    blk = blk_new();
-    bs = bdrv_open(filename, reference, options, flags, errp);
-    if (!bs) {
+    blk = blk_new_with_bs(errp);
+    if (!blk) {
+        QDECREF(options);
+        return NULL;
+    }
+
+    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
+    if (ret < 0) {
        blk_unref(blk);
        return NULL;
    }

-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
+    blk_set_enable_write_cache(blk, true);

    return blk;
 }
@@ -178,6 +178,10 @@ static void blk_delete(BlockBackend *blk)
    }
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
+    if (blk->root_state.throttle_state) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
    QTAILQ_REMOVE(&block_backends, blk, link);
    drive_info_del(blk->legacy_dinfo);
    block_acct_cleanup(&blk->stats);
@@ -264,45 +268,28 @@ BlockBackend *blk_next(BlockBackend *blk)
               : QTAILQ_FIRST(&monitor_block_backends);
 }

-/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
- * the monitor or attached to a BlockBackend */
-BlockDriverState *bdrv_next(BdrvNextIterator *it)
+/*
+ * Iterates over all BlockDriverStates which are attached to a BlockBackend.
+ * This function is for use by bdrv_next().
+ *
+ * @bs must be NULL or a BDS that is attached to a BB.
+ */
+BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
 {
-    BlockDriverState *bs;
+    BlockBackend *blk;

-    /* First, return all root nodes of BlockBackends. In order to avoid
-     * returning a BDS twice when multiple BBs refer to it, we only return it
-     * if the BB is the first one in the parent list of the BDS. */
-    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
-        do {
-            it->blk = blk_all_next(it->blk);
-            bs = it->blk ? blk_bs(it->blk) : NULL;
-        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
-
-        if (bs) {
-            return bs;
-        }
-        it->phase = BDRV_NEXT_MONITOR_OWNED;
+    if (bs) {
+        assert(bs->blk);
+        blk = bs->blk;
+    } else {
+        blk = NULL;
    }

-    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
-     * BDSes that are attached to a BlockBackend here; they have been handled
-     * by the above block already */
    do {
-        it->bs = bdrv_next_monitor_owned(it->bs);
-        bs = it->bs;
-    } while (bs && bdrv_has_blk(bs));
+        blk = blk_all_next(blk);
+    } while (blk && !blk->root);

-    return bs;
-}
-
-BlockDriverState *bdrv_first(BdrvNextIterator *it)
-{
-    *it = (BdrvNextIterator) {
-        .phase = BDRV_NEXT_BACKEND_ROOTS,
-    };
-
-    return bdrv_next(it);
+    return blk ? blk->root->bs : NULL;
 }

 /*
@@ -389,42 +376,6 @@ BlockDriverState *blk_bs(BlockBackend *blk)
    return blk->root ? blk->root->bs : NULL;
 }

-static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
-{
-    BdrvChild *child;
-    QLIST_FOREACH(child, &bs->parents, next_parent) {
-        if (child->role == &child_root) {
-            return child->opaque;
-        }
-    }
-
-    return NULL;
-}
-
-/*
- * Returns true if @bs has an associated BlockBackend.
- */
-bool bdrv_has_blk(BlockDriverState *bs)
-{
-    return bdrv_first_blk(bs) != NULL;
-}
-
-/*
- * Returns true if @bs has only BlockBackends as parents.
- */
-bool bdrv_is_root_node(BlockDriverState *bs)
-{
-    BdrvChild *c;
-
-    QLIST_FOREACH(c, &bs->parents, next_parent) {
-        if (c->role != &child_root) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 /*
 * Return @blk's DriveInfo if any, else null.
 */
@@ -460,34 +411,18 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
    abort();
 }

-/*
- * Returns a pointer to the publicly accessible fields of @blk.
- */
-BlockBackendPublic *blk_get_public(BlockBackend *blk)
-{
-    return &blk->public;
-}
-
-/*
- * Returns a BlockBackend given the associated @public fields.
- */
-BlockBackend *blk_by_public(BlockBackendPublic *public)
-{
-    return container_of(public, BlockBackend, public);
-}
-
 /*
 * Disassociates the currently associated BlockDriverState from @blk.
 */
 void blk_remove_bs(BlockBackend *blk)
 {
+    assert(blk->root->bs->blk == blk);
+
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
-        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
-    }

    blk_update_root_state(blk);

+    blk->root->bs->blk = NULL;
    bdrv_root_unref_child(blk->root);
    blk->root = NULL;
 }
@@ -497,14 +432,12 @@ void blk_remove_bs(BlockBackend *blk)
 */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
+    assert(!blk->root && !bs->blk);
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
+    bs->blk = blk;

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
-        throttle_timers_attach_aio_context(
-            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
-    }
 }

 /*
@@ -528,11 +461,12 @@ int blk_attach_dev(BlockBackend *blk, void *dev)
 * @blk must not have a device model attached already.
 * TODO qdevified devices don't use this, remove when devices are qdevified
 */
-void blk_attach_dev_nofail(BlockBackend *blk, void *dev)
+void blk_attach_dev_nofail(BlockBackend *blk, void *dev, bool xen_dev)
 {
    if (blk_attach_dev(blk, dev) < 0) {
        abort();
    }
+    blk->xen_dev = xen_dev;
 }

 /*
@@ -559,25 +493,6 @@ void *blk_get_attached_dev(BlockBackend *blk)
    return blk->dev;
 }

-/*
- * Return the BlockBackend which has the device model @dev attached if it
- * exists, else null.
- *
- * @dev must not be null.
- */
-BlockBackend *blk_by_dev(void *dev)
-{
-    BlockBackend *blk = NULL;
-
-    assert(dev != NULL);
-    while ((blk = blk_all_next(blk)) != NULL) {
-        if (blk->dev == dev) {
-            return blk;
-        }
-    }
-    return NULL;
-}
-
 /*
 * Set @blk's device model callbacks to @ops.
 * @opaque is the opaque argument to pass to the callbacks.
@@ -612,11 +527,6 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
    }
 }

-static void blk_root_change_media(BdrvChild *child, bool load)
-{
-    blk_dev_change_media_cb(child->opaque, load);
-}
-
 /*
 * Does @blk's attached device model have removable media?
 * %true if no device model is attached.
@@ -671,10 +581,8 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
 * Notify @blk's attached device model of a backend size change.
 */
-static void blk_root_resize(BdrvChild *child)
+void blk_dev_resize_cb(BlockBackend *blk)
 {
-    BlockBackend *blk = child->opaque;
-
    if (blk->dev_ops && blk->dev_ops->resize_cb) {
        blk->dev_ops->resize_cb(blk->dev_opaque);
    }
@@ -762,50 +670,49 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
    return 0;
 }

-int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                               unsigned int bytes, QEMUIOVector *qiov,
-                               BdrvRequestFlags flags)
+static int blk_check_request(BlockBackend *blk, int64_t sector_num,
+                             int nb_sectors)
 {
-    int ret;
-
-    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
-
-    ret = blk_check_byte_request(blk, offset, bytes);
-    if (ret < 0) {
-        return ret;
+    if (sector_num < 0 || sector_num > INT64_MAX / BDRV_SECTOR_SIZE) {
+        return -EIO;
    }

-    /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+        return -EIO;
    }

-    return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    return blk_check_byte_request(blk, sector_num * BDRV_SECTOR_SIZE,
+                                  nb_sectors * BDRV_SECTOR_SIZE);
 }

-int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                unsigned int bytes, QEMUIOVector *qiov,
-                                BdrvRequestFlags flags)
+static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                                      unsigned int bytes, QEMUIOVector *qiov,
+                                      BdrvRequestFlags flags)
 {
-    int ret;
-
-    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
-
-    ret = blk_check_byte_request(blk, offset, bytes);
+    int ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

-    /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                      unsigned int bytes, QEMUIOVector *qiov,
+                                      BdrvRequestFlags flags)
+{
+    int ret;
+
+    ret = blk_check_byte_request(blk, offset, bytes);
+    if (ret < 0) {
+        return ret;
    }

    if (!blk->enable_write_cache) {
        flags |= BDRV_REQ_FUA;
    }

-    return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
 }

 typedef struct BlkRwCo {
@@ -856,8 +763,8 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
        .ret    = NOT_DONE,
    };

-    co = qemu_coroutine_create(co_entry, &rwco);
-    qemu_coroutine_enter(co);
+    co = qemu_coroutine_create(co_entry);
+    qemu_coroutine_enter(co, &rwco);

    aio_context = blk_get_aio_context(blk);
    while (rwco.ret == NOT_DONE) {
@@ -867,9 +774,29 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
    return rwco.ret;
 }

+static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+                  int nb_sectors, CoroutineEntry co_entry,
+                  BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf,
+                   nb_sectors << BDRV_SECTOR_BITS, co_entry, flags);
+}
+
+int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
+             int nb_sectors)
+{
+    return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0);
+}
+
 int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
                          int count)
 {
+    BlockDriverState *bs = blk_bs(blk);
+    bool enabled;
    int ret;

    ret = blk_check_byte_request(blk, offset, count);
@@ -877,24 +804,27 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
        return ret;
    }

-    blk_root_drained_begin(blk->root);
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
    ret = blk_pread(blk, offset, buf, count);
-    blk_root_drained_end(blk->root);
+    bs->io_limits_enabled = enabled;
    return ret;
 }

-int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                      int count, BdrvRequestFlags flags)
+int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
+              int nb_sectors)
+{
+    return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors,
+                  blk_write_entry, 0);
+}
+
+int blk_write_zeroes(BlockBackend *blk, int64_t offset,
+                     int count, BdrvRequestFlags flags)
 {
    return blk_prw(blk, offset, NULL, count, blk_write_entry,
                   flags | BDRV_REQ_ZERO_WRITE);
 }

-int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
-{
-    return bdrv_make_zero(blk->root, flags);
-}
-
 static void error_callback_bh(void *opaque)
 {
    struct BlockBackendAIOCB *acb = opaque;
@@ -970,8 +900,8 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
    acb->bh = NULL;
    acb->has_returned = false;

-    co = qemu_coroutine_create(co_entry, acb);
-    qemu_coroutine_enter(co);
+    co = qemu_coroutine_create(co_entry);
+    qemu_coroutine_enter(co, acb);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
@@ -1004,9 +934,9 @@ static void blk_aio_write_entry(void *opaque)
    blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                  int count, BdrvRequestFlags flags,
-                                  BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
+                                 int count, BdrvRequestFlags flags,
+                                 BlockCompletionFunc *cb, void *opaque)
 {
    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
@@ -1059,6 +989,19 @@ int64_t blk_nb_sectors(BlockBackend *blk)
    return bdrv_nb_sectors(blk_bs(blk));
 }

+BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
+                          QEMUIOVector *iov, int nb_sectors,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
+    }
+
+    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
+    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
+                        blk_aio_read_entry, 0, cb, opaque);
+}
+
 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
                           QEMUIOVector *qiov, BdrvRequestFlags flags,
                           BlockCompletionFunc *cb, void *opaque)
@@ -1067,6 +1010,19 @@ BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
                        blk_aio_read_entry, flags, cb, opaque);
 }

+BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
+                           QEMUIOVector *iov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
+    }
+
+    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
+    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
+                        blk_aio_write_entry, 0, cb, opaque);
+}
+
 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                            QEMUIOVector *qiov, BdrvRequestFlags flags,
                            BlockCompletionFunc *cb, void *opaque)
@@ -1085,16 +1041,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
    return bdrv_aio_flush(blk_bs(blk), cb, opaque);
 }

-BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
-                             int64_t offset, int count,
-                             BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_discard(BlockBackend *blk,
+                            int64_t sector_num, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
        return blk_abort_aio_request(blk, cb, opaque, ret);
    }

-    return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque);
+    return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque);
 }

 void blk_aio_cancel(BlockAIOCB *acb)
@@ -1107,6 +1063,20 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
    bdrv_aio_cancel_async(acb);
 }

+int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
+{
+    int i, ret;
+
+    for (i = 0; i < num_reqs; i++) {
+        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
+}
+
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
    if (!blk_is_available(blk)) {
@@ -1126,14 +1096,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
    return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque);
 }

-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
+int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
        return ret;
    }

-    return bdrv_co_pdiscard(blk_bs(blk), offset, count);
+    return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors);
 }

 int blk_co_flush(BlockBackend *blk)
@@ -1193,7 +1163,6 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
        return BLOCK_ERROR_ACTION_REPORT;
    case BLOCKDEV_ON_ERROR_IGNORE:
        return BLOCK_ERROR_ACTION_IGNORE;
-    case BLOCKDEV_ON_ERROR_AUTO:
    default:
        abort();
    }
@@ -1329,16 +1298,15 @@ int blk_get_flags(BlockBackend *blk)
    }
 }

-/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
-uint32_t blk_get_max_transfer(BlockBackend *blk)
+int blk_get_max_transfer_length(BlockBackend *blk)
 {
    BlockDriverState *bs = blk_bs(blk);
-    uint32_t max = 0;

    if (bs) {
-        max = bs->bl.max_transfer;
+        return bs->bl.max_transfer_length;
+    } else {
+        return 0;
    }
-    return MIN_NON_ZERO(max, INT_MAX);
 }

 int blk_get_max_iov(BlockBackend *blk)
@@ -1421,14 +1389,7 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
    BlockDriverState *bs = blk_bs(blk);

    if (bs) {
-        if (blk->public.throttle_state) {
-            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
-        }
        bdrv_set_aio_context(bs, new_context);
-        if (blk->public.throttle_state) {
-            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
-                                               new_context);
-        }
    }
 }

@@ -1497,18 +1458,22 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                      int count, BdrvRequestFlags flags)
+int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
+                                     int count, BdrvRequestFlags flags)
 {
    return blk_co_pwritev(blk, offset, count, NULL,
                          flags | BDRV_REQ_ZERO_WRITE);
 }

-int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
-                          int count)
+int blk_write_compressed(BlockBackend *blk, int64_t sector_num,
+                         const uint8_t *buf, int nb_sectors)
 {
-    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
-                   BDRV_REQ_WRITE_COMPRESSED);
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_write_compressed(blk_bs(blk), sector_num, buf, nb_sectors);
 }

 int blk_truncate(BlockBackend *blk, int64_t offset)
@@ -1520,14 +1485,21 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
    return bdrv_truncate(blk_bs(blk), offset);
 }

-int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
+void blk_legacy_resize_cb(BlockBackend *blk)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
+    if (blk->xen_dev) {
+        xen_blk_resize_cb(blk->dev);
+    }
+}
+
+int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+{
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
    if (ret < 0) {
        return ret;
    }

-    return bdrv_pdiscard(blk_bs(blk), offset, count);
+    return bdrv_discard(blk_bs(blk), sector_num, nb_sectors);
 }

 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
@@ -1589,6 +1561,19 @@ void blk_update_root_state(BlockBackend *blk)
    blk->root_state.open_flags    = blk->root->bs->open_flags;
    blk->root_state.read_only     = blk->root->bs->read_only;
    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
+
+    if (blk->root_state.throttle_group) {
+        g_free(blk->root_state.throttle_group);
+        throttle_group_unref(blk->root_state.throttle_state);
+    }
+    if (blk->root->bs->throttle_state) {
+        const char *name = throttle_group_get_name(blk->root->bs);
+        blk->root_state.throttle_group = g_strdup(name);
+        blk->root_state.throttle_state = throttle_group_incref(name);
+    } else {
+        blk->root_state.throttle_group = NULL;
+        blk->root_state.throttle_state = NULL;
+    }
 }

 /*
@@ -1599,6 +1584,9 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
    bs->detect_zeroes = blk->root_state.detect_zeroes;
+    if (blk->root_state.throttle_group) {
+        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
+    }
 }

 /*
@@ -1639,84 +1627,3 @@ int blk_commit_all(void)
    }
    return 0;
 }
-
-int blk_flush_all(void)
-{
-    BlockBackend *blk = NULL;
-    int result = 0;
-
-    while ((blk = blk_all_next(blk)) != NULL) {
-        AioContext *aio_context = blk_get_aio_context(blk);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        if (blk_is_inserted(blk)) {
-            ret = blk_flush(blk);
-            if (ret < 0 && !result) {
-                result = ret;
-            }
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
-}
-
-
-/* throttling disk I/O limits */
-void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
-{
-    throttle_group_config(blk, cfg);
-}
-
-void blk_io_limits_disable(BlockBackend *blk)
-{
-    assert(blk->public.throttle_state);
-    bdrv_drained_begin(blk_bs(blk));
-    throttle_group_unregister_blk(blk);
-    bdrv_drained_end(blk_bs(blk));
-}
-
-/* should be called before blk_set_io_limits if a limit is set */
-void blk_io_limits_enable(BlockBackend *blk, const char *group)
-{
-    assert(!blk->public.throttle_state);
-    throttle_group_register_blk(blk, group);
-}
-
-void blk_io_limits_update_group(BlockBackend *blk, const char *group)
-{
-    /* this BB is not part of any group */
-    if (!blk->public.throttle_state) {
-        return;
-    }
-
-    /* this BB is a part of the same group than the one we want */
-    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
-        return;
-    }
-
-    /* need to change the group this bs belong to */
-    blk_io_limits_disable(blk);
-    blk_io_limits_enable(blk, group);
-}
-
-static void blk_root_drained_begin(BdrvChild *child)
-{
-    BlockBackend *blk = child->opaque;
-
-    /* Note that blk->root may not be accessible here yet if we are just
-     * attaching to a BlockDriverState that is drained. Use child instead. */
-
-    if (blk->public.io_limits_disabled++ == 0) {
-        throttle_group_restart_blk(blk);
-    }
-}
-
-static void blk_root_drained_end(BdrvChild *child)
-{
-    BlockBackend *blk = child->opaque;
-
-    assert(blk->public.io_limits_disabled);
-    --blk->public.io_limits_disabled;
-}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -27,7 +27,6 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"

 /**************************************************************/

@@ -104,9 +103,9 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    struct bochs_header bochs;
    int ret;

-    bs->read_only = true; /* no write support yet */
+    bs->read_only = 1; // no write support yet

-    ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
+    ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs));
    if (ret < 0) {
        return ret;
    }
@@ -140,7 +139,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
        return -ENOMEM;
    }

-    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
+    ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap,
                     s->catalog_size * 4);
    if (ret < 0) {
        goto fail;
@@ -188,11 +187,6 @@ fail:
    return ret;
 }

-static void bochs_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
-}
-
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
    BDRVBochsState *s = bs->opaque;
@@ -214,7 +208,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
        (s->extent_blocks + s->bitmap_blocks));

    /* read in bitmap for current extent */
-    ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
+    ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8),
                     &bitmap_entry, 1);
    if (ret < 0) {
        return ret;
@@ -227,52 +221,38 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
 }

-static int coroutine_fn
-bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                QEMUIOVector *qiov, int flags)
+static int bochs_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
 {
-    BDRVBochsState *s = bs->opaque;
-    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-    uint64_t bytes_done = 0;
-    QEMUIOVector local_qiov;
    int ret;

-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-
-    qemu_iovec_init(&local_qiov, qiov->niov);
-    qemu_co_mutex_lock(&s->lock);
-
    while (nb_sectors > 0) {
        int64_t block_offset = seek_to_sector(bs, sector_num);
        if (block_offset < 0) {
-            ret = block_offset;
-            goto fail;
-        }
-
-        qemu_iovec_reset(&local_qiov);
-        qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512);
-
-        if (block_offset > 0) {
-            ret = bdrv_co_preadv(bs->file, block_offset, 512,
-                                 &local_qiov, 0);
+            return block_offset;
+        } else if (block_offset > 0) {
+            ret = bdrv_pread(bs->file->bs, block_offset, buf, 512);
            if (ret < 0) {
-                goto fail;
+                return ret;
            }
        } else {
-            qemu_iovec_memset(&local_qiov, 0, 0, 512);
+            memset(buf, 0, 512);
        }
        nb_sectors--;
        sector_num++;
-        bytes_done += 512;
+        buf += 512;
    }
+    return 0;
+}

-    ret = 0;
-fail:
+static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
+                                      uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVBochsState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = bochs_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-    qemu_iovec_destroy(&local_qiov);
-
    return ret;
 }

@@ -287,8 +267,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
-    .bdrv_refresh_limits = bochs_refresh_limits,
-    .bdrv_co_preadv = bochs_co_preadv,
+    .bdrv_read          = bochs_co_read,
    .bdrv_close		= bochs_close,
 };

--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,7 +26,6 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include <zlib.h>

 /* Maximum compressed block size */
@@ -66,10 +65,10 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    uint32_t offsets_size, max_compressed_block_size = 1, i;
    int ret;

-    bs->read_only = true;
+    bs->read_only = 1;

    /* read header */
-    ret = bdrv_pread(bs->file, 128, &s->block_size, 4);
+    ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4);
    if (ret < 0) {
        return ret;
    }
@@ -95,7 +94,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
        return -EINVAL;
    }

-    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
+    ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4);
    if (ret < 0) {
        return ret;
    }
@@ -126,7 +125,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
        return -ENOMEM;
    }

-    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
+    ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size);
    if (ret < 0) {
        goto fail;
    }
@@ -198,11 +197,6 @@ fail:
    return ret;
 }

-static void cloop_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
-}
-
 static inline int cloop_read_block(BlockDriverState *bs, int block_num)
 {
    BDRVCloopState *s = bs->opaque;
@@ -211,7 +205,7 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
        int ret;
        uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num];

-        ret = bdrv_pread(bs->file, s->offsets[block_num],
+        ret = bdrv_pread(bs->file->bs, s->offsets[block_num],
                         s->compressed_block, bytes);
        if (ret != bytes) {
            return -1;
@@ -235,38 +229,33 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
    return 0;
 }

-static int coroutine_fn
-cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                QEMUIOVector *qiov, int flags)
+static int cloop_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
 {
    BDRVCloopState *s = bs->opaque;
-    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-    int ret, i;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-
-    qemu_co_mutex_lock(&s->lock);
+    int i;

    for (i = 0; i < nb_sectors; i++) {
-        void *data;
        uint32_t sector_offset_in_block =
            ((sector_num + i) % s->sectors_per_block),
            block_num = (sector_num + i) / s->sectors_per_block;
        if (cloop_read_block(bs, block_num) != 0) {
-            ret = -EIO;
-            goto fail;
+            return -1;
        }
-
-        data = s->uncompressed_block + sector_offset_in_block * 512;
-        qemu_iovec_from_buf(qiov, i * 512, data, 512);
+        memcpy(buf + i * 512,
+            s->uncompressed_block + sector_offset_in_block * 512, 512);
    }
+    return 0;
+}

-    ret = 0;
-fail:
+static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
+                                      uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVCloopState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = cloop_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-
    return ret;
 }

@@ -284,8 +273,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
-    .bdrv_refresh_limits = cloop_refresh_limits,
-    .bdrv_co_preadv = cloop_co_preadv,
+    .bdrv_read      = cloop_co_read,
    .bdrv_close     = cloop_close,
 };

--- a/block/commit.c
+++ b/block/commit.c
@@ -36,36 +36,28 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
-    BlockBackend *top;
-    BlockBackend *base;
+    BlockDriverState *top;
+    BlockDriverState *base;
    BlockdevOnError on_error;
    int base_flags;
    int orig_overlay_flags;
    char *backing_file_str;
 } CommitBlockJob;

-static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+                                        BlockDriverState *base,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
    int ret = 0;
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base = buf,
-        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
-    };

-    qemu_iovec_init_external(&qiov, &iov, 1);
-
-    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
-                        qiov.size, &qiov, 0);
-    if (ret < 0) {
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    if (ret) {
        return ret;
    }

-    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
-                         qiov.size, &qiov, 0);
-    if (ret < 0) {
+    ret = bdrv_write(base, sector_num, buf, nb_sectors);
+    if (ret) {
        return ret;
    }

@@ -81,9 +73,9 @@ static void commit_complete(BlockJob *job, void *opaque)
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
-    BlockDriverState *top = blk_bs(s->top);
-    BlockDriverState *base = blk_bs(s->base);
-    BlockDriverState *overlay_bs = bdrv_find_overlay(active, top);
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
+    BlockDriverState *overlay_bs;
    int ret = data->ret;

    if (!block_job_is_cancelled(&s->common) && ret == 0) {
@@ -97,12 +89,11 @@ static void commit_complete(BlockJob *job, void *opaque)
    if (s->base_flags != bdrv_get_flags(base)) {
        bdrv_reopen(base, s->base_flags, NULL);
    }
+    overlay_bs = bdrv_find_overlay(active, top);
    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
    }
    g_free(s->backing_file_str);
-    blk_unref(s->top);
-    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
 }
@@ -111,39 +102,42 @@ static void coroutine_fn commit_run(void *opaque)
 {
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
    int64_t sector_num, end;
-    uint64_t delay_ns = 0;
    int ret = 0;
    int n = 0;
    void *buf = NULL;
    int bytes_written = 0;
    int64_t base_len;

-    ret = s->common.len = blk_getlength(s->top);
+    ret = s->common.len = bdrv_getlength(top);


    if (s->common.len < 0) {
        goto out;
    }

-    ret = base_len = blk_getlength(s->base);
+    ret = base_len = bdrv_getlength(base);
    if (base_len < 0) {
        goto out;
    }

    if (base_len < s->common.len) {
-        ret = blk_truncate(s->base, s->common.len);
+        ret = bdrv_truncate(base, s->common.len);
        if (ret) {
            goto out;
        }
    }

    end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
+    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);

    for (sector_num = 0; sector_num < end; sector_num += n) {
+        uint64_t delay_ns = 0;
        bool copy;

+wait:
        /* Note that even when no rate limit is applied we need to yield
         * with no pending I/O here so that bdrv_drain_all() returns.
         */
@@ -152,20 +146,25 @@ static void coroutine_fn commit_run(void *opaque)
            break;
        }
        /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
-                                      sector_num,
+        ret = bdrv_is_allocated_above(top, base, sector_num,
                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                      &n);
        copy = (ret == 1);
        trace_commit_one_iteration(s, sector_num, n, ret);
        if (copy) {
-            ret = commit_populate(s->top, s->base, sector_num, n, buf);
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, n);
+                if (delay_ns > 0) {
+                    goto wait;
+                }
+            }
+            ret = commit_populate(top, base, sector_num, n, buf);
            bytes_written += n * BDRV_SECTOR_SIZE;
        }
        if (ret < 0) {
-            BlockErrorAction action =
-                block_job_error_action(&s->common, false, s->on_error, -ret);
-            if (action == BLOCK_ERROR_ACTION_REPORT) {
+            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
+                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
                goto out;
            } else {
                n = 0;
@@ -174,10 +173,6 @@ static void coroutine_fn commit_run(void *opaque)
        }
        /* Publish progress */
        s->common.offset += n * BDRV_SECTOR_SIZE;
-
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
-        }
    }

    ret = 0;
@@ -207,8 +202,8 @@ static const BlockJobDriver commit_job_driver = {
    .set_speed     = commit_set_speed,
 };

-void commit_start(const char *job_id, BlockDriverState *bs,
-                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+                  BlockDriverState *top, int64_t speed,
                  BlockdevOnError on_error, BlockCompletionFunc *cb,
                  void *opaque, const char *backing_file_str, Error **errp)
 {
@@ -219,6 +214,13 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    BlockDriverState *overlay_bs;
    Error *local_err = NULL;

+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
+        error_setg(errp, "Invalid parameter combination");
+        return;
+    }
+
    assert(top != bs);
    if (top == base) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
@@ -232,40 +234,34 @@ void commit_start(const char *job_id, BlockDriverState *bs,
        return;
    }

-    s = block_job_create(job_id, &commit_job_driver, bs, speed,
-                         cb, opaque, errp);
-    if (!s) {
-        return;
-    }
-
    orig_base_flags    = bdrv_get_flags(base);
    orig_overlay_flags = bdrv_get_flags(overlay_bs);

    /* convert base & overlay_bs to r/w, if necessary */
-    if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
                                         orig_overlay_flags | BDRV_O_RDWR);
    }
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
    if (reopen_queue) {
        bdrv_reopen_multiple(reopen_queue, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
-            block_job_unref(&s->common);
            return;
        }
    }


-    s->base = blk_new();
-    blk_insert_bs(s->base, base);
-
-    s->top = blk_new();
-    blk_insert_bs(s->top, top);
+    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }

+    s->base   = base;
+    s->top    = top;
    s->active = bs;

    s->base_flags          = orig_base_flags;
@@ -274,129 +270,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    s->backing_file_str = g_strdup(backing_file_str);

    s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(commit_run, s);
+    s->common.co = qemu_coroutine_create(commit_run);

    trace_commit_start(bs, base, top, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
-}
-
-
-#define COMMIT_BUF_SECTORS 2048
-
-/* commit COW file into the raw image */
-int bdrv_commit(BlockDriverState *bs)
-{
-    BlockBackend *src, *backing;
-    BlockDriver *drv = bs->drv;
-    int64_t sector, total_sectors, length, backing_length;
-    int n, ro, open_flags;
-    int ret = 0;
-    uint8_t *buf = NULL;
-
-    if (!drv)
-        return -ENOMEDIUM;
-
-    if (!bs->backing) {
-        return -ENOTSUP;
-    }
-
-    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
-        bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
-        return -EBUSY;
-    }
-
-    ro = bs->backing->bs->read_only;
-    open_flags =  bs->backing->bs->open_flags;
-
-    if (ro) {
-        if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) {
-            return -EACCES;
-        }
-    }
-
-    src = blk_new();
-    blk_insert_bs(src, bs);
-
-    backing = blk_new();
-    blk_insert_bs(backing, bs->backing->bs);
-
-    length = blk_getlength(src);
-    if (length < 0) {
-        ret = length;
-        goto ro_cleanup;
-    }
-
-    backing_length = blk_getlength(backing);
-    if (backing_length < 0) {
-        ret = backing_length;
-        goto ro_cleanup;
-    }
-
-    /* If our top snapshot is larger than the backing file image,
-     * grow the backing file image if possible.  If not possible,
-     * we must return an error */
-    if (length > backing_length) {
-        ret = blk_truncate(backing, length);
-        if (ret < 0) {
-            goto ro_cleanup;
-        }
-    }
-
-    total_sectors = length >> BDRV_SECTOR_BITS;
-
-    /* blk_try_blockalign() for src will choose an alignment that works for
-     * backing as well, so no need to compare the alignment manually. */
-    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
-    if (buf == NULL) {
-        ret = -ENOMEM;
-        goto ro_cleanup;
-    }
-
-    for (sector = 0; sector < total_sectors; sector += n) {
-        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
-        if (ret < 0) {
-            goto ro_cleanup;
-        }
-        if (ret) {
-            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
-                            n * BDRV_SECTOR_SIZE);
-            if (ret < 0) {
-                goto ro_cleanup;
-            }
-
-            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
-                             n * BDRV_SECTOR_SIZE, 0);
-            if (ret < 0) {
-                goto ro_cleanup;
-            }
-        }
-    }
-
-    if (drv->bdrv_make_empty) {
-        ret = drv->bdrv_make_empty(bs);
-        if (ret < 0) {
-            goto ro_cleanup;
-        }
-        blk_flush(src);
-    }
-
-    /*
-     * Make sure all data we wrote to the backing device is actually
-     * stable on disk.
-     */
-    blk_flush(backing);
-
-    ret = 0;
-ro_cleanup:
-    qemu_vfree(buf);
-
-    blk_unref(src);
-    blk_unref(backing);
-
-    if (ro) {
-        /* ignoring error return here */
-        bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL);
-    }
-
-    return ret;
+    qemu_coroutine_enter(s->common.co, s);
 }
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -33,7 +33,6 @@
 #define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg"
 #define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg"
 #define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"

 typedef struct BlockCrypto BlockCrypto;

@@ -65,7 +64,7 @@ static ssize_t block_crypto_read_func(QCryptoBlock *block,
    BlockDriverState *bs = opaque;
    ssize_t ret;

-    ret = bdrv_pread(bs->file, offset, buf, buflen);
+    ret = bdrv_pread(bs->file->bs, offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return ret;
@@ -184,11 +183,6 @@ static QemuOptsList block_crypto_create_opts_luks = {
            .type = QEMU_OPT_STRING,
            .help = "Name of encryption hash algorithm",
        },
-        {
-            .name = BLOCK_CRYPTO_OPT_LUKS_ITER_TIME,
-            .type = QEMU_OPT_NUMBER,
-            .help = "Time to spend in PBKDF in milliseconds",
-        },
        { /* end of list */ }
    },
 };
@@ -199,16 +193,18 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
                            QemuOpts *opts,
                            Error **errp)
 {
-    Visitor *v;
+    OptsVisitor *ov;
    QCryptoBlockOpenOptions *ret = NULL;
    Error *local_err = NULL;
+    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockOpenOptions, 1);
    ret->format = format;

-    v = opts_visitor_new(opts);
+    ov = opts_visitor_new(opts);

-    visit_start_struct(v, NULL, NULL, 0, &local_err);
+    visit_start_struct(opts_get_visitor(ov),
+                       NULL, NULL, 0, &local_err);
    if (local_err) {
        goto out;
    }
@@ -216,18 +212,16 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
    switch (format) {
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
        visit_type_QCryptoBlockOptionsLUKS_members(
-            v, &ret->u.luks, &local_err);
+            opts_get_visitor(ov), &ret->u.luks, &local_err);
        break;

    default:
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
-    if (!local_err) {
-        visit_check_struct(v, &local_err);
-    }

-    visit_end_struct(v, NULL);
+    visit_end_struct(opts_get_visitor(ov), &end_err);
+    error_propagate(&local_err, end_err);

 out:
    if (local_err) {
@@ -235,7 +229,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
        qapi_free_QCryptoBlockOpenOptions(ret);
        ret = NULL;
    }
-    visit_free(v);
+    opts_visitor_cleanup(ov);
    return ret;
 }

@@ -245,16 +239,18 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
                              QemuOpts *opts,
                              Error **errp)
 {
-    Visitor *v;
+    OptsVisitor *ov;
    QCryptoBlockCreateOptions *ret = NULL;
    Error *local_err = NULL;
+    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockCreateOptions, 1);
    ret->format = format;

-    v = opts_visitor_new(opts);
+    ov = opts_visitor_new(opts);

-    visit_start_struct(v, NULL, NULL, 0, &local_err);
+    visit_start_struct(opts_get_visitor(ov),
+                       NULL, NULL, 0, &local_err);
    if (local_err) {
        goto out;
    }
@@ -262,18 +258,16 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
    switch (format) {
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
        visit_type_QCryptoBlockCreateOptionsLUKS_members(
-            v, &ret->u.luks, &local_err);
+            opts_get_visitor(ov), &ret->u.luks, &local_err);
        break;

    default:
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
-    if (!local_err) {
-        visit_check_struct(v, &local_err);
-    }

-    visit_end_struct(v, NULL);
+    visit_end_struct(opts_get_visitor(ov), &end_err);
+    error_propagate(&local_err, end_err);

 out:
    if (local_err) {
@@ -281,7 +275,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
        qapi_free_QCryptoBlockCreateOptions(ret);
        ret = NULL;
    }
-    visit_free(v);
+    opts_visitor_cleanup(ov);
    return ret;
 }

@@ -326,8 +320,8 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
        goto cleanup;
    }

-    bs->encrypted = true;
-    bs->valid_key = true;
+    bs->encrypted = 1;
+    bs->valid_key = 1;

    ret = 0;
 cleanup:
@@ -432,7 +426,7 @@ block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num,
        qemu_iovec_reset(&hd_qiov);
        qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);

-        ret = bdrv_co_readv(bs->file,
+        ret = bdrv_co_readv(bs->file->bs,
                            payload_offset + sector_num,
                            cur_nr_sectors, &hd_qiov);
        if (ret < 0) {
@@ -511,7 +505,7 @@ block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num,
        qemu_iovec_reset(&hd_qiov);
        qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);

-        ret = bdrv_co_writev(bs->file,
+        ret = bdrv_co_writev(bs->file->bs,
                             payload_offset + sector_num,
                             cur_nr_sectors, &hd_qiov);
        if (ret < 0) {
@@ -569,53 +563,6 @@ static int block_crypto_create_luks(const char *filename,
                                       filename, opts, errp);
 }

-static int block_crypto_get_info_luks(BlockDriverState *bs,
-                                      BlockDriverInfo *bdi)
-{
-    BlockDriverInfo subbdi;
-    int ret;
-
-    ret = bdrv_get_info(bs->file->bs, &subbdi);
-    if (ret != 0) {
-        return ret;
-    }
-
-    bdi->unallocated_blocks_are_zero = false;
-    bdi->can_write_zeroes_with_unmap = false;
-    bdi->cluster_size = subbdi.cluster_size;
-
-    return 0;
-}
-
-static ImageInfoSpecific *
-block_crypto_get_specific_info_luks(BlockDriverState *bs)
-{
-    BlockCrypto *crypto = bs->opaque;
-    ImageInfoSpecific *spec_info;
-    QCryptoBlockInfo *info;
-
-    info = qcrypto_block_get_info(crypto->block, NULL);
-    if (!info) {
-        return NULL;
-    }
-    if (info->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
-        qapi_free_QCryptoBlockInfo(info);
-        return NULL;
-    }
-
-    spec_info = g_new(ImageInfoSpecific, 1);
-    spec_info->type = IMAGE_INFO_SPECIFIC_KIND_LUKS;
-    spec_info->u.luks.data = g_new(QCryptoBlockInfoLUKS, 1);
-    *spec_info->u.luks.data = info->u.luks;
-
-    /* Blank out pointers we've just stolen to avoid double free */
-    memset(&info->u.luks, 0, sizeof(info->u.luks));
-
-    qapi_free_QCryptoBlockInfo(info);
-
-    return spec_info;
-}
-
 BlockDriver bdrv_crypto_luks = {
    .format_name        = "luks",
    .instance_size      = sizeof(BlockCrypto),
@@ -629,8 +576,6 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_co_readv      = block_crypto_co_readv,
    .bdrv_co_writev     = block_crypto_co_writev,
    .bdrv_getlength     = block_crypto_getlength,
-    .bdrv_get_info      = block_crypto_get_info_luks,
-    .bdrv_get_specific_info = block_crypto_get_specific_info_luks,
 };

 static void block_crypto_init(void)
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,16 +36,10 @@
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
-#define DEBUG_CURL_PRINT 1
+#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
 #else
-#define DEBUG_CURL_PRINT 0
+#define DPRINTF(fmt, ...) do { } while (0)
 #endif
-#define DPRINTF(fmt, ...)                                            \
-    do {                                                             \
-        if (DEBUG_CURL_PRINT) {                                      \
-            fprintf(stderr, fmt, ## __VA_ARGS__);                    \
-        }                                                            \
-    } while (0)

 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
@@ -73,7 +67,6 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,

 #define CURL_NUM_STATES 8
 #define CURL_NUM_ACB    8
-#define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
 #define CURL_TIMEOUT_DEFAULT 5
 #define CURL_TIMEOUT_MAX 10000
@@ -106,12 +99,17 @@ typedef struct CURLAIOCB {
    size_t end;
 } CURLAIOCB;

+typedef struct CURLSocket {
+    int fd;
+    QLIST_ENTRY(CURLSocket) next;
+} CURLSocket;
+
 typedef struct CURLState
 {
    struct BDRVCURLState *s;
    CURLAIOCB *acb[CURL_NUM_ACB];
    CURL *curl;
-    curl_socket_t sock_fd;
+    QLIST_HEAD(, CURLSocket) sockets;
    char *orig_buf;
    size_t buf_start;
    size_t buf_off;
@@ -165,10 +163,27 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
 {
    BDRVCURLState *s;
    CURLState *state = NULL;
+    CURLSocket *socket;
+
    curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state);
-    state->sock_fd = fd;
    s = state->s;

+    QLIST_FOREACH(socket, &state->sockets, next) {
+        if (socket->fd == fd) {
+            if (action == CURL_POLL_REMOVE) {
+                QLIST_REMOVE(socket, next);
+                g_free(socket);
+            }
+            break;
+        }
+    }
+    if (!socket) {
+        socket = g_new0(CURLSocket, 1);
+        socket->fd = fd;
+        QLIST_INSERT_HEAD(&state->sockets, socket, next);
+    }
+    socket = NULL;
+
    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, (int)fd);
    switch (action) {
        case CURL_POLL_IN:
@@ -214,12 +229,13 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)

    DPRINTF("CURL: Just reading %zd bytes\n", realsize);

-    if (!s || !s->orig_buf)
-        return 0;
+    if (!s || !s->orig_buf) {
+        goto read_end;
+    }

    if (s->buf_off >= s->buf_len) {
        /* buffer full, read nothing */
-        return 0;
+        goto read_end;
    }
    realsize = MIN(realsize, s->buf_len - s->buf_off);
    memcpy(s->orig_buf + s->buf_off, ptr, realsize);
@@ -232,15 +248,26 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
            continue;

        if ((s->buf_off >= acb->end)) {
+            size_t request_length = acb->nb_sectors * BDRV_SECTOR_SIZE;
+
            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
                                acb->end - acb->start);
+
+            if (acb->end - acb->start < request_length) {
+                size_t offset = acb->end - acb->start;
+                qemu_iovec_memset(acb->qiov, offset, 0,
+                                  request_length - offset);
+            }
+
            acb->common.cb(acb->common.opaque, 0);
            qemu_aio_unref(acb);
            s->acb[i] = NULL;
        }
    }

-    return realsize;
+read_end:
+    /* curl will error out if we do not return this value */
+    return size * nmemb;
 }

 static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
@@ -248,6 +275,8 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
 {
    int i;
    size_t end = start + len;
+    size_t clamped_end = MIN(end, s->len);
+    size_t clamped_len = clamped_end - start;

    for (i=0; i<CURL_NUM_STATES; i++) {
        CURLState *state = &s->states[i];
@@ -262,12 +291,15 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
        // Does the existing buffer cover our section?
        if ((start >= state->buf_start) &&
            (start <= buf_end) &&
-            (end >= state->buf_start) &&
-            (end <= buf_end))
+            (clamped_end >= state->buf_start) &&
+            (clamped_end <= buf_end))
        {
            char *buf = state->orig_buf + (start - state->buf_start);

-            qemu_iovec_from_buf(acb->qiov, 0, buf, len);
+            qemu_iovec_from_buf(acb->qiov, 0, buf, clamped_len);
+            if (clamped_len < len) {
+                qemu_iovec_memset(acb->qiov, clamped_len, 0, len - clamped_len);
+            }
            acb->common.cb(acb->common.opaque, 0);

            return FIND_RET_OK;
@@ -277,13 +309,13 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
        if (state->in_use &&
            (start >= state->buf_start) &&
            (start <= buf_fend) &&
-            (end >= state->buf_start) &&
-            (end <= buf_fend))
+            (clamped_end >= state->buf_start) &&
+            (clamped_end <= buf_fend))
        {
            int j;

            acb->start = start - state->buf_start;
-            acb->end = acb->start + len;
+            acb->end = acb->start + clamped_len;

            for (j=0; j<CURL_NUM_ACB; j++) {
                if (!state->acb[j]) {
@@ -353,6 +385,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
 static void curl_multi_do(void *arg)
 {
    CURLState *s = (CURLState *)arg;
+    CURLSocket *socket, *next_socket;
    int running;
    int r;

@@ -360,10 +393,13 @@ static void curl_multi_do(void *arg)
        return;
    }

-    do {
-        r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
+    /* Need to use _SAFE because curl_multi_socket_action() may trigger
+     * curl_sock_cb() which might modify this list */
+    QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) {
+        do {
+            r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running);
+        } while (r == CURLM_CALL_MULTI_PERFORM);
+    }
 }

 static void curl_multi_read(void *arg)
@@ -467,6 +503,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
 #endif
    }

+    QLIST_INIT(&state->sockets);
    state->s = s;

    return state;
@@ -476,6 +513,14 @@ static void curl_clean_state(CURLState *s)
 {
    if (s->s->multi)
        curl_multi_remove_handle(s->s->multi, s->curl);
+
+    while (!QLIST_EMPTY(&s->sockets)) {
+        CURLSocket *socket = QLIST_FIRST(&s->sockets);
+
+        QLIST_REMOVE(socket, next);
+        g_free(socket);
+    }
+
    s->in_use = 0;
 }

@@ -675,28 +720,11 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
    curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s);
    if (curl_easy_perform(state->curl))
        goto out;
-    if (curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d)) {
+    curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d);
+    if (d)
+        s->len = (size_t)d;
+    else if(!s->len)
        goto out;
-    }
-    /* Prior CURL 7.19.4 return value of 0 could mean that the file size is not
-     * know or the size is zero. From 7.19.4 CURL returns -1 if size is not
-     * known and zero if it is realy zero-length file. */
-#if LIBCURL_VERSION_NUM >= 0x071304
-    if (d < 0) {
-        pstrcpy(state->errmsg, CURL_ERROR_SIZE,
-                "Server didn't report file size.");
-        goto out;
-    }
-#else
-    if (d <= 0) {
-        pstrcpy(state->errmsg, CURL_ERROR_SIZE,
-                "Unknown file size or zero-length file.");
-        goto out;
-    }
-#endif
-
-    s->len = (size_t)d;
-
    if ((!strncasecmp(s->url, "http://", strlen("http://"))
        || !strncasecmp(s->url, "https://", strlen("https://")))
        && !s->accept_range) {
@@ -742,12 +770,12 @@ static void curl_readv_bh_cb(void *p)
    qemu_bh_delete(acb->bh);
    acb->bh = NULL;

-    size_t start = acb->sector_num * SECTOR_SIZE;
+    size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
    size_t end;

    // In case we have the requested data already (e.g. read-ahead),
    // we can just call the callback and be done.
-    switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
+    switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
        case FIND_RET_OK:
            qemu_aio_unref(acb);
            // fall through
@@ -766,13 +794,13 @@ static void curl_readv_bh_cb(void *p)
    }

    acb->start = 0;
-    acb->end = (acb->nb_sectors * SECTOR_SIZE);
+    acb->end = MIN(acb->nb_sectors * BDRV_SECTOR_SIZE, s->len - start);

    state->buf_off = 0;
    g_free(state->orig_buf);
    state->buf_start = start;
-    state->buf_len = acb->end + s->readahead_size;
-    end = MIN(start + state->buf_len, s->len) - 1;
+    state->buf_len = MIN(acb->end + s->readahead_size, s->len - start);
+    end = start + state->buf_len - 1;
    state->orig_buf = g_try_malloc(state->buf_len);
    if (state->buf_len && state->orig_buf == NULL) {
        curl_clean_state(state);
@@ -783,8 +811,8 @@ static void curl_readv_bh_cb(void *p)
    state->acb[0] = acb;

    snprintf(state->range, 127, "%zd-%zd", start, end);
-    DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n",
-            (acb->nb_sectors * SECTOR_SIZE), start, state->range);
+    DPRINTF("CURL (AIO): Reading %llu at %zd (%s)\n",
+            (acb->nb_sectors * BDRV_SECTOR_SIZE), start, state->range);
    curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);

    curl_multi_add_handle(s->multi, state->curl);
--- a/block/dictzip.c
+++ b/block/dictzip.c
@@ -0,0 +1,586 @@
+/*
+ * DictZip Block driver for dictzip enabled gzip files
+ *
+ * Use the "dictzip" tool from the "dictd" package to create gzip files that
+ * contain the extra DictZip headers.
+ *
+ * dictzip(1) is a compression program which creates compressed files in the
+ * gzip format (see RFC 1952). However, unlike gzip(1), dictzip(1) compresses
+ * the file in pieces and stores an index to the pieces in the gzip header.
+ * This allows random access to the file at the granularity of the compressed
+ * pieces (currently about 64kB) while maintaining good compression ratios
+ * (within 5% of the expected ratio for dictionary data).
+ * dictd(8) uses files stored in this format.
+ *
+ * For details on DictZip see http://dict.org/.
+ *
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include <zlib.h>
+
+// #define DEBUG
+
+#ifdef DEBUG
+#define dprintf(fmt, ...) do { printf("dzip: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) do { } while (0)
+#endif
+
+#define SECTOR_SIZE 512
+#define Z_STREAM_COUNT 4
+#define CACHE_COUNT 20
+
+/* magic values */
+
+#define GZ_MAGIC1     0x1f
+#define GZ_MAGIC2     0x8b
+#define DZ_MAGIC1      'R'
+#define DZ_MAGIC2      'A'
+
+#define GZ_FEXTRA     0x04      /* Optional field (random access index)    */
+#define GZ_FNAME      0x08      /* Original name                           */
+#define GZ_COMMENT    0x10      /* Zero-terminated, human-readable comment */
+#define GZ_FHCRC      0x02      /* Header CRC16                            */
+
+/* offsets */
+
+#define GZ_ID            0      /* GZ_MAGIC (16bit)                        */
+#define GZ_FLG           3      /* FLaGs (see above)                       */
+#define GZ_XLEN         10      /* eXtra LENgth (16bit)                    */
+#define GZ_SI           12      /* Subfield ID (16bit)                     */
+#define GZ_VERSION      16      /* Version for subfield format             */
+#define GZ_CHUNKSIZE    18      /* Chunk size (16bit)                      */
+#define GZ_CHUNKCNT     20      /* Number of chunks (16bit)                */
+#define GZ_RNDDATA      22      /* Random access data (16bit)              */
+
+#define GZ_99_CHUNKSIZE 18      /* Chunk size (32bit)                      */
+#define GZ_99_CHUNKCNT  22      /* Number of chunks (32bit)                */
+#define GZ_99_FILESIZE  26      /* Size of unpacked file (64bit)           */
+#define GZ_99_RNDDATA   34      /* Random access data (32bit)              */
+
+struct BDRVDictZipState;
+
+typedef struct DictZipAIOCB {
+    BlockAIOCB common;
+    struct BDRVDictZipState *s;
+    QEMUIOVector *qiov;          /* QIOV of the original request */
+    QEMUIOVector *qiov_gz;       /* QIOV of the gz subrequest */
+    QEMUBH *bh;                  /* BH for cache */
+    z_stream *zStream;           /* stream to use for decoding */
+    int zStream_id;              /* stream id of the above pointer */
+    size_t start;                /* offset into the uncompressed file */
+    size_t len;                  /* uncompressed bytes to read */
+    uint8_t *gzipped;            /* the gzipped data */
+    uint8_t *buf;                /* cached result */
+    size_t gz_len;               /* amount of gzip data */
+    size_t gz_start;             /* uncompressed starting point of gzip data */
+    uint64_t offset;             /* offset for "start" into the uncompressed chunk */
+    int chunks_len;              /* amount of uncompressed data in all gzip data */
+} DictZipAIOCB;
+
+typedef struct dict_cache {
+    size_t start;
+    size_t len;
+    uint8_t *buf;
+} DictCache;
+
+typedef struct BDRVDictZipState {
+    BlockDriverState *hd;
+    z_stream zStream[Z_STREAM_COUNT];
+    DictCache cache[CACHE_COUNT];
+    int cache_index;
+    uint8_t  stream_in_use;
+    uint64_t chunk_len;
+    uint32_t chunk_cnt;
+    uint16_t *chunks;
+    uint32_t *chunks32;
+    uint64_t *offsets;
+    int64_t file_len;
+} BDRVDictZipState;
+
+static int start_zStream(z_stream *zStream)
+{
+    zStream->zalloc    = NULL;
+    zStream->zfree     = NULL;
+    zStream->opaque    = NULL;
+    zStream->next_in   = 0;
+    zStream->avail_in  = 0;
+    zStream->next_out  = NULL;
+    zStream->avail_out = 0;
+
+    return inflateInit2( zStream, -15 );
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "dzip",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the dictzip file",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int dictzip_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
+{
+    BDRVDictZipState *s = bs->opaque;
+    const char *err = "Unknown (read error?)";
+    uint8_t magic[2];
+    char buf[100];
+    uint8_t header_flags;
+    uint16_t chunk_len16;
+    uint16_t chunk_cnt16;
+    uint32_t chunk_len32;
+    uint16_t header_ver;
+    uint16_t tmp_short;
+    uint64_t offset;
+    int chunks_len;
+    int headerLength = GZ_XLEN - 1;
+    int rnd_offs;
+    int ret;
+    int i;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
+    if (!strncmp(filename, "dzip://", 7))
+        filename += 7;
+    else if (!strncmp(filename, "dzip:", 5))
+        filename += 5;
+
+    ret = bdrv_open(&s->hd, filename, NULL, NULL, flags | BDRV_O_PROTOCOL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        qemu_opts_del(opts);
+        return ret;
+    }
+
+    /* initialize zlib streams */
+    for (i = 0; i < Z_STREAM_COUNT; i++) {
+        if (start_zStream( &s->zStream[i] ) != Z_OK) {
+            err = s->zStream[i].msg;
+            goto fail;
+        }
+    }
+
+    /* gzip header */
+    if (bdrv_pread(s->hd, GZ_ID, &magic, sizeof(magic)) != sizeof(magic))
+        goto fail;
+
+    if (!((magic[0] == GZ_MAGIC1) && (magic[1] == GZ_MAGIC2))) {
+        err = "No gzip file";
+        goto fail;
+    }
+
+    /* dzip header */
+    if (bdrv_pread(s->hd, GZ_FLG, &header_flags, 1) != 1)
+        goto fail;
+
+    if (!(header_flags & GZ_FEXTRA)) {
+        err = "Not a dictzip file (wrong flags)";
+        goto fail;
+    }
+
+    /* extra length */
+    if (bdrv_pread(s->hd, GZ_XLEN, &tmp_short, 2) != 2)
+        goto fail;
+
+    headerLength += le16_to_cpu(tmp_short) + 2;
+
+    /* DictZip magic */
+    if (bdrv_pread(s->hd, GZ_SI, &magic, 2) != 2)
+        goto fail;
+
+    if (magic[0] != DZ_MAGIC1 || magic[1] != DZ_MAGIC2) {
+        err = "Not a dictzip file (missing extra magic)";
+        goto fail;
+    }
+
+    /* DictZip version */
+    if (bdrv_pread(s->hd, GZ_VERSION, &header_ver, 2) != 2)
+        goto fail;
+
+    header_ver = le16_to_cpu(header_ver);
+
+    switch (header_ver) {
+        case 1: /* Normal DictZip */
+            /* number of chunks */
+            if (bdrv_pread(s->hd, GZ_CHUNKSIZE, &chunk_len16, 2) != 2)
+                goto fail;
+
+            s->chunk_len = le16_to_cpu(chunk_len16);
+
+            /* chunk count */
+            if (bdrv_pread(s->hd, GZ_CHUNKCNT, &chunk_cnt16, 2) != 2)
+                goto fail;
+
+            s->chunk_cnt = le16_to_cpu(chunk_cnt16);
+            chunks_len = sizeof(short) * s->chunk_cnt;
+            rnd_offs = GZ_RNDDATA;
+            break;
+        case 99: /* Special Alex pigz version */
+            /* number of chunks */
+            if (bdrv_pread(s->hd, GZ_99_CHUNKSIZE, &chunk_len32, 4) != 4)
+                goto fail;
+
+            dprintf("chunk len [%#x] = %d\n", GZ_99_CHUNKSIZE, chunk_len32);
+            s->chunk_len = le32_to_cpu(chunk_len32);
+
+            /* chunk count */
+            if (bdrv_pread(s->hd, GZ_99_CHUNKCNT, &s->chunk_cnt, 4) != 4)
+                goto fail;
+
+            s->chunk_cnt = le32_to_cpu(s->chunk_cnt);
+
+            dprintf("chunk len | count = %"PRId64" | %d\n", s->chunk_len, s->chunk_cnt);
+
+            /* file size */
+            if (bdrv_pread(s->hd, GZ_99_FILESIZE, &s->file_len, 8) != 8)
+                goto fail;
+
+            s->file_len = le64_to_cpu(s->file_len);
+            chunks_len = sizeof(int) * s->chunk_cnt;
+            rnd_offs = GZ_99_RNDDATA;
+            break;
+        default:
+            err = "Invalid DictZip version";
+            goto fail;
+    }
+
+    /* random access data */
+    s->chunks = g_malloc(chunks_len);
+    if (header_ver == 99)
+        s->chunks32 = (uint32_t *)s->chunks;
+
+    if (bdrv_pread(s->hd, rnd_offs, s->chunks, chunks_len) != chunks_len)
+        goto fail;
+
+    /* orig filename */
+    if (header_flags & GZ_FNAME) {
+        if (bdrv_pread(s->hd, headerLength + 1, buf, sizeof(buf)) != sizeof(buf))
+            goto fail;
+
+        buf[sizeof(buf) - 1] = '\0';
+        headerLength += strlen(buf) + 1;
+
+        if (strlen(buf) == sizeof(buf))
+            goto fail;
+
+        dprintf("filename: %s\n", buf);
+    }
+
+    /* comment field */
+    if (header_flags & GZ_COMMENT) {
+        if (bdrv_pread(s->hd, headerLength, buf, sizeof(buf)) != sizeof(buf))
+            goto fail;
+
+        buf[sizeof(buf) - 1] = '\0';
+        headerLength += strlen(buf) + 1;
+
+        if (strlen(buf) == sizeof(buf))
+            goto fail;
+
+        dprintf("comment: %s\n", buf);
+    }
+
+    if (header_flags & GZ_FHCRC)
+        headerLength += 2;
+
+    /* uncompressed file length*/
+    if (!s->file_len) {
+        uint32_t file_len;
+
+        if (bdrv_pread(s->hd, bdrv_getlength(s->hd) - 4, &file_len, 4) != 4)
+            goto fail;
+
+        s->file_len = le32_to_cpu(file_len);
+    }
+
+    /* compute offsets */
+    s->offsets = g_malloc(sizeof( *s->offsets ) * s->chunk_cnt);
+
+    for (offset = headerLength + 1, i = 0; i < s->chunk_cnt; i++) {
+        s->offsets[i] = offset;
+        switch (header_ver) {
+        case 1:
+            offset += le16_to_cpu(s->chunks[i]);
+            break;
+        case 99:
+            offset += le32_to_cpu(s->chunks32[i]);
+            break;
+        }
+
+        dprintf("chunk %#"PRIx64" - %#"PRIx64" = offset %#"PRIx64" -> %#"PRIx64"\n", i * s->chunk_len, (i+1) * s->chunk_len, s->offsets[i], offset);
+    }
+    qemu_opts_del(opts);
+
+    return 0;
+
+fail:
+    fprintf(stderr, "DictZip: Error opening file: %s\n", err);
+    bdrv_unref(s->hd);
+    if (s->chunks)
+        g_free(s->chunks);
+    qemu_opts_del(opts);
+    return -EINVAL;
+}
+
+/* This callback gets invoked when we have the result in cache already */
+static void dictzip_cache_cb(void *opaque)
+{
+    DictZipAIOCB *acb = (DictZipAIOCB *)opaque;
+
+    qemu_iovec_from_buf(acb->qiov, 0, acb->buf, acb->len);
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_unref(acb);
+}
+
+/* This callback gets invoked by the underlying block reader when we have
+ * all compressed data. We uncompress in here. */
+static void dictzip_read_cb(void *opaque, int ret)
+{
+    DictZipAIOCB *acb = (DictZipAIOCB *)opaque;
+    struct BDRVDictZipState *s = acb->s;
+    uint8_t *buf;
+    DictCache *cache;
+    int r, i;
+
+    buf = g_malloc(acb->chunks_len);
+
+    /* try to find zlib stream for decoding */
+    do {
+        for (i = 0; i < Z_STREAM_COUNT; i++) {
+            if (!(s->stream_in_use & (1 << i))) {
+                s->stream_in_use |= (1 << i);
+                acb->zStream_id = i;
+                acb->zStream = &s->zStream[i];
+                break;
+            }
+        }
+    } while(!acb->zStream);
+
+    /* sure, we could handle more streams, but this callback should be single
+       threaded and when it's not, we really want to know! */
+    assert(i == 0);
+
+    /* uncompress the chunk */
+    acb->zStream->next_in   = acb->gzipped;
+    acb->zStream->avail_in  = acb->gz_len;
+    acb->zStream->next_out  = buf;
+    acb->zStream->avail_out = acb->chunks_len;
+
+    r = inflate( acb->zStream,  Z_PARTIAL_FLUSH );
+    if ( (r != Z_OK) && (r != Z_STREAM_END) )
+        fprintf(stderr, "Error inflating: [%d] %s\n", r, acb->zStream->msg);
+
+    if ( r == Z_STREAM_END )
+        inflateReset(acb->zStream);
+
+    dprintf("inflating [%d] left: %d | %d bytes\n", r, acb->zStream->avail_in, acb->zStream->avail_out);
+    s->stream_in_use &= ~(1 << acb->zStream_id);
+
+    /* nofity the caller */
+    qemu_iovec_from_buf(acb->qiov, 0, buf + acb->offset, acb->len);
+    acb->common.cb(acb->common.opaque, 0);
+
+    /* fill the cache */
+    cache = &s->cache[s->cache_index];
+    s->cache_index++;
+    if (s->cache_index == CACHE_COUNT)
+        s->cache_index = 0;
+
+    cache->len = 0;
+    if (cache->buf)
+        g_free(cache->buf);
+    cache->start = acb->gz_start;
+    cache->buf = buf;
+    cache->len = acb->chunks_len;
+
+    /* free occupied ressources */
+    g_free(acb->qiov_gz);
+    qemu_aio_unref(acb);
+}
+
+static const AIOCBInfo dictzip_aiocb_info = {
+    .aiocb_size         = sizeof(DictZipAIOCB),
+};
+
+/* This is where we get a request from a caller to read something */
+static BlockAIOCB *dictzip_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    BDRVDictZipState *s = bs->opaque;
+    DictZipAIOCB *acb;
+    QEMUIOVector *qiov_gz;
+    struct iovec *iov;
+    uint8_t *buf;
+    size_t  start = sector_num * SECTOR_SIZE;
+    size_t  len = nb_sectors * SECTOR_SIZE;
+    size_t  end = start + len;
+    size_t  gz_start;
+    size_t  gz_len;
+    int64_t gz_sector_num;
+    int     gz_nb_sectors;
+    int     first_chunk, last_chunk;
+    int     first_offset;
+    int     i;
+
+    acb = qemu_aio_get(&dictzip_aiocb_info, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    /* Search Cache */
+    for (i = 0; i < CACHE_COUNT; i++) {
+        if (!s->cache[i].len)
+            continue;
+
+        if ((start >= s->cache[i].start) &&
+            (end <= (s->cache[i].start + s->cache[i].len))) {
+            acb->buf = s->cache[i].buf + (start - s->cache[i].start);
+            acb->len = len;
+            acb->qiov = qiov;
+            acb->bh = qemu_bh_new(dictzip_cache_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        }
+    }
+
+    /* No cache, so let's decode */
+    /* We need to read these chunks */
+    first_chunk  = start / s->chunk_len;
+    first_offset = start - first_chunk * s->chunk_len;
+    last_chunk   = end / s->chunk_len;
+
+    gz_start = s->offsets[first_chunk];
+    gz_len = 0;
+    for (i = first_chunk; i <= last_chunk; i++) {
+        if (s->chunks32)
+            gz_len += le32_to_cpu(s->chunks32[i]);
+        else
+            gz_len += le16_to_cpu(s->chunks[i]);
+    }
+
+    gz_sector_num = gz_start / SECTOR_SIZE;
+    gz_nb_sectors = (gz_len / SECTOR_SIZE);
+
+    /* account for tail and heads */
+    while ((gz_start + gz_len) > ((gz_sector_num + gz_nb_sectors) * SECTOR_SIZE))
+        gz_nb_sectors++;
+
+    /* Allocate qiov, iov and buf in one chunk so we only need to free qiov */
+    qiov_gz = g_malloc0(sizeof(QEMUIOVector) + sizeof(struct iovec) +
+                           (gz_nb_sectors * SECTOR_SIZE));
+    iov = (struct iovec *)(((char *)qiov_gz) + sizeof(QEMUIOVector));
+    buf = ((uint8_t *)iov) + sizeof(struct iovec *);
+
+    /* Kick off the read by the backing file, so we can start decompressing */
+    iov->iov_base = (void *)buf;
+    iov->iov_len = gz_nb_sectors * 512;
+    qemu_iovec_init_external(qiov_gz, iov, 1);
+
+    dprintf("read %zd - %zd => %zd - %zd\n", start, end, gz_start, gz_start + gz_len);
+
+    acb->s = s;
+    acb->qiov = qiov;
+    acb->qiov_gz = qiov_gz;
+    acb->start = start;
+    acb->len = len;
+    acb->gzipped = buf + (gz_start % SECTOR_SIZE);
+    acb->gz_len = gz_len;
+    acb->gz_start = first_chunk * s->chunk_len;
+    acb->offset = first_offset;
+    acb->chunks_len = (last_chunk - first_chunk + 1) * s->chunk_len;
+
+    return bdrv_aio_readv(s->hd, gz_sector_num, qiov_gz, gz_nb_sectors,
+                          dictzip_read_cb, acb);
+}
+
+static void dictzip_close(BlockDriverState *bs)
+{
+    BDRVDictZipState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < CACHE_COUNT; i++) {
+        if (!s->cache[i].len)
+            continue;
+
+        g_free(s->cache[i].buf);
+    }
+
+    for (i = 0; i < Z_STREAM_COUNT; i++) {
+        inflateEnd(&s->zStream[i]);
+    }
+
+    if (s->chunks)
+        g_free(s->chunks);
+
+    if (s->offsets)
+        g_free(s->offsets);
+
+    dprintf("Close\n");
+}
+
+static int64_t dictzip_getlength(BlockDriverState *bs)
+{
+    BDRVDictZipState *s = bs->opaque;
+    dprintf("getlength -> %ld\n", s->file_len);
+    return s->file_len;
+}
+
+static BlockDriver bdrv_dictzip = {
+    .format_name     = "dzip",
+    .protocol_name   = "dzip",
+
+    .instance_size   = sizeof(BDRVDictZipState),
+    .bdrv_file_open  = dictzip_open,
+    .bdrv_close      = dictzip_close,
+    .bdrv_getlength  = dictzip_getlength,
+
+    .bdrv_aio_readv  = dictzip_aio_readv,
+};
+
+static void dictzip_block_init(void)
+{
+    bdrv_register(&bdrv_dictzip);
+}
+
+block_init(dictzip_block_init);
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -326,14 +326,14 @@ void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 }

 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int64_t nr_sectors)
+                           int64_t cur_sector, int nr_sectors)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }

 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int64_t nr_sectors)
+                             int64_t cur_sector, int nr_sectors)
 {
    assert(bdrv_dirty_bitmap_enabled(bitmap));
    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
@@ -361,7 +361,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 }

 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int64_t nr_sectors)
+                    int nr_sectors)
 {
    BdrvDirtyBitmap *bitmap;
    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -32,6 +32,7 @@
 #ifdef CONFIG_BZIP2
 #include <bzlib.h>
 #endif
+#include <glib.h>

 enum {
    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -86,7 +87,7 @@ static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
    uint64_t buffer;
    int ret;

-    ret = bdrv_pread(bs->file, offset, &buffer, 8);
+    ret = bdrv_pread(bs->file->bs, offset, &buffer, 8);
    if (ret < 0) {
        return ret;
    }
@@ -100,7 +101,7 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
    uint32_t buffer;
    int ret;

-    ret = bdrv_pread(bs->file, offset, &buffer, 4);
+    ret = bdrv_pread(bs->file->bs, offset, &buffer, 4);
    if (ret < 0) {
        return ret;
    }
@@ -153,9 +154,8 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
    }
 }

-static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp)
+static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
 {
-    BlockDriverState *file_bs = file->bs;
    int64_t length;
    int64_t offset = 0;
    uint8_t buffer[515];
@@ -179,7 +179,7 @@ static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp)
        offset = length - 511 - 512;
    }
    length = length < 515 ? length : 515;
-    ret = bdrv_pread(file, offset, buffer, length);
+    ret = bdrv_pread(file_bs, offset, buffer, length);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed while reading UDIF trailer");
        return ret;
@@ -356,7 +356,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
        offset += 4;

        buffer = g_realloc(buffer, count);
-        ret = bdrv_pread(bs->file, offset, buffer, count);
+        ret = bdrv_pread(bs->file->bs, offset, buffer, count);
        if (ret < 0) {
            goto fail;
        }
@@ -393,7 +393,7 @@ static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,

    buffer = g_malloc(info_length + 1);
    buffer[info_length] = '\0';
-    ret = bdrv_pread(bs->file, info_begin, buffer, info_length);
+    ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length);
    if (ret != info_length) {
        ret = -EINVAL;
        goto fail;
@@ -439,8 +439,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t offset;
    int ret;

-    bs->read_only = true;
-
+    bs->read_only = 1;
    s->n_chunks = 0;
    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
    /* used by dmg_read_mish_block to keep track of the current I/O position */
@@ -449,7 +448,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    ds.max_sectors_per_chunk = 1;

    /* locate the UDIF trailer */
-    offset = dmg_find_koly_offset(bs->file, errp);
+    offset = dmg_find_koly_offset(bs->file->bs, errp);
    if (offset < 0) {
        ret = offset;
        goto fail;
@@ -547,11 +546,6 @@ fail:
    return ret;
 }

-static void dmg_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
-}
-
 static inline int is_sector_in_chunk(BDRVDMGState* s,
                uint32_t chunk_num, uint64_t sector_num)
 {
@@ -600,7 +594,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
        case 0x80000005: { /* zlib compressed */
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk],
+            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
                             s->compressed_chunk, s->lengths[chunk]);
            if (ret != s->lengths[chunk]) {
                return -1;
@@ -624,7 +618,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
        case 0x80000006: /* bzip2 compressed */
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
-            ret = bdrv_pread(bs->file, s->offsets[chunk],
+            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
                             s->compressed_chunk, s->lengths[chunk]);
            if (ret != s->lengths[chunk]) {
                return -1;
@@ -649,7 +643,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
            break;
 #endif /* CONFIG_BZIP2 */
        case 1: /* copy */
-            ret = bdrv_pread(bs->file, s->offsets[chunk],
+            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
                             s->uncompressed_chunk, s->lengths[chunk]);
            if (ret != s->lengths[chunk]) {
                return -1;
@@ -665,42 +659,38 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    return 0;
 }

-static int coroutine_fn
-dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-              QEMUIOVector *qiov, int flags)
+static int dmg_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
 {
    BDRVDMGState *s = bs->opaque;
-    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-    int ret, i;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-
-    qemu_co_mutex_lock(&s->lock);
+    int i;

    for (i = 0; i < nb_sectors; i++) {
        uint32_t sector_offset_in_chunk;
-        void *data;
-
        if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            ret = -EIO;
-            goto fail;
+            return -1;
        }
        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
         * s->uncompressed_chunk may be too small to cover the large all-zeroes
         * section. dmg_read_chunk is called to find s->current_chunk */
        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-            qemu_iovec_memset(qiov, i * 512, 0, 512);
+            memset(buf + i * 512, 0, 512);
            continue;
        }
        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
-        qemu_iovec_from_buf(qiov, i * 512, data, 512);
+        memcpy(buf + i * 512,
+               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
    }
+    return 0;
+}

-    ret = 0;
-fail:
+static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
+                                    uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVDMGState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = dmg_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -725,8 +715,7 @@ static BlockDriver bdrv_dmg = {
    .instance_size  = sizeof(BDRVDMGState),
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
-    .bdrv_refresh_limits = dmg_refresh_limits,
-    .bdrv_co_preadv = dmg_co_preadv,
+    .bdrv_read      = dmg_co_read,
    .bdrv_close     = dmg_close,
 };

--- a/block/gluster.c
+++ b/block/gluster.c
--- a/block/io.c
+++ b/block/io.c
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
 * QEMU Block driver for iSCSI images
 *
 * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -46,6 +46,7 @@

 #ifdef __linux__
 #include <scsi/sg.h>
+#include <block/scsi.h>
 #endif

 typedef struct IscsiLun {
@@ -61,23 +62,7 @@ typedef struct IscsiLun {
    struct scsi_inquiry_logical_block_provisioning lbp;
    struct scsi_inquiry_block_limits bl;
    unsigned char *zeroblock;
-    /* The allocmap tracks which clusters (pages) on the iSCSI target are
-     * allocated and which are not. In case a target returns zeros for
-     * unallocated pages (iscsilun->lprz) we can directly return zeros instead
-     * of reading zeros over the wire if a read request falls within an
-     * unallocated block. As there are 3 possible states we need 2 bitmaps to
-     * track. allocmap_valid keeps track if QEMU's information about a page is
-     * valid. allocmap tracks if a page is allocated or not. In case QEMU has no
-     * valid information about a page the corresponding allocmap entry should be
-     * switched to unallocated as well to force a new lookup of the allocation
-     * status as lookups are generally skipped if a page is suspect to be
-     * allocated. If a iSCSI target is opened with cache.direct = on the
-     * allocmap_valid does not exist turning all cached information invalid so
-     * that a fresh lookup is made for any page even if allocmap entry returns
-     * it's unallocated. */
-    unsigned long *allocmap;
-    unsigned long *allocmap_valid;
-    long allocmap_size;
+    unsigned long *allocationmap;
    int cluster_sectors;
    bool use_16_for_rw;
    bool write_protected;
@@ -168,7 +153,7 @@ static void iscsi_co_generic_bh_cb(void *opaque)
    struct IscsiTask *iTask = opaque;
    iTask->complete = 1;
    qemu_bh_delete(iTask->bh);
-    qemu_coroutine_enter(iTask->co);
+    qemu_coroutine_enter(iTask->co, NULL);
 }

 static void iscsi_retry_timer_expired(void *opaque)
@@ -176,7 +161,7 @@ static void iscsi_retry_timer_expired(void *opaque)
    struct IscsiTask *iTask = opaque;
    iTask->complete = 1;
    if (iTask->co) {
-        qemu_coroutine_enter(iTask->co);
+        qemu_coroutine_enter(iTask->co, NULL);
    }
 }

@@ -416,159 +401,55 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }

-static bool is_byte_request_lun_aligned(int64_t offset, int count,
-                                        IscsiLun *iscsilun)
+static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
+                                      IscsiLun *iscsilun)
 {
-    if (offset % iscsilun->block_size || count % iscsilun->block_size) {
-        error_report("iSCSI misaligned request: "
-                     "iscsilun->block_size %u, offset %" PRIi64
-                     ", count %d",
-                     iscsilun->block_size, offset, count);
-        return false;
+    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
+        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
+            error_report("iSCSI misaligned request: "
+                         "iscsilun->block_size %u, sector_num %" PRIi64
+                         ", nb_sectors %d",
+                         iscsilun->block_size, sector_num, nb_sectors);
+            return 0;
    }
-    return true;
+    return 1;
 }

-static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors,
-                                          IscsiLun *iscsilun)
+static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
 {
-    assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
-    return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS,
-                                       nb_sectors << BDRV_SECTOR_BITS,
-                                       iscsilun);
+    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
+                                                       iscsilun),
+                                       iscsilun->cluster_sectors));
 }

-static void iscsi_allocmap_free(IscsiLun *iscsilun)
+static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
+                                    int nb_sectors)
 {
-    g_free(iscsilun->allocmap);
-    g_free(iscsilun->allocmap_valid);
-    iscsilun->allocmap = NULL;
-    iscsilun->allocmap_valid = NULL;
-}
-
-
-static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
-{
-    iscsi_allocmap_free(iscsilun);
-
-    iscsilun->allocmap_size =
-        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
-                     iscsilun->cluster_sectors);
-
-    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
-    if (!iscsilun->allocmap) {
-        return -ENOMEM;
-    }
-
-    if (open_flags & BDRV_O_NOCACHE) {
-        /* in case that cache.direct = on all allocmap entries are
-         * treated as invalid to force a relookup of the block
-         * status on every read request */
-        return 0;
-    }
-
-    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
-    if (!iscsilun->allocmap_valid) {
-        /* if we are under memory pressure free the allocmap as well */
-        iscsi_allocmap_free(iscsilun);
-        return -ENOMEM;
-    }
-
-    return 0;
-}
-
-static void
-iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
-                      int nb_sectors, bool allocated, bool valid)
-{
-    int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk;
-
-    if (iscsilun->allocmap == NULL) {
+    int64_t cluster_num, nb_clusters;
+    if (iscsilun->allocationmap == NULL) {
        return;
    }
-    /* expand to entirely contain all affected clusters */
-    cl_num_expanded = sector_num / iscsilun->cluster_sectors;
-    nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors,
-                                   iscsilun->cluster_sectors) - cl_num_expanded;
-    /* shrink to touch only completely contained clusters */
-    cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
-    nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors
-                      - cl_num_shrunk;
-    if (allocated) {
-        bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded);
-    } else {
-        bitmap_clear(iscsilun->allocmap, cl_num_shrunk, nb_cls_shrunk);
-    }
+    cluster_num = sector_num / iscsilun->cluster_sectors;
+    nb_clusters = DIV_ROUND_UP(sector_num + nb_sectors,
+                               iscsilun->cluster_sectors) - cluster_num;
+    bitmap_set(iscsilun->allocationmap, cluster_num, nb_clusters);
+}

-    if (iscsilun->allocmap_valid == NULL) {
+static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
+                                      int nb_sectors)
+{
+    int64_t cluster_num, nb_clusters;
+    if (iscsilun->allocationmap == NULL) {
        return;
    }
-    if (valid) {
-        bitmap_set(iscsilun->allocmap_valid, cl_num_shrunk, nb_cls_shrunk);
-    } else {
-        bitmap_clear(iscsilun->allocmap_valid, cl_num_expanded,
-                     nb_cls_expanded);
+    cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
+    nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
+                  - cluster_num;
+    if (nb_clusters > 0) {
+        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
    }
 }

-static void
-iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                             int nb_sectors)
-{
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
-}
-
-static void
-iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
-                               int nb_sectors)
-{
-    /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update
-     * is ignored, so this will in effect be an iscsi_allocmap_set_invalid.
-     */
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
-}
-
-static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
-                                       int nb_sectors)
-{
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
-}
-
-static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
-{
-    if (iscsilun->allocmap) {
-        bitmap_zero(iscsilun->allocmap, iscsilun->allocmap_size);
-    }
-    if (iscsilun->allocmap_valid) {
-        bitmap_zero(iscsilun->allocmap_valid, iscsilun->allocmap_size);
-    }
-}
-
-static inline bool
-iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                            int nb_sectors)
-{
-    unsigned long size;
-    if (iscsilun->allocmap == NULL) {
-        return true;
-    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
-    return !(find_next_bit(iscsilun->allocmap, size,
-                           sector_num / iscsilun->cluster_sectors) == size);
-}
-
-static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
-                                           int64_t sector_num, int nb_sectors)
-{
-    unsigned long size;
-    if (iscsilun->allocmap_valid == NULL) {
-        return false;
-    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
-    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
-                               sector_num / iscsilun->cluster_sectors) == size);
-}
-
 static int coroutine_fn
 iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
                      QEMUIOVector *iov, int flags)
@@ -577,23 +458,23 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
-    bool fua = flags & BDRV_REQ_FUA;
+    bool fua;

-    if (fua) {
-        assert(iscsilun->dpofua);
-    }
-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }

-    if (bs->bl.max_transfer) {
-        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
    }

    lba = sector_qemu2lun(sector_num, iscsilun);
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
+    fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA);
    if (iscsilun->use_16_for_rw) {
        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        NULL, num_sectors * iscsilun->block_size,
@@ -626,17 +507,34 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
        return iTask.err_code;
    }

-    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
+    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);

    return 0;
 }

+static int coroutine_fn
+iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                QEMUIOVector *iov)
+{
+    return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0);
+}


+static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
+                                             int64_t sector_num, int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocationmap == NULL) {
+        return true;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return !(find_next_bit(iscsilun->allocationmap, size,
+                           sector_num / iscsilun->cluster_sectors) == size);
+}
+
 static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
                                                  int64_t sector_num,
                                                  int nb_sectors, int *pnum,
@@ -646,11 +544,11 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
    struct scsi_get_lba_status *lbas = NULL;
    struct scsi_lba_status_descriptor *lbasd = NULL;
    struct IscsiTask iTask;
-    int64_t ret;
+    int64_t ret, max_sector;

    iscsi_co_init_iscsitask(iscsilun, &iTask);

-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        ret = -EINVAL;
        goto out;
    }
@@ -665,6 +563,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
        goto out;
    }

+    max_sector = iscsilun->num_blocks - sector_num;
 retry:
    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
                                  sector_qemu2lun(sector_num, iscsilun),
@@ -709,7 +608,7 @@ retry:
        goto out;
    }

-    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
+    *pnum = MIN(sector_lun2qemu(lbasd->num_blocks, iscsilun), max_sector);

    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
@@ -720,9 +619,9 @@ retry:
    }

    if (ret & BDRV_BLOCK_ZERO) {
-        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
+        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
+        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
    }

    if (*pnum > nb_sectors) {
@@ -747,40 +646,26 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    uint64_t lba;
    uint32_t num_sectors;

-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }

-    if (bs->bl.max_transfer) {
-        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
    }

-    /* if cache.direct is off and we have a valid entry in our allocation map
-     * we can skip checking the block status and directly return zeroes if
-     * the request falls within an unallocated area */
-    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
-            qemu_iovec_memset(iov, 0, 0x00, iov->size);
-            return 0;
-    }
-
-    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
+        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+        int64_t ret;
        int pnum;
        BlockDriverState *file;
-        /* check the block status from the beginning of the cluster
-         * containing the start sector */
-        int64_t ret = iscsi_co_get_block_status(bs,
-                          sector_num - sector_num % iscsilun->cluster_sectors,
-                          BDRV_REQUEST_MAX_SECTORS, &pnum, &file);
+        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
        if (ret < 0) {
            return ret;
        }
-        /* if the whole request falls into an unallocated area we can avoid
-         * to read and directly return zeroes instead */
-        if (ret & BDRV_BLOCK_ZERO &&
-            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
+        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
            qemu_iovec_memset(iov, 0, 0x00, iov->size);
            return 0;
        }
@@ -894,8 +779,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
        acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE;

        acb->ioh->sb_len_wr = acb->task->datain.size - 2;
-        ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ?
-             acb->ioh->mx_sb_len : acb->ioh->sb_len_wr;
+        ss = MIN(acb->ioh->mx_sb_len, acb->ioh->sb_len_wr);
        memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
    }

@@ -1042,26 +926,29 @@ iscsi_getlength(BlockDriverState *bs)
 }

 static int
-coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
    struct unmap_list list;

-    assert(is_byte_request_lun_aligned(offset, count, iscsilun));
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
+    }

    if (!iscsilun->lbp.lbpu) {
        /* UNMAP is not supported by the target */
        return 0;
    }

-    list.lba = offset / iscsilun->block_size;
-    list.num = count / iscsilun->block_size;
+    list.lba = sector_qemu2lun(sector_num, iscsilun);
+    list.num = sector_qemu2lun(nb_sectors, iscsilun);

    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
-                         iscsi_co_generic_cb, &iTask) == NULL) {
+                     iscsi_co_generic_cb, &iTask) == NULL) {
        return -ENOMEM;
    }

@@ -1091,15 +978,14 @@ retry:
        return iTask.err_code;
    }

-    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               count >> BDRV_SECTOR_BITS);
+    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);

    return 0;
 }

 static int
-coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                    int count, BdrvRequestFlags flags)
+coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, BdrvRequestFlags flags)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
@@ -1107,8 +993,8 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
    uint32_t nb_blocks;
    bool use_16_for_ws = iscsilun->use_16_for_rw;

-    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
-        return -ENOTSUP;
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        return -EINVAL;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1129,8 +1015,8 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
        return -ENOTSUP;
    }

-    lba = offset / iscsilun->block_size;
-    nb_blocks = count / iscsilun->block_size;
+    lba = sector_qemu2lun(sector_num, iscsilun);
+    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);

    if (iscsilun->zeroblock == NULL) {
        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1182,17 +1068,13 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
        return iTask.err_code;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
+        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                     count >> BDRV_SECTOR_BITS);
+        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
    }

    return 0;
@@ -1683,10 +1565,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    task = NULL;

    iscsi_modesense_sync(iscsilun);
-    if (iscsilun->dpofua) {
-        bs->supported_write_flags = BDRV_REQ_FUA;
-    }
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Check the write protect flag of the LUN if we want to write */
    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1703,13 +1581,14 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        goto out;
    }
    bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
+    bs->request_alignment = iscsilun->block_size;

    /* We don't have any emulation for devices other than disks and CD-ROMs, so
     * this must be sg ioctl compatible. We force it to be sg, otherwise qemu
     * will try to read from the device to guess the image format.
     */
    if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) {
-        bs->sg = true;
+        bs->sg = 1;
    }

    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
@@ -1765,7 +1644,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
        if (iscsilun->lbprz) {
-            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
+            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+            if (iscsilun->allocationmap == NULL) {
+                ret = -ENOMEM;
+            }
        }
    }

@@ -1802,57 +1684,48 @@ static void iscsi_close(BlockDriverState *bs)
    }
    iscsi_destroy_context(iscsi);
    g_free(iscsilun->zeroblock);
-    iscsi_allocmap_free(iscsilun);
+    g_free(iscsilun->allocationmap);
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1);
+}
+
 static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    /* We don't actually refresh here, but just return data queried in
     * iscsi_open(): iscsi targets don't change their limits. */

    IscsiLun *iscsilun = bs->opaque;
-    uint64_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
-    unsigned int block_size = MAX(BDRV_SECTOR_SIZE, iscsilun->block_size);
-
-    assert(iscsilun->block_size >= BDRV_SECTOR_SIZE || bs->sg);
-
-    bs->bl.request_alignment = block_size;
+    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;

    if (iscsilun->bl.max_xfer_len) {
        max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len);
    }

-    if (max_xfer_len * block_size < INT_MAX) {
-        bs->bl.max_transfer = max_xfer_len * iscsilun->block_size;
-    }
+    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun);

    if (iscsilun->lbp.lbpu) {
-        if (iscsilun->bl.max_unmap < 0xffffffff / block_size) {
-            bs->bl.max_pdiscard =
-                iscsilun->bl.max_unmap * iscsilun->block_size;
+        if (iscsilun->bl.max_unmap < 0xffffffff) {
+            bs->bl.max_discard =
+                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun);
        }
-        bs->bl.pdiscard_alignment =
-            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
-    } else {
-        bs->bl.pdiscard_alignment = iscsilun->block_size;
+        bs->bl.discard_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }

-    if (iscsilun->bl.max_ws_len < 0xffffffff / block_size) {
-        bs->bl.max_pwrite_zeroes =
-            iscsilun->bl.max_ws_len * iscsilun->block_size;
+    if (iscsilun->bl.max_ws_len < 0xffffffff) {
+        bs->bl.max_write_zeroes =
+            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
    }
    if (iscsilun->lbp.lbpws) {
-        bs->bl.pwrite_zeroes_alignment =
-            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
-    } else {
-        bs->bl.pwrite_zeroes_alignment = iscsilun->block_size;
-    }
-    if (iscsilun->bl.opt_xfer_len &&
-        iscsilun->bl.opt_xfer_len < INT_MAX / block_size) {
-        bs->bl.opt_transfer = pow2floor(iscsilun->bl.opt_xfer_len *
-                                        iscsilun->block_size);
+        bs->bl.write_zeroes_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }
+    bs->bl.opt_transfer_length =
+        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
 }

 /* Note that this will not re-establish a connection with an iSCSI target - it
@@ -1869,16 +1742,6 @@ static int iscsi_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
-{
-    IscsiLun *iscsilun = reopen_state->bs->opaque;
-
-    /* the cache.direct status might have changed */
-    if (iscsilun->allocmap != NULL) {
-        iscsi_allocmap_init(iscsilun, reopen_state->flags);
-    }
-}
-
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
    IscsiLun *iscsilun = bs->opaque;
@@ -1898,8 +1761,9 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
        return -EINVAL;
    }

-    if (iscsilun->allocmap != NULL) {
-        iscsi_allocmap_init(iscsilun, bs->open_flags);
+    if (iscsilun->allocationmap != NULL) {
+        g_free(iscsilun->allocationmap);
+        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
    }

    return 0;
@@ -1959,13 +1823,6 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static void iscsi_invalidate_cache(BlockDriverState *bs,
-                                   Error **errp)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    iscsi_allocmap_invalidate(iscsilun);
-}
-
 static QemuOptsList iscsi_create_opts = {
    .name = "iscsi-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head),
@@ -1989,9 +1846,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_close      = iscsi_close,
    .bdrv_create     = iscsi_create,
    .create_opts     = &iscsi_create_opts,
-    .bdrv_reopen_prepare   = iscsi_reopen_prepare,
-    .bdrv_reopen_commit    = iscsi_reopen_commit,
-    .bdrv_invalidate_cache = iscsi_invalidate_cache,
+    .bdrv_reopen_prepare  = iscsi_reopen_prepare,

    .bdrv_getlength  = iscsi_getlength,
    .bdrv_get_info   = iscsi_get_info,
@@ -1999,10 +1854,12 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_refresh_limits = iscsi_refresh_limits,

    .bdrv_co_get_block_status = iscsi_co_get_block_status,
-    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
-    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
+    .bdrv_co_discard      = iscsi_co_discard,
+    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
+    .bdrv_co_writev        = iscsi_co_writev,
    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
@@ -2013,9 +1870,45 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_attach_aio_context = iscsi_attach_aio_context,
 };

+static QemuOptsList qemu_iscsi_opts = {
+    .name = "iscsi",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head),
+    .desc = {
+        {
+            .name = "user",
+            .type = QEMU_OPT_STRING,
+            .help = "username for CHAP authentication to target",
+        },{
+            .name = "password",
+            .type = QEMU_OPT_STRING,
+            .help = "password for CHAP authentication to target",
+        },{
+            .name = "password-secret",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of the secret providing password for CHAP "
+                    "authentication to target",
+        },{
+            .name = "header-digest",
+            .type = QEMU_OPT_STRING,
+            .help = "HeaderDigest setting. "
+                    "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}",
+        },{
+            .name = "initiator-name",
+            .type = QEMU_OPT_STRING,
+            .help = "Initiator iqn name to use when connecting",
+        },{
+            .name = "timeout",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Request timeout in seconds (default 0 = no timeout)",
+        },
+        { /* end of list */ }
+    },
+};
+
 static void iscsi_block_init(void)
 {
    bdrv_register(&bdrv_iscsi);
+    qemu_add_opts(&qemu_iscsi_opts);
 }

 block_init(iscsi_block_init);
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -11,10 +11,8 @@
 #include "qemu-common.h"
 #include "block/aio.h"
 #include "qemu/queue.h"
-#include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
-#include "qemu/coroutine.h"

 #include <libaio.h>

@@ -28,10 +26,11 @@
 */
 #define MAX_EVENTS 128

+#define MAX_QUEUED_IO  128
+
 struct qemu_laiocb {
    BlockAIOCB common;
-    Coroutine *co;
-    LinuxAioState *ctx;
+    struct qemu_laio_state *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
@@ -42,15 +41,12 @@ struct qemu_laiocb {

 typedef struct {
    int plugged;
-    unsigned int in_queue;
-    unsigned int in_flight;
+    unsigned int n;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;

-struct LinuxAioState {
-    AioContext *aio_context;
-
+struct qemu_laio_state {
    io_context_t ctx;
    EventNotifier e;

@@ -59,11 +55,12 @@ struct LinuxAioState {

    /* I/O completion processing */
    QEMUBH *completion_bh;
+    struct io_event events[MAX_EVENTS];
    int event_idx;
    int event_max;
 };

-static void ioq_submit(LinuxAioState *s);
+static void ioq_submit(struct qemu_laio_state *s);

 static inline ssize_t io_event_ret(struct io_event *ev)
 {
@@ -73,7 +70,8 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 /*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
-static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+static void qemu_laio_process_completion(struct qemu_laio_state *s,
+    struct qemu_laiocb *laiocb)
 {
    int ret;

@@ -87,168 +85,71 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
-                ret = -ENOSPC;
+                ret = -EINVAL;
            }
        }
    }
+    laiocb->common.cb(laiocb->common.opaque, ret);

-    laiocb->ret = ret;
-    if (laiocb->co) {
-        /* Jump and continue completion for foreign requests, don't do
-         * anything for current request, it will be completed shortly. */
-        if (laiocb->co != qemu_coroutine_self()) {
-            qemu_coroutine_enter(laiocb->co);
-        }
-    } else {
-        laiocb->common.cb(laiocb->common.opaque, ret);
-        qemu_aio_unref(laiocb);
-    }
+    qemu_aio_unref(laiocb);
 }

-/**
- * aio_ring buffer which is shared between userspace and kernel.
- *
- * This copied from linux/fs/aio.c, common header does not exist
- * but AIO exists for ages so we assume ABI is stable.
- */
-struct aio_ring {
-    unsigned    id;    /* kernel internal index number */
-    unsigned    nr;    /* number of io_events */
-    unsigned    head;  /* Written to by userland or by kernel. */
-    unsigned    tail;
-
-    unsigned    magic;
-    unsigned    compat_features;
-    unsigned    incompat_features;
-    unsigned    header_length;  /* size of aio_ring */
-
-    struct io_event io_events[0];
-};
-
-/**
- * io_getevents_peek:
- * @ctx: AIO context
- * @events: pointer on events array, output value
-
- * Returns the number of completed events and sets a pointer
- * on events array.  This function does not update the internal
- * ring buffer, only reads head and tail.  When @events has been
- * processed io_getevents_commit() must be called.
- */
-static inline unsigned int io_getevents_peek(io_context_t ctx,
-                                             struct io_event **events)
-{
-    struct aio_ring *ring = (struct aio_ring *)ctx;
-    unsigned int head = ring->head, tail = ring->tail;
-    unsigned int nr;
-
-    nr = tail >= head ? tail - head : ring->nr - head;
-    *events = ring->io_events + head;
-    /* To avoid speculative loads of s->events[i] before observing tail.
-       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
-    smp_rmb();
-
-    return nr;
-}
-
-/**
- * io_getevents_commit:
- * @ctx: AIO context
- * @nr: the number of events on which head should be advanced
- *
- * Advances head of a ring buffer.
- */
-static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
-{
-    struct aio_ring *ring = (struct aio_ring *)ctx;
-
-    if (nr) {
-        ring->head = (ring->head + nr) % ring->nr;
-    }
-}
-
-/**
- * io_getevents_advance_and_peek:
- * @ctx: AIO context
- * @events: pointer on events array, output value
- * @nr: the number of events on which head should be advanced
- *
- * Advances head of a ring buffer and returns number of elements left.
- */
-static inline unsigned int
-io_getevents_advance_and_peek(io_context_t ctx,
-                              struct io_event **events,
-                              unsigned int nr)
-{
-    io_getevents_commit(ctx, nr);
-    return io_getevents_peek(ctx, events);
-}
-
-/**
- * qemu_laio_process_completions:
- * @s: AIO state
- *
- * Fetches completed I/O requests and invokes their callbacks.
+/* The completion BH fetches completed I/O requests and invokes their
+ * callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
- * indices are kept in LinuxAioState.  Function schedules BH completion so it
- * can be called again in a nested event loop.  When there are no events left
- * to complete the BH is being canceled.
+ * the completion events array and index are kept in qemu_laio_state.  The BH
+ * reschedules itself as long as there are completions pending so it will
+ * either be called again in a nested event loop or will be called after all
+ * events have been completed.  When there are no events left to complete, the
+ * BH returns without rescheduling.
 */
-static void qemu_laio_process_completions(LinuxAioState *s)
+static void qemu_laio_completion_bh(void *opaque)
 {
-    struct io_event *events;
+    struct qemu_laio_state *s = opaque;
+
+    /* Fetch more completion events when empty */
+    if (s->event_idx == s->event_max) {
+        do {
+            struct timespec ts = { 0 };
+            s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS,
+                                        s->events, &ts);
+        } while (s->event_max == -EINTR);
+
+        s->event_idx = 0;
+        if (s->event_max <= 0) {
+            s->event_max = 0;
+            return; /* no more events */
+        }
+    }

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

-    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
-                                                         s->event_idx))) {
-        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
-            struct iocb *iocb = events[s->event_idx].obj;
-            struct qemu_laiocb *laiocb =
+    /* Process completion events */
+    while (s->event_idx < s->event_max) {
+        struct iocb *iocb = s->events[s->event_idx].obj;
+        struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

-            laiocb->ret = io_event_ret(&events[s->event_idx]);
+        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
+        s->event_idx++;

-            /* Change counters one-by-one because we can be nested. */
-            s->io_q.in_flight--;
-            s->event_idx++;
-            qemu_laio_process_completion(laiocb);
-        }
+        qemu_laio_process_completion(s, laiocb);
    }

-    qemu_bh_cancel(s->completion_bh);
-
-    /* If we are nested we have to notify the level above that we are done
-     * by setting event_max to zero, upper level will then jump out of it's
-     * own `for` loop.  If we are the last all counters droped to zero. */
-    s->event_max = 0;
-    s->event_idx = 0;
-}
-
-static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
-{
-    qemu_laio_process_completions(s);
    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }

-static void qemu_laio_completion_bh(void *opaque)
-{
-    LinuxAioState *s = opaque;
-
-    qemu_laio_process_completions_and_submit(s);
-}
-
 static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    LinuxAioState *s = container_of(e, LinuxAioState, e);
+    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);

    if (event_notifier_test_and_clear(&s->e)) {
-        qemu_laio_process_completions_and_submit(s);
+        qemu_bh_schedule(s->completion_bh);
    }
 }

@@ -280,26 +181,22 @@ static void ioq_init(LaioQueue *io_q)
 {
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->plugged = 0;
-    io_q->in_queue = 0;
-    io_q->in_flight = 0;
+    io_q->n = 0;
    io_q->blocked = false;
 }

-static void ioq_submit(LinuxAioState *s)
+static void ioq_submit(struct qemu_laio_state *s)
 {
    int ret, len;
    struct qemu_laiocb *aiocb;
-    struct iocb *iocbs[MAX_EVENTS];
+    struct iocb *iocbs[MAX_QUEUED_IO];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
-        if (s->io_q.in_flight >= MAX_EVENTS) {
-            break;
-        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
-            if (s->io_q.in_flight + len >= MAX_EVENTS) {
+            if (len == MAX_QUEUED_IO) {
                break;
            }
        }
@@ -309,56 +206,55 @@ static void ioq_submit(LinuxAioState *s)
            break;
        }
        if (ret < 0) {
-            /* Fail the first request, retry the rest */
-            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
-            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
-            s->io_q.in_queue--;
-            aiocb->ret = ret;
-            qemu_laio_process_completion(aiocb);
-            continue;
+            abort();
        }

-        s->io_q.in_flight += ret;
-        s->io_q.in_queue  -= ret;
+        s->io_q.n -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
-    s->io_q.blocked = (s->io_q.in_queue > 0);
-
-    if (s->io_q.in_flight) {
-        /* We can try to complete something just right away if there are
-         * still requests in-flight. */
-        qemu_laio_process_completions(s);
-        /*
-         * Even we have completed everything (in_flight == 0), the queue can
-         * have still pended requests (in_queue > 0).  We do not attempt to
-         * repeat submission to avoid IO hang.  The reason is simple: s->e is
-         * still set and completion callback will be called shortly and all
-         * pended requests will be submitted from there.
-         */
-    }
+    s->io_q.blocked = (s->io_q.n > 0);
 }

-void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
+void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
 {
+    struct qemu_laio_state *s = aio_ctx;
+
    s->io_q.plugged++;
 }

-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
+void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
 {
-    assert(s->io_q.plugged);
-    if (--s->io_q.plugged == 0 &&
-        !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    struct qemu_laio_state *s = aio_ctx;
+
+    assert(s->io_q.plugged > 0 || !unplug);
+
+    if (unplug && --s->io_q.plugged > 0) {
+        return;
+    }
+
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }

-static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
-                          int type)
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
-    LinuxAioState *s = laiocb->ctx;
-    struct iocb *iocbs = &laiocb->iocb;
-    QEMUIOVector *qiov = laiocb->qiov;
+    struct qemu_laio_state *s = aio_ctx;
+    struct qemu_laiocb *laiocb;
+    struct iocb *iocbs;
+    off_t offset = sector_num * 512;
+
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+    laiocb->nbytes = nb_sectors * 512;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+    laiocb->is_read = (type == QEMU_AIO_READ);
+    laiocb->qiov = qiov;
+
+    iocbs = &laiocb->iocb;

    switch (type) {
    case QEMU_AIO_WRITE:
@@ -371,86 +267,43 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
-        return -EIO;
+        goto out_free_aiocb;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
-    s->io_q.in_queue++;
+    s->io_q.n++;
    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
+        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
        ioq_submit(s);
    }
-
-    return 0;
-}
-
-int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type)
-{
-    int ret;
-    struct qemu_laiocb laiocb = {
-        .co         = qemu_coroutine_self(),
-        .nbytes     = qiov->size,
-        .ctx        = s,
-        .ret        = -EINPROGRESS,
-        .is_read    = (type == QEMU_AIO_READ),
-        .qiov       = qiov,
-    };
-
-    ret = laio_do_submit(fd, &laiocb, offset, type);
-    if (ret < 0) {
-        return ret;
-    }
-
-    if (laiocb.ret == -EINPROGRESS) {
-        qemu_coroutine_yield();
-    }
-    return laiocb.ret;
-}
-
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
-{
-    struct qemu_laiocb *laiocb;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
-    int ret;
-
-    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
-    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    laiocb->ctx = s;
-    laiocb->ret = -EINPROGRESS;
-    laiocb->is_read = (type == QEMU_AIO_READ);
-    laiocb->qiov = qiov;
-
-    ret = laio_do_submit(fd, laiocb, offset, type);
-    if (ret < 0) {
-        qemu_aio_unref(laiocb);
-        return NULL;
-    }
-
    return &laiocb->common;
+
+out_free_aiocb:
+    qemu_aio_unref(laiocb);
+    return NULL;
 }

-void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+void laio_detach_aio_context(void *s_, AioContext *old_context)
 {
+    struct qemu_laio_state *s = s_;
+
    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

-void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
+void laio_attach_aio_context(void *s_, AioContext *new_context)
 {
-    s->aio_context = new_context;
+    struct qemu_laio_state *s = s_;
+
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb);
 }

-LinuxAioState *laio_init(void)
+void *laio_init(void)
 {
-    LinuxAioState *s;
+    struct qemu_laio_state *s;

    s = g_malloc0(sizeof(*s));
    if (event_notifier_init(&s->e, false) < 0) {
@@ -472,8 +325,10 @@ out_free_state:
    return NULL;
 }

-void laio_cleanup(LinuxAioState *s)
+void laio_cleanup(void *s_)
 {
+    struct qemu_laio_state *s = s_;
+
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,12 +20,11 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
+#include "qemu/error-report.h"

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
-#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
-#define DEFAULT_MIRROR_BUF_SIZE \
-    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
+#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)

 /* The mirroring buffer is a list of granularity-sized chunks.
 * Free chunks are organized in a list.
@@ -37,7 +36,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
-    BlockBackend *target;
+    BlockDriverState *target;
    BlockDriverState *base;
    /* The name of the graph node to replace */
    char *replaces;
@@ -46,7 +45,6 @@ typedef struct MirrorBlockJob {
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
    bool is_none_mode;
-    BlockMirrorBackingMode backing_mode;
    BlockdevOnError on_source_error, on_target_error;
    bool synced;
    bool should_complete;
@@ -60,10 +58,9 @@ typedef struct MirrorBlockJob {
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;

-    uint64_t last_pause_ns;
    unsigned long *in_flight_bitmap;
    int in_flight;
-    int64_t sectors_in_flight;
+    int sectors_in_flight;
    int ret;
    bool unmap;
    bool waiting_for_io;
@@ -83,11 +80,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
    s->synced = false;
    if (read) {
-        return block_job_error_action(&s->common, s->on_source_error,
-                                      true, error);
+        return block_job_error_action(&s->common, s->common.bs,
+                                      s->on_source_error, true, error);
    } else {
-        return block_job_error_action(&s->common, s->on_target_error,
-                                      false, error);
+        return block_job_error_action(&s->common, s->target,
+                                      s->on_target_error, false, error);
    }
 }

@@ -124,7 +121,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    g_free(op);

    if (s->waiting_for_io) {
-        qemu_coroutine_enter(s->common.co);
+        qemu_coroutine_enter(s->common.co, NULL);
    }
 }

@@ -160,8 +157,8 @@ static void mirror_read_complete(void *opaque, int ret)
        mirror_iteration_done(op, ret);
        return;
    }
-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                    0, mirror_write_complete, op);
+    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+                    mirror_write_complete, op);
 }

 static inline void mirror_clip_sectors(MirrorBlockJob *s,
@@ -189,9 +186,8 @@ static int mirror_cow_align(MirrorBlockJob *s,
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
-        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
-                                       *nb_sectors, &align_sector_num,
-                                       &align_nb_sectors);
+        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+                               &align_sector_num, &align_nb_sectors);
    }

    if (align_nb_sectors > max_sectors) {
@@ -221,29 +217,23 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 }

 /* Submit async read while handling COW.
- * Returns: The number of sectors copied after and including sector_num,
- *          excluding any sectors copied prior to sector_num due to alignment.
- *          This will be nb_sectors if no alignment is necessary, or
+ * Returns: nb_sectors if no alignment is necessary, or
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
 {
-    BlockBackend *source = s->common.blk;
+    BlockDriverState *source = s->common.bs;
    int sectors_per_chunk, nb_chunks;
-    int ret;
+    int ret = nb_sectors;
    MirrorOp *op;
-    int max_sectors;

    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    max_sectors = sectors_per_chunk * s->max_iov;

    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
-    nb_sectors = MIN(max_sectors, nb_sectors);
    assert(nb_sectors);
-    ret = nb_sectors;

    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
@@ -284,7 +274,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
+    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
    return ret;
 }
@@ -306,12 +296,10 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
-        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
-                         op->nb_sectors << BDRV_SECTOR_BITS,
+        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
                         mirror_write_complete, op);
    } else {
-        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
-                              op->nb_sectors * BDRV_SECTOR_SIZE,
+        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    }
@@ -319,16 +307,13 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = blk_bs(s->common.blk);
+    BlockDriverState *source = s->common.bs;
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
    int nb_chunks = 1;
    int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
    int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
-    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
-                             MAX_IO_SECTORS);

    sector_num = hbitmap_iter_next(&s->hbi);
    if (sector_num < 0) {
@@ -340,12 +325,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)

    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        trace_mirror_yield_in_flight(s, first_chunk, s->in_flight);
        mirror_wait_for_io(s);
    }

-    block_job_pause_point(&s->common);
-
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
@@ -379,7 +362,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
    while (nb_chunks > 0 && sector_num < end) {
        int ret;
-        int io_sectors, io_sectors_acct;
+        int io_sectors;
        BlockDriverState *file;
        enum MirrorMethod {
            MIRROR_METHOD_COPY,
@@ -392,9 +375,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
                                          nb_chunks * sectors_per_chunk,
                                          &io_sectors, &file);
        if (ret < 0) {
-            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
-        } else if (ret & BDRV_BLOCK_DATA) {
-            io_sectors = MIN(io_sectors, max_io_sectors);
+            io_sectors = nb_chunks * sectors_per_chunk;
        }

        io_sectors -= io_sectors % sectors_per_chunk;
@@ -403,9 +384,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
-            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
-                                           io_sectors,  &target_sector_num,
-                                           &target_nb_sectors);
+            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+                                   &target_sector_num, &target_nb_sectors);
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
@@ -414,30 +394,16 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            }
        }

-        while (s->in_flight >= MAX_IN_FLIGHT) {
-            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
-            mirror_wait_for_io(s);
-        }
-
-        if (s->ret < 0) {
-            return 0;
-        }
-
        mirror_clip_sectors(s, sector_num, &io_sectors);
        switch (mirror_method) {
        case MIRROR_METHOD_COPY:
            io_sectors = mirror_do_read(s, sector_num, io_sectors);
-            io_sectors_acct = io_sectors;
            break;
        case MIRROR_METHOD_ZERO:
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
+            break;
        case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors,
-                                      mirror_method == MIRROR_METHOD_DISCARD);
-            if (write_zeroes_ok) {
-                io_sectors_acct = 0;
-            } else {
-                io_sectors_acct = io_sectors;
-            }
+            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
            break;
        default:
            abort();
@@ -445,9 +411,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        assert(io_sectors);
        sector_num += io_sectors;
        nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
-        if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
-        }
+        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
    }
    return delay_ns;
 }
@@ -485,8 +449,7 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = blk_bs(s->common.blk);
-    BlockDriverState *target_bs = blk_bs(s->target);
+    BlockDriverState *src = s->common.bs;

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
@@ -498,25 +461,26 @@ static void mirror_exit(BlockJob *job, void *opaque)
    }

    if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = src;
+        BlockDriverState *to_replace = s->common.bs;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }

-        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
+        /* This was checked in mirror_start_job(), but meanwhile one of the
+         * nodes could have been newly attached to a BlockBackend. */
+        if (to_replace->blk && s->target->blk) {
+            error_report("block job: Can't create node with two BlockBackends");
+            data->ret = -EINVAL;
+            goto out;
        }

-        /* The mirror job has no requests in flight any more, but we need to
-         * drain potential other users of the BDS before changing the graph. */
-        bdrv_drained_begin(target_bs);
-        bdrv_replace_in_backing_chain(to_replace, target_bs);
-        bdrv_drained_end(target_bs);
-
-        /* We just changed the BDS the job BB refers to */
-        blk_remove_bs(job->blk);
-        blk_insert_bs(job->blk, src);
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
+        }
+        bdrv_replace_in_backing_chain(to_replace, s->target);
    }
+
+out:
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
@@ -526,102 +490,30 @@ static void mirror_exit(BlockJob *job, void *opaque)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_op_unblock_all(target_bs, s->common.blocker);
-    blk_unref(s->target);
+    bdrv_op_unblock_all(s->target, s->common.blocker);
+    bdrv_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
+    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
+        aio_enable_external(iohandler_get_aio_context());
+    }
    bdrv_unref(src);
 }

-static void mirror_throttle(MirrorBlockJob *s)
-{
-    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-
-    if (now - s->last_pause_ns > SLICE_TIME) {
-        s->last_pause_ns = now;
-        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
-    } else {
-        block_job_pause_point(&s->common);
-    }
-}
-
-static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
-{
-    int64_t sector_num, end;
-    BlockDriverState *base = s->base;
-    BlockDriverState *bs = blk_bs(s->common.blk);
-    BlockDriverState *target_bs = blk_bs(s->target);
-    int ret, n;
-
-    end = s->bdev_length / BDRV_SECTOR_SIZE;
-
-    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
-        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
-            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
-            return 0;
-        }
-
-        for (sector_num = 0; sector_num < end; ) {
-            int nb_sectors = MIN(end - sector_num,
-                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
-
-            mirror_throttle(s);
-
-            if (block_job_is_cancelled(&s->common)) {
-                return 0;
-            }
-
-            if (s->in_flight >= MAX_IN_FLIGHT) {
-                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
-                mirror_wait_for_io(s);
-                continue;
-            }
-
-            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
-            sector_num += nb_sectors;
-        }
-
-        mirror_drain(s);
-    }
-
-    /* First part, loop on the sectors and initialize the dirty bitmap.  */
-    for (sector_num = 0; sector_num < end; ) {
-        /* Just to make sure we are not exceeding int limit. */
-        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
-                             end - sector_num);
-
-        mirror_throttle(s);
-
-        if (block_job_is_cancelled(&s->common)) {
-            return 0;
-        }
-
-        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
-        if (ret < 0) {
-            return ret;
-        }
-
-        assert(n > 0);
-        if (ret == 1) {
-            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
-        }
-        sector_num += n;
-    }
-    return 0;
-}
-
 static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = blk_bs(s->common.blk);
-    BlockDriverState *target_bs = blk_bs(s->target);
-    int64_t length;
+    BlockDriverState *bs = s->common.bs;
+    bool need_drain = true;
+    int64_t sector_num, end, length;
+    uint64_t last_pause_ns;
    BlockDriverInfo bdi;
    char backing_filename[2]; /* we only need 2 characters because we are only
                                 checking for a NULL string */
    int ret = 0;
+    int n;
    int target_cluster_size = BDRV_SECTOR_SIZE;

    if (block_job_is_cancelled(&s->common)) {
@@ -650,19 +542,20 @@ static void coroutine_fn mirror_run(void *opaque)
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
-    bdrv_get_backing_filename(target_bs, backing_filename,
+    bdrv_get_backing_filename(s->target, backing_filename,
                              sizeof(backing_filename));
-    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
        target_cluster_size = bdi.cluster_size;
    }
-    if (backing_filename[0] && !target_bs->backing
+    if (backing_filename[0] && !s->target->backing
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
+    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);

+    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
@@ -671,18 +564,45 @@ static void coroutine_fn mirror_run(void *opaque)

    mirror_free_init(s);

-    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    if (!s->is_none_mode) {
-        ret = mirror_dirty_init(s);
-        if (ret < 0 || block_job_is_cancelled(&s->common)) {
-            goto immediate_exit;
+        /* First part, loop on the sectors and initialize the dirty bitmap.  */
+        BlockDriverState *base = s->base;
+        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
+
+        for (sector_num = 0; sector_num < end; ) {
+            /* Just to make sure we are not exceeding int limit. */
+            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
+                                 end - sector_num);
+            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+            if (now - last_pause_ns > SLICE_TIME) {
+                last_pause_ns = now;
+                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+            }
+
+            if (block_job_is_cancelled(&s->common)) {
+                goto immediate_exit;
+            }
+
+            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
+
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+
+            assert(n > 0);
+            if (ret == 1 || mark_all_dirty) {
+                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
+            }
+            sector_num += n;
        }
    }

    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
    for (;;) {
        uint64_t delay_ns = 0;
-        int64_t cnt, delta;
+        int64_t cnt;
        bool should_complete;

        if (s->ret < 0) {
@@ -690,8 +610,6 @@ static void coroutine_fn mirror_run(void *opaque)
            goto immediate_exit;
        }

-        block_job_pause_point(&s->common);
-
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
@@ -705,10 +623,9 @@ static void coroutine_fn mirror_run(void *opaque)
         * We do so every SLICE_TIME nanoseconds, or when there is an error,
         * or when the source is clean, whichever comes first.
         */
-        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
-        if (delta < SLICE_TIME &&
+        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                (cnt == 0 && s->in_flight > 0)) {
                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
                mirror_wait_for_io(s);
@@ -721,7 +638,7 @@ static void coroutine_fn mirror_run(void *opaque)
        should_complete = false;
        if (s->in_flight == 0 && cnt == 0) {
            trace_mirror_before_flush(s);
-            ret = blk_flush(s->target);
+            ret = bdrv_flush(s->target);
            if (ret < 0) {
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
@@ -751,11 +668,26 @@ static void coroutine_fn mirror_run(void *opaque)
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
-             * mirror_populate runs.
+             * mirror_populate runs, so pause it now.  Before deciding
+             * whether to switch to target check one last time if I/O has
+             * come in the meanwhile, and if not flush the data to disk.
             */
            trace_mirror_before_drain(s, cnt);
-            bdrv_co_drain(bs);
+
+            bdrv_drained_begin(bs);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+            if (cnt > 0) {
+                bdrv_drained_end(bs);
+                continue;
+            }
+
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            need_drain = false;
+            break;
        }

        ret = 0;
@@ -768,15 +700,8 @@ static void coroutine_fn mirror_run(void *opaque)
        } else if (!should_complete) {
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-        } else if (cnt == 0) {
-            /* The two disks are in sync.  Exit and report successful
-             * completion.
-             */
-            assert(QLIST_EMPTY(&bs->tracked_requests));
-            s->common.cancelled = false;
-            break;
        }
-        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    }

 immediate_exit:
@@ -786,6 +711,7 @@ immediate_exit:
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
+        assert(need_drain);
        mirror_drain(s);
    }

@@ -794,12 +720,22 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
+    if (s->target->blk) {
+        blk_iostatus_disable(s->target->blk);
+    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
-    /* Before we switch to target in mirror_exit, make sure data doesn't
-     * change. */
-    bdrv_drained_begin(bs);
+
+    if (need_drain) {
+        bdrv_drained_begin(s->common.bs);
+    }
+    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
+        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
+         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
+         * need a block layer API change to achieve this. */
+        aio_disable_external(iohandler_get_aio_context());
+    }
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

@@ -814,31 +750,32 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

+static void mirror_iostatus_reset(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    if (s->target->blk) {
+        blk_iostatus_reset(s->target->blk);
+    }
+}
+
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    BlockDriverState *src, *target;
-
-    src = blk_bs(job->blk);
-    target = blk_bs(s->target);
+    Error *local_err = NULL;
+    int ret;

+    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        return;
+    }
    if (!s->synced) {
-        error_setg(errp, "The active block job '%s' cannot be completed",
-                   job->id);
+        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
        return;
    }

-    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
-        int ret;
-
-        assert(!target->backing);
-        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
-        if (ret < 0) {
-            return;
-        }
-    }
-
-    /* block all operations on to_replace bs */
+    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
        AioContext *replace_aio_context;

@@ -859,67 +796,41 @@ static void mirror_complete(BlockJob *job, Error **errp)
        aio_context_release(replace_aio_context);
    }

-    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
-        BlockDriverState *backing = s->is_none_mode ? src : s->base;
-        if (backing_bs(target) != backing) {
-            bdrv_set_backing_hd(target, backing);
-        }
-    }
-
    s->should_complete = true;
    block_job_enter(&s->common);
 }

-/* There is no matching mirror_resume() because mirror_run() will begin
- * iterating again when the job is resumed.
- */
-static void coroutine_fn mirror_pause(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    mirror_drain(s);
-}
-
-static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    blk_set_aio_context(s->target, new_context);
-}
-
 static const BlockJobDriver mirror_job_driver = {
-    .instance_size          = sizeof(MirrorBlockJob),
-    .job_type               = BLOCK_JOB_TYPE_MIRROR,
-    .set_speed              = mirror_set_speed,
-    .complete               = mirror_complete,
-    .pause                  = mirror_pause,
-    .attached_aio_context   = mirror_attached_aio_context,
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = BLOCK_JOB_TYPE_MIRROR,
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset= mirror_iostatus_reset,
+    .complete      = mirror_complete,
 };

 static const BlockJobDriver commit_active_job_driver = {
-    .instance_size          = sizeof(MirrorBlockJob),
-    .job_type               = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed              = mirror_set_speed,
-    .complete               = mirror_complete,
-    .pause                  = mirror_pause,
-    .attached_aio_context   = mirror_attached_aio_context,
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset
+                   = mirror_iostatus_reset,
+    .complete      = mirror_complete,
 };

-static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-                             BlockDriverState *target, const char *replaces,
+static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
+                             const char *replaces,
                             int64_t speed, uint32_t granularity,
                             int64_t buf_size,
-                             BlockMirrorBackingMode backing_mode,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
                             bool unmap,
                             BlockCompletionFunc *cb,
                             void *opaque, Error **errp,
                             const BlockJobDriver *driver,
-                             bool is_none_mode, BlockDriverState *base,
-                             bool auto_complete)
+                             bool is_none_mode, BlockDriverState *base)
 {
    MirrorBlockJob *s;
+    BlockDriverState *replaced_bs;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -927,6 +838,13 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,

    assert ((granularity & (granularity - 1)) == 0);

+    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        return;
+    }
+
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
@@ -936,47 +854,58 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
+    /* We can't support this case as long as the block layer can't handle
+     * multiple BlockBackends per BlockDriverState. */
+    if (replaces) {
+        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
+        if (replaced_bs == NULL) {
+            return;
+        }
+    } else {
+        replaced_bs = bs;
+    }
+    if (replaced_bs->blk && target->blk) {
+        error_setg(errp, "Can't create node with two BlockBackends");
+        return;
+    }
+
+    s = block_job_create(driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
    }

-    s->target = blk_new();
-    blk_insert_bs(s->target, target);
-
    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
+    s->target = target;
    s->is_none_mode = is_none_mode;
-    s->backing_mode = backing_mode;
    s->base = base;
    s->granularity = granularity;
    s->buf_size = ROUND_UP(buf_size, granularity);
    s->unmap = unmap;
-    if (auto_complete) {
-        s->should_complete = true;
-    }

    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
-        blk_unref(s->target);
        block_job_unref(&s->common);
        return;
    }

-    bdrv_op_block_all(target, s->common.blocker);
+    bdrv_op_block_all(s->target, s->common.blocker);

-    s->common.co = qemu_coroutine_create(mirror_run, s);
+    if (s->target->blk) {
+        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
+        blk_iostatus_enable(s->target->blk);
+    }
+    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
+    qemu_coroutine_enter(s->common.co, s);
 }

-void mirror_start(const char *job_id, BlockDriverState *bs,
-                  BlockDriverState *target, const char *replaces,
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  const char *replaces,
                  int64_t speed, uint32_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
-                  BlockdevOnError on_source_error,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  bool unmap,
                  BlockCompletionFunc *cb,
@@ -991,18 +920,17 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
    }
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
-    mirror_start_job(job_id, bs, target, replaces,
-                     speed, granularity, buf_size, backing_mode,
+    mirror_start_job(bs, target, replaces,
+                     speed, granularity, buf_size,
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
-                     &mirror_job_driver, is_none_mode, base, false);
+                     &mirror_job_driver, is_none_mode, base);
 }

-void commit_active_start(const char *job_id, BlockDriverState *bs,
-                         BlockDriverState *base, int64_t speed,
+void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
+                         int64_t speed,
                         BlockdevOnError on_error,
                         BlockCompletionFunc *cb,
-                         void *opaque, Error **errp,
-                         bool auto_complete)
+                         void *opaque, Error **errp)
 {
    int64_t length, base_length;
    int orig_base_flags;
@@ -1040,10 +968,10 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
        }
    }

-    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
-                     MIRROR_LEAVE_BACKING_CHAIN,
+    bdrv_ref(base);
+    mirror_start_job(bs, base, NULL, speed, 0, 0,
                     on_error, on_error, false, cb, opaque, &local_err,
-                     &commit_active_job_driver, false, base, auto_complete);
+                     &commit_active_job_driver, false, base);
    if (local_err) {
        error_propagate(errp, local_err);
        goto error_restore_flags;
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -38,7 +38,7 @@ static void nbd_recv_coroutines_enter_all(NbdClientSession *s)

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i]);
+            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        }
    }
 }
@@ -99,7 +99,7 @@ static void nbd_reply_ready(void *opaque)
    }

    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i]);
+        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        return;
    }

@@ -111,12 +111,12 @@ static void nbd_restart_write(void *opaque)
 {
    BlockDriverState *bs = opaque;

-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
+    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL);
 }

 static int nbd_co_send_request(BlockDriverState *bs,
                               struct nbd_request *request,
-                               QEMUIOVector *qiov)
+                               QEMUIOVector *qiov, int offset)
 {
    NbdClientSession *s = nbd_get_client_session(bs);
    AioContext *aio_context;
@@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
        qio_channel_set_cork(s->ioc, true);
        rc = nbd_send_request(s->ioc, request);
        if (rc >= 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
-                               false);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
+                               offset, request->len, 0);
            if (ret != request->len) {
                rc = -EIO;
            }
@@ -167,9 +167,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
 }

 static void nbd_co_receive_reply(NbdClientSession *s,
-                                 struct nbd_request *request,
-                                 struct nbd_reply *reply,
-                                 QEMUIOVector *qiov)
+    struct nbd_request *request, struct nbd_reply *reply,
+    QEMUIOVector *qiov, int offset)
 {
    int ret;

@@ -182,8 +181,8 @@ static void nbd_co_receive_reply(NbdClientSession *s,
        reply->error = EIO;
    } else {
        if (qiov && reply->error == 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
-                               true);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
+                               offset, request->len, 1);
            if (ret != request->len) {
                reply->error = EIO;
            }
@@ -218,60 +217,91 @@ static void nbd_coroutine_end(NbdClientSession *s,
    }
 }

-int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
-                         uint64_t bytes, QEMUIOVector *qiov, int flags)
+static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov,
+                          int offset)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
-        .type = NBD_CMD_READ,
-        .from = offset,
-        .len = bytes,
-    };
+    struct nbd_request request = { .type = NBD_CMD_READ };
    struct nbd_reply reply;
    ssize_t ret;

-    assert(bytes <= NBD_MAX_BUFFER_SIZE);
-    assert(!flags);
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;

    nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL);
+    ret = nbd_co_send_request(bs, &request, NULL, 0);
    if (ret < 0) {
        reply.error = -ret;
    } else {
-        nbd_co_receive_reply(client, &request, &reply, qiov);
+        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
+                           int nb_sectors, QEMUIOVector *qiov,
+                           int offset, int *flags)
+{
+    NbdClientSession *client = nbd_get_client_session(bs);
+    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
+        *flags &= ~BDRV_REQ_FUA;
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(bs, &request, qiov, offset);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(client, &request);
    return -reply.error;
 }

-int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                          uint64_t bytes, QEMUIOVector *qiov, int flags)
+int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors, QEMUIOVector *qiov)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
-        .type = NBD_CMD_WRITE,
-        .from = offset,
-        .len = bytes,
-    };
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    if (flags & BDRV_REQ_FUA) {
-        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
-        request.type |= NBD_CMD_FLAG_FUA;
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
    }
+    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
+}

-    assert(bytes <= NBD_MAX_BUFFER_SIZE);
-
-    nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, qiov);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
+int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov, int *flags)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset,
+                              flags);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
    }
-    nbd_coroutine_end(client, &request);
-    return -reply.error;
+    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags);
 }

 int nbd_client_co_flush(BlockDriverState *bs)
@@ -289,37 +319,36 @@ int nbd_client_co_flush(BlockDriverState *bs)
    request.len = 0;

    nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL);
+    ret = nbd_co_send_request(bs, &request, NULL, 0);
    if (ret < 0) {
        reply.error = -ret;
    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(client, &request);
    return -reply.error;
 }

-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
-        .type = NBD_CMD_TRIM,
-        .from = offset,
-        .len = count,
-    };
+    struct nbd_request request = { .type = NBD_CMD_TRIM };
    struct nbd_reply reply;
    ssize_t ret;

    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
        return 0;
    }
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;

    nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL);
+    ret = nbd_co_send_request(bs, &request, NULL, 0);
    if (ret < 0) {
        reply.error = -ret;
    } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL);
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(client, &request);
    return -reply.error;
@@ -381,9 +410,6 @@ int nbd_client_init(BlockDriverState *bs,
        logout("Failed to negotiate with the NBD server\n");
        return ret;
    }
-    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
-        bs->supported_write_flags = BDRV_REQ_FUA;
-    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_mutex_init(&client->free_sema);
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -44,12 +44,13 @@ int nbd_client_init(BlockDriverState *bs,
                    Error **errp);
 void nbd_client_close(BlockDriverState *bs);

-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
+int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors);
 int nbd_client_co_flush(BlockDriverState *bs);
-int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                          uint64_t bytes, QEMUIOVector *qiov, int flags);
-int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
-                         uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov, int *flags);
+int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors, QEMUIOVector *qiov);

 void nbd_client_detach_aio_context(BlockDriverState *bs);
 void nbd_client_attach_aio_context(BlockDriverState *bs,
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -42,9 +42,6 @@

 typedef struct BDRVNBDState {
    NbdClientSession client;
-
-    /* For nbd_refresh_filename() */
-    char *path, *host, *port, *export, *tlscredsid;
 } BDRVNBDState;

 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -191,15 +188,13 @@ out:
    g_free(file);
 }

-static SocketAddress *nbd_config(BDRVNBDState *s, QemuOpts *opts, Error **errp)
+static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
+                                 Error **errp)
 {
    SocketAddress *saddr;

-    s->path = g_strdup(qemu_opt_get(opts, "path"));
-    s->host = g_strdup(qemu_opt_get(opts, "host"));
-
-    if (!s->path == !s->host) {
-        if (s->path) {
+    if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) {
+        if (qdict_haskey(options, "path")) {
            error_setg(errp, "path and host may not be used at the same time.");
        } else {
            error_setg(errp, "one of path and host must be specified.");
@@ -209,28 +204,32 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QemuOpts *opts, Error **errp)

    saddr = g_new0(SocketAddress, 1);

-    if (s->path) {
+    if (qdict_haskey(options, "path")) {
        UnixSocketAddress *q_unix;
        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
        q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
-        q_unix->path = g_strdup(s->path);
+        q_unix->path = g_strdup(qdict_get_str(options, "path"));
+        qdict_del(options, "path");
    } else {
        InetSocketAddress *inet;
-
-        s->port = g_strdup(qemu_opt_get(opts, "port"));
-
        saddr->type = SOCKET_ADDRESS_KIND_INET;
        inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1);
-        inet->host = g_strdup(s->host);
-        inet->port = g_strdup(s->port);
-        if (!inet->port) {
+        inet->host = g_strdup(qdict_get_str(options, "host"));
+        if (!qdict_get_try_str(options, "port")) {
            inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+        } else {
+            inet->port = g_strdup(qdict_get_str(options, "port"));
        }
+        qdict_del(options, "host");
+        qdict_del(options, "port");
    }

    s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;

-    s->export = g_strdup(qemu_opt_get(opts, "export"));
+    *export = g_strdup(qdict_get_try_str(options, "export"));
+    if (*export) {
+        qdict_del(options, "export");
+    }

    return saddr;
 }
@@ -293,66 +292,28 @@ static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
 }


-static QemuOptsList nbd_runtime_opts = {
-    .name = "nbd",
-    .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head),
-    .desc = {
-        {
-            .name = "host",
-            .type = QEMU_OPT_STRING,
-            .help = "TCP host to connect to",
-        },
-        {
-            .name = "port",
-            .type = QEMU_OPT_STRING,
-            .help = "TCP port to connect to",
-        },
-        {
-            .name = "path",
-            .type = QEMU_OPT_STRING,
-            .help = "Unix socket path to connect to",
-        },
-        {
-            .name = "export",
-            .type = QEMU_OPT_STRING,
-            .help = "Name of the NBD export to open",
-        },
-        {
-            .name = "tls-creds",
-            .type = QEMU_OPT_STRING,
-            .help = "ID of the TLS credentials to use",
-        },
-    },
-};
-
 static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVNBDState *s = bs->opaque;
-    QemuOpts *opts = NULL;
-    Error *local_err = NULL;
+    char *export = NULL;
    QIOChannelSocket *sioc = NULL;
-    SocketAddress *saddr = NULL;
+    SocketAddress *saddr;
+    const char *tlscredsid;
    QCryptoTLSCreds *tlscreds = NULL;
    const char *hostname = NULL;
    int ret = -EINVAL;

-    opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        goto error;
-    }
-
    /* Pop the config into our state object. Exit if invalid. */
-    saddr = nbd_config(s, opts, errp);
+    saddr = nbd_config(s, options, &export, errp);
    if (!saddr) {
        goto error;
    }

-    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
-    if (s->tlscredsid) {
-        tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
+    tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds"));
+    if (tlscredsid) {
+        qdict_del(options, "tls-creds");
+        tlscreds = nbd_get_tls_creds(tlscredsid, errp);
        if (!tlscreds) {
            goto error;
        }
@@ -374,7 +335,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
    }

    /* NBD handshake */
-    ret = nbd_client_init(bs, sioc, s->export,
+    ret = nbd_client_init(bs, sioc, export,
                          tlscreds, hostname, errp);
 error:
    if (sioc) {
@@ -383,18 +344,42 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
    if (tlscreds) {
        object_unref(OBJECT(tlscreds));
    }
-    if (ret < 0) {
-        g_free(s->path);
-        g_free(s->host);
-        g_free(s->port);
-        g_free(s->export);
-        g_free(s->tlscredsid);
-    }
    qapi_free_SocketAddress(saddr);
-    qemu_opts_del(opts);
+    g_free(export);
    return ret;
 }

+static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors, QEMUIOVector *qiov)
+{
+    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
+}
+
+static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
+                               int nb_sectors, QEMUIOVector *qiov, int flags)
+{
+    int ret;
+
+    ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The flag wasn't sent to the server, so we need to emulate it with an
+     * explicit flush */
+    if (flags & BDRV_REQ_FUA) {
+        ret = nbd_client_co_flush(bs);
+    }
+
+    return ret;
+}
+
+static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
+{
+    return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
+}
+
 static int nbd_co_flush(BlockDriverState *bs)
 {
    return nbd_client_co_flush(bs);
@@ -402,21 +387,19 @@ static int nbd_co_flush(BlockDriverState *bs)

 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
-    bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
+    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
+    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
+}
+
+static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors)
+{
+    return nbd_client_co_discard(bs, sector_num, nb_sectors);
 }

 static void nbd_close(BlockDriverState *bs)
 {
-    BDRVNBDState *s = bs->opaque;
-
    nbd_client_close(bs);
-
-    g_free(s->path);
-    g_free(s->host);
-    g_free(s->port);
-    g_free(s->export);
-    g_free(s->tlscredsid);
 }

 static int64_t nbd_getlength(BlockDriverState *bs)
@@ -439,45 +422,48 @@ static void nbd_attach_aio_context(BlockDriverState *bs,

 static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
 {
-    BDRVNBDState *s = bs->opaque;
    QDict *opts = qdict_new();
+    const char *path   = qdict_get_try_str(options, "path");
+    const char *host   = qdict_get_try_str(options, "host");
+    const char *port   = qdict_get_try_str(options, "port");
+    const char *export = qdict_get_try_str(options, "export");
+    const char *tlscreds = qdict_get_try_str(options, "tls-creds");

    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));

-    if (s->path && s->export) {
+    if (path && export) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix:///%s?socket=%s", s->export, s->path);
-    } else if (s->path && !s->export) {
+                 "nbd+unix:///%s?socket=%s", export, path);
+    } else if (path && !export) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix://?socket=%s", s->path);
-    } else if (!s->path && s->export && s->port) {
+                 "nbd+unix://?socket=%s", path);
+    } else if (!path && export && port) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s/%s", s->host, s->port, s->export);
-    } else if (!s->path && s->export && !s->port) {
+                 "nbd://%s:%s/%s", host, port, export);
+    } else if (!path && export && !port) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s/%s", s->host, s->export);
-    } else if (!s->path && !s->export && s->port) {
+                 "nbd://%s/%s", host, export);
+    } else if (!path && !export && port) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s", s->host, s->port);
-    } else if (!s->path && !s->export && !s->port) {
+                 "nbd://%s:%s", host, port);
+    } else if (!path && !export && !port) {
        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s", s->host);
+                 "nbd://%s", host);
    }

-    if (s->path) {
-        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(s->path)));
-    } else if (s->port) {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
-        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(s->port)));
+    if (path) {
+        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path)));
+    } else if (port) {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port)));
    } else {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
    }
-    if (s->export) {
-        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(s->export)));
+    if (export) {
+        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export)));
    }
-    if (s->tlscredsid) {
-        qdict_put_obj(opts, "tls-creds",
-                      QOBJECT(qstring_from_str(s->tlscredsid)));
+    if (tlscreds) {
+        qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds)));
    }

    bs->full_open_options = opts;
@@ -489,11 +475,13 @@ static BlockDriver bdrv_nbd = {
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
-    .bdrv_co_preadv             = nbd_client_co_preadv,
-    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_readv              = nbd_co_readv,
+    .bdrv_co_writev             = nbd_co_writev,
+    .bdrv_co_writev_flags       = nbd_co_writev_flags,
+    .supported_write_flags      = BDRV_REQ_FUA,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
+    .bdrv_co_discard            = nbd_co_discard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -507,11 +495,13 @@ static BlockDriver bdrv_nbd_tcp = {
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
-    .bdrv_co_preadv             = nbd_client_co_preadv,
-    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_readv              = nbd_co_readv,
+    .bdrv_co_writev             = nbd_co_writev,
+    .bdrv_co_writev_flags       = nbd_co_writev_flags,
+    .supported_write_flags      = BDRV_REQ_FUA,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
+    .bdrv_co_discard            = nbd_co_discard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -525,11 +515,13 @@ static BlockDriver bdrv_nbd_unix = {
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
-    .bdrv_co_preadv             = nbd_client_co_preadv,
-    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_readv              = nbd_co_readv,
+    .bdrv_co_writev             = nbd_co_writev,
+    .bdrv_co_writev_flags       = nbd_co_writev_flags,
+    .supported_write_flags      = BDRV_REQ_FUA,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
+    .bdrv_co_discard            = nbd_co_discard,
    .bdrv_refresh_limits        = nbd_refresh_limits,
    .bdrv_getlength             = nbd_getlength,
    .bdrv_detach_aio_context    = nbd_detach_aio_context,
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -38,7 +38,6 @@
 #include <nfsc/libnfs.h>

 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
-#define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
 #define QEMU_NFS_MAX_DEBUG_LEVEL 2

 typedef struct NFSClient {
@@ -104,7 +103,7 @@ static void nfs_co_generic_bh_cb(void *opaque)
    NFSRPC *task = opaque;
    task->complete = 1;
    qemu_bh_delete(task->bh);
-    qemu_coroutine_enter(task->co);
+    qemu_coroutine_enter(task->co, NULL);
 }

 static void
@@ -343,26 +342,6 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
                val = QEMU_NFS_MAX_READAHEAD_SIZE;
            }
            nfs_set_readahead(client->context, val);
-#ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
-#endif
-            client->cache_used = true;
-#endif
-#ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
-        } else if (!strcmp(qp->p[i].name, "pagecache")) {
-            if (open_flags & BDRV_O_NOCACHE) {
-                error_setg(errp, "Cannot enable NFS pagecache "
-                                 "if cache.direct = on");
-                goto fail;
-            }
-            if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) {
-                error_report("NFS Warning: Truncating NFS pagecache"
-                             " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
-                val = QEMU_NFS_MAX_PAGECACHE_SIZE;
-            }
-            nfs_set_pagecache(client->context, val);
-            nfs_set_pagecache_ttl(client->context, 0);
            client->cache_used = true;
 #endif
 #ifdef LIBNFS_FEATURE_DEBUG
@@ -545,8 +524,7 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
    }

    if ((state->flags & BDRV_O_NOCACHE) && client->cache_used) {
-        error_setg(errp, "Cannot disable cache if libnfs readahead or"
-                         " pagecache is enabled");
+        error_setg(errp, "Cannot disable cache if libnfs readahead is enabled");
        return -EINVAL;
    }

@@ -564,15 +542,6 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-#ifdef LIBNFS_FEATURE_PAGECACHE
-static void nfs_invalidate_cache(BlockDriverState *bs,
-                                 Error **errp)
-{
-    NFSClient *client = bs->opaque;
-    nfs_pagecache_invalidate(client->context, client->fh);
-}
-#endif
-
 static BlockDriver bdrv_nfs = {
    .format_name                    = "nfs",
    .protocol_name                  = "nfs",
@@ -596,10 +565,6 @@ static BlockDriver bdrv_nfs = {

    .bdrv_detach_aio_context        = nfs_detach_aio_context,
    .bdrv_attach_aio_context        = nfs_attach_aio_context,
-
-#ifdef LIBNFS_FEATURE_PAGECACHE
-    .bdrv_invalidate_cache          = nfs_invalidate_cache,
-#endif
 };

 static void nfs_block_init(void)
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,6 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
-#include "qapi/qmp/qdict.h"
-#include "qapi/qmp/qstring.h"
 #include "block/block_int.h"

 #define NULL_OPT_LATENCY "latency-ns"
@@ -225,20 +223,6 @@ static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
    }
 }

-static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
-{
-    QINCREF(opts);
-    qdict_del(opts, "filename");
-
-    if (!qdict_size(opts)) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://",
-                 bs->drv->format_name);
-    }
-
-    qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name));
-    bs->full_open_options = opts;
-}
-
 static BlockDriver bdrv_null_co = {
    .format_name            = "null-co",
    .protocol_name          = "null-co",
@@ -254,8 +238,6 @@ static BlockDriver bdrv_null_co = {
    .bdrv_reopen_prepare    = null_reopen_prepare,

    .bdrv_co_get_block_status   = null_co_get_block_status,
-
-    .bdrv_refresh_filename  = null_refresh_filename,
 };

 static BlockDriver bdrv_null_aio = {
@@ -273,8 +255,6 @@ static BlockDriver bdrv_null_aio = {
    .bdrv_reopen_prepare    = null_reopen_prepare,

    .bdrv_co_get_block_status   = null_co_get_block_status,
-
-    .bdrv_refresh_filename  = null_refresh_filename,
 };

 static void bdrv_null_init(void)
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -33,7 +33,6 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"

@@ -43,7 +42,6 @@
 #define HEADER_MAGIC2 "WithouFreSpacExt"
 #define HEADER_VERSION 2
 #define HEADER_INUSE_MAGIC  (0x746F6E59)
-#define MAX_PARALLELS_IMAGE_FACTOR (1ull << 32)

 #define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */

@@ -205,15 +203,13 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        return -EINVAL;
    }

-    to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;
+    to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx;
    space = to_allocate * s->tracks;
    if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) {
        int ret;
        space += s->prealloc_size;
        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
-            ret = bdrv_pwrite_zeroes(bs->file,
-                                     s->data_end << BDRV_SECTOR_BITS,
-                                     space << BDRV_SECTOR_BITS, 0);
+            ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0);
        } else {
            ret = bdrv_truncate(bs->file->bs,
                                (s->data_end + space) << BDRV_SECTOR_BITS);
@@ -251,7 +247,7 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
        if (off + to_write > s->header_size) {
            to_write = s->header_size - off;
        }
-        ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off,
+        ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off,
                          to_write);
        if (ret < 0) {
            qemu_co_mutex_unlock(&s->lock);
@@ -312,7 +308,7 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
        qemu_iovec_reset(&hd_qiov);
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);

-        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
+        ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov);
        if (ret < 0) {
            break;
        }
@@ -352,7 +348,7 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
            qemu_iovec_reset(&hd_qiov);
            qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);

-            ret = bdrv_co_readv(bs->file, position, n, &hd_qiov);
+            ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov);
            if (ret < 0) {
                break;
            }
@@ -433,7 +429,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
    }

    if (flush_bat) {
-        ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size);
+        ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size);
        if (ret < 0) {
            res->check_errors++;
            return ret;
@@ -476,10 +472,6 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                          DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE);
-    if (total_size >= MAX_PARALLELS_IMAGE_FACTOR * cl_size) {
-        error_propagate(errp, local_err);
-        return -E2BIG;
-    }

    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
@@ -524,8 +516,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    if (ret < 0) {
        goto exit;
    }
-    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
-                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
+    ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE,
+                           (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
    if (ret < 0) {
        goto exit;
    }
@@ -568,7 +560,7 @@ static int parallels_update_header(BlockDriverState *bs)
    if (size > s->header_size) {
        size = s->header_size;
    }
-    return bdrv_pwrite_sync(bs->file, 0, s->header, size);
+    return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size);
 }

 static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
@@ -581,7 +573,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    Error *local_err = NULL;
    char *buf;

-    ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
+    ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph));
    if (ret < 0) {
        goto fail;
    }
@@ -636,7 +628,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        s->header_size = size;
    }

-    ret = bdrv_pread(bs->file, 0, s->header, s->header_size);
+    ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size);
    if (ret < 0) {
        goto fail;
    }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (blk && blk_get_public(blk)->throttle_state) {
+    if (bs->throttle_state) {
        ThrottleConfig cfg;

-        throttle_group_get_config(blk, &cfg);
+        throttle_group_get_config(bs, &cfg);

        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->iops_size = cfg.op_size;

        info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(blk));
+        info->group = g_strdup(throttle_group_get_name(bs));
    }

    info->write_threshold = bdrv_write_threshold_get(bs);
@@ -690,15 +690,16 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation,
 void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,
                                   ImageInfoSpecific *info_spec)
 {
+    QmpOutputVisitor *ov = qmp_output_visitor_new();
    QObject *obj, *data;
-    Visitor *v = qmp_output_visitor_new(&obj);

-    visit_type_ImageInfoSpecific(v, NULL, &info_spec, &error_abort);
-    visit_complete(v, &obj);
+    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec,
+                                 &error_abort);
+    obj = qmp_output_get_qobject(ov);
    assert(qobject_type(obj) == QTYPE_QDICT);
    data = qdict_get(qobject_to_qdict(obj), "data");
    dump_qobject(func_fprintf, f, 1, data);
-    visit_free(v);
+    qmp_output_visitor_cleanup(ov);
 }

 void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -28,7 +28,6 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
 #include "crypto/cipher.h"
@@ -105,7 +104,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;
    QCowHeader header;

-    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header));
    if (ret < 0) {
        goto fail;
    }
@@ -162,19 +161,13 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->crypt_method_header) {
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_setg(errp,
-                       "Use of AES-CBC encrypted qcow images is no longer "
-                       "supported in system emulators");
-            error_append_hint(errp,
-                              "You can use 'qemu-img convert' to convert your "
-                              "image to an alternative supported format, such "
-                              "as unencrypted qcow, or raw with the LUKS "
-                              "format instead.\n");
-            ret = -ENOSYS;
-            goto fail;
+            error_report("qcow built-in AES encryption is deprecated");
+            error_printf("Support for it will be removed in a future release.\n"
+                         "You can use 'qemu-img convert' to switch to an\n"
+                         "unencrypted qcow image, or a LUKS raw image.\n");
        }

-        bs->encrypted = true;
+        bs->encrypted = 1;
    }
    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
@@ -208,7 +201,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+    ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table,
               s->l1_size * sizeof(uint64_t));
    if (ret < 0) {
        goto fail;
@@ -239,7 +232,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
            ret = -EINVAL;
            goto fail;
        }
-        ret = bdrv_pread(bs->file, header.backing_file_offset,
+        ret = bdrv_pread(bs->file->bs, header.backing_file_offset,
                   bs->backing_file, len);
        if (ret < 0) {
            goto fail;
@@ -390,7 +383,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
        /* update the L1 entry */
        s->l1_table[l1_index] = l2_offset;
        tmp = cpu_to_be64(l2_offset);
-        if (bdrv_pwrite_sync(bs->file,
+        if (bdrv_pwrite_sync(bs->file->bs,
                s->l1_table_offset + l1_index * sizeof(tmp),
                &tmp, sizeof(tmp)) < 0)
            return 0;
@@ -420,11 +413,11 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
    l2_table = s->l2_cache + (min_index << s->l2_bits);
    if (new_l2_table) {
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+        if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table,
                s->l2_size * sizeof(uint64_t)) < 0)
            return 0;
    } else {
-        if (bdrv_pread(bs->file, l2_offset, l2_table,
+        if (bdrv_pread(bs->file->bs, l2_offset, l2_table,
                       s->l2_size * sizeof(uint64_t)) !=
            s->l2_size * sizeof(uint64_t))
            return 0;
@@ -450,7 +443,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
            cluster_offset = (cluster_offset + s->cluster_size - 1) &
                ~(s->cluster_size - 1);
            /* write the cluster content */
-            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
+            if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache,
                            s->cluster_size) !=
                s->cluster_size)
                return -1;
@@ -480,7 +473,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                                errno = EIO;
                                return -1;
                            }
-                            if (bdrv_pwrite(bs->file,
+                            if (bdrv_pwrite(bs->file->bs,
                                            cluster_offset + i * 512,
                                            s->cluster_data, 512) != 512)
                                return -1;
@@ -495,7 +488,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
        /* update L2 table */
        tmp = cpu_to_be64(cluster_offset);
        l2_table[l2_index] = tmp;
-        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+        if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp),
                &tmp, sizeof(tmp)) < 0)
            return 0;
    }
@@ -565,7 +558,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
    if (s->cluster_cache_offset != coffset) {
        csize = cluster_offset >> (63 - s->cluster_bits);
        csize &= (s->cluster_size - 1);
-        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
+        ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize);
        if (ret != csize)
            return -1;
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
@@ -619,7 +612,8 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                hd_iov.iov_len = n * 512;
                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
                qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
+                ret = bdrv_co_readv(bs->backing->bs, sector_num,
+                                    n, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
                    goto fail;
@@ -643,7 +637,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
            hd_iov.iov_len = n * 512;
            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
            qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->file,
+            ret = bdrv_co_readv(bs->file->bs,
                                (cluster_offset >> 9) + index_in_cluster,
                                n, &hd_qiov);
            qemu_co_mutex_lock(&s->lock);
@@ -745,7 +739,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
        hd_iov.iov_len = n * 512;
        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
        qemu_co_mutex_unlock(&s->lock);
-        ret = bdrv_co_writev(bs->file,
+        ret = bdrv_co_writev(bs->file->bs,
                             (cluster_offset >> 9) + index_in_cluster,
                             n, &hd_qiov);
        qemu_co_mutex_lock(&s->lock);
@@ -873,8 +867,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    tmp = g_malloc0(BDRV_SECTOR_SIZE);
-    for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
-         i++) {
+    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
+        BDRV_SECTOR_SIZE); i++) {
        ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
                         tmp, BDRV_SECTOR_SIZE, 0);
        if (ret != BDRV_SECTOR_SIZE) {
@@ -899,7 +893,7 @@ static int qcow_make_empty(BlockDriverState *bs)
    int ret;

    memset(s->l1_table, 0, l1_length);
-    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+    if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table,
            l1_length) < 0)
        return -1;
    ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length);
@@ -915,32 +909,32 @@ static int qcow_make_empty(BlockDriverState *bs)

 /* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
-static coroutine_fn int
-qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
-                           uint64_t bytes, QEMUIOVector *qiov)
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
 {
    BDRVQcowState *s = bs->opaque;
-    QEMUIOVector hd_qiov;
-    struct iovec iov;
    z_stream strm;
    int ret, out_len;
-    uint8_t *buf, *out_buf;
+    uint8_t *out_buf;
    uint64_t cluster_offset;

-    buf = qemu_blockalign(bs, s->cluster_size);
-    if (bytes != s->cluster_size) {
-        if (bytes > s->cluster_size ||
-            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
-        {
-            qemu_vfree(buf);
-            return -EINVAL;
-        }
-        /* Zero-pad last write if image size is not cluster aligned */
-        memset(buf + bytes, 0, s->cluster_size - bytes);
-    }
-    qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;

-    out_buf = g_malloc(s->cluster_size);
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow_write_compressed(bs, sector_num,
+                                        pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
@@ -969,35 +963,27 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,

    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
        /* could not compress: write normal cluster */
-        ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
-                             bytes >> BDRV_SECTOR_BITS, qiov);
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
        if (ret < 0) {
            goto fail;
        }
-        goto success;
-    }
-    qemu_co_mutex_lock(&s->lock);
-    cluster_offset = get_cluster_offset(bs, offset, 2, out_len, 0, 0);
-    qemu_co_mutex_unlock(&s->lock);
-    if (cluster_offset == 0) {
-        ret = -EIO;
-        goto fail;
-    }
-    cluster_offset &= s->cluster_offset_mask;
+    } else {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+                                            out_len, 0, 0);
+        if (cluster_offset == 0) {
+            ret = -EIO;
+            goto fail;
+        }

-    iov = (struct iovec) {
-        .iov_base   = out_buf,
-        .iov_len    = out_len,
-    };
-    qemu_iovec_init_external(&hd_qiov, &iov, 1);
-    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
-    if (ret < 0) {
-        goto fail;
+        cluster_offset &= s->cluster_offset_mask;
+        ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
    }
-success:
+
    ret = 0;
 fail:
-    qemu_vfree(buf);
    g_free(out_buf);
    return ret;
 }
@@ -1050,7 +1036,7 @@ static BlockDriver bdrv_qcow = {

    .bdrv_set_key           = qcow_set_key,
    .bdrv_make_empty        = qcow_make_empty,
-    .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
+    .bdrv_write_compressed  = qcow_write_compressed,
    .bdrv_get_info          = qcow_get_info,

    .create_opts            = &qcow_create_opts,
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -24,6 +24,11 @@

 /* Needed for CONFIG_MADVISE */
 #include "qemu/osdep.h"
+
+#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
+#include <sys/mman.h>
+#endif
+
 #include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
@@ -210,7 +215,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
    }

-    ret = bdrv_pwrite(bs->file, c->entries[i].offset,
+    ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset,
                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
    if (ret < 0) {
        return ret;
@@ -357,7 +362,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
        }

-        ret = bdrv_pread(bs->file, offset,
+        ret = bdrv_pread(bs->file->bs, offset,
                         qcow2_cache_get_table_addr(bs, c, i),
                         s->cluster_size);
        if (ret < 0) {
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,7 +29,6 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
-#include "qemu/bswap.h"
 #include "trace.h"

 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
@@ -83,9 +82,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
    }
    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));

-    if (s->l1_size) {
-        memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
-    }
+    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));

    /* write new table (align to cluster) */
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
@@ -111,7 +108,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
    for(i = 0; i < s->l1_size; i++)
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
-    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
+    ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset,
                           new_l1_table, new_l1_size2);
    if (ret < 0)
        goto fail;
@@ -120,9 +117,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,

    /* set new table */
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
-    stl_be_p(data, new_l1_size);
+    cpu_to_be32w((uint32_t*)data, new_l1_size);
    stq_be_p(data + 4, new_l1_table_offset);
-    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
+    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size),
                           data, sizeof(data));
    if (ret < 0) {
        goto fail;
@@ -157,9 +154,11 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
    uint64_t **l2_table)
 {
    BDRVQcow2State *s = bs->opaque;
+    int ret;

-    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                           (void **)l2_table);
+    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
+
+    return ret;
 }

 /*
@@ -188,7 +187,7 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
-    ret = bdrv_pwrite_sync(bs->file,
+    ret = bdrv_pwrite_sync(bs->file->bs,
                           s->l1_table_offset + 8 * l1_start_index,
                           buf, sizeof(buf));
    if (ret < 0) {
@@ -391,18 +390,22 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
    return 0;
 }

-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-                                       uint64_t src_cluster_offset,
-                                       uint64_t cluster_offset,
-                                       int offset_in_cluster,
-                                       int bytes)
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     uint64_t start_sect,
+                                     uint64_t cluster_offset,
+                                     int n_start, int n_end)
 {
    BDRVQcow2State *s = bs->opaque;
    QEMUIOVector qiov;
    struct iovec iov;
-    int ret;
+    int n, ret;

-    iov.iov_len = bytes;
+    n = n_end - n_start;
+    if (n <= 0) {
+        return 0;
+    }
+
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
    if (iov.iov_base == NULL) {
        return -ENOMEM;
@@ -421,21 +424,17 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     * interface.  This avoids double I/O throttling and request tracking,
     * which can lead to deadlock when block layer copy-on-read is enabled.
     */
-    ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
-                                  bytes, &qiov, 0);
+    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
    if (ret < 0) {
        goto out;
    }

    if (bs->encrypted) {
        Error *err = NULL;
-        int64_t sector = (src_cluster_offset + offset_in_cluster)
-                         >> BDRV_SECTOR_BITS;
        assert(s->cipher);
-        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
-        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
-                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
+        if (qcow2_encrypt_sectors(s, start_sect + n_start,
+                                  iov.iov_base, iov.iov_base, n,
+                                  true, &err) < 0) {
            ret = -EIO;
            error_free(err);
            goto out;
@@ -443,14 +442,14 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
    }

    ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + offset_in_cluster, bytes);
+            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
    if (ret < 0) {
        goto out;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-                          bytes, &qiov, 0);
+    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
+                         &qiov);
    if (ret < 0) {
        goto out;
    }
@@ -465,43 +464,47 @@ out:
 /*
 * get_cluster_offset
 *
- * For a given offset of the virtual disk, find the cluster type and offset in
- * the qcow2 file. The offset is stored in *cluster_offset.
+ * For a given offset of the disk image, find the cluster offset in
+ * qcow2 file. The offset is stored in *cluster_offset.
 *
- * On entry, *bytes is the maximum number of contiguous bytes starting at
- * offset that we are interested in.
+ * on entry, *num is the number of contiguous sectors we'd like to
+ * access following offset.
 *
- * On exit, *bytes is the number of bytes starting at offset that have the same
- * cluster type and (if applicable) are stored contiguously in the image file.
- * Compressed clusters are always returned one by one.
+ * on exit, *num is the number of contiguous sectors we can read.
 *
 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 * cases.
 */
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-                             unsigned int *bytes, uint64_t *cluster_offset)
+    int *num, uint64_t *cluster_offset)
 {
    BDRVQcow2State *s = bs->opaque;
    unsigned int l2_index;
    uint64_t l1_index, l2_offset, *l2_table;
    int l1_bits, c;
-    unsigned int offset_in_cluster;
-    uint64_t bytes_available, bytes_needed, nb_clusters;
+    unsigned int index_in_cluster, nb_clusters;
+    uint64_t nb_available, nb_needed;
    int ret;

-    offset_in_cluster = offset_into_cluster(s, offset);
-    bytes_needed = (uint64_t) *bytes + offset_in_cluster;
+    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+    nb_needed = *num + index_in_cluster;

    l1_bits = s->l2_bits + s->cluster_bits;

-    /* compute how many bytes there are between the start of the cluster
-     * containing offset and the end of the l1 entry */
-    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
-                    + offset_in_cluster;
+    /* compute how many bytes there are between the offset and
+     * the end of the l1 entry
+     */

-    if (bytes_needed > bytes_available) {
-        bytes_needed = bytes_available;
+    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
+
+    /* compute the number of available sectors */
+
+    nb_available = (nb_available >> 9) + index_in_cluster;
+
+    if (nb_needed > nb_available) {
+        nb_needed = nb_available;
    }
+    assert(nb_needed <= INT_MAX);

    *cluster_offset = 0;

@@ -538,11 +541,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
    *cluster_offset = be64_to_cpu(l2_table[l2_index]);

-    nb_clusters = size_to_clusters(s, bytes_needed);
-    /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
-     * integers; the minimum cluster size is 512, so this assertion is always
-     * true */
-    assert(nb_clusters <= INT_MAX);
+    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
+    nb_clusters = size_to_clusters(s, nb_needed << 9);

    ret = qcow2_get_cluster_type(*cluster_offset);
    switch (ret) {
@@ -589,18 +589,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,

    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);

-    bytes_available = (int64_t)c * s->cluster_size;
+    nb_available = (c * s->cluster_sectors);

 out:
-    if (bytes_available > bytes_needed) {
-        bytes_available = bytes_needed;
-    }
+    if (nb_available > nb_needed)
+        nb_available = nb_needed;

-    /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
-     * subtracting offset_in_cluster will therefore definitely yield something
-     * not exceeding UINT_MAX */
-    assert(bytes_available - offset_in_cluster <= UINT_MAX);
-    *bytes = bytes_available - offset_in_cluster;
+    *num = nb_available - index_in_cluster;

    return ret;

@@ -746,12 +741,14 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
    BDRVQcow2State *s = bs->opaque;
    int ret;

-    if (r->nb_bytes == 0) {
+    if (r->nb_sectors == 0) {
        return 0;
    }

    qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
    qemu_co_mutex_lock(&s->lock);

    if (ret < 0) {
@@ -813,14 +810,13 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
    assert(l2_index + m->nb_clusters <= s->l2_size);
    for (i = 0; i < m->nb_clusters; i++) {
        /* if two concurrent writes happen to the same unallocated cluster
-         * each write allocates separate cluster and writes data concurrently.
-         * The first one to complete updates l2 table with pointer to its
-         * cluster the second one has to do RMW (which is done above by
-         * perform_cow()), update l2 table with its cluster pointer and free
-         * old cluster. This is what this loop does */
-        if (l2_table[l2_index + i] != 0) {
+	 * each write allocates separate cluster and writes data concurrently.
+	 * The first one to complete updates l2 table with pointer to its
+	 * cluster the second one has to do RMW (which is done above by
+	 * copy_sectors()), update l2 table with its cluster pointer and free
+	 * old cluster. This is what this loop does */
+        if(l2_table[l2_index + i] != 0)
            old_cluster[j++] = l2_table[l2_index + i];
-        }

        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
@@ -1202,20 +1198,25 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
    /*
     * Save info needed for meta data update.
     *
-     * requested_bytes: Number of bytes from the start of the first
+     * requested_sectors: Number of sectors from the start of the first
     * newly allocated cluster to the end of the (possibly shortened
     * before) write request.
     *
-     * avail_bytes: Number of bytes from the start of the first
+     * avail_sectors: Number of sectors from the start of the first
     * newly allocated to the end of the last newly allocated cluster.
     *
-     * nb_bytes: The number of bytes from the start of the first
+     * nb_sectors: The number of sectors from the start of the first
     * newly allocated cluster to the end of the area that the write
     * request actually writes to (excluding COW at the end)
     */
-    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
-    int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
-    int nb_bytes = MIN(requested_bytes, avail_bytes);
+    int requested_sectors =
+        (*bytes + offset_into_cluster(s, guest_offset))
+        >> BDRV_SECTOR_BITS;
+    int avail_sectors = nb_clusters
+                        << (s->cluster_bits - BDRV_SECTOR_BITS);
+    int alloc_n_start = offset_into_cluster(s, guest_offset)
+                        >> BDRV_SECTOR_BITS;
+    int nb_sectors = MIN(requested_sectors, avail_sectors);
    QCowL2Meta *old_m = *m;

    *m = g_malloc0(sizeof(**m));
@@ -1226,21 +1227,23 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
        .alloc_offset   = alloc_cluster_offset,
        .offset         = start_of_cluster(s, guest_offset),
        .nb_clusters    = nb_clusters,
+        .nb_available   = nb_sectors,

        .cow_start = {
            .offset     = 0,
-            .nb_bytes   = offset_into_cluster(s, guest_offset),
+            .nb_sectors = alloc_n_start,
        },
        .cow_end = {
-            .offset     = nb_bytes,
-            .nb_bytes   = avail_bytes - nb_bytes,
+            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+            .nb_sectors = avail_sectors - nb_sectors,
        },
    };
    qemu_co_queue_init(&(*m)->dependent_requests);
    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);

    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
-    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
+    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                         - offset_into_cluster(s, guest_offset));
    assert(*bytes != 0);

    return 1;
@@ -1272,8 +1275,7 @@ fail:
 * Return 0 on success and -errno in error cases
 */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-                               unsigned int *bytes, uint64_t *host_offset,
-                               QCowL2Meta **m)
+    int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t start, remaining;
@@ -1281,11 +1283,13 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
    uint64_t cur_bytes;
    int ret;

-    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
+
+    assert((offset & ~BDRV_SECTOR_MASK) == 0);

 again:
    start = offset;
-    remaining = *bytes;
+    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
    cluster_offset = 0;
    *host_offset = 0;
    cur_bytes = 0;
@@ -1371,8 +1375,8 @@ again:
        }
    }

-    *bytes -= remaining;
-    assert(*bytes > 0);
+    *num -= remaining >> BDRV_SECTOR_BITS;
+    assert(*num > 0);
    assert(*host_offset != 0);

    return 0;
@@ -1417,7 +1421,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
        sector_offset = coffset & 511;
        csize = nb_csectors * 512 - sector_offset;
        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data,
+        ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data,
                        nb_csectors);
        if (ret < 0) {
            return ret;
@@ -1686,7 +1690,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    (void **)&l2_table);
        } else {
            /* load inactive L2 tables from disk */
-            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+            ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
                            (void *)l2_table, s->cluster_sectors);
        }
        if (ret < 0) {
@@ -1761,7 +1765,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                goto fail;
            }

-            ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
+            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
+                                    s->cluster_sectors, 0);
            if (ret < 0) {
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
@@ -1793,7 +1798,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    goto fail;
                }

-                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
                                 (void *)l2_table, s->cluster_sectors);
                if (ret < 0) {
                    goto fail;
@@ -1863,12 +1868,12 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
    }

    for (i = 0; i < s->nb_snapshots; i++) {
-        int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
-                                      sizeof(uint64_t), BDRV_SECTOR_SIZE);
+        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
+                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;

        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);

-        ret = bdrv_read(bs->file,
+        ret = bdrv_read(bs->file->bs,
                        s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
                        (void *)l1_table, l1_sectors);
        if (ret < 0) {
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,7 +28,6 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 #include "qemu/range.h"
-#include "qemu/bswap.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -104,7 +103,7 @@ int qcow2_refcount_init(BlockDriverState *bs)
            goto fail;
        }
        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
-        ret = bdrv_pread(bs->file, s->refcount_table_offset,
+        ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
                         s->refcount_table, refcount_table_size2);
        if (ret < 0) {
            goto fail;
@@ -218,10 +217,13 @@ static int load_refcount_block(BlockDriverState *bs,
                               void **refcount_block)
 {
    BDRVQcow2State *s = bs->opaque;
+    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
-    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
-                           refcount_block);
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
+
+    return ret;
 }

 /*
@@ -431,7 +433,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
    if (refcount_table_index < s->refcount_table_size) {
        uint64_t data64 = cpu_to_be64(new_block);
        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
-        ret = bdrv_pwrite_sync(bs->file,
+        ret = bdrv_pwrite_sync(bs->file->bs,
            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
            &data64, sizeof(data64));
        if (ret < 0) {
@@ -487,12 +489,14 @@ static int alloc_refcount_block(BlockDriverState *bs,
        uint64_t table_clusters =
            size_to_clusters(s, table_size * sizeof(uint64_t));
        blocks_clusters = 1 +
-            DIV_ROUND_UP(table_clusters, s->refcount_block_size);
+            ((table_clusters + s->refcount_block_size - 1)
+            / s->refcount_block_size);
        uint64_t meta_clusters = table_clusters + blocks_clusters;

        last_table_size = table_size;
        table_size = next_refcount_table_size(s, blocks_used +
-            DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
+            ((meta_clusters + s->refcount_block_size - 1)
+            / s->refcount_block_size));

    } while (last_table_size != table_size);

@@ -533,7 +537,7 @@ static int alloc_refcount_block(BlockDriverState *bs,

    /* Write refcount blocks to disk */
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
-    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
+    ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
        blocks_clusters * s->cluster_size);
    g_free(new_blocks);
    new_blocks = NULL;
@@ -547,7 +551,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
-    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
+    ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
        table_size * sizeof(uint64_t));
    if (ret < 0) {
        goto fail_table;
@@ -562,10 +566,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
        uint64_t d64;
        uint32_t d32;
    } data;
-    data.d64 = cpu_to_be64(table_offset);
-    data.d32 = cpu_to_be32(table_clusters);
+    cpu_to_be64w(&data.d64, table_offset);
+    cpu_to_be32w(&data.d32, table_clusters);
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
-    ret = bdrv_pwrite_sync(bs->file,
+    ret = bdrv_pwrite_sync(bs->file->bs,
                           offsetof(QCowHeader, refcount_table_offset),
                           &data, sizeof(data));
    if (ret < 0) {
@@ -615,7 +619,9 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)

        /* Discard is optional, ignore the return value */
        if (ret >= 0) {
-            bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
+            bdrv_discard(bs->file->bs,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
        }

        g_free(d);
@@ -1068,7 +1074,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
        }
        l1_allocated = true;

-        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
        if (ret < 0) {
            goto fail;
        }
@@ -1221,7 +1227,7 @@ fail:
            cpu_to_be64s(&l1_table[i]);
        }

-        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
+        ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
                               l1_table, l1_size2);

        for (i = 0; i < l1_size; i++) {
@@ -1380,7 +1386,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
    l2_size = s->l2_size * sizeof(uint64_t);
    l2_table = g_malloc(l2_size);

-    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
+    ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
    if (ret < 0) {
        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
        res->check_errors++;
@@ -1512,7 +1518,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
            res->check_errors++;
            goto fail;
        }
-        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
        if (ret < 0) {
            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
            res->check_errors++;
@@ -1610,7 +1616,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
            }
        }

-        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+        ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
                         s->l2_size * sizeof(uint64_t));
        if (ret < 0) {
            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
@@ -1662,7 +1668,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
                goto fail;
            }

-            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
+            ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
                              s->cluster_size);
            if (ret < 0) {
                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
@@ -2096,7 +2102,7 @@ write_refblocks:
        on_disk_refblock = (void *)((char *) *refcount_table +
                                    refblock_index * s->cluster_size);

-        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
+        ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
                         on_disk_refblock, s->cluster_sectors);
        if (ret < 0) {
            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
@@ -2145,7 +2151,7 @@ write_refblocks:
    }

    assert(reftable_size < INT_MAX / sizeof(uint64_t));
-    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
+    ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
                      reftable_size * sizeof(uint64_t));
    if (ret < 0) {
        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
@@ -2153,11 +2159,12 @@ write_refblocks:
    }

    /* Enter new reftable into the image header */
-    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
-    reftable_offset_and_clusters.reftable_clusters =
-        cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
-    ret = bdrv_pwrite_sync(bs->file,
-                           offsetof(QCowHeader, refcount_table_offset),
+    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
+                 reftable_offset);
+    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
+                 size_to_clusters(s, reftable_size * sizeof(uint64_t)));
+    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
+                                                  refcount_table_offset),
                           &reftable_offset_and_clusters,
                           sizeof(reftable_offset_and_clusters));
    if (ret < 0) {
@@ -2404,7 +2411,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
                return -ENOMEM;
            }

-            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
+            ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
            if (ret < 0) {
                g_free(l1);
                return ret;
@@ -2557,7 +2564,7 @@ static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
            return ret;
        }

-        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
+        ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to write refblock");
            return ret;
@@ -2827,7 +2834,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
        cpu_to_be64s(&new_reftable[i]);
    }

-    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
+    ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
                      new_reftable_size * sizeof(uint64_t));

    for (i = 0; i < new_reftable_size; i++) {
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,7 +26,6 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
-#include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"

@@ -67,7 +66,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
    for(i = 0; i < s->nb_snapshots; i++) {
        /* Read statically sized part of the snapshot header */
        offset = align_offset(offset, 8);
-        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+        ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h));
        if (ret < 0) {
            goto fail;
        }
@@ -86,7 +85,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
        name_size = be16_to_cpu(h.name_size);

        /* Read extra data */
-        ret = bdrv_pread(bs->file, offset, &extra,
+        ret = bdrv_pread(bs->file->bs, offset, &extra,
                         MIN(sizeof(extra), extra_data_size));
        if (ret < 0) {
            goto fail;
@@ -105,7 +104,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)

        /* Read snapshot ID */
        sn->id_str = g_malloc(id_str_size + 1);
-        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+        ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size);
        if (ret < 0) {
            goto fail;
        }
@@ -114,7 +113,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)

        /* Read snapshot name */
        sn->name = g_malloc(name_size + 1);
-        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+        ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size);
        if (ret < 0) {
            goto fail;
        }
@@ -217,25 +216,25 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
        h.name_size = cpu_to_be16(name_size);
        offset = align_offset(offset, 8);

-        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+        ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h));
        if (ret < 0) {
            goto fail;
        }
        offset += sizeof(h);

-        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
+        ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra));
        if (ret < 0) {
            goto fail;
        }
        offset += sizeof(extra);

-        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+        ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size);
        if (ret < 0) {
            goto fail;
        }
        offset += id_str_size;

-        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+        ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size);
        if (ret < 0) {
            goto fail;
        }
@@ -257,7 +256,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);

-    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots),
                           &header_data, sizeof(header_data));
    if (ret < 0) {
        goto fail;
@@ -399,7 +398,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
        goto fail;
    }

-    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+    ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table,
                      s->l1_size * sizeof(uint64_t));
    if (ret < 0) {
        goto fail;
@@ -512,7 +511,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
        goto fail;
    }

-    ret = bdrv_pread(bs->file, sn->l1_table_offset,
+    ret = bdrv_pread(bs->file->bs, sn->l1_table_offset,
                     sn_l1_table, sn_l1_bytes);
    if (ret < 0) {
        goto fail;
@@ -530,7 +529,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
        goto fail;
    }

-    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+    ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table,
                           cur_l1_bytes);
    if (ret < 0) {
        goto fail;
@@ -716,7 +715,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
        return -ENOMEM;
    }

-    ret = bdrv_pread(bs->file, sn->l1_table_offset,
+    ret = bdrv_pread(bs->file->bs, sn->l1_table_offset,
                     new_l1_table, new_l1_bytes);
    if (ret < 0) {
        error_setg(errp, "Failed to read l1 table for snapshot");
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -36,7 +36,6 @@
 #include "trace.h"
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
-#include "qemu/bswap.h"

 /*
  Differences with QCOW:
@@ -107,7 +106,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
        printf("attempting to read extended header in offset %lu\n", offset);
 #endif

-        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
+        ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext));
        if (ret < 0) {
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
                             "pread fail from offset %" PRIu64, offset);
@@ -135,7 +134,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                           sizeof(bs->backing_format));
                return 2;
            }
-            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
+            ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
                                 "Could not read format name");
@@ -151,7 +150,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
            if (p_feature_table != NULL) {
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
-                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+                ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len);
                if (ret < 0) {
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
                                     "Could not read table");
@@ -172,7 +171,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                uext->len = ext.len;
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);

-                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
+                ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len);
                if (ret < 0) {
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
                                     "Could not read data");
@@ -249,7 +248,7 @@ int qcow2_mark_dirty(BlockDriverState *bs)
    }

    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
-    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+    ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features),
                      &val, sizeof(val));
    if (ret < 0) {
        return ret;
@@ -817,7 +816,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    uint64_t ext_end;
    uint64_t l1_vm_state_index;

-    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header));
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
        goto fail;
@@ -892,7 +891,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    if (header.header_length > sizeof(header)) {
        s->unknown_header_fields_size = header.header_length - sizeof(header);
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
-        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+        ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields,
                         s->unknown_header_fields_size);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
@@ -968,19 +967,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->crypt_method_header) {
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_setg(errp,
-                       "Use of AES-CBC encrypted qcow2 images is no longer "
-                       "supported in system emulators");
-            error_append_hint(errp,
-                              "You can use 'qemu-img convert' to convert your "
-                              "image to an alternative supported format, such "
-                              "as unencrypted qcow2, or raw with the LUKS "
-                              "format instead.\n");
-            ret = -ENOSYS;
-            goto fail;
+            error_report("qcow2 built-in AES encryption is deprecated");
+            error_printf("Support for it will be removed in a future release.\n"
+                         "You can use 'qemu-img convert' to switch to an\n"
+                         "unencrypted qcow2 image, or a LUKS raw image.\n");
        }

-        bs->encrypted = true;
+        bs->encrypted = 1;
    }

    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
@@ -1066,7 +1059,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
            ret = -ENOMEM;
            goto fail;
        }
-        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+        ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table,
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not read L1 table");
@@ -1122,7 +1115,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
            ret = -EINVAL;
            goto fail;
        }
-        ret = bdrv_pread(bs->file, header.backing_file_offset,
+        ret = bdrv_pread(bs->file->bs, header.backing_file_offset,
                         bs->backing_file, len);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not read backing file name");
@@ -1199,11 +1192,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;

-    if (bs->encrypted) {
-        /* Encryption works on a sector granularity */
-        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
-    }
-    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
+    bs->bl.write_zeroes_alignment = s->cluster_sectors;
 }

 static int qcow2_set_key(BlockDriverState *bs, const char *key)
@@ -1341,20 +1330,16 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
    BDRVQcow2State *s = bs->opaque;
    uint64_t cluster_offset;
    int index_in_cluster, ret;
-    unsigned int bytes;
    int64_t status = 0;

-    bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
+    *pnum = nb_sectors;
    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes,
-                                   &cluster_offset);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        return ret;
    }

-    *pnum = bytes >> BDRV_SECTOR_BITS;
-
    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
        !s->cipher) {
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1372,34 +1357,28 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,

 /* handle reading after the end of the backing file */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
+                  int64_t sector_num, int nb_sectors)
 {
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
+    if ((sector_num + nb_sectors) <= bs->total_sectors)
+        return nb_sectors;
+    if (sector_num >= bs->total_sectors)
        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
+    else
+        n1 = bs->total_sectors - sector_num;

-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
+    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));

    return n1;
 }

-static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
-                                        uint64_t bytes, QEMUIOVector *qiov,
-                                        int flags)
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
 {
    BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int index_in_cluster, n1;
    int ret;
-    unsigned int cur_bytes; /* number of bytes in current iteration */
+    int cur_nr_sectors; /* number of sectors in current iteration */
    uint64_t cluster_offset = 0;
    uint64_t bytes_done = 0;
    QEMUIOVector hd_qiov;
@@ -1409,24 +1388,26 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,

    qemu_co_mutex_lock(&s->lock);

-    while (bytes != 0) {
+    while (remaining_sectors != 0) {

        /* prepare next request */
-        cur_bytes = MIN(bytes, INT_MAX);
+        cur_nr_sectors = remaining_sectors;
        if (s->cipher) {
-            cur_bytes = MIN(cur_bytes,
-                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            cur_nr_sectors = MIN(cur_nr_sectors,
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
        }

-        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
+        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+            &cur_nr_sectors, &cluster_offset);
        if (ret < 0) {
            goto fail;
        }

-        offset_in_cluster = offset_into_cluster(s, offset);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);

        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
@@ -1434,17 +1415,18 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
            if (bs->backing) {
                /* read from the base image */
                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
+                    sector_num, cur_nr_sectors);
                if (n1 > 0) {
                    QEMUIOVector local_qiov;

                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
+                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
+                                      n1 * BDRV_SECTOR_SIZE);

                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
+                    ret = bdrv_co_readv(bs->backing->bs, sector_num,
+                                        n1, &local_qiov);
                    qemu_co_mutex_lock(&s->lock);

                    qemu_iovec_destroy(&local_qiov);
@@ -1455,12 +1437,12 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                }
            } else {
                /* Note: in this case, no need to wait */
-                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
+                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
            }
            break;

        case QCOW2_CLUSTER_ZERO:
-            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
+            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
            break;

        case QCOW2_CLUSTER_COMPRESSED:
@@ -1471,8 +1453,8 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
            }

            qemu_iovec_from_buf(&hd_qiov, 0,
-                                s->cluster_cache + offset_in_cluster,
-                                cur_bytes);
+                s->cluster_cache + index_in_cluster * 512,
+                512 * cur_nr_sectors);
            break;

        case QCOW2_CLUSTER_NORMAL:
@@ -1499,34 +1481,34 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                    }
                }

-                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                assert(cur_nr_sectors <=
+                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
                qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
+                qemu_iovec_add(&hd_qiov, cluster_data,
+                    512 * cur_nr_sectors);
            }

            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_preadv(bs->file,
-                                 cluster_offset + offset_in_cluster,
-                                 cur_bytes, &hd_qiov, 0);
+            ret = bdrv_co_readv(bs->file->bs,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                cur_nr_sectors, &hd_qiov);
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
                goto fail;
            }
            if (bs->encrypted) {
                assert(s->cipher);
-                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
                Error *err = NULL;
-                if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
-                                          cluster_data, cluster_data,
-                                          cur_bytes >> BDRV_SECTOR_BITS,
-                                          false, &err) < 0) {
+                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data,
+                                          cluster_data, cur_nr_sectors, false,
+                                          &err) < 0) {
                    error_free(err);
                    ret = -EIO;
                    goto fail;
                }
-                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
+                qemu_iovec_from_buf(qiov, bytes_done,
+                    cluster_data, 512 * cur_nr_sectors);
            }
            break;

@@ -1536,9 +1518,9 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
            goto fail;
        }

-        bytes -= cur_bytes;
-        offset += cur_bytes;
-        bytes_done += cur_bytes;
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
    }
    ret = 0;

@@ -1551,21 +1533,23 @@ fail:
    return ret;
 }

-static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                                         uint64_t bytes, QEMUIOVector *qiov,
-                                         int flags)
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int remaining_sectors,
+                           QEMUIOVector *qiov)
 {
    BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster;
+    int index_in_cluster;
    int ret;
-    unsigned int cur_bytes; /* number of sectors in current iteration */
+    int cur_nr_sectors; /* number of sectors in current iteration */
    uint64_t cluster_offset;
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
    QCowL2Meta *l2meta = NULL;

-    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+                                 remaining_sectors);

    qemu_iovec_init(&hd_qiov, qiov->niov);

@@ -1573,21 +1557,22 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,

    qemu_co_mutex_lock(&s->lock);

-    while (bytes != 0) {
+    while (remaining_sectors != 0) {

        l2meta = NULL;

        trace_qcow2_writev_start_part(qemu_coroutine_self());
-        offset_in_cluster = offset_into_cluster(s, offset);
-        cur_bytes = MIN(bytes, INT_MAX);
-        if (bs->encrypted) {
-            cur_bytes = MIN(cur_bytes,
-                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
-                            - offset_in_cluster);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        cur_nr_sectors = remaining_sectors;
+        if (bs->encrypted &&
+            cur_nr_sectors >
+            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
+            cur_nr_sectors =
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
        }

-        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
-                                         &cluster_offset, &l2meta);
+        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+            &cur_nr_sectors, &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }
@@ -1595,7 +1580,8 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
        assert((cluster_offset & 511) == 0);

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);

        if (bs->encrypted) {
            Error *err = NULL;
@@ -1614,9 +1600,8 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);

-            if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
-                                      cluster_data, cluster_data,
-                                      cur_bytes >>BDRV_SECTOR_BITS,
+            if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
+                                      cluster_data, cur_nr_sectors,
                                      true, &err) < 0) {
                error_free(err);
                ret = -EIO;
@@ -1624,11 +1609,13 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
            }

            qemu_iovec_reset(&hd_qiov);
-            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
+            qemu_iovec_add(&hd_qiov, cluster_data,
+                cur_nr_sectors * 512);
        }

        ret = qcow2_pre_write_overlap_check(bs, 0,
-                cluster_offset + offset_in_cluster, cur_bytes);
+                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
+                cur_nr_sectors * BDRV_SECTOR_SIZE);
        if (ret < 0) {
            goto fail;
        }
@@ -1636,10 +1623,10 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
        qemu_co_mutex_unlock(&s->lock);
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        trace_qcow2_writev_data(qemu_coroutine_self(),
-                                cluster_offset + offset_in_cluster);
-        ret = bdrv_co_pwritev(bs->file,
-                              cluster_offset + offset_in_cluster,
-                              cur_bytes, &hd_qiov, 0);
+                                (cluster_offset >> 9) + index_in_cluster);
+        ret = bdrv_co_writev(bs->file->bs,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             cur_nr_sectors, &hd_qiov);
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto fail;
@@ -1665,10 +1652,10 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
            l2meta = next;
        }

-        bytes -= cur_bytes;
-        offset += cur_bytes;
-        bytes_done += cur_bytes;
-        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
    }
    ret = 0;

@@ -1770,6 +1757,13 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)

    qcow2_close(bs);

+    bdrv_invalidate_cache(bs->file->bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        bs->drv = NULL;
+        return;
+    }
+
    memset(s, 0, sizeof(BDRVQcow2State));
    options = qdict_clone_shallow(bs->options);

@@ -1804,10 +1798,7 @@ static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
        .magic  = cpu_to_be32(magic),
        .len    = cpu_to_be32(len),
    };
-
-    if (len) {
-        memcpy(buf + sizeof(QCowExtension), s, len);
-    }
+    memcpy(buf + sizeof(QCowExtension), s, len);

    return ext_len;
 }
@@ -1979,7 +1970,7 @@ int qcow2_update_header(BlockDriverState *bs)
    }

    /* Write the new header */
-    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
+    ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size);
    if (ret < 0) {
        goto fail;
    }
@@ -2013,19 +2004,19 @@ static int qcow2_change_backing_file(BlockDriverState *bs,

 static int preallocate(BlockDriverState *bs)
 {
-    uint64_t bytes;
+    uint64_t nb_sectors;
    uint64_t offset;
    uint64_t host_offset = 0;
-    unsigned int cur_bytes;
+    int num;
    int ret;
    QCowL2Meta *meta;

-    bytes = bdrv_getlength(bs);
+    nb_sectors = bdrv_nb_sectors(bs);
    offset = 0;

-    while (bytes) {
-        cur_bytes = MIN(bytes, INT_MAX);
-        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
+    while (nb_sectors) {
+        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &num,
                                         &host_offset, &meta);
        if (ret < 0) {
            return ret;
@@ -2051,8 +2042,8 @@ static int preallocate(BlockDriverState *bs)

        /* TODO Preallocate data if requested */

-        bytes -= cur_bytes;
-        offset += cur_bytes;
+        nb_sectors -= num;
+        offset += num << BDRV_SECTOR_BITS;
    }

    /*
@@ -2061,9 +2052,11 @@ static int preallocate(BlockDriverState *bs)
     * EOF). Extend the image to the last allocated sector.
     */
    if (host_offset != 0) {
-        uint8_t data = 0;
-        ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
-                          &data, 1);
+        uint8_t buf[BDRV_SECTOR_SIZE];
+        memset(buf, 0, BDRV_SECTOR_SIZE);
+        ret = bdrv_write(bs->file->bs,
+                         (host_offset >> BDRV_SECTOR_BITS) + num - 1,
+                         buf, 1);
        if (ret < 0) {
            return ret;
        }
@@ -2407,7 +2400,9 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                        cluster_size, prealloc, opts, version, refcount_order,
                        &local_err);
-    error_propagate(errp, local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }

 finish:
    g_free(backing_file);
@@ -2416,81 +2411,35 @@ finish:
    return ret;
 }

-
-static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
-                            uint32_t count)
-{
-    int nr;
-    BlockDriverState *file;
-    int64_t res;
-
-    if (!count) {
-        return true;
-    }
-    res = bdrv_get_block_status_above(bs, NULL, start, count,
-                                      &nr, &file);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
-}
-
-static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;

-    uint32_t head = offset % s->cluster_size;
-    uint32_t tail = (offset + count) % s->cluster_size;
-
-    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
-
-    if (head || tail) {
-        int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
-        uint64_t off;
-        unsigned int nr;
-
-        assert(head + count <= s->cluster_size);
-
-        /* check whether remainder of cluster already reads as zero */
-        if (!(is_zero_sectors(bs, cl_start,
-                              DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
-              is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS,
-                              DIV_ROUND_UP(-tail & (s->cluster_size - 1),
-                                           BDRV_SECTOR_SIZE)))) {
-            return -ENOTSUP;
-        }
-
-        qemu_co_mutex_lock(&s->lock);
-        /* We can have new write after previous check */
-        offset = cl_start << BDRV_SECTOR_BITS;
-        count = s->cluster_size;
-        nr = s->cluster_size;
-        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
-        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
-            qemu_co_mutex_unlock(&s->lock);
-            return -ENOTSUP;
-        }
-    } else {
-        qemu_co_mutex_lock(&s->lock);
+    /* Emulate misaligned zero writes */
+    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+        return -ENOTSUP;
    }

-    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
-
    /* Whatever is left can use real zero clusters */
-    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
 }

-static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
-                                          int64_t offset, int count)
+static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;

    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
-                                 QCOW2_DISCARD_REQUEST, false);
+    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors, QCOW2_DISCARD_REQUEST, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -2526,7 +2475,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)

    /* write updated header.size */
    offset = cpu_to_be64(offset);
-    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
+    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size),
                           &offset, sizeof(uint64_t));
    if (ret < 0) {
        return ret;
@@ -2538,39 +2487,39 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)

 /* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
-static coroutine_fn int
-qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
-                            uint64_t bytes, QEMUIOVector *qiov)
+static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                  const uint8_t *buf, int nb_sectors)
 {
    BDRVQcow2State *s = bs->opaque;
-    QEMUIOVector hd_qiov;
-    struct iovec iov;
    z_stream strm;
    int ret, out_len;
-    uint8_t *buf, *out_buf;
+    uint8_t *out_buf;
    uint64_t cluster_offset;

-    if (bytes == 0) {
+    if (nb_sectors == 0) {
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
        cluster_offset = bdrv_getlength(bs->file->bs);
        return bdrv_truncate(bs->file->bs, cluster_offset);
    }

-    buf = qemu_blockalign(bs, s->cluster_size);
-    if (bytes != s->cluster_size) {
-        if (bytes > s->cluster_size ||
-            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
-        {
-            qemu_vfree(buf);
-            return -EINVAL;
-        }
-        /* Zero-pad last write if image size is not cluster aligned */
-        memset(buf + bytes, 0, s->cluster_size - bytes);
-    }
-    qemu_iovec_to_buf(qiov, 0, buf, bytes);
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;

-    out_buf = g_malloc(s->cluster_size);
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow2_write_compressed(bs, sector_num,
+                                         pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
@@ -2599,44 +2548,33 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,

    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
        /* could not compress: write normal cluster */
-        ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+            sector_num << 9, out_len);
+        if (!cluster_offset) {
+            ret = -EIO;
+            goto fail;
+        }
+        cluster_offset &= s->cluster_offset_mask;
+
+        ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len);
        if (ret < 0) {
            goto fail;
        }
-        goto success;
    }

-    qemu_co_mutex_lock(&s->lock);
-    cluster_offset =
-        qcow2_alloc_compressed_cluster_offset(bs, offset, out_len);
-    if (!cluster_offset) {
-        qemu_co_mutex_unlock(&s->lock);
-        ret = -EIO;
-        goto fail;
-    }
-    cluster_offset &= s->cluster_offset_mask;
-
-    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
-    qemu_co_mutex_unlock(&s->lock);
-    if (ret < 0) {
-        goto fail;
-    }
-
-    iov = (struct iovec) {
-        .iov_base   = out_buf,
-        .iov_len    = out_len,
-    };
-    qemu_iovec_init_external(&hd_qiov, &iov, 1);
-
-    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
-    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
-    if (ret < 0) {
-        goto fail;
-    }
-success:
    ret = 0;
 fail:
-    qemu_vfree(buf);
    g_free(out_buf);
    return ret;
 }
@@ -2678,8 +2616,8 @@ static int make_completely_empty(BlockDriverState *bs)
    /* After this call, neither the in-memory nor the on-disk refcount
     * information accurately describe the actual references */

-    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
-                             l1_clusters * s->cluster_size, 0);
+    ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE,
+                            l1_clusters * s->cluster_sectors, 0);
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
@@ -2692,8 +2630,9 @@ static int make_completely_empty(BlockDriverState *bs)
     * overwrite parts of the existing refcount and L1 table, which is not
     * an issue because the dirty flag is set, complete data loss is in fact
     * desired and partial data loss is consequently fine as well */
-    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
-                             (2 + l1_clusters) * s->cluster_size, 0);
+    ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE,
+                            (2 + l1_clusters) * s->cluster_size /
+                            BDRV_SECTOR_SIZE, 0);
    /* This call (even if it failed overall) may have overwritten on-disk
     * refcount structures; in that case, the in-memory refcount information
     * will probably differ from the on-disk information which makes the BDS
@@ -2708,10 +2647,10 @@ static int make_completely_empty(BlockDriverState *bs)
    /* "Create" an empty reftable (one cluster) directly after the image
     * header and an empty L1 table three clusters after the image header;
     * the cluster between those two will be used as the first refblock */
-    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
-    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
-    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
-    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
+    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
+    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset),
                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
    if (ret < 0) {
        goto fail_broken_refcounts;
@@ -2742,7 +2681,7 @@ static int make_completely_empty(BlockDriverState *bs)

    /* Enter the first refblock into the reftable */
    rt_entry = cpu_to_be64(2 * s->cluster_size);
-    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
+    ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size,
                           &rt_entry, sizeof(rt_entry));
    if (ret < 0) {
        goto fail_broken_refcounts;
@@ -2922,20 +2861,36 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
 {
    BDRVQcow2State *s = bs->opaque;
+    int64_t total_sectors = bs->total_sectors;
+    bool zero_beyond_eof = bs->zero_beyond_eof;
+    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
-    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
-                                    qiov->size, qiov, 0);
+    bs->zero_beyond_eof = false;
+    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
+    bs->zero_beyond_eof = zero_beyond_eof;
+
+    /* bdrv_co_do_writev will have increased the total_sectors value to include
+     * the VM state - the VM state is however not an actual part of the block
+     * device, therefore, we need to restore the old value. */
+    bs->total_sectors = total_sectors;
+
+    return ret;
 }

-static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
-                              int64_t pos)
+static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                              int64_t pos, int size)
 {
    BDRVQcow2State *s = bs->opaque;
+    bool zero_beyond_eof = bs->zero_beyond_eof;
+    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
-    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
-                                   qiov->size, qiov, 0);
+    bs->zero_beyond_eof = false;
+    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+    bs->zero_beyond_eof = zero_beyond_eof;
+
+    return ret;
 }

 /*
@@ -3374,14 +3329,14 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
    .bdrv_set_key       = qcow2_set_key,

-    .bdrv_co_preadv         = qcow2_co_preadv,
-    .bdrv_co_pwritev        = qcow2_co_pwritev,
+    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_writev         = qcow2_co_writev,
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,

-    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
-    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
+    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_discard        = qcow2_co_discard,
    .bdrv_truncate          = qcow2_truncate,
-    .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
+    .bdrv_write_compressed  = qcow2_write_compressed,
    .bdrv_make_empty        = qcow2_make_empty,

    .bdrv_snapshot_create   = qcow2_snapshot_create,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion {
     */
    uint64_t    offset;

-    /** Number of bytes to copy */
-    int         nb_bytes;
+    /** Number of sectors to copy */
+    int         nb_sectors;
 } Qcow2COWRegion;

 /**
@@ -318,6 +318,12 @@ typedef struct QCowL2Meta
    /** Host offset of the first newly allocated cluster */
    uint64_t alloc_offset;

+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
+    int nb_available;
+
    /** Number of newly allocated clusters */
    int nb_clusters;

@@ -465,7 +471,8 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m)

 static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
 {
-    return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
+    return m->offset + m->cow_end.offset
+        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
 }

 static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
@@ -530,16 +537,16 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        bool exact_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
                          uint8_t *out_buf, const uint8_t *in_buf,
                          int nb_sectors, bool enc, Error **errp);

 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-                             unsigned int *bytes, uint64_t *cluster_offset);
+    int *num, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-                               unsigned int *bytes, uint64_t *host_offset,
-                               QCowL2Meta **m);
+    int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                         uint64_t offset,
                                         int compressed_size);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -234,7 +234,8 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
    }

    check.result->bfi.total_clusters =
-        DIV_ROUND_UP(s->header.image_size, s->header.cluster_size);
+        (s->header.image_size + s->header.cluster_size - 1) /
+            s->header.cluster_size;
    ret = qed_check_l1_table(&check, s->l1_table);
    if (ret == 0) {
        /* Only check for leaks if entire image was scanned successfully */
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -16,7 +16,6 @@
 #include "trace.h"
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
-#include "qemu/bswap.h"

 typedef struct {
    GenericCB gencb;
@@ -65,7 +64,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,

    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
-    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
+    bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov,
                   qiov->size / BDRV_SECTOR_SIZE,
                   qed_read_table_cb, read_table_cb);
 }
@@ -154,7 +153,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
    /* Adjust for offset into table */
    offset += start * sizeof(uint64_t);

-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE,
                    &write_table_cb->qiov,
                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
                    qed_write_table_cb, write_table_cb);
--- a/block/qed.c
+++ b/block/qed.c
@@ -15,7 +15,6 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
-#include "qemu/bswap.h"
 #include "trace.h"
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
@@ -86,7 +85,7 @@ int qed_write_header_sync(BDRVQEDState *s)
    int ret;

    qed_header_cpu_to_le(&s->header, &le);
-    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+    ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le));
    if (ret != sizeof(le)) {
        return ret;
    }
@@ -123,7 +122,7 @@ static void qed_write_header_read_cb(void *opaque, int ret)
    /* Update header */
    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);

-    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+    bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov,
                    write_header_cb->nsectors, qed_write_header_cb,
                    write_header_cb);
 }
@@ -143,7 +142,8 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
     * them, and write back.
     */

-    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
+    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+                   BDRV_SECTOR_SIZE;
    size_t len = nsectors * BDRV_SECTOR_SIZE;
    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
                                                    cb, opaque);
@@ -155,7 +155,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
    write_header_cb->iov.iov_len = len;
    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);

-    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+    bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors,
                   qed_write_header_read_cb, write_header_cb);
 }

@@ -218,7 +218,7 @@ static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
 *
 * The string is NUL-terminated.
 */
-static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
                           char *buf, size_t buflen)
 {
    int ret;
@@ -389,7 +389,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
    s->bs = bs;
    QSIMPLEQ_INIT(&s->allocating_write_reqs);

-    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+    ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header));
    if (ret < 0) {
        return ret;
    }
@@ -446,7 +446,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
            return -EINVAL;
        }

-        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+        ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset,
                              s->header.backing_filename_size, bs->backing_file,
                              sizeof(bs->backing_file));
        if (ret < 0) {
@@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQEDState *s = bs->opaque;

-    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
+    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
 }

 /* We have nothing to do for QED reopen, stubs just return
@@ -708,7 +708,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
    }

    if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        qemu_coroutine_enter(cb->co, NULL);
    }
 }

@@ -800,7 +800,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
    qemu_iovec_concat(*backing_qiov, qiov, 0, size);

    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-    bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE,
+    bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE,
                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
 }

@@ -837,7 +837,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret)
    }

    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE,
                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
                    qed_copy_from_backing_file_cb, copy_cb);
 }
@@ -1087,7 +1087,7 @@ static void qed_aio_write_main(void *opaque, int ret)
    }

    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE,
                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
                    next_fn, acb);
 }
@@ -1319,7 +1319,7 @@ static void qed_aio_read_data(void *opaque, int ret,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE,
                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
                   qed_aio_next_io, acb);
    return;
@@ -1418,21 +1418,21 @@ typedef struct {
    bool done;
 } QEDWriteZeroesCB;

-static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
 {
    QEDWriteZeroesCB *cb = opaque;

    cb->done = true;
    cb->ret = ret;
    if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        qemu_coroutine_enter(cb->co, NULL);
    }
 }

-static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
-                                                  int64_t offset,
-                                                  int count,
-                                                  BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors,
+                                                 BdrvRequestFlags flags)
 {
    BlockAIOCB *blockacb;
    BDRVQEDState *s = bs->opaque;
@@ -1440,22 +1440,25 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
    QEMUIOVector qiov;
    struct iovec iov;

-    /* Fall back if the request is not aligned */
-    if (qed_offset_into_cluster(s, offset) ||
-        qed_offset_into_cluster(s, count)) {
-        return -ENOTSUP;
+    /* Refuse if there are untouched backing file sectors */
+    if (bs->backing) {
+        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
    }

    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
     * then it will be allocated during request processing.
     */
-    iov.iov_base = NULL;
-    iov.iov_len = count;
+    iov.iov_base = NULL,
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,

    qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
-                             count >> BDRV_SECTOR_BITS,
-                             qed_co_pwrite_zeroes_cb, &cb,
+    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+                             qed_co_write_zeroes_cb, &cb,
                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
    if (!blockacb) {
        return -EIO;
@@ -1575,7 +1578,7 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
    }

    /* Write new header */
-    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
+    ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len);
    g_free(buffer);
    if (ret == 0) {
        memcpy(&s->header, &new_header, sizeof(new_header));
@@ -1591,6 +1594,12 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)

    bdrv_qed_close(bs);

+    bdrv_invalidate_cache(bs->file->bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
    memset(s, 0, sizeof(BDRVQEDState));
    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
    if (local_err) {
@@ -1660,7 +1669,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
    .bdrv_aio_readv           = bdrv_qed_aio_readv,
    .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
+    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
    .bdrv_truncate            = bdrv_qed_truncate,
    .bdrv_getlength           = bdrv_qed_getlength,
    .bdrv_get_info            = bdrv_qed_get_info,
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,7 +14,6 @@
 */

 #include "qemu/osdep.h"
-#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -68,9 +67,6 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
    BdrvChild **children;  /* children BlockDriverStates */
    int num_children;      /* children count */
-    unsigned next_child_index;  /* the index of the next child that should
-                                 * be added
-                                 */
    int threshold;         /* if less than threshold children reads gave the
                            * same result a quorum error occurs.
                            */
@@ -383,7 +379,7 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
            continue;
        }
        QLIST_FOREACH(item, &version->items, next) {
-            bdrv_aio_writev(s->children[item->index], acb->sector_num,
+            bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num,
                            acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb,
                            acb);
        }
@@ -660,7 +656,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
    }

    for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num,
+        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num,
                                            &acb->qcrs[i].qiov, acb->nb_sectors,
                                            quorum_aio_cb, &acb->qcrs[i]);
    }
@@ -678,7 +674,7 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
                     acb->qcrs[acb->child_iter].buf);
    acb->qcrs[acb->child_iter].aiocb =
-        bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num,
+        bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
                       &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
                       quorum_aio_cb, &acb->qcrs[acb->child_iter]);

@@ -719,7 +715,7 @@ static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs,
    int i;

    for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num,
+        acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num,
                                             qiov, nb_sectors, &quorum_aio_cb,
                                             &acb->qcrs[i]);
    }
@@ -751,6 +747,21 @@ static int64_t quorum_getlength(BlockDriverState *bs)
    return result;
 }

+static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
+{
+    BDRVQuorumState *s = bs->opaque;
+    Error *local_err = NULL;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_invalidate_cache(s->children[i]->bs, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
    BDRVQuorumState *s = bs->opaque;
@@ -887,9 +898,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto exit;
    }
-    if (s->num_children < 1) {
+    if (s->num_children < 2) {
        error_setg(&local_err,
-                   "Number of provided children must be 1 or more");
+                   "Number of provided children must be greater than 1");
        ret = -EINVAL;
        goto exit;
    }
@@ -953,7 +964,6 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

        opened[i] = true;
    }
-    s->next_child_index = s->num_children;

    g_free(opened);
    goto exit;
@@ -971,7 +981,9 @@ close_exit:
 exit:
    qemu_opts_del(opts);
    /* propagate error */
-    error_propagate(errp, local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
    return ret;
 }

@@ -987,70 +999,25 @@ static void quorum_close(BlockDriverState *bs)
    g_free(s->children);
 }

-static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
-                             Error **errp)
-{
-    BDRVQuorumState *s = bs->opaque;
-    BdrvChild *child;
-    char indexstr[32];
-    int ret;
-
-    assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
-    if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
-        s->next_child_index == UINT_MAX) {
-        error_setg(errp, "Too many children");
-        return;
-    }
-
-    ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
-    if (ret < 0 || ret >= 32) {
-        error_setg(errp, "cannot generate child name");
-        return;
-    }
-    s->next_child_index++;
-
-    bdrv_drained_begin(bs);
-
-    /* We can safely add the child now */
-    bdrv_ref(child_bs);
-    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
-    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
-    s->children[s->num_children++] = child;
-
-    bdrv_drained_end(bs);
-}
-
-static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
-                             Error **errp)
+static void quorum_detach_aio_context(BlockDriverState *bs)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_children; i++) {
-        if (s->children[i] == child) {
-            break;
-        }
+        bdrv_detach_aio_context(s->children[i]->bs);
    }
+}

-    /* we have checked it in bdrv_del_child() */
-    assert(i < s->num_children);
+static void quorum_attach_aio_context(BlockDriverState *bs,
+                                      AioContext *new_context)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;

-    if (s->num_children <= s->threshold) {
-        error_setg(errp,
-            "The number of children cannot be lower than the vote threshold %d",
-            s->threshold);
-        return;
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_attach_aio_context(s->children[i]->bs, new_context);
    }
-
-    bdrv_drained_begin(bs);
-
-    /* We can safely remove this child now */
-    memmove(&s->children[i], &s->children[i + 1],
-            (s->num_children - i - 1) * sizeof(BdrvChild *));
-    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
-    bdrv_unref_child(bs, child);
-
-    bdrv_drained_end(bs);
 }

 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -1103,9 +1070,10 @@ static BlockDriver bdrv_quorum = {

    .bdrv_aio_readv                     = quorum_aio_readv,
    .bdrv_aio_writev                    = quorum_aio_writev,
+    .bdrv_invalidate_cache              = quorum_invalidate_cache,

-    .bdrv_add_child                     = quorum_add_child,
-    .bdrv_del_child                     = quorum_del_child,
+    .bdrv_detach_aio_context            = quorum_detach_aio_context,
+    .bdrv_attach_aio_context            = quorum_attach_aio_context,

    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -15,7 +15,6 @@
 #ifndef QEMU_RAW_AIO_H
 #define QEMU_RAW_AIO_H

-#include "qemu/coroutine.h"
 #include "qemu/iov.h"

 /* AIO request types */
@@ -36,18 +35,15 @@

 /* linux-aio.c - Linux native implementation */
 #ifdef CONFIG_LINUX_AIO
-typedef struct LinuxAioState LinuxAioState;
-LinuxAioState *laio_init(void);
-void laio_cleanup(LinuxAioState *s);
-int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type);
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+void *laio_init(void);
+void laio_cleanup(void *s);
+BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type);
-void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
-void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s);
+void laio_detach_aio_context(void *s, AioContext *old_context);
+void laio_attach_aio_context(void *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
+void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
 #endif

 #ifdef _WIN32
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -32,7 +32,7 @@
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
-#include "block/raw-aio.h"
+#include "raw-aio.h"
 #include "qapi/util.h"
 #include "qapi/qmp/qstring.h"

@@ -137,6 +137,10 @@ typedef struct BDRVRawState {
    int open_flags;
    size_t buf_align;

+#ifdef CONFIG_LINUX_AIO
+    int use_aio;
+    void *aio_ctx;
+#endif
 #ifdef CONFIG_XFS
    bool is_xfs:1;
 #endif
@@ -150,6 +154,9 @@ typedef struct BDRVRawState {
 typedef struct BDRVRawReopenState {
    int fd;
    int open_flags;
+#ifdef CONFIG_LINUX_AIO
+    int use_aio;
+#endif
 } BDRVRawReopenState;

 static int fd_open(BlockDriverState *bs);
@@ -295,22 +302,22 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
    /* For SCSI generic devices the alignment is not really used.
       With buffered I/O, we don't have any restrictions. */
    if (bdrv_is_sg(bs) || !s->needs_alignment) {
-        bs->bl.request_alignment = 1;
+        bs->request_alignment = 1;
        s->buf_align = 1;
        return;
    }

-    bs->bl.request_alignment = 0;
+    bs->request_alignment = 0;
    s->buf_align = 0;
    /* Let's try to use the logical blocksize for the alignment. */
-    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
-        bs->bl.request_alignment = 0;
+    if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) {
+        bs->request_alignment = 0;
    }
 #ifdef CONFIG_XFS
    if (s->is_xfs) {
        struct dioattr da;
        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
-            bs->bl.request_alignment = da.d_miniosz;
+            bs->request_alignment = da.d_miniosz;
            /* The kernel returns wrong information for d_mem */
            /* s->buf_align = da.d_mem; */
        }
@@ -330,21 +337,21 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
        qemu_vfree(buf);
    }

-    if (!bs->bl.request_alignment) {
+    if (!bs->request_alignment) {
        size_t align;
        buf = qemu_memalign(s->buf_align, max_align);
        for (align = 512; align <= max_align; align <<= 1) {
            if (raw_is_io_aligned(fd, buf, align)) {
-                bs->bl.request_alignment = align;
+                bs->request_alignment = align;
                break;
            }
        }
        qemu_vfree(buf);
    }

-    if (!s->buf_align || !bs->bl.request_alignment) {
-        error_setg(errp, "Could not find working O_DIRECT alignment");
-        error_append_hint(errp, "Try cache.direct=off\n");
+    if (!s->buf_align || !bs->request_alignment) {
+        error_setg(errp, "Could not find working O_DIRECT alignment. "
+                         "Try cache.direct=off.");
    }
 }

@@ -367,15 +374,58 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags)
    }
 }

-#ifdef CONFIG_LINUX_AIO
-static bool raw_use_aio(int bdrv_flags)
+static void raw_detach_aio_context(BlockDriverState *bs)
 {
+#ifdef CONFIG_LINUX_AIO
+    BDRVRawState *s = bs->opaque;
+
+    if (s->use_aio) {
+        laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs));
+    }
+#endif
+}
+
+static void raw_attach_aio_context(BlockDriverState *bs,
+                                   AioContext *new_context)
+{
+#ifdef CONFIG_LINUX_AIO
+    BDRVRawState *s = bs->opaque;
+
+    if (s->use_aio) {
+        laio_attach_aio_context(s->aio_ctx, new_context);
+    }
+#endif
+}
+
+#ifdef CONFIG_LINUX_AIO
+static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+{
+    int ret = -1;
+    assert(aio_ctx != NULL);
+    assert(use_aio != NULL);
    /*
     * Currently Linux do AIO only for files opened with O_DIRECT
     * specified so check NOCACHE flag too
     */
-    return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                         (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO);
+    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+        /* if non-NULL, laio_init() has already been run */
+        if (*aio_ctx == NULL) {
+            *aio_ctx = laio_init();
+            if (!*aio_ctx) {
+                goto error;
+            }
+        }
+        *use_aio = 1;
+    } else {
+        *use_aio = 0;
+    }
+
+    ret = 0;
+
+error:
+    return ret;
 }
 #endif

@@ -444,7 +494,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    s->fd = fd;

 #ifdef CONFIG_LINUX_AIO
-    if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
+    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
+        qemu_close(fd);
+        ret = -errno;
+        error_setg_errno(errp, -ret, "Could not set AIO state");
+        goto fail;
+    }
+    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
        error_setg(errp, "aio=native was specified, but it requires "
                         "cache.direct=on, which was not specified.");
        ret = -EINVAL;
@@ -461,7 +517,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
@@ -511,6 +566,8 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
 #endif

+    raw_attach_aio_context(bs, bdrv_get_aio_context(bs));
+
    ret = 0;
 fail:
    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -524,9 +581,15 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;

    s->type = FTYPE_FILE;
-    return raw_open_common(bs, options, flags, 0, errp);
+    ret = raw_open_common(bs, options, flags, 0, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }

 static int raw_reopen_prepare(BDRVReopenState *state,
@@ -545,6 +608,18 @@ static int raw_reopen_prepare(BDRVReopenState *state,
    state->opaque = g_new0(BDRVRawReopenState, 1);
    raw_s = state->opaque;

+#ifdef CONFIG_LINUX_AIO
+    raw_s->use_aio = s->use_aio;
+
+    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
+     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
+     * won't override aio_ctx if aio_ctx is non-NULL */
+    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+        error_setg(errp, "Could not set AIO state");
+        return -1;
+    }
+#endif
+
    if (s->type == FTYPE_CD) {
        raw_s->open_flags |= O_NONBLOCK;
    }
@@ -569,7 +644,15 @@ static int raw_reopen_prepare(BDRVReopenState *state,

    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
        /* dup the original fd */
-        raw_s->fd = qemu_dup(s->fd);
+        /* TODO: use qemu fcntl wrapper */
+#ifdef F_DUPFD_CLOEXEC
+        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
+#else
+        raw_s->fd = dup(s->fd);
+        if (raw_s->fd != -1) {
+            qemu_set_cloexec(raw_s->fd);
+        }
+#endif
        if (raw_s->fd >= 0) {
            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
            if (ret) {
@@ -619,6 +702,9 @@ static void raw_reopen_commit(BDRVReopenState *state)

    qemu_close(s->fd);
    s->fd = raw_s->fd;
+#ifdef CONFIG_LINUX_AIO
+    s->use_aio = raw_s->use_aio;
+#endif

    g_free(state->opaque);
    state->opaque = NULL;
@@ -642,33 +728,9 @@ static void raw_reopen_abort(BDRVReopenState *state)
    state->opaque = NULL;
 }

-static int hdev_get_max_transfer_length(int fd)
-{
-#ifdef BLKSECTGET
-    int max_sectors = 0;
-    if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
-        return max_sectors;
-    } else {
-        return -errno;
-    }
-#else
-    return -ENOSYS;
-#endif
-}
-
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
-    struct stat st;
-
-    if (!fstat(s->fd, &st)) {
-        if (S_ISBLK(st.st_mode)) {
-            int ret = hdev_get_max_transfer_length(s->fd);
-            if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
-                bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
-            }
-        }
-    }

    raw_probe_alignment(bs, s->fd, errp);
    bs->bl.min_mem_alignment = s->buf_align;
@@ -1189,8 +1251,8 @@ static int aio_worker(void *arg)
 }

 static int paio_submit_co(BlockDriverState *bs, int fd,
-                          int64_t offset, QEMUIOVector *qiov,
-                          int count, int type)
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int type)
 {
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
    ThreadPool *pool;
@@ -1199,22 +1261,22 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

-    acb->aio_nbytes = count;
-    acb->aio_offset = offset;
+    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;

    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
-        assert(qiov->size == count);
+        assert(qiov->size == acb->aio_nbytes);
    }

-    trace_paio_submit_co(offset, count, type);
+    trace_paio_submit_co(sector_num, nb_sectors, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_co(pool, aio_worker, acb);
 }

 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t offset, QEMUIOVector *qiov, int count,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1224,8 +1286,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

-    acb->aio_nbytes = count;
-    acb->aio_offset = offset;
+    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;

    if (qiov) {
        acb->aio_iov = qiov->iov;
@@ -1233,18 +1295,19 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
        assert(qiov->size == acb->aio_nbytes);
    }

-    trace_paio_submit(acb, opaque, offset, count, type);
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }

-static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
-                                   uint64_t bytes, QEMUIOVector *qiov, int type)
+static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque, int type)
 {
    BDRVRawState *s = bs->opaque;

    if (fd_open(bs) < 0)
-        return -EIO;
+        return NULL;

    /*
     * Check if the underlying device requires requests to be aligned,
@@ -1256,38 +1319,23 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
-        } else if (bs->open_flags & BDRV_O_NATIVE_AIO) {
-            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
-            assert(qiov->size == bytes);
-            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
+        } else if (s->use_aio) {
+            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+                               nb_sectors, cb, opaque, type);
 #endif
        }
    }

-    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
-}
-
-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
-                                      uint64_t bytes, QEMUIOVector *qiov,
-                                      int flags)
-{
-    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
-}
-
-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                                       uint64_t bytes, QEMUIOVector *qiov,
-                                       int flags)
-{
-    assert(flags == 0);
-    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
+    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
+                       cb, opaque, type);
 }

 static void raw_aio_plug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
-        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
-        laio_io_plug(bs, aio);
+    BDRVRawState *s = bs->opaque;
+    if (s->use_aio) {
+        laio_io_plug(bs, s->aio_ctx);
    }
 #endif
 }
@@ -1295,13 +1343,39 @@ static void raw_aio_plug(BlockDriverState *bs)
 static void raw_aio_unplug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
-        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
-        laio_io_unplug(bs, aio);
+    BDRVRawState *s = bs->opaque;
+    if (s->use_aio) {
+        laio_io_unplug(bs, s->aio_ctx, true);
    }
 #endif
 }

+static void raw_aio_flush_io_queue(BlockDriverState *bs)
+{
+#ifdef CONFIG_LINUX_AIO
+    BDRVRawState *s = bs->opaque;
+    if (s->use_aio) {
+        laio_io_unplug(bs, s->aio_ctx, false);
+    }
+#endif
+}
+
+static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+                          cb, opaque, QEMU_AIO_READ);
+}
+
+static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+                          cb, opaque, QEMU_AIO_WRITE);
+}
+
 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
        BlockCompletionFunc *cb, void *opaque)
 {
@@ -1317,6 +1391,13 @@ static void raw_close(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;

+    raw_detach_aio_context(bs);
+
+#ifdef CONFIG_LINUX_AIO
+    if (s->use_aio) {
+        laio_cleanup(s->aio_ctx);
+    }
+#endif
    if (s->fd >= 0) {
        qemu_close(s->fd);
        s->fd = -1;
@@ -1786,27 +1867,27 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
    return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }

-static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors,
    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;

-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
                       cb, opaque, QEMU_AIO_DISCARD);
 }

-static int coroutine_fn raw_co_pwrite_zeroes(
-    BlockDriverState *bs, int64_t offset,
-    int count, BdrvRequestFlags flags)
+static int coroutine_fn raw_co_write_zeroes(
+    BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;

    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
                              QEMU_AIO_WRITE_ZEROES);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
                              QEMU_AIO_DISCARD);
    }
    return -ENOTSUP;
@@ -1859,15 +1940,16 @@ BlockDriver bdrv_file = {
    .bdrv_create = raw_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = raw_co_get_block_status,
-    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
+    .bdrv_co_write_zeroes = raw_co_write_zeroes,

-    .bdrv_co_preadv         = raw_co_preadv,
-    .bdrv_co_pwritev        = raw_co_pwritev,
+    .bdrv_aio_readv = raw_aio_readv,
+    .bdrv_aio_writev = raw_aio_writev,
    .bdrv_aio_flush = raw_aio_flush,
-    .bdrv_aio_pdiscard = raw_aio_pdiscard,
+    .bdrv_aio_discard = raw_aio_discard,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
@@ -1875,6 +1957,9 @@ BlockDriver bdrv_file = {
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,

+    .bdrv_detach_aio_context = raw_detach_aio_context,
+    .bdrv_attach_aio_context = raw_attach_aio_context,
+
    .create_opts = &raw_create_opts,
 };

@@ -2140,7 +2225,9 @@ hdev_open_Mac_error:

    ret = raw_open_common(bs, options, flags, 0, &local_err);
    if (ret < 0) {
-        error_propagate(errp, local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
 #if defined(__APPLE__) && defined(__MACH__)
        if (*bsd_path) {
            filename = bsd_path;
@@ -2203,8 +2290,8 @@ static int fd_open(BlockDriverState *bs)
    return -EIO;
 }

-static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors,
    BlockCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
@@ -2212,12 +2299,12 @@ static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
    if (fd_open(bs) < 0) {
        return NULL;
    }
-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

-static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;
    int rc;
@@ -2227,10 +2314,10 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
        return rc;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
    }
    return -ENOTSUP;
@@ -2302,15 +2389,16 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create         = hdev_create,
    .create_opts         = &raw_create_opts,
-    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+    .bdrv_co_write_zeroes = hdev_co_write_zeroes,

-    .bdrv_co_preadv         = raw_co_preadv,
-    .bdrv_co_pwritev        = raw_co_pwritev,
+    .bdrv_aio_readv	= raw_aio_readv,
+    .bdrv_aio_writev	= raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
-    .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
+    .bdrv_aio_discard   = hdev_aio_discard,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
@@ -2320,6 +2408,9 @@ static BlockDriver bdrv_host_device = {
    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
    .bdrv_probe_geometry = hdev_probe_geometry,

+    .bdrv_detach_aio_context = raw_detach_aio_context,
+    .bdrv_attach_aio_context = raw_attach_aio_context,
+
    /* generic scsi device */
 #ifdef __linux__
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
@@ -2342,11 +2433,17 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
    BDRVRawState *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;

    s->type = FTYPE_CD;

    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
-    return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
+    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }

 static int cdrom_probe_device(const char *filename)
@@ -2425,13 +2522,13 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_create         = hdev_create,
    .create_opts         = &raw_create_opts,

-
-    .bdrv_co_preadv         = raw_co_preadv,
-    .bdrv_co_pwritev        = raw_co_pwritev,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2439,6 +2536,9 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,

+    .bdrv_detach_aio_context = raw_detach_aio_context,
+    .bdrv_attach_aio_context = raw_attach_aio_context,
+
    /* removable device support */
    .bdrv_is_inserted   = cdrom_is_inserted,
    .bdrv_eject         = cdrom_eject,
@@ -2461,7 +2561,9 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,

    ret = raw_open_common(bs, options, flags, 0, &local_err);
    if (ret) {
-        error_propagate(errp, local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
        return ret;
    }

@@ -2556,12 +2658,13 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_create        = hdev_create,
    .create_opts        = &raw_create_opts,

-    .bdrv_co_preadv         = raw_co_preadv,
-    .bdrv_co_pwritev        = raw_co_pwritev,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2569,6 +2672,9 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,

+    .bdrv_detach_aio_context = raw_detach_aio_context,
+    .bdrv_attach_aio_context = raw_attach_aio_context,
+
    /* removable device support */
    .bdrv_is_inserted   = cdrom_is_inserted,
    .bdrv_eject         = cdrom_eject,
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -27,7 +27,7 @@
 #include "qemu/timer.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "block/raw-aio.h"
+#include "raw-aio.h"
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
@@ -142,7 +142,7 @@ static int aio_worker(void *arg)
 }

 static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
-        int64_t offset, QEMUIOVector *qiov, int count,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type)
 {
    RawWin32AIOData *acb = g_new(RawWin32AIOData, 1);
@@ -155,12 +155,11 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
-        assert(qiov->size == count);
    }
-    acb->aio_nbytes = count;
-    acb->aio_offset = offset;
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;

-    trace_paio_submit(acb, opaque, offset, count, type);
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -223,7 +222,7 @@ static void raw_attach_aio_context(BlockDriverState *bs,
    }
 }

-static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
+static void raw_probe_alignment(BlockDriverState *bs)
 {
    BDRVRawState *s = bs->opaque;
    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
@@ -231,14 +230,14 @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
    BOOL status;

    if (s->type == FTYPE_CD) {
-        bs->bl.request_alignment = 2048;
+        bs->request_alignment = 2048;
        return;
    }
    if (s->type == FTYPE_HARDDISK) {
        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
        if (status != 0) {
-            bs->bl.request_alignment = dg.Geometry.BytesPerSector;
+            bs->request_alignment = dg.Geometry.BytesPerSector;
            return;
        }
        /* try GetDiskFreeSpace too */
@@ -248,7 +247,7 @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
                         &dg.Geometry.BytesPerSector,
                         &freeClusters, &totalClusters);
-        bs->bl.request_alignment = dg.Geometry.BytesPerSector;
+        bs->request_alignment = dg.Geometry.BytesPerSector;
    }
 }

@@ -366,6 +365,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
        win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs));
    }

+    raw_probe_alignment(bs);
    ret = 0;
 fail:
    qemu_opts_del(opts);
@@ -379,10 +379,9 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_READ);
+                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
    } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
                           cb, opaque, QEMU_AIO_READ);
    }
 }
@@ -394,10 +393,9 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
    BDRVRawState *s = bs->opaque;
    if (s->aio) {
        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
    } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
                           cb, opaque, QEMU_AIO_WRITE);
    }
 }
@@ -552,7 +550,6 @@ BlockDriver bdrv_file = {
    .bdrv_needs_filename = true,
    .bdrv_parse_filename = raw_parse_filename,
    .bdrv_file_open     = raw_open,
-    .bdrv_refresh_limits = raw_probe_alignment,
    .bdrv_close         = raw_close,
    .bdrv_create        = raw_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -1,6 +1,6 @@
 /* BlockDriver implementation for "raw"
 *
- * Copyright (C) 2010-2016 Red Hat, Inc.
+ * Copyright (C) 2010, 2013, Red Hat, Inc.
 * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com>
 * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com>
 *
@@ -50,30 +50,33 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state,
    return 0;
 }

-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
-                                      uint64_t bytes, QEMUIOVector *qiov,
-                                      int flags)
+static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
+                                     int nb_sectors, QEMUIOVector *qiov)
 {
    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+    return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov);
 }

-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                                       uint64_t bytes, QEMUIOVector *qiov,
-                                       int flags)
+static int coroutine_fn
+raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                    QEMUIOVector *qiov, int flags)
 {
    void *buf = NULL;
    BlockDriver *drv;
    QEMUIOVector local_qiov;
    int ret;

-    if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
-        /* Handling partial writes would be a pain - so we just
-         * require that guests have 512-byte request alignment if
-         * probing occurred */
+    if (bs->probed && sector_num == 0) {
+        /* As long as these conditions are true, we can't get partial writes to
+         * the probe buffer and can just directly check the request. */
        QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512);
        QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512);
-        assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE);
+
+        if (nb_sectors == 0) {
+            /* qemu_iovec_to_buf() would fail, but we want to return success
+             * instead of -EINVAL in this case. */
+            return 0;
+        }

        buf = qemu_try_blockalign(bs->file->bs, 512);
        if (!buf) {
@@ -102,7 +105,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+    ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
+                             nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);

 fail:
    if (qiov == &local_qiov) {
@@ -112,6 +116,13 @@ fail:
    return ret;
 }

+static int coroutine_fn
+raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+              QEMUIOVector *qiov)
+{
+    return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
+}
+
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int64_t sector_num,
                                            int nb_sectors, int *pnum,
@@ -123,17 +134,17 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
           (sector_num << BDRV_SECTOR_BITS);
 }

-static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
-                                             int64_t offset, int count,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
+                                            int64_t sector_num, int nb_sectors,
+                                            BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
 }

-static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
-                                        int64_t offset, int count)
+static int coroutine_fn raw_co_discard(BlockDriverState *bs,
+                                       int64_t sector_num, int nb_sectors)
 {
-    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+    return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
 }

 static int64_t raw_getlength(BlockDriverState *bs)
@@ -148,12 +159,7 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)

 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    if (bs->probed) {
-        /* To make it easier to protect the first sector, any probed
-         * image is restricted to read-modify-write on sub-sector
-         * operations. */
-        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
-    }
+    bs->bl = bs->file->bs->bl;
 }

 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -191,17 +197,20 @@ static int raw_has_zero_init(BlockDriverState *bs)

 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    return bdrv_create_file(filename, opts, errp);
+    Error *local_err = NULL;
+    int ret;
+
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
 }

 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->bs->sg;
-    bs->supported_write_flags = BDRV_REQ_FUA &
-        bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-        bs->file->bs->supported_zero_flags;

    if (bs->probed && !bdrv_is_read_only(bs)) {
        fprintf(stderr,
@@ -246,10 +255,12 @@ BlockDriver bdrv_raw = {
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
    .bdrv_create          = &raw_create,
-    .bdrv_co_preadv       = &raw_co_preadv,
-    .bdrv_co_pwritev      = &raw_co_pwritev,
-    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
-    .bdrv_co_pdiscard     = &raw_co_pdiscard,
+    .bdrv_co_readv        = &raw_co_readv,
+    .bdrv_co_writev       = &raw_co_writev,
+    .bdrv_co_writev_flags = &raw_co_writev_flags,
+    .supported_write_flags = BDRV_REQ_FUA,
+    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
+    .bdrv_co_discard      = &raw_co_discard,
    .bdrv_co_get_block_status = &raw_co_get_block_status,
    .bdrv_truncate        = &raw_truncate,
    .bdrv_getlength       = &raw_getlength,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -290,8 +290,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
            if (only_read_conf_file) {
                ret = rados_conf_read_file(cluster, value);
                if (ret < 0) {
-                    error_setg_errno(errp, -ret, "error reading conf file %s",
-                                     value);
+                    error_setg(errp, "error reading conf file %s", value);
                    break;
                }
            }
@@ -300,7 +299,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
        } else if (!only_read_conf_file) {
            ret = rados_conf_set(cluster, name, value);
            if (ret < 0) {
-                error_setg_errno(errp, -ret, "invalid conf option %s", name);
+                error_setg(errp, "invalid conf option %s", name);
                ret = -EINVAL;
                break;
            }
@@ -355,10 +354,9 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    ret = rados_create(&cluster, clientname);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error initializing");
-        return ret;
+    if (rados_create(&cluster, clientname) < 0) {
+        error_setg(errp, "error initializing");
+        return -EIO;
    }

    if (strstr(conf, "conf=") == NULL) {
@@ -383,27 +381,21 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }

-    ret = rados_connect(cluster);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error connecting");
+    if (rados_connect(cluster) < 0) {
+        error_setg(errp, "error connecting");
        rados_shutdown(cluster);
-        return ret;
+        return -EIO;
    }

-    ret = rados_ioctx_create(cluster, pool, &io_ctx);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error opening pool %s", pool);
+    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
+        error_setg(errp, "error opening pool %s", pool);
        rados_shutdown(cluster);
-        return ret;
+        return -EIO;
    }

    ret = rbd_create(io_ctx, name, bytes, &obj_order);
    rados_ioctx_destroy(io_ctx);
    rados_shutdown(cluster);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "error rbd create");
-        return ret;
-    }

    return ret;
 }
@@ -508,7 +500,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
    r = rados_create(&s->cluster, clientname);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error initializing");
+        error_setg(errp, "error initializing");
        goto failed_opts;
    }

@@ -554,19 +546,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,

    r = rados_connect(s->cluster);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error connecting");
+        error_setg(errp, "error connecting");
        goto failed_shutdown;
    }

    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error opening pool %s", pool);
+        error_setg(errp, "error opening pool %s", pool);
        goto failed_shutdown;
    }

    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
    if (r < 0) {
-        error_setg_errno(errp, -r, "error reading header from %s", s->name);
+        error_setg(errp, "error reading header from %s", s->name);
        goto failed_open;
    }

@@ -737,7 +729,7 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
                                      void *opaque)
 {
    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                         RBD_AIO_READ);
 }

@@ -749,7 +741,7 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                                       void *opaque)
 {
    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                         RBD_AIO_WRITE);
 }

@@ -882,8 +874,10 @@ static int qemu_rbd_snap_rollback(BlockDriverState *bs,
                                  const char *snapshot_name)
 {
    BDRVRBDState *s = bs->opaque;
+    int r;

-    return rbd_snap_rollback(s->image, snapshot_name);
+    r = rbd_snap_rollback(s->image, snapshot_name);
+    return r;
 }

 static int qemu_rbd_snap_list(BlockDriverState *bs,
@@ -930,13 +924,14 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }

 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
-                                         int64_t offset,
-                                         int count,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque)
+static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
+                                        int64_t sector_num,
+                                        int nb_sectors,
+                                        BlockCompletionFunc *cb,
+                                        void *opaque)
 {
-    return rbd_start_aio(bs, offset, NULL, count, cb, opaque,
+    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, NULL,
+                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                         RBD_AIO_DISCARD);
 }
 #endif
@@ -1000,7 +995,7 @@ static BlockDriver bdrv_rbd = {
 #endif

 #ifdef LIBRBD_SUPPORTS_DISCARD
-    .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
+    .bdrv_aio_discard       = qemu_rbd_aio_discard,
 #endif

    .bdrv_snapshot_create   = qemu_rbd_snap_create,
--- a/block/replication.c
+++ b/block/replication.c
@@ -1,659 +0,0 @@
-/*
- * Replication Block filter
- *
- * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
- * Copyright (c) 2016 Intel Corporation
- * Copyright (c) 2016 FUJITSU LIMITED
- *
- * Author:
- *   Wen Congyang <wency@cn.fujitsu.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "block/nbd.h"
-#include "block/blockjob.h"
-#include "block/block_int.h"
-#include "block/block_backup.h"
-#include "sysemu/block-backend.h"
-#include "qapi/error.h"
-#include "replication.h"
-
-typedef struct BDRVReplicationState {
-    ReplicationMode mode;
-    int replication_state;
-    BdrvChild *active_disk;
-    BdrvChild *hidden_disk;
-    BdrvChild *secondary_disk;
-    char *top_id;
-    ReplicationState *rs;
-    Error *blocker;
-    int orig_hidden_flags;
-    int orig_secondary_flags;
-    int error;
-} BDRVReplicationState;
-
-enum {
-    BLOCK_REPLICATION_NONE,             /* block replication is not started */
-    BLOCK_REPLICATION_RUNNING,          /* block replication is running */
-    BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
-    BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
-    BLOCK_REPLICATION_DONE,             /* block replication is done */
-};
-
-static void replication_start(ReplicationState *rs, ReplicationMode mode,
-                              Error **errp);
-static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
-static void replication_get_error(ReplicationState *rs, Error **errp);
-static void replication_stop(ReplicationState *rs, bool failover,
-                             Error **errp);
-
-#define REPLICATION_MODE        "mode"
-#define REPLICATION_TOP_ID      "top-id"
-static QemuOptsList replication_runtime_opts = {
-    .name = "replication",
-    .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
-    .desc = {
-        {
-            .name = REPLICATION_MODE,
-            .type = QEMU_OPT_STRING,
-        },
-        {
-            .name = REPLICATION_TOP_ID,
-            .type = QEMU_OPT_STRING,
-        },
-        { /* end of list */ }
-    },
-};
-
-static ReplicationOps replication_ops = {
-    .start = replication_start,
-    .checkpoint = replication_do_checkpoint,
-    .get_error = replication_get_error,
-    .stop = replication_stop,
-};
-
-static int replication_open(BlockDriverState *bs, QDict *options,
-                            int flags, Error **errp)
-{
-    int ret;
-    BDRVReplicationState *s = bs->opaque;
-    Error *local_err = NULL;
-    QemuOpts *opts = NULL;
-    const char *mode;
-    const char *top_id;
-
-    ret = -EINVAL;
-    opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        goto fail;
-    }
-
-    mode = qemu_opt_get(opts, REPLICATION_MODE);
-    if (!mode) {
-        error_setg(&local_err, "Missing the option mode");
-        goto fail;
-    }
-
-    if (!strcmp(mode, "primary")) {
-        s->mode = REPLICATION_MODE_PRIMARY;
-    } else if (!strcmp(mode, "secondary")) {
-        s->mode = REPLICATION_MODE_SECONDARY;
-        top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
-        s->top_id = g_strdup(top_id);
-        if (!s->top_id) {
-            error_setg(&local_err, "Missing the option top-id");
-            goto fail;
-        }
-    } else {
-        error_setg(&local_err,
-                   "The option mode's value should be primary or secondary");
-        goto fail;
-    }
-
-    s->rs = replication_new(bs, &replication_ops);
-
-    ret = 0;
-
-fail:
-    qemu_opts_del(opts);
-    error_propagate(errp, local_err);
-
-    return ret;
-}
-
-static void replication_close(BlockDriverState *bs)
-{
-    BDRVReplicationState *s = bs->opaque;
-
-    if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
-        replication_stop(s->rs, false, NULL);
-    }
-
-    if (s->mode == REPLICATION_MODE_SECONDARY) {
-        g_free(s->top_id);
-    }
-
-    replication_remove(s->rs);
-}
-
-static int64_t replication_getlength(BlockDriverState *bs)
-{
-    return bdrv_getlength(bs->file->bs);
-}
-
-static int replication_get_io_status(BDRVReplicationState *s)
-{
-    switch (s->replication_state) {
-    case BLOCK_REPLICATION_NONE:
-        return -EIO;
-    case BLOCK_REPLICATION_RUNNING:
-        return 0;
-    case BLOCK_REPLICATION_FAILOVER:
-        return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
-    case BLOCK_REPLICATION_FAILOVER_FAILED:
-        return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
-    case BLOCK_REPLICATION_DONE:
-        /*
-         * active commit job completes, and active disk and secondary_disk
-         * is swapped, so we can operate bs->file directly
-         */
-        return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
-    default:
-        abort();
-    }
-}
-
-static int replication_return_value(BDRVReplicationState *s, int ret)
-{
-    if (s->mode == REPLICATION_MODE_SECONDARY) {
-        return ret;
-    }
-
-    if (ret < 0) {
-        s->error = ret;
-        ret = 0;
-    }
-
-    return ret;
-}
-
-static coroutine_fn int replication_co_readv(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             int remaining_sectors,
-                                             QEMUIOVector *qiov)
-{
-    BDRVReplicationState *s = bs->opaque;
-    BdrvChild *child = s->secondary_disk;
-    BlockJob *job = NULL;
-    CowRequest req;
-    int ret;
-
-    if (s->mode == REPLICATION_MODE_PRIMARY) {
-        /* We only use it to forward primary write requests */
-        return -EIO;
-    }
-
-    ret = replication_get_io_status(s);
-    if (ret < 0) {
-        return ret;
-    }
-
-    if (child && child->bs) {
-        job = child->bs->job;
-    }
-
-    if (job) {
-        backup_wait_for_overlapping_requests(child->bs->job, sector_num,
-                                             remaining_sectors);
-        backup_cow_request_begin(&req, child->bs->job, sector_num,
-                                 remaining_sectors);
-        ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors,
-                            qiov);
-        backup_cow_request_end(&req);
-        goto out;
-    }
-
-    ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors, qiov);
-out:
-    return replication_return_value(s, ret);
-}
-
-static coroutine_fn int replication_co_writev(BlockDriverState *bs,
-                                              int64_t sector_num,
-                                              int remaining_sectors,
-                                              QEMUIOVector *qiov)
-{
-    BDRVReplicationState *s = bs->opaque;
-    QEMUIOVector hd_qiov;
-    uint64_t bytes_done = 0;
-    BdrvChild *top = bs->file;
-    BdrvChild *base = s->secondary_disk;
-    BdrvChild *target;
-    int ret, n;
-
-    ret = replication_get_io_status(s);
-    if (ret < 0) {
-        goto out;
-    }
-
-    if (ret == 0) {
-        ret = bdrv_co_writev(top, sector_num,
-                             remaining_sectors, qiov);
-        return replication_return_value(s, ret);
-    }
-
-    /*
-     * Failover failed, only write to active disk if the sectors
-     * have already been allocated in active disk/hidden disk.
-     */
-    qemu_iovec_init(&hd_qiov, qiov->niov);
-    while (remaining_sectors > 0) {
-        ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num,
-                                      remaining_sectors, &n);
-        if (ret < 0) {
-            goto out1;
-        }
-
-        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE);
-
-        target = ret ? top : base;
-        ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
-        if (ret < 0) {
-            goto out1;
-        }
-
-        remaining_sectors -= n;
-        sector_num += n;
-        bytes_done += n * BDRV_SECTOR_SIZE;
-    }
-
-out1:
-    qemu_iovec_destroy(&hd_qiov);
-out:
-    return ret;
-}
-
-static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
-                                                    BlockDriverState *candidate)
-{
-    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
-}
-
-static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
-{
-    Error *local_err = NULL;
-    int ret;
-
-    if (!s->secondary_disk->bs->job) {
-        error_setg(errp, "Backup job was cancelled unexpectedly");
-        return;
-    }
-
-    backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
-    ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
-    if (ret < 0) {
-        error_setg(errp, "Cannot make active disk empty");
-        return;
-    }
-
-    ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
-    if (ret < 0) {
-        error_setg(errp, "Cannot make hidden disk empty");
-        return;
-    }
-}
-
-static void reopen_backing_file(BDRVReplicationState *s, bool writable,
-                                Error **errp)
-{
-    BlockReopenQueue *reopen_queue = NULL;
-    int orig_hidden_flags, orig_secondary_flags;
-    int new_hidden_flags, new_secondary_flags;
-    Error *local_err = NULL;
-
-    if (writable) {
-        orig_hidden_flags = s->orig_hidden_flags =
-                                bdrv_get_flags(s->hidden_disk->bs);
-        new_hidden_flags = (orig_hidden_flags | BDRV_O_RDWR) &
-                                                    ~BDRV_O_INACTIVE;
-        orig_secondary_flags = s->orig_secondary_flags =
-                                bdrv_get_flags(s->secondary_disk->bs);
-        new_secondary_flags = (orig_secondary_flags | BDRV_O_RDWR) &
-                                                     ~BDRV_O_INACTIVE;
-    } else {
-        orig_hidden_flags = (s->orig_hidden_flags | BDRV_O_RDWR) &
-                                                    ~BDRV_O_INACTIVE;
-        new_hidden_flags = s->orig_hidden_flags;
-        orig_secondary_flags = (s->orig_secondary_flags | BDRV_O_RDWR) &
-                                                    ~BDRV_O_INACTIVE;
-        new_secondary_flags = s->orig_secondary_flags;
-    }
-
-    if (orig_hidden_flags != new_hidden_flags) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
-                                         new_hidden_flags);
-    }
-
-    if (!(orig_secondary_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
-                                         NULL, new_secondary_flags);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
-        error_propagate(errp, local_err);
-    }
-}
-
-static void backup_job_cleanup(BDRVReplicationState *s)
-{
-    BlockDriverState *top_bs;
-
-    top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
-    if (!top_bs) {
-        return;
-    }
-    bdrv_op_unblock_all(top_bs, s->blocker);
-    error_free(s->blocker);
-    reopen_backing_file(s, false, NULL);
-}
-
-static void backup_job_completed(void *opaque, int ret)
-{
-    BDRVReplicationState *s = opaque;
-
-    if (s->replication_state != BLOCK_REPLICATION_FAILOVER) {
-        /* The backup job is cancelled unexpectedly */
-        s->error = -EIO;
-    }
-
-    backup_job_cleanup(s);
-}
-
-static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    /* The bs itself is the top_bs */
-    if (top_bs == bs) {
-        return true;
-    }
-
-    /* Iterate over top_bs's children */
-    QLIST_FOREACH(child, &top_bs->children, next) {
-        if (child->bs == bs || check_top_bs(child->bs, bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static void replication_start(ReplicationState *rs, ReplicationMode mode,
-                              Error **errp)
-{
-    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    BlockDriverState *top_bs;
-    int64_t active_length, hidden_length, disk_length;
-    AioContext *aio_context;
-    Error *local_err = NULL;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
-
-    if (s->replication_state != BLOCK_REPLICATION_NONE) {
-        error_setg(errp, "Block replication is running or done");
-        aio_context_release(aio_context);
-        return;
-    }
-
-    if (s->mode != mode) {
-        error_setg(errp, "The parameter mode's value is invalid, needs %d,"
-                   " but got %d", s->mode, mode);
-        aio_context_release(aio_context);
-        return;
-    }
-
-    switch (s->mode) {
-    case REPLICATION_MODE_PRIMARY:
-        break;
-    case REPLICATION_MODE_SECONDARY:
-        s->active_disk = bs->file;
-        if (!s->active_disk || !s->active_disk->bs ||
-                                    !s->active_disk->bs->backing) {
-            error_setg(errp, "Active disk doesn't have backing file");
-            aio_context_release(aio_context);
-            return;
-        }
-
-        s->hidden_disk = s->active_disk->bs->backing;
-        if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
-            error_setg(errp, "Hidden disk doesn't have backing file");
-            aio_context_release(aio_context);
-            return;
-        }
-
-        s->secondary_disk = s->hidden_disk->bs->backing;
-        if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
-            error_setg(errp, "The secondary disk doesn't have block backend");
-            aio_context_release(aio_context);
-            return;
-        }
-
-        /* verify the length */
-        active_length = bdrv_getlength(s->active_disk->bs);
-        hidden_length = bdrv_getlength(s->hidden_disk->bs);
-        disk_length = bdrv_getlength(s->secondary_disk->bs);
-        if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
-            active_length != hidden_length || hidden_length != disk_length) {
-            error_setg(errp, "Active disk, hidden disk, secondary disk's length"
-                       " are not the same");
-            aio_context_release(aio_context);
-            return;
-        }
-
-        if (!s->active_disk->bs->drv->bdrv_make_empty ||
-            !s->hidden_disk->bs->drv->bdrv_make_empty) {
-            error_setg(errp,
-                       "Active disk or hidden disk doesn't support make_empty");
-            aio_context_release(aio_context);
-            return;
-        }
-
-        /* reopen the backing file in r/w mode */
-        reopen_backing_file(s, true, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            aio_context_release(aio_context);
-            return;
-        }
-
-        /* start backup job now */
-        error_setg(&s->blocker,
-                   "Block device is in use by internal backup job");
-
-        top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
-        if (!top_bs || !bdrv_is_root_node(top_bs) ||
-            !check_top_bs(top_bs, bs)) {
-            error_setg(errp, "No top_bs or it is invalid");
-            reopen_backing_file(s, false, NULL);
-            aio_context_release(aio_context);
-            return;
-        }
-        bdrv_op_block_all(top_bs, s->blocker);
-        bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
-
-        backup_start("replication-backup", s->secondary_disk->bs,
-                     s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
-                     BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
-                     backup_job_completed, s, NULL, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            backup_job_cleanup(s);
-            aio_context_release(aio_context);
-            return;
-        }
-        break;
-    default:
-        aio_context_release(aio_context);
-        abort();
-    }
-
-    s->replication_state = BLOCK_REPLICATION_RUNNING;
-
-    if (s->mode == REPLICATION_MODE_SECONDARY) {
-        secondary_do_checkpoint(s, errp);
-    }
-
-    s->error = 0;
-    aio_context_release(aio_context);
-}
-
-static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
-{
-    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
-
-    if (s->mode == REPLICATION_MODE_SECONDARY) {
-        secondary_do_checkpoint(s, errp);
-    }
-    aio_context_release(aio_context);
-}
-
-static void replication_get_error(ReplicationState *rs, Error **errp)
-{
-    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
-
-    if (s->replication_state != BLOCK_REPLICATION_RUNNING) {
-        error_setg(errp, "Block replication is not running");
-        aio_context_release(aio_context);
-        return;
-    }
-
-    if (s->error) {
-        error_setg(errp, "I/O error occurred");
-        aio_context_release(aio_context);
-        return;
-    }
-    aio_context_release(aio_context);
-}
-
-static void replication_done(void *opaque, int ret)
-{
-    BlockDriverState *bs = opaque;
-    BDRVReplicationState *s = bs->opaque;
-
-    if (ret == 0) {
-        s->replication_state = BLOCK_REPLICATION_DONE;
-
-        /* refresh top bs's filename */
-        bdrv_refresh_filename(bs);
-        s->active_disk = NULL;
-        s->secondary_disk = NULL;
-        s->hidden_disk = NULL;
-        s->error = 0;
-    } else {
-        s->replication_state = BLOCK_REPLICATION_FAILOVER_FAILED;
-        s->error = -EIO;
-    }
-}
-
-static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
-{
-    BlockDriverState *bs = rs->opaque;
-    BDRVReplicationState *s;
-    AioContext *aio_context;
-
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-    s = bs->opaque;
-
-    if (s->replication_state != BLOCK_REPLICATION_RUNNING) {
-        error_setg(errp, "Block replication is not running");
-        aio_context_release(aio_context);
-        return;
-    }
-
-    switch (s->mode) {
-    case REPLICATION_MODE_PRIMARY:
-        s->replication_state = BLOCK_REPLICATION_DONE;
-        s->error = 0;
-        break;
-    case REPLICATION_MODE_SECONDARY:
-        /*
-         * This BDS will be closed, and the job should be completed
-         * before the BDS is closed, because we will access hidden
-         * disk, secondary disk in backup_job_completed().
-         */
-        if (s->secondary_disk->bs->job) {
-            block_job_cancel_sync(s->secondary_disk->bs->job);
-        }
-
-        if (!failover) {
-            secondary_do_checkpoint(s, errp);
-            s->replication_state = BLOCK_REPLICATION_DONE;
-            aio_context_release(aio_context);
-            return;
-        }
-
-        s->replication_state = BLOCK_REPLICATION_FAILOVER;
-        commit_active_start("replication-commit", s->active_disk->bs,
-                            s->secondary_disk->bs, 0, BLOCKDEV_ON_ERROR_REPORT,
-                            replication_done,
-                            bs, errp, true);
-        break;
-    default:
-        aio_context_release(aio_context);
-        abort();
-    }
-    aio_context_release(aio_context);
-}
-
-BlockDriver bdrv_replication = {
-    .format_name                = "replication",
-    .protocol_name              = "replication",
-    .instance_size              = sizeof(BDRVReplicationState),
-
-    .bdrv_open                  = replication_open,
-    .bdrv_close                 = replication_close,
-
-    .bdrv_getlength             = replication_getlength,
-    .bdrv_co_readv              = replication_co_readv,
-    .bdrv_co_writev             = replication_co_writev,
-
-    .is_filter                  = true,
-    .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
-
-    .has_variable_length        = true,
-};
-
-static void bdrv_replication_init(void)
-{
-    bdrv_register(&bdrv_replication);
-}
-
-block_init(bdrv_replication_init);
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,16 +294,13 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)

 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DEBUG_SDOG_PRINT 1
-#else
-#define DEBUG_SDOG_PRINT 0
-#endif
-#define DPRINTF(fmt, args...)                                           \
-    do {                                                                \
-        if (DEBUG_SDOG_PRINT) {                                         \
-            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
-        }                                                               \
+#define DPRINTF(fmt, args...)                                       \
+    do {                                                            \
+        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
    } while (0)
+#else
+#define DPRINTF(fmt, args...)
+#endif

 typedef struct SheepdogAIOCB SheepdogAIOCB;

@@ -495,7 +492,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)

 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
 {
-    qemu_coroutine_enter(acb->coroutine);
+    qemu_coroutine_enter(acb->coroutine, NULL);
    qemu_aio_unref(acb);
 }

@@ -636,7 +633,7 @@ static void restart_co_req(void *opaque)
 {
    Coroutine *co = opaque;

-    qemu_coroutine_enter(co);
+    qemu_coroutine_enter(co, NULL);
 }

 typedef struct SheepdogReqCo {
@@ -726,8 +723,8 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
    if (qemu_in_coroutine()) {
        do_co_req(&srco);
    } else {
-        co = qemu_coroutine_create(do_co_req, &srco);
-        qemu_coroutine_enter(co);
+        co = qemu_coroutine_create(do_co_req);
+        qemu_coroutine_enter(co, &srco);
        while (!srco.finished) {
            aio_poll(aio_context, true);
        }
@@ -925,17 +922,17 @@ static void co_read_response(void *opaque)
    BDRVSheepdogState *s = opaque;

    if (!s->co_recv) {
-        s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
+        s->co_recv = qemu_coroutine_create(aio_read_response);
    }

-    qemu_coroutine_enter(s->co_recv);
+    qemu_coroutine_enter(s->co_recv, opaque);
 }

 static void co_write_request(void *opaque)
 {
    BDRVSheepdogState *s = opaque;

-    qemu_coroutine_enter(s->co_send);
+    qemu_coroutine_enter(s->co_send, NULL);
 }

 /*
@@ -1049,7 +1046,7 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
    const char *host_spec, *vdi_spec;
    int nr_sep, ret;

-    strstart(filename, "sheepdog:", &filename);
+    strstart(filename, "sheepdog:", (const char **)&filename);
    p = q = g_strdup(filename);

    /* count the number of separators */
@@ -2652,7 +2649,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
    req.opcode = SD_OP_READ_VDIS;
    req.data_length = max;

-    ret = do_req(fd, s->aio_context, &req,
+    ret = do_req(fd, s->aio_context, (SheepdogReq *)&req,
                 vdi_inuse, &wlen, &rlen);

    closesocket(fd);
@@ -2784,24 +2781,17 @@ static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
    return ret;
 }

-static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
-                           int64_t pos)
+static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
+                           int64_t pos, int size)
 {
    BDRVSheepdogState *s = bs->opaque;
-    void *buf;
-    int ret;

-    buf = qemu_blockalign(bs, qiov->size);
-    ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
-    qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
-    qemu_vfree(buf);
-
-    return ret;
+    return do_load_save_vmstate(s, data, pos, size, 1);
 }


-static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                      int count)
+static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors)
 {
    SheepdogAIOCB *acb;
    BDRVSheepdogState *s = bs->opaque;
@@ -2811,7 +2801,7 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    uint32_t zero = 0;

    if (!s->discard_supported) {
-        return 0;
+            return 0;
    }

    memset(&discard_iov, 0, sizeof(discard_iov));
@@ -2820,10 +2810,7 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    iov.iov_len = sizeof(zero);
    discard_iov.iov = &iov;
    discard_iov.niov = 1;
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
-    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
-                       count >> BDRV_SECTOR_BITS);
+    acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
    acb->aiocb_type = AIOCB_DISCARD_OBJ;
    acb->aio_done_func = sd_finish_aiocb;

@@ -2957,7 +2944,7 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_co_readv  = sd_co_readv,
    .bdrv_co_writev = sd_co_writev,
    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_pdiscard = sd_co_pdiscard,
+    .bdrv_co_discard = sd_co_discard,
    .bdrv_co_get_block_status = sd_co_get_block_status,

    .bdrv_snapshot_create   = sd_snapshot_create,
@@ -2993,7 +2980,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_co_readv  = sd_co_readv,
    .bdrv_co_writev = sd_co_writev,
    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_pdiscard = sd_co_pdiscard,
+    .bdrv_co_discard = sd_co_discard,
    .bdrv_co_get_block_status = sd_co_get_block_status,

    .bdrv_snapshot_create   = sd_snapshot_create,
@@ -3029,7 +3016,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_co_readv  = sd_co_readv,
    .bdrv_co_writev = sd_co_writev,
    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_pdiscard = sd_co_pdiscard,
+    .bdrv_co_discard = sd_co_discard,
    .bdrv_co_get_block_status = sd_co_get_block_status,

    .bdrv_snapshot_create   = sd_snapshot_create,
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -358,7 +358,9 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
        ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err);
    }

-    error_propagate(errp, local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }

    return ret;
 }
@@ -371,10 +373,9 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
    bool ok = true;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (ok && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -382,12 +383,8 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
            ok = bdrv_can_snapshot(bs);
        }
        aio_context_release(ctx);
-        if (!ok) {
-            goto fail;
-        }
    }

-fail:
    *first_bad_bs = bs;
    return ok;
 }
@@ -396,11 +393,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                             Error **err)
 {
    int ret = 0;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
    QEMUSnapshotInfo sn1, *snapshot = &sn1;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (ret == 0 && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -409,12 +405,8 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
        }
        aio_context_release(ctx);
-        if (ret < 0) {
-            goto fail;
-        }
    }

-fail:
    *first_bad_bs = bs;
    return ret;
 }
@@ -423,10 +415,9 @@ fail:
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (err == 0 && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -434,12 +425,8 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_goto(bs, name);
        }
        aio_context_release(ctx);
-        if (err < 0) {
-            goto fail;
-        }
    }

-fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -448,10 +435,9 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    QEMUSnapshotInfo sn;
    int err = 0;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (err == 0 && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -459,12 +445,8 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_find(bs, &sn, name);
        }
        aio_context_release(ctx);
-        if (err < 0) {
-            goto fail;
-        }
    }

-fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -475,10 +457,9 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                             BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (err == 0 && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -490,32 +471,23 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
            err = bdrv_snapshot_create(bs, sn);
        }
        aio_context_release(ctx);
-        if (err < 0) {
-            goto fail;
-        }
    }

-fail:
    *first_bad_bs = bs;
    return err;
 }

 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    bool not_found = true;
+    BlockDriverState *bs = NULL;

-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while (not_found && (bs = bdrv_next(bs))) {
        AioContext *ctx = bdrv_get_aio_context(bs);
-        bool found;

        aio_context_acquire(ctx);
-        found = bdrv_can_snapshot(bs);
+        not_found = !bdrv_can_snapshot(bs);
        aio_context_release(ctx);
-
-        if (found) {
-            break;
-        }
    }
    return bs;
 }
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -508,73 +508,36 @@ static int authenticate(BDRVSSHState *s, const char *user, Error **errp)
    return ret;
 }

-static QemuOptsList ssh_runtime_opts = {
-    .name = "ssh",
-    .head = QTAILQ_HEAD_INITIALIZER(ssh_runtime_opts.head),
-    .desc = {
-        {
-            .name = "host",
-            .type = QEMU_OPT_STRING,
-            .help = "Host to connect to",
-        },
-        {
-            .name = "port",
-            .type = QEMU_OPT_NUMBER,
-            .help = "Port to connect to",
-        },
-        {
-            .name = "path",
-            .type = QEMU_OPT_STRING,
-            .help = "Path of the image on the host",
-        },
-        {
-            .name = "user",
-            .type = QEMU_OPT_STRING,
-            .help = "User as which to connect",
-        },
-        {
-            .name = "host_key_check",
-            .type = QEMU_OPT_STRING,
-            .help = "Defines how and what to check the host key against",
-        },
-    },
-};
-
 static int connect_to_ssh(BDRVSSHState *s, QDict *options,
                          int ssh_flags, int creat_mode, Error **errp)
 {
    int r, ret;
-    QemuOpts *opts = NULL;
-    Error *local_err = NULL;
    const char *host, *user, *path, *host_key_check;
    int port;

-    opts = qemu_opts_create(&ssh_runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        ret = -EINVAL;
-        error_propagate(errp, local_err);
-        goto err;
-    }
-
-    host = qemu_opt_get(opts, "host");
-    if (!host) {
+    if (!qdict_haskey(options, "host")) {
        ret = -EINVAL;
        error_setg(errp, "No hostname was specified");
        goto err;
    }
+    host = qdict_get_str(options, "host");

-    port = qemu_opt_get_number(opts, "port", 22);
+    if (qdict_haskey(options, "port")) {
+        port = qdict_get_int(options, "port");
+    } else {
+        port = 22;
+    }

-    path = qemu_opt_get(opts, "path");
-    if (!path) {
+    if (!qdict_haskey(options, "path")) {
        ret = -EINVAL;
        error_setg(errp, "No path was specified");
        goto err;
    }
+    path = qdict_get_str(options, "path");

-    user = qemu_opt_get(opts, "user");
-    if (!user) {
+    if (qdict_haskey(options, "user")) {
+        user = qdict_get_str(options, "user");
+    } else {
        user = g_get_user_name();
        if (!user) {
            error_setg_errno(errp, errno, "Can't get user name");
@@ -583,8 +546,9 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        }
    }

-    host_key_check = qemu_opt_get(opts, "host_key_check");
-    if (!host_key_check) {
+    if (qdict_haskey(options, "host_key_check")) {
+        host_key_check = qdict_get_str(options, "host_key_check");
+    } else {
        host_key_check = "yes";
    }

@@ -648,14 +612,21 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        goto err;
    }

-    qemu_opts_del(opts);
-
    r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs);
    if (r < 0) {
        sftp_error_setg(errp, s, "failed to read file attributes");
        return -EINVAL;
    }

+    /* Delete the options we've used; any not deleted will cause the
+     * block layer to give an error about unused options.
+     */
+    qdict_del(options, "host");
+    qdict_del(options, "port");
+    qdict_del(options, "user");
+    qdict_del(options, "path");
+    qdict_del(options, "host_key_check");
+
    return 0;

 err:
@@ -675,8 +646,6 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    }
    s->session = NULL;

-    qemu_opts_del(opts);
-
    return ret;
 }

@@ -808,7 +777,7 @@ static void restart_coroutine(void *opaque)

    DPRINTF("co=%p", co);

-    qemu_coroutine_enter(co);
+    qemu_coroutine_enter(co, NULL);
 }

 static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
    char *backing_file_str;
 } StreamBlockJob;

-static int coroutine_fn stream_populate(BlockBackend *blk,
+static int coroutine_fn stream_populate(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
@@ -52,8 +52,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Copy-on-read the unallocated clusters */
-    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
-                         BDRV_REQ_COPY_ON_READ);
+    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
 }

 typedef struct {
@@ -65,7 +64,6 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
    StreamCompleteData *data = opaque;
-    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -77,8 +75,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                base_fmt = base->drv->format_name;
            }
        }
-        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        bdrv_set_backing_hd(bs, base);
+        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
+        bdrv_set_backing_hd(job->bs, base);
    }

    g_free(s->backing_file_str);
@@ -90,12 +88,10 @@ static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
    StreamCompleteData *data;
-    BlockBackend *blk = s->common.blk;
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs = s->common.bs;
    BlockDriverState *base = s->base;
    int64_t sector_num = 0;
    int64_t end = -1;
-    uint64_t delay_ns = 0;
    int error = 0;
    int ret = 0;
    int n = 0;
@@ -124,8 +120,10 @@ static void coroutine_fn stream_run(void *opaque)
    }

    for (sector_num = 0; sector_num < end; sector_num += n) {
+        uint64_t delay_ns = 0;
        bool copy;

+wait:
        /* Note that even when no rate limit is applied we need to yield
         * with no pending I/O here so that bdrv_drain_all() returns.
         */
@@ -155,11 +153,18 @@ static void coroutine_fn stream_run(void *opaque)
        }
        trace_stream_one_iteration(s, sector_num, n, ret);
        if (copy) {
-            ret = stream_populate(blk, sector_num, n, buf);
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, n);
+                if (delay_ns > 0) {
+                    goto wait;
+                }
+            }
+            ret = stream_populate(bs, sector_num, n, buf);
        }
        if (ret < 0) {
            BlockErrorAction action =
-                block_job_error_action(&s->common, s->on_error, true, -ret);
+                block_job_error_action(&s->common, s->common.bs, s->on_error,
+                                       true, -ret);
            if (action == BLOCK_ERROR_ACTION_STOP) {
                n = 0;
                continue;
@@ -175,9 +180,6 @@ static void coroutine_fn stream_run(void *opaque)

        /* Publish progress */
        s->common.offset += n * BDRV_SECTOR_SIZE;
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
-        }
    }

    if (!base) {
@@ -214,15 +216,22 @@ static const BlockJobDriver stream_job_driver = {
    .set_speed     = stream_set_speed,
 };

-void stream_start(const char *job_id, BlockDriverState *bs,
-                  BlockDriverState *base, const char *backing_file_str,
-                  int64_t speed, BlockdevOnError on_error,
-                  BlockCompletionFunc *cb, void *opaque, Error **errp)
+void stream_start(BlockDriverState *bs, BlockDriverState *base,
+                  const char *backing_file_str, int64_t speed,
+                  BlockdevOnError on_error,
+                  BlockCompletionFunc *cb,
+                  void *opaque, Error **errp)
 {
    StreamBlockJob *s;

-    s = block_job_create(job_id, &stream_job_driver, bs, speed,
-                         cb, opaque, errp);
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
+        return;
+    }
+
+    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
    }
@@ -231,7 +240,7 @@ void stream_start(const char *job_id, BlockDriverState *bs,
    s->backing_file_str = g_strdup(backing_file_str);

    s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(stream_run, s);
+    s->common.co = qemu_coroutine_create(stream_run);
    trace_stream_start(bs, base, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
+    qemu_coroutine_enter(s->common.co, s);
 }
--- a/block/tar.c
+++ b/block/tar.c
@@ -0,0 +1,379 @@
+/*
+ * Tar block driver
+ *
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "block/block_int.h"
+
+// #define DEBUG
+
+#ifdef DEBUG
+#define dprintf(fmt, ...) do { printf("tar: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) do { } while (0)
+#endif
+
+#define SECTOR_SIZE      512
+
+#define POSIX_TAR_MAGIC  "ustar"
+#define OFFS_LENGTH      0x7c
+#define OFFS_TYPE        0x9c
+#define OFFS_MAGIC       0x101
+
+#define OFFS_S_SP        0x182
+#define OFFS_S_EXT       0x1e2
+#define OFFS_S_LENGTH    0x1e3
+#define OFFS_SX_EXT      0x1f8
+
+typedef struct SparseCache {
+    uint64_t start;
+    uint64_t end;
+} SparseCache;
+
+typedef struct BDRVTarState {
+    BlockDriverState *hd;
+    size_t file_sec;
+    uint64_t file_len;
+    SparseCache *sparse;
+    int sparse_num;
+    uint64_t last_end;
+    char longfile[2048];
+} BDRVTarState;
+
+static int str_ends(char *str, const char *end)
+{
+    int end_len = strlen(end);
+    int str_len = strlen(str);
+
+    if (str_len < end_len)
+        return 0;
+
+    return !strncmp(str + str_len - end_len, end, end_len);
+}
+
+static int is_target_file(BlockDriverState *bs, char *filename,
+                          char *header)
+{
+    int retval = 0;
+
+    if (str_ends(filename, ".raw"))
+        retval = 1;
+
+    if (str_ends(filename, ".qcow"))
+        retval = 1;
+
+    if (str_ends(filename, ".qcow2"))
+        retval = 1;
+
+    if (str_ends(filename, ".vmdk"))
+        retval = 1;
+
+    if (retval &&
+        (header[OFFS_TYPE] != '0') &&
+        (header[OFFS_TYPE] != 'S')) {
+        retval = 0;
+    }
+
+    dprintf("does filename %s match? %s\n", filename, retval ? "yes" : "no");
+
+    /* make sure we're not using this name again */
+    filename[0] = '\0';
+
+    return retval;
+}
+
+static uint64_t tar2u64(char *ptr)
+{
+    uint64_t retval;
+    char oldend = ptr[12];
+
+    ptr[12] = '\0';
+    if (*ptr & 0x80) {
+        /* XXX we only support files up to 64 bit length */
+        retval = be64_to_cpu(*(uint64_t *)(ptr+4));
+        dprintf("Convert %lx -> %#lx\n", *(uint64_t*)(ptr+4), retval);
+    } else {
+        retval = strtol(ptr, NULL, 8);
+        dprintf("Convert %s -> %#lx\n", ptr, retval);
+    }
+
+    ptr[12] = oldend;
+
+    return retval;
+}
+
+static void tar_sparse(BDRVTarState *s, uint64_t offs, uint64_t len)
+{
+    SparseCache *sparse;
+
+    if (!len)
+        return;
+    if (!(offs - s->last_end)) {
+        s->last_end += len;
+        return;
+    }
+    if (s->last_end > offs)
+        return;
+
+    dprintf("Last chunk until %lx new chunk at %lx\n", s->last_end, offs);
+
+    s->sparse = g_realloc(s->sparse, (s->sparse_num + 1) * sizeof(SparseCache));
+    sparse = &s->sparse[s->sparse_num];
+    sparse->start = s->last_end;
+    sparse->end = offs;
+    s->last_end = offs + len;
+    s->sparse_num++;
+    dprintf("Sparse at %lx end=%lx\n", sparse->start,
+                                       sparse->end);
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "tar",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the tar file",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int tar_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
+{
+    BDRVTarState *s = bs->opaque;
+    char header[SECTOR_SIZE];
+    char *real_file = header;
+    char *magic;
+    size_t header_offs = 0;
+    int ret;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    const char *filename;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
+
+    if (!strncmp(filename, "tar://", 6))
+        filename += 6;
+    else if (!strncmp(filename, "tar:", 4))
+        filename += 4;
+
+    ret = bdrv_open(&s->hd, filename, NULL, NULL, flags | BDRV_O_PROTOCOL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        qemu_opts_del(opts);
+        return ret;
+    }
+
+    /* Search the file for an image */
+
+    do {
+        /* tar header */
+        if (bdrv_pread(s->hd, header_offs, header, SECTOR_SIZE) != SECTOR_SIZE)
+            goto fail;
+
+        if ((header_offs > 1) && !header[0]) {
+            fprintf(stderr, "Tar: No image file found in archive\n");
+            goto fail;
+        }
+
+        magic = &header[OFFS_MAGIC];
+        if (strncmp(magic, POSIX_TAR_MAGIC, 5)) {
+            fprintf(stderr, "Tar: Invalid magic: %s\n", magic);
+            goto fail;
+        }
+
+        dprintf("file type: %c\n", header[OFFS_TYPE]);
+
+        /* file length*/
+        s->file_len = (tar2u64(&header[OFFS_LENGTH]) + (SECTOR_SIZE - 1)) &
+                      ~(SECTOR_SIZE - 1);
+        s->file_sec = (header_offs / SECTOR_SIZE) + 1;
+
+        header_offs += s->file_len + SECTOR_SIZE;
+
+        if (header[OFFS_TYPE] == 'L') {
+            bdrv_pread(s->hd, header_offs - s->file_len, s->longfile,
+                       sizeof(s->longfile));
+            s->longfile[sizeof(s->longfile)-1] = '\0';
+            real_file = header;
+        } else if (s->longfile[0]) {
+            real_file = s->longfile;
+        } else {
+            real_file = header;
+        }
+    } while(!is_target_file(bs, real_file, header));
+
+    /* We found an image! */
+
+    if (header[OFFS_TYPE] == 'S') {
+        uint8_t isextended;
+        int i;
+
+        for (i = OFFS_S_SP; i < (OFFS_S_SP + (4 * 24)); i += 24)
+            tar_sparse(s, tar2u64(&header[i]), tar2u64(&header[i+12]));
+
+        s->file_len = tar2u64(&header[OFFS_S_LENGTH]);
+        isextended = header[OFFS_S_EXT];
+
+        while (isextended) {
+            if (bdrv_pread(s->hd, s->file_sec * SECTOR_SIZE, header,
+                           SECTOR_SIZE) != SECTOR_SIZE)
+                goto fail;
+
+            for (i = 0; i < (21 * 24); i += 24)
+                tar_sparse(s, tar2u64(&header[i]), tar2u64(&header[i+12]));
+            isextended = header[OFFS_SX_EXT];
+            s->file_sec++;
+        }
+        tar_sparse(s, s->file_len, 1);
+    }
+    qemu_opts_del(opts);
+
+    return 0;
+
+fail:
+    fprintf(stderr, "Tar: Error opening file\n");
+    bdrv_unref(s->hd);
+    qemu_opts_del(opts);
+    return -EINVAL;
+}
+
+typedef struct TarAIOCB {
+    BlockAIOCB common;
+    QEMUBH *bh;
+} TarAIOCB;
+
+/* This callback gets invoked when we have pure sparseness */
+static void tar_sparse_cb(void *opaque)
+{
+    TarAIOCB *acb = (TarAIOCB *)opaque;
+
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_unref(acb);
+}
+
+static AIOCBInfo tar_aiocb_info = {
+    .aiocb_size         = sizeof(TarAIOCB),
+};
+
+/* This is where we get a request from a caller to read something */
+static BlockAIOCB *tar_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    BDRVTarState *s = bs->opaque;
+    SparseCache *sparse;
+    int64_t sec_file = sector_num + s->file_sec;
+    int64_t start = sector_num * SECTOR_SIZE;
+    int64_t end = start + (nb_sectors * SECTOR_SIZE);
+    int i;
+    TarAIOCB *acb;
+
+    for (i = 0; i < s->sparse_num; i++) {
+        sparse = &s->sparse[i];
+        if (sparse->start > end) {
+            /* We expect the cache to be start increasing */
+            break;
+        } else if ((sparse->start < start) && (sparse->end <= start)) {
+            /* sparse before our offset */
+            sec_file -= (sparse->end - sparse->start) / SECTOR_SIZE;
+        } else if ((sparse->start <= start) && (sparse->end >= end)) {
+            /* all our sectors are sparse */
+            char *buf = g_malloc0(nb_sectors * SECTOR_SIZE);
+
+            acb = qemu_aio_get(&tar_aiocb_info, bs, cb, opaque);
+            qemu_iovec_from_buf(qiov, 0, buf, nb_sectors * SECTOR_SIZE);
+            g_free(buf);
+            acb->bh = qemu_bh_new(tar_sparse_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        } else if (((sparse->start >= start) && (sparse->start < end)) ||
+                   ((sparse->end >= start) && (sparse->end < end))) {
+            /* we're semi-sparse (worst case) */
+            /* let's go synchronous and read all sectors individually */
+            char *buf = g_malloc(nb_sectors * SECTOR_SIZE);
+            uint64_t offs;
+
+            for (offs = 0; offs < (nb_sectors * SECTOR_SIZE);
+                 offs += SECTOR_SIZE) {
+                bdrv_pread(bs, (sector_num * SECTOR_SIZE) + offs,
+                           buf + offs, SECTOR_SIZE);
+            }
+
+            qemu_iovec_from_buf(qiov, 0, buf, nb_sectors * SECTOR_SIZE);
+            acb = qemu_aio_get(&tar_aiocb_info, bs, cb, opaque);
+            acb->bh = qemu_bh_new(tar_sparse_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        }
+    }
+
+    return bdrv_aio_readv(s->hd, sec_file, qiov, nb_sectors,
+                          cb, opaque);
+}
+
+static void tar_close(BlockDriverState *bs)
+{
+    dprintf("Close\n");
+}
+
+static int64_t tar_getlength(BlockDriverState *bs)
+{
+    BDRVTarState *s = bs->opaque;
+    dprintf("getlength -> %ld\n", s->file_len);
+    return s->file_len;
+}
+
+static BlockDriver bdrv_tar = {
+    .format_name     = "tar",
+    .protocol_name   = "tar",
+
+    .instance_size   = sizeof(BDRVTarState),
+    .bdrv_file_open  = tar_open,
+    .bdrv_close      = tar_close,
+    .bdrv_getlength  = tar_getlength,
+
+    .bdrv_aio_readv  = tar_aio_readv,
+};
+
+static void tar_block_init(void)
+{
+    bdrv_register(&bdrv_tar);
+}
+
+block_init(tar_block_init);
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,14 +23,13 @@
 */

 #include "qemu/osdep.h"
-#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"

 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockBackends and it's independent from
+ * among different BlockDriverState and it's independent from
 * AioContext, so in order to use it from different threads it needs
 * its own locking.
 *
@@ -40,26 +39,26 @@
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
 *
- * In addition to the ThrottleGroup structure, BlockBackendPublic has
+ * In addition to the ThrottleGroup structure, BlockDriverState has
 * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a
- * BlockBackend is registered in a group those fields can be accessed
- * by other threads any time.
+ * therefore also need to be protected by this lock. Once a BDS is
+ * registered in a group those fields can be accessed by other threads
+ * any time.
 *
 * Again, all this is handled internally and is mostly transparent to
 * the outside. The 'throttle_timers' field however has an additional
 * constraint because it may be temporarily invalid (see for example
 * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BlockBackend's timers only after verifying that
- * that BlockBackend has throttled requests in the queue.
+ * access some other BDS's timers only after verifying that that BDS
+ * has throttled requests in the queue.
 */
 typedef struct ThrottleGroup {
    char *name; /* This is constant during the lifetime of the group */

    QemuMutex lock; /* This lock protects the following four fields */
    ThrottleState ts;
-    QLIST_HEAD(, BlockBackendPublic) head;
-    BlockBackend *tokens[2];
+    QLIST_HEAD(, BlockDriverState) head;
+    BlockDriverState *tokens[2];
    bool any_timer_armed[2];

    /* These two are protected by the global throttle_groups_lock */
@@ -133,98 +132,93 @@ void throttle_group_unref(ThrottleState *ts)
    qemu_mutex_unlock(&throttle_groups_lock);
 }

-/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
- * is guaranteed to remain constant during the lifetime of the group.
+/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
+ * the pointer) is guaranteed to remain constant during the lifetime
+ * of the group.
 *
- * @blk:  a BlockBackend that is member of a throttling group
+ * @bs:   a BlockDriverState that is member of a throttling group
 * @ret:  the name of the group.
 */
-const char *throttle_group_get_name(BlockBackend *blk)
+const char *throttle_group_get_name(BlockDriverState *bs)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    return tg->name;
 }

-/* Return the next BlockBackend in the round-robin sequence, simulating a
- * circular list.
+/* Return the next BlockDriverState in the round-robin sequence,
+ * simulating a circular list.
 *
 * This assumes that tg->lock is held.
 *
- * @blk: the current BlockBackend
- * @ret: the next BlockBackend in the sequence
+ * @bs:  the current BlockDriverState
+ * @ret: the next BlockDriverState in the sequence
 */
-static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
+static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = bs->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
+    BlockDriverState *next = QLIST_NEXT(bs, round_robin);

    if (!next) {
-        next = QLIST_FIRST(&tg->head);
+        return QLIST_FIRST(&tg->head);
    }

-    return blk_by_public(next);
+    return next;
 }

-/* Return the next BlockBackend in the round-robin sequence with pending I/O
- * requests.
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
 *
 * This assumes that tg->lock is held.
 *
- * @blk:       the current BlockBackend
+ * @bs:        the current BlockDriverState
 * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockBackend with pending requests, or blk if there is
- *             none.
+ * @ret:       the next BlockDriverState with pending requests, or bs
+ *             if there is none.
 */
-static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+                                             bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-    BlockBackend *token, *start;
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockDriverState *token, *start;

    start = token = tg->tokens[is_write];

    /* get next bs round in round robin style */
-    token = throttle_group_next_blk(token);
-    while (token != start && !blkp->pending_reqs[is_write]) {
-        token = throttle_group_next_blk(token);
+    token = throttle_group_next_bs(token);
+    while (token != start && !token->pending_reqs[is_write]) {
+        token = throttle_group_next_bs(token);
    }

    /* If no IO are queued for scheduling on the next round robin token
     * then decide the token is the current bs because chances are
     * the current bs get the current request queued.
     */
-    if (token == start && !blkp->pending_reqs[is_write]) {
-        token = blk;
+    if (token == start && !token->pending_reqs[is_write]) {
+        token = bs;
    }

    return token;
 }

-/* Check if the next I/O request for a BlockBackend needs to be throttled or
- * not. If there's no timer set in this group, set one and update the token
- * accordingly.
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
 *
 * This assumes that tg->lock is held.
 *
- * @blk:        the current BlockBackend
+ * @bs:         the current BlockDriverState
 * @is_write:   the type of operation (read/write)
 * @ret:        whether the I/O request needs to be throttled or not
 */
-static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+                                          bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
-    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool must_wait;

-    if (blkp->io_limits_disabled) {
-        return false;
-    }
-
    /* Check if any of the timers in this group is already armed */
    if (tg->any_timer_armed[is_write]) {
        return true;
@@ -232,9 +226,9 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)

    must_wait = throttle_schedule_timer(ts, tt, is_write);

-    /* If a timer just got armed, set blk as the current token */
+    /* If a timer just got armed, set bs as the current token */
    if (must_wait) {
-        tg->tokens[is_write] = blk;
+        tg->tokens[is_write] = bs;
        tg->any_timer_armed[is_write] = true;
    }

@@ -245,19 +239,18 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 *
 * This assumes that tg->lock is held.
 *
- * @blk:       the current BlockBackend
+ * @bs:        the current BlockDriverState
 * @is_write:  the type of operation (read/write)
 */
-static void schedule_next_request(BlockBackend *blk, bool is_write)
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    bool must_wait;
-    BlockBackend *token;
+    BlockDriverState *token;

    /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(blk, is_write);
-    if (!blkp->pending_reqs[is_write]) {
+    token = next_throttle_token(bs, is_write);
+    if (!token->pending_reqs[is_write]) {
        return;
    }

@@ -266,12 +259,12 @@ static void schedule_next_request(BlockBackend *blk, bool is_write)

    /* If it doesn't have to wait, queue it for immediate execution */
    if (!must_wait) {
-        /* Give preference to requests from the current blk */
+        /* Give preference to requests from the current bs */
        if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
-            token = blk;
+            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+            token = bs;
        } else {
-            ThrottleTimers *tt = &blkp->throttle_timers;
+            ThrottleTimers *tt = &token->throttle_timers;
            int64_t now = qemu_clock_get_ns(tt->clock_type);
            timer_mod(tt->timers[is_write], now + 1);
            tg->any_timer_armed[is_write] = true;
@@ -284,67 +277,53 @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
 * if necessary, and schedule the next request using a round robin
 * algorithm.
 *
- * @blk:       the current BlockBackend
+ * @bs:        the current BlockDriverState
 * @bytes:     the number of bytes for this I/O
 * @is_write:  the type of operation (read/write)
 */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
                                                        unsigned int bytes,
                                                        bool is_write)
 {
    bool must_wait;
-    BlockBackend *token;
+    BlockDriverState *token;

-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);

    /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(blk, is_write);
+    token = next_throttle_token(bs, is_write);
    must_wait = throttle_group_schedule_timer(token, is_write);

    /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || blkp->pending_reqs[is_write]) {
-        blkp->pending_reqs[is_write]++;
+    if (must_wait || bs->pending_reqs[is_write]) {
+        bs->pending_reqs[is_write]++;
        qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
        qemu_mutex_lock(&tg->lock);
-        blkp->pending_reqs[is_write]--;
+        bs->pending_reqs[is_write]--;
    }

    /* The I/O will be executed, so do the accounting */
-    throttle_account(blkp->throttle_state, is_write, bytes);
+    throttle_account(bs->throttle_state, is_write, bytes);

    /* Schedule the next request */
-    schedule_next_request(blk, is_write);
+    schedule_next_request(bs, is_write);

    qemu_mutex_unlock(&tg->lock);
 }

-void throttle_group_restart_blk(BlockBackend *blk)
-{
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    int i;
-
-    for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
-            ;
-        }
-    }
-}
-
 /* Update the throttle configuration for a particular group. Similar
 * to throttle_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @blk: a BlockBackend that is a member of the group
+ * @bs:  a BlockDriverState that is member of the group
 * @cfg: the configuration to set
 */
-void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
+void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleTimers *tt = &blkp->throttle_timers;
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleState *ts = bs->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    /* throttle_config() cancels the timers */
@@ -356,22 +335,18 @@ void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
    }
    throttle_config(ts, tt, cfg);
    qemu_mutex_unlock(&tg->lock);
-
-    qemu_co_enter_next(&blkp->throttled_reqs[0]);
-    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }

 /* Get the throttle configuration from a particular group. Similar to
 * throttle_get_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @blk: a BlockBackend that is a member of the group
+ * @bs:  a BlockDriverState that is member of the group
 * @cfg: the configuration will be written here
 */
-void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = bs->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    throttle_get_config(ts, cfg);
@@ -381,13 +356,12 @@ void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
 * because it had been throttled.
 *
- * @blk:       the BlockBackend whose request had been throttled
+ * @bs:        the BlockDriverState whose request had been throttled
 * @is_write:  the type of operation (read/write)
 */
-static void timer_cb(BlockBackend *blk, bool is_write)
+static void timer_cb(BlockDriverState *bs, bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = bs->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool empty_queue;

@@ -397,13 +371,13 @@ static void timer_cb(BlockBackend *blk, bool is_write)
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
    if (empty_queue) {
        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(blk, is_write);
+        schedule_next_request(bs, is_write);
        qemu_mutex_unlock(&tg->lock);
    }
 }
@@ -418,17 +392,17 @@ static void write_timer_cb(void *opaque)
    timer_cb(opaque, true);
 }

-/* Register a BlockBackend in the throttling group, also initializing its
- * timers and updating its throttle_state pointer to point to it. If a
- * throttling group with that name does not exist yet, it will be created.
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
 *
- * @blk:       the BlockBackend to insert
+ * @bs:        the BlockDriverState to insert
 * @groupname: the name of the group
 */
-void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
+void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
 {
    int i;
-    BlockBackendPublic *blkp = blk_get_public(blk);
    ThrottleState *ts = throttle_group_incref(groupname);
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;
@@ -438,67 +412,67 @@ void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    blkp->throttle_state = ts;
+    bs->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockBackend as the token */
+    /* If the ThrottleGroup is new set this BlockDriverState as the token */
    for (i = 0; i < 2; i++) {
        if (!tg->tokens[i]) {
-            tg->tokens[i] = blk;
+            tg->tokens[i] = bs;
        }
    }

-    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);

-    throttle_timers_init(&blkp->throttle_timers,
-                         blk_get_aio_context(blk),
+    throttle_timers_init(&bs->throttle_timers,
+                         bdrv_get_aio_context(bs),
                         clock_type,
                         read_timer_cb,
                         write_timer_cb,
-                         blk);
+                         bs);

    qemu_mutex_unlock(&tg->lock);
 }

-/* Unregister a BlockBackend from its group, removing it from the list,
- * destroying the timers and setting the throttle_state pointer to NULL.
+/* Unregister a BlockDriverState from its group, removing it from the
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
 *
- * The BlockBackend must not have pending throttled requests, so the caller has
- * to drain them first.
+ * The BlockDriverState must not have pending throttled requests, so
+ * the caller has to drain them first.
 *
 * The group will be destroyed if it's empty after this operation.
 *
- * @blk: the BlockBackend to remove
+ * @bs: the BlockDriverState to remove
 */
-void throttle_group_unregister_blk(BlockBackend *blk)
+void throttle_group_unregister_bs(BlockDriverState *bs)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
    int i;

-    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
+    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));

    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == blk) {
-            BlockBackend *token = throttle_group_next_blk(blk);
-            /* Take care of the case where this is the last blk in the group */
-            if (token == blk) {
+        if (tg->tokens[i] == bs) {
+            BlockDriverState *token = throttle_group_next_bs(bs);
+            /* Take care of the case where this is the last bs in the group */
+            if (token == bs) {
                token = NULL;
            }
            tg->tokens[i] = token;
        }
    }

-    /* remove the current blk from the list */
-    QLIST_REMOVE(blkp, round_robin);
-    throttle_timers_destroy(&blkp->throttle_timers);
+    /* remove the current bs from the list */
+    QLIST_REMOVE(bs, round_robin);
+    throttle_timers_destroy(&bs->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

    throttle_group_unref(&tg->ts);
-    blkp->throttle_state = NULL;
+    bs->throttle_state = NULL;
 }

 static void throttle_groups_init(void)
--- a/block/trace-events
+++ b/block/trace-events
@@ -1,116 +0,0 @@
-# See docs/tracing.txt for syntax documentation.
-
-# block.c
-bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\""
-bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
-
-# block/block-backend.c
-blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
-blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
-
-# block/io.c
-bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p"
-bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
-bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
-bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
-
-# block/stream.c
-stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
-
-# block/commit.c
-commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
-
-# block/mirror.c
-mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
-mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
-mirror_before_flush(void *s) "s %p"
-mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
-mirror_before_sleep(void *s, int64_t cnt, int synced, uint64_t delay_ns) "s %p dirty count %"PRId64" synced %d delay %"PRIu64"ns"
-mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
-mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
-mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
-mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
-mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
-mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
-
-# block/backup.c
-backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d"
-backup_do_cow_return(void *job, int64_t sector_num, int nb_sectors, int ret) "job %p sector_num %"PRId64" nb_sectors %d ret %d"
-backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64
-backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64
-backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
-backup_do_cow_write_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
-
-# blockdev.c
-qmp_block_job_cancel(void *job) "job %p"
-qmp_block_job_pause(void *job) "job %p"
-qmp_block_job_resume(void *job) "job %p"
-qmp_block_job_complete(void *job) "job %p"
-block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d"
-qmp_block_stream(void *bs, void *job) "bs %p job %p"
-
-# block/raw-win32.c
-# block/raw-posix.c
-paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d"
-paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
-
-# block/qcow2.c
-qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
-qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
-qcow2_writev_start_part(void *co) "co %p"
-qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
-qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
-qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
-qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
-
-# block/qcow2-cluster.c
-qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
-qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
-qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
-qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
-qcow2_cluster_alloc_phys(void *co) "co %p"
-qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d"
-
-qcow2_l2_allocate(void *bs, int l1_index) "bs %p l1_index %d"
-qcow2_l2_allocate_get_empty(void *bs, int l1_index) "bs %p l1_index %d"
-qcow2_l2_allocate_write_l2(void *bs, int l1_index) "bs %p l1_index %d"
-qcow2_l2_allocate_write_l1(void *bs, int l1_index) "bs %p l1_index %d"
-qcow2_l2_allocate_done(void *bs, int l1_index, int ret) "bs %p l1_index %d ret %d"
-
-# block/qcow2-cache.c
-qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset %" PRIx64 " read_from_disk %d"
-qcow2_cache_get_replace_entry(void *co, int c, int i) "co %p is_l2_cache %d index %d"
-qcow2_cache_get_read(void *co, int c, int i) "co %p is_l2_cache %d index %d"
-qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d"
-qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
-qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d"
-
-# block/qed-l2-cache.c
-qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
-qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
-qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d"
-
-# block/qed-table.c
-qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p"
-qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d"
-qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u"
-qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d"
-
-# block/qed.c
-qed_need_check_timer_cb(void *s) "s %p"
-qed_start_need_check_timer(void *s) "s %p"
-qed_cancel_need_check_timer(void *s) "s %p"
-qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
-qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
-qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64
-qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
-qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
-qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
-qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
-qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -54,12 +54,19 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
 #include "qemu/cutils.h"
 #include "qemu/uuid.h"

+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#else
+/* TODO: move uuid emulation to some central place in QEMU. */
+#include "sysemu/sysemu.h"     /* UUID_FMT */
+typedef unsigned char uuid_t[16];
+#endif
+
 /* Code configuration options. */

 /* Enable debug messages. */
@@ -133,6 +140,28 @@
 #define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
                                  (uint64_t)DEFAULT_CLUSTER_SIZE)

+#if !defined(CONFIG_UUID)
+static inline void uuid_generate(uuid_t out)
+{
+    memset(out, 0, sizeof(uuid_t));
+}
+
+static inline int uuid_is_null(const uuid_t uu)
+{
+    uuid_t null_uuid = { 0 };
+    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
+}
+
+# if defined(CONFIG_VDI_DEBUG)
+static inline void uuid_unparse(const uuid_t uu, char *out)
+{
+    snprintf(out, 37, UUID_FMT,
+            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
+            uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
+}
+# endif
+#endif
+
 typedef struct {
    char text[0x40];
    uint32_t signature;
@@ -153,10 +182,10 @@ typedef struct {
    uint32_t block_extra;       /* unused here */
    uint32_t blocks_in_image;
    uint32_t blocks_allocated;
-    QemuUUID uuid_image;
-    QemuUUID uuid_last_snap;
-    QemuUUID uuid_link;
-    QemuUUID uuid_parent;
+    uuid_t uuid_image;
+    uuid_t uuid_last_snap;
+    uuid_t uuid_link;
+    uuid_t uuid_parent;
    uint64_t unused2[7];
 } QEMU_PACKED VdiHeader;

@@ -177,6 +206,16 @@ typedef struct {
    Error *migration_blocker;
 } BDRVVdiState;

+/* Change UUID from little endian (IPRT = VirtualBox format) to big endian
+ * format (network byte order, standard, see RFC 4122) and vice versa.
+ */
+static void uuid_convert(uuid_t uuid)
+{
+    bswap32s((uint32_t *)&uuid[0]);
+    bswap16s((uint16_t *)&uuid[4]);
+    bswap16s((uint16_t *)&uuid[6]);
+}
+
 static void vdi_header_to_cpu(VdiHeader *header)
 {
    le32_to_cpus(&header->signature);
@@ -195,10 +234,10 @@ static void vdi_header_to_cpu(VdiHeader *header)
    le32_to_cpus(&header->block_extra);
    le32_to_cpus(&header->blocks_in_image);
    le32_to_cpus(&header->blocks_allocated);
-    qemu_uuid_bswap(&header->uuid_image);
-    qemu_uuid_bswap(&header->uuid_last_snap);
-    qemu_uuid_bswap(&header->uuid_link);
-    qemu_uuid_bswap(&header->uuid_parent);
+    uuid_convert(header->uuid_image);
+    uuid_convert(header->uuid_last_snap);
+    uuid_convert(header->uuid_link);
+    uuid_convert(header->uuid_parent);
 }

 static void vdi_header_to_le(VdiHeader *header)
@@ -219,10 +258,10 @@ static void vdi_header_to_le(VdiHeader *header)
    cpu_to_le32s(&header->block_extra);
    cpu_to_le32s(&header->blocks_in_image);
    cpu_to_le32s(&header->blocks_allocated);
-    qemu_uuid_bswap(&header->uuid_image);
-    qemu_uuid_bswap(&header->uuid_last_snap);
-    qemu_uuid_bswap(&header->uuid_link);
-    qemu_uuid_bswap(&header->uuid_parent);
+    uuid_convert(header->uuid_image);
+    uuid_convert(header->uuid_last_snap);
+    uuid_convert(header->uuid_link);
+    uuid_convert(header->uuid_parent);
 }

 #if defined(CONFIG_VDI_DEBUG)
@@ -364,7 +403,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,

    logout("\n");

-    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
+    ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1);
    if (ret < 0) {
        goto fail;
    }
@@ -430,11 +469,11 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
                   (uint64_t)header.blocks_in_image * header.block_size);
        ret = -ENOTSUP;
        goto fail;
-    } else if (!qemu_uuid_is_null(&header.uuid_link)) {
+    } else if (!uuid_is_null(header.uuid_link)) {
        error_setg(errp, "unsupported VDI image (non-NULL link UUID)");
        ret = -ENOTSUP;
        goto fail;
-    } else if (!qemu_uuid_is_null(&header.uuid_parent)) {
+    } else if (!uuid_is_null(header.uuid_parent)) {
        error_setg(errp, "unsupported VDI image (non-NULL parent UUID)");
        ret = -ENOTSUP;
        goto fail;
@@ -461,7 +500,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap,
+    ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap,
                    bmap_size);
    if (ret < 0) {
        goto fail_free_bmap;
@@ -519,109 +558,98 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }

-static int coroutine_fn
-vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-              QEMUIOVector *qiov, int flags)
+static int vdi_co_read(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors)
 {
    BDRVVdiState *s = bs->opaque;
-    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t offset_in_block;
-    uint32_t n_bytes;
-    uint64_t bytes_done = 0;
+    uint32_t sector_in_block;
+    uint32_t n_sectors;
    int ret = 0;

    logout("\n");

-    qemu_iovec_init(&local_qiov, qiov->niov);
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
+        }

-    while (ret >= 0 && bytes > 0) {
-        block_index = offset / s->block_size;
-        offset_in_block = offset % s->block_size;
-        n_bytes = MIN(bytes, s->block_size - offset_in_block);
-
-        logout("will read %u bytes starting at offset %" PRIu64 "\n",
-               n_bytes, offset);
+        logout("will read %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Block not allocated, return zeros, no need to wait. */
-            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
+            memset(buf, 0, n_sectors * SECTOR_SIZE);
            ret = 0;
        } else {
-            uint64_t data_offset = s->header.offset_data +
-                                   (uint64_t)bmap_entry * s->block_size +
-                                   offset_in_block;
-
-            qemu_iovec_reset(&local_qiov);
-            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-            ret = bdrv_co_preadv(bs->file, data_offset, n_bytes,
-                                 &local_qiov, 0);
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
+            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
        }
-        logout("%u bytes read\n", n_bytes);
+        logout("%u sectors read\n", n_sectors);

-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;
    }

-    qemu_iovec_destroy(&local_qiov);
-
    return ret;
 }

-static int coroutine_fn
-vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-               QEMUIOVector *qiov, int flags)
+static int vdi_co_write(BlockDriverState *bs,
+        int64_t sector_num, const uint8_t *buf, int nb_sectors)
 {
    BDRVVdiState *s = bs->opaque;
-    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t offset_in_block;
-    uint32_t n_bytes;
+    uint32_t sector_in_block;
+    uint32_t n_sectors;
    uint32_t bmap_first = VDI_UNALLOCATED;
    uint32_t bmap_last = VDI_UNALLOCATED;
    uint8_t *block = NULL;
-    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    qemu_iovec_init(&local_qiov, qiov->niov);
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
+        }

-    while (ret >= 0 && bytes > 0) {
-        block_index = offset / s->block_size;
-        offset_in_block = offset % s->block_size;
-        n_bytes = MIN(bytes, s->block_size - offset_in_block);
-
-        logout("will write %u bytes starting at offset %" PRIu64 "\n",
-               n_bytes, offset);
+        logout("will write %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Allocate new block and write to it. */
-            uint64_t data_offset;
+            uint64_t offset;
            bmap_entry = s->header.blocks_allocated;
            s->bmap[block_index] = cpu_to_le32(bmap_entry);
            s->header.blocks_allocated++;
-            data_offset = s->header.offset_data +
-                          (uint64_t)bmap_entry * s->block_size;
+            offset = s->header.offset_data / SECTOR_SIZE +
+                     (uint64_t)bmap_entry * s->block_sectors;
            if (block == NULL) {
                block = g_malloc(s->block_size);
                bmap_first = block_index;
            }
            bmap_last = block_index;
            /* Copy data to be written to new block and zero unused parts. */
-            memset(block, 0, offset_in_block);
-            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
-                              n_bytes);
-            memset(block + offset_in_block + n_bytes, 0,
-                   s->block_size - n_bytes - offset_in_block);
+            memset(block, 0, sector_in_block * SECTOR_SIZE);
+            memcpy(block + sector_in_block * SECTOR_SIZE,
+                   buf, n_sectors * SECTOR_SIZE);
+            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
+                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);

            /* Note that this coroutine does not yield anywhere from reading the
             * bmap entry until here, so in regards to all the coroutines trying
@@ -631,12 +659,12 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
             * acquire the lock and thus the padded cluster is written before
             * the other coroutines can write to the affected area. */
            qemu_co_mutex_lock(&s->write_lock);
-            ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size);
+            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
            qemu_co_mutex_unlock(&s->write_lock);
        } else {
-            uint64_t data_offset = s->header.offset_data +
-                                   (uint64_t)bmap_entry * s->block_size +
-                                   offset_in_block;
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
            qemu_co_mutex_lock(&s->write_lock);
            /* This lock is only used to make sure the following write operation
             * is executed after the write issued by the coroutine allocating
@@ -647,23 +675,16 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
             * that that write operation has returned (there may be other writes
             * in flight, but they do not concern this very operation). */
            qemu_co_mutex_unlock(&s->write_lock);
-
-            qemu_iovec_reset(&local_qiov);
-            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-            ret = bdrv_co_pwritev(bs->file, data_offset, n_bytes,
-                                  &local_qiov, 0);
+            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
        }

-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;

-        logout("%u bytes written\n", n_bytes);
+        logout("%u sectors written\n", n_sectors);
    }

-    qemu_iovec_destroy(&local_qiov);
-
    logout("finished data write\n");
    if (ret < 0) {
        return ret;
@@ -674,13 +695,12 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
        VdiHeader *header = (VdiHeader *) block;
        uint8_t *base;
        uint64_t offset;
-        uint32_t n_sectors;

        logout("now writing modified header\n");
        assert(VDI_IS_ALLOCATED(bmap_first));
        *header = s->header;
        vdi_header_to_le(header);
-        ret = bdrv_write(bs->file, 0, block, 1);
+        ret = bdrv_write(bs->file->bs, 0, block, 1);
        g_free(block);
        block = NULL;

@@ -698,7 +718,7 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
        logout("will write %u block map sectors starting from entry %u\n",
               n_sectors, bmap_first);
-        ret = bdrv_write(bs->file, offset, base, n_sectors);
+        ret = bdrv_write(bs->file->bs, offset, base, n_sectors);
    }

    return ret;
@@ -782,8 +802,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    if (image_type == VDI_TYPE_STATIC) {
        header.blocks_allocated = blocks;
    }
-    qemu_uuid_generate(&header.uuid_image);
-    qemu_uuid_generate(&header.uuid_last_snap);
+    uuid_generate(header.uuid_image);
+    uuid_generate(header.uuid_last_snap);
    /* There is no need to set header.uuid_link or header.uuid_parent here. */
 #if defined(CONFIG_VDI_DEBUG)
    vdi_header_print(&header);
@@ -884,9 +904,9 @@ static BlockDriver bdrv_vdi = {
    .bdrv_co_get_block_status = vdi_co_get_block_status,
    .bdrv_make_empty = vdi_make_empty,

-    .bdrv_co_preadv     = vdi_co_preadv,
+    .bdrv_read = vdi_co_read,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_co_pwritev    = vdi_co_pwritev,
+    .bdrv_write = vdi_co_write,
 #endif

    .bdrv_get_info = vdi_get_info,
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -18,9 +18,11 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
-#include "qemu/bswap.h"
 #include "block/vhdx.h"

+#include <uuid/uuid.h>
+
+
 /*
 * All the VHDX formats on disk are little endian - the following
 * are helper import/export functions to correctly convert
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -23,7 +23,6 @@
 #include "block/block_int.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include "block/vhdx.h"


@@ -84,7 +83,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,

    offset = log->offset + read;

-    ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
+    ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader));
    if (ret < 0) {
        goto exit;
    }
@@ -144,7 +143,7 @@ static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
        }
        offset = log->offset + read;

-        ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
+        ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE);
        if (ret < 0) {
            goto exit;
        }
@@ -194,7 +193,7 @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
            /* full */
            break;
        }
-        ret = bdrv_pwrite(bs->file, offset, buffer_tmp,
+        ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp,
                          VHDX_LOG_SECTOR_SIZE);
        if (ret < 0) {
            goto exit;
@@ -466,7 +465,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,

    /* count is only > 1 if we are writing zeroes */
    for (i = 0; i < count; i++) {
-        ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
+        ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer,
                               VHDX_LOG_SECTOR_SIZE);
        if (ret < 0) {
            goto exit;
@@ -945,7 +944,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,

        if (i == 0 && leading_length) {
            /* partial sector at the front of the buffer */
-            ret = bdrv_pread(bs->file, file_offset, merged_sector,
+            ret = bdrv_pread(bs->file->bs, file_offset, merged_sector,
                             VHDX_LOG_SECTOR_SIZE);
            if (ret < 0) {
                goto exit;
@@ -955,7 +954,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
            sector_write = merged_sector;
        } else if (i == sectors - 1 && trailing_length) {
            /* partial sector at the end of the buffer */
-            ret = bdrv_pread(bs->file,
+            ret = bdrv_pread(bs->file->bs,
                            file_offset,
                            merged_sector + trailing_length,
                            VHDX_LOG_SECTOR_SIZE - trailing_length);
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -22,10 +22,11 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
-#include "qemu/bswap.h"
 #include "block/vhdx.h"
 #include "migration/migration.h"
-#include "qemu/uuid.h"
+
+#include <uuid/uuid.h>
+#include <glib.h>

 /* Options for VHDX creation */

@@ -212,11 +213,11 @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
 */
 void vhdx_guid_generate(MSGUID *guid)
 {
-    QemuUUID uuid;
+    uuid_t uuid;
    assert(guid != NULL);

-    qemu_uuid_generate(&uuid);
-    memcpy(guid, &uuid, sizeof(MSGUID));
+    uuid_generate(uuid);
+    memcpy(guid, uuid, sizeof(MSGUID));
 }

 /* Check for region overlaps inside the VHDX image */
@@ -297,10 +298,9 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
 * and then update the header checksum.  Header is converted to proper
 * endianness before being written to the specified file offset
 */
-static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr,
+static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
                             uint64_t offset, bool read)
 {
-    BlockDriverState *bs_file = file->bs;
    uint8_t *buffer = NULL;
    int ret;
    VHDXHeader *header_le;
@@ -315,7 +315,7 @@ static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr,
    buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE);
    if (read) {
        /* if true, we can't assume the extra reserved bytes are 0 */
-        ret = bdrv_pread(file, offset, buffer, VHDX_HEADER_SIZE);
+        ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE);
        if (ret < 0) {
            goto exit;
        }
@@ -329,7 +329,7 @@ static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr,
    vhdx_header_le_export(hdr, header_le);
    vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
                         offsetof(VHDXHeader, checksum));
-    ret = bdrv_pwrite_sync(file, offset, header_le, sizeof(VHDXHeader));
+    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader));

 exit:
    qemu_vfree(buffer);
@@ -378,7 +378,7 @@ static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
        inactive_header->log_guid = *log_guid;
    }

-    ret = vhdx_write_header(bs->file, inactive_header, header_offset, true);
+    ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true);
    if (ret < 0) {
        goto exit;
    }
@@ -430,7 +430,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    /* We have to read the whole VHDX_HEADER_SIZE instead of
     * sizeof(VHDXHeader), because the checksum is over the whole
     * region */
-    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer,
+    ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer,
                     VHDX_HEADER_SIZE);
    if (ret < 0) {
        goto fail;
@@ -447,7 +447,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
        }
    }

-    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer,
+    ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer,
                     VHDX_HEADER_SIZE);
    if (ret < 0) {
        goto fail;
@@ -521,7 +521,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
     * whole block */
    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);

-    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+    ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer,
                     VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
        goto fail;
@@ -634,7 +634,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)

    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);

-    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+    ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer,
                     VHDX_METADATA_TABLE_MAX_SIZE);
    if (ret < 0) {
        goto exit;
@@ -737,7 +737,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
        goto exit;
    }

-    ret = bdrv_pread(bs->file,
+    ret = bdrv_pread(bs->file->bs,
                     s->metadata_entries.file_parameters_entry.offset
                                         + s->metadata_rt.file_offset,
                     &s->params,
@@ -772,7 +772,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
    /* determine virtual disk size, logical sector size,
     * and phys sector size */

-    ret = bdrv_pread(bs->file,
+    ret = bdrv_pread(bs->file->bs,
                     s->metadata_entries.virtual_disk_size_entry.offset
                                           + s->metadata_rt.file_offset,
                     &s->virtual_disk_size,
@@ -780,7 +780,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
    if (ret < 0) {
        goto exit;
    }
-    ret = bdrv_pread(bs->file,
+    ret = bdrv_pread(bs->file->bs,
                     s->metadata_entries.logical_sector_size_entry.offset
                                             + s->metadata_rt.file_offset,
                     &s->logical_sector_size,
@@ -788,7 +788,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
    if (ret < 0) {
        goto exit;
    }
-    ret = bdrv_pread(bs->file,
+    ret = bdrv_pread(bs->file->bs,
                     s->metadata_entries.phys_sector_size_entry.offset
                                          + s->metadata_rt.file_offset,
                     &s->physical_sector_size,
@@ -905,7 +905,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    QLIST_INIT(&s->regions);

    /* validate the file signature */
-    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
+    ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t));
    if (ret < 0) {
        goto fail;
    }
@@ -964,7 +964,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
+    ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length);
    if (ret < 0) {
        goto fail;
    }
@@ -1117,7 +1117,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                break;
            case PAYLOAD_BLOCK_FULLY_PRESENT:
                qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_readv(bs->file,
+                ret = bdrv_co_readv(bs->file->bs,
                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
                                    sinfo.sectors_avail, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
@@ -1326,7 +1326,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                }
                /* block exists, so we can just overwrite it */
                qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_writev(bs->file,
+                ret = bdrv_co_writev(bs->file->bs,
                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
                                    sectors_to_write, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
@@ -1387,11 +1387,9 @@ exit:
 * There are 2 headers, and the highest sequence number will represent
 * the active header
 */
-static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
+static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
                                   uint32_t log_size)
 {
-    BlockDriverState *bs = blk_bs(blk);
-    BdrvChild *child;
    int ret = 0;
    VHDXHeader *hdr = NULL;

@@ -1406,18 +1404,12 @@ static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
    vhdx_guid_generate(&hdr->file_write_guid);
    vhdx_guid_generate(&hdr->data_write_guid);

-    /* XXX Ugly way to get blk->root, but that's a feature, not a bug. This
-     * hack makes it obvious that vhdx_write_header() bypasses the BlockBackend
-     * here, which it really shouldn't be doing. */
-    child = QLIST_FIRST(&bs->parents);
-    assert(!QLIST_NEXT(child, next_parent));
-
-    ret = vhdx_write_header(child, hdr, VHDX_HEADER1_OFFSET, false);
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false);
    if (ret < 0) {
        goto exit;
    }
    hdr->sequence_number++;
-    ret = vhdx_write_header(child, hdr, VHDX_HEADER2_OFFSET, false);
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false);
    if (ret < 0) {
        goto exit;
    }
@@ -1450,7 +1442,7 @@ exit:
 * The first 64KB of the Metadata section is reserved for the metadata
 * header and entries; beyond that, the metadata items themselves reside.
 */
-static int vhdx_create_new_metadata(BlockBackend *blk,
+static int vhdx_create_new_metadata(BlockDriverState *bs,
                                    uint64_t image_size,
                                    uint32_t block_size,
                                    uint32_t sector_size,
@@ -1546,13 +1538,13 @@ static int vhdx_create_new_metadata(BlockBackend *blk,
                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
    vhdx_metadata_entry_le_export(&md_table_entry[4]);

-    ret = blk_pwrite(blk, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE, 0);
+    ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
        goto exit;
    }

-    ret = blk_pwrite(blk, metadata_offset + (64 * KiB), entry_buffer,
-                     VHDX_METADATA_ENTRY_BUFFER_SIZE, 0);
+    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
+                      VHDX_METADATA_ENTRY_BUFFER_SIZE);
    if (ret < 0) {
        goto exit;
    }
@@ -1572,7 +1564,7 @@ exit:
 *  Fixed images: default state of the BAT is fully populated, with
 *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
 */
-static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
+static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                           uint64_t image_size, VHDXImageType type,
                           bool use_zero_blocks, uint64_t file_offset,
                           uint32_t length)
@@ -1596,12 +1588,12 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
    if (type == VHDX_TYPE_DYNAMIC) {
        /* All zeroes, so we can just extend the file - the end of the BAT
         * is the furthest thing we have written yet */
-        ret = blk_truncate(blk, data_file_offset);
+        ret = bdrv_truncate(bs, data_file_offset);
        if (ret < 0) {
            goto exit;
        }
    } else if (type == VHDX_TYPE_FIXED) {
-        ret = blk_truncate(blk, data_file_offset + image_size);
+        ret = bdrv_truncate(bs, data_file_offset + image_size);
        if (ret < 0) {
            goto exit;
        }
@@ -1612,7 +1604,7 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,

    if (type == VHDX_TYPE_FIXED ||
                use_zero_blocks ||
-                bdrv_has_zero_init(blk_bs(blk)) == 0) {
+                bdrv_has_zero_init(bs) == 0) {
        /* for a fixed file, the default BAT entry is not zero */
        s->bat = g_try_malloc0(length);
        if (length && s->bat == NULL) {
@@ -1628,12 +1620,12 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
            sinfo.file_offset = data_file_offset +
                                (sector_num << s->logical_sector_size_bits);
            sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB);
-            vhdx_update_bat_table_entry(blk_bs(blk), s, &sinfo, &unused, &unused,
+            vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused,
                                        block_state);
            cpu_to_le64s(&s->bat[sinfo.bat_idx]);
            sector_num += s->sectors_per_block;
        }
-        ret = blk_pwrite(blk, file_offset, s->bat, length, 0);
+        ret = bdrv_pwrite(bs, file_offset, s->bat, length);
        if (ret < 0) {
            goto exit;
        }
@@ -1653,7 +1645,7 @@ exit:
 * to create the BAT itself, we will also cause the BAT to be
 * created.
 */
-static int vhdx_create_new_region_table(BlockBackend *blk,
+static int vhdx_create_new_region_table(BlockDriverState *bs,
                                        uint64_t image_size,
                                        uint32_t block_size,
                                        uint32_t sector_size,
@@ -1728,21 +1720,21 @@ static int vhdx_create_new_region_table(BlockBackend *blk,

    /* The region table gives us the data we need to create the BAT,
     * so do that now */
-    ret = vhdx_create_bat(blk, s, image_size, type, use_zero_blocks,
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks,
                          bat_file_offset, bat_length);
    if (ret < 0) {
        goto exit;
    }

    /* Now write out the region headers to disk */
-    ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, buffer,
-                     VHDX_HEADER_BLOCK_SIZE, 0);
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
+                      VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
        goto exit;
    }

-    ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, buffer,
-                     VHDX_HEADER_BLOCK_SIZE, 0);
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer,
+                      VHDX_HEADER_BLOCK_SIZE);
    if (ret < 0) {
        goto exit;
    }
@@ -1879,13 +1871,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)


    /* Creates (B),(C) */
-    ret = vhdx_create_new_headers(blk, image_size, log_size);
+    ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size);
    if (ret < 0) {
        goto delete_and_exit;
    }

    /* Creates (D),(E),(G) explicitly. (F) created as by-product */
-    ret = vhdx_create_new_region_table(blk, image_size, block_size, 512,
+    ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512,
                                       log_size, use_zero_blocks, image_type,
                                       &metadata_offset);
    if (ret < 0) {
@@ -1893,7 +1885,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* Creates (H) */
-    ret = vhdx_create_new_metadata(blk, image_size, block_size, 512,
+    ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512,
                                   metadata_offset, image_type);
    if (ret < 0) {
        goto delete_and_exit;
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,10 +30,10 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/cutils.h"
 #include <zlib.h>
+#include <glib.h>

 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
@@ -252,7 +252,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
    int ret;

    desc = g_malloc0(DESC_SIZE);
-    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
        g_free(desc);
        return 0;
@@ -286,7 +286,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)

    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
-    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
        goto out;
    }
@@ -306,7 +306,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
        pstrcat(desc, DESC_SIZE, tmp_desc);
    }

-    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE);

 out:
    g_free(desc);
@@ -350,7 +350,7 @@ static int vmdk_parent_open(BlockDriverState *bs)
    int ret;

    desc = g_malloc0(DESC_SIZE + 1);
-    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
    if (ret < 0) {
        goto out;
    }
@@ -454,7 +454,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
        return -ENOMEM;
    }

-    ret = bdrv_pread(extent->file,
+    ret = bdrv_pread(extent->file->bs,
                     extent->l1_table_offset,
                     extent->l1_table,
                     l1_size);
@@ -474,7 +474,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
            ret = -ENOMEM;
            goto fail_l1;
        }
-        ret = bdrv_pread(extent->file,
+        ret = bdrv_pread(extent->file->bs,
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
                         l1_size);
@@ -508,7 +508,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
    VMDK3Header header;
    VmdkExtent *extent;

-    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
+    ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header));
    if (ret < 0) {
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
@@ -538,13 +538,14 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
                               QDict *options, Error **errp);

-static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
+static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
+                            Error **errp)
 {
    int64_t size;
    char *buf;
    int ret;

-    size = bdrv_getlength(file->bs);
+    size = bdrv_getlength(file);
    if (size < 0) {
        error_setg_errno(errp, -size, "Could not access file");
        return NULL;
@@ -585,7 +586,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    int64_t l1_backup_offset = 0;
    bool compressed;

-    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
+    ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header));
    if (ret < 0) {
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
@@ -595,7 +596,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    if (header.capacity == 0) {
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
        if (desc_offset) {
-            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
+            char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp);
            if (!buf) {
                return -EINVAL;
            }
@@ -635,7 +636,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
            } QEMU_PACKED eos_marker;
        } QEMU_PACKED footer;

-        ret = bdrv_pread(file,
+        ret = bdrv_pread(file->bs,
            bs->file->bs->total_sectors * 512 - 1536,
            &footer, sizeof(footer));
        if (ret < 0) {
@@ -873,7 +874,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            extent->flat_start_offset = flat_offset << 9;
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
-            char *buf = vmdk_read_desc(extent_file, 0, errp);
+            char *buf = vmdk_read_desc(extent_file->bs, 0, errp);
            if (!buf) {
                ret = -EINVAL;
            } else {
@@ -942,7 +943,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVVmdkState *s = bs->opaque;
    uint32_t magic;

-    buf = vmdk_read_desc(bs->file, 0, errp);
+    buf = vmdk_read_desc(bs->file->bs, 0, errp);
    if (!buf) {
        return -EINVAL;
    }
@@ -996,9 +997,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
-            bs->bl.pwrite_zeroes_alignment =
-                MAX(bs->bl.pwrite_zeroes_alignment,
-                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
+            bs->bl.write_zeroes_alignment =
+                MAX(bs->bl.write_zeroes_alignment,
+                    s->extents[i].cluster_sectors);
        }
    }
 }
@@ -1015,26 +1016,27 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 */
 static int get_whole_cluster(BlockDriverState *bs,
                             VmdkExtent *extent,
-                             uint64_t cluster_offset,
-                             uint64_t offset,
-                             uint64_t skip_start_bytes,
-                             uint64_t skip_end_bytes)
+                             uint64_t cluster_sector_num,
+                             uint64_t sector_num,
+                             uint64_t skip_start_sector,
+                             uint64_t skip_end_sector)
 {
    int ret = VMDK_OK;
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
+    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
-    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
    whole_grain = qemu_blockalign(bs, cluster_bytes);

    if (!bs->backing) {
-        memset(whole_grain, 0, skip_start_bytes);
-        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
+        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
+        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
+               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
    }

-    assert(skip_end_bytes <= cluster_bytes);
+    assert(skip_end_sector <= extent->cluster_sectors);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
@@ -1043,43 +1045,42 @@ static int get_whole_cluster(BlockDriverState *bs,
    }

    /* Read backing data before skip range */
-    if (skip_start_bytes > 0) {
+    if (skip_start_sector > 0) {
        if (bs->backing) {
-            ret = bdrv_pread(bs->backing, offset, whole_grain,
-                             skip_start_bytes);
+            ret = bdrv_read(bs->backing->bs, sector_num,
+                            whole_grain, skip_start_sector);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
-                          skip_start_bytes);
+        ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain,
+                         skip_start_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
    /* Read backing data after skip range */
-    if (skip_end_bytes < cluster_bytes) {
+    if (skip_end_sector < extent->cluster_sectors) {
        if (bs->backing) {
-            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
-                             whole_grain + skip_end_bytes,
-                             cluster_bytes - skip_end_bytes);
+            ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector,
+                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                            extent->cluster_sectors - skip_end_sector);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
-                          whole_grain + skip_end_bytes,
-                          cluster_bytes - skip_end_bytes);
+        ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector,
+                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                         extent->cluster_sectors - skip_end_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }

-    ret = VMDK_OK;
 exit:
    qemu_vfree(whole_grain);
    return ret;
@@ -1090,7 +1091,8 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
 {
    offset = cpu_to_le32(offset);
    /* update L2 table */
-    if (bdrv_pwrite_sync(extent->file,
+    if (bdrv_pwrite_sync(
+                extent->file->bs,
                ((int64_t)m_data->l2_offset * 512)
                    + (m_data->l2_index * sizeof(offset)),
                &offset, sizeof(offset)) < 0) {
@@ -1099,7 +1101,8 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
    /* update backup L2 table */
    if (extent->l1_backup_table_offset != 0) {
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
-        if (bdrv_pwrite_sync(extent->file,
+        if (bdrv_pwrite_sync(
+                    extent->file->bs,
                    ((int64_t)m_data->l2_offset * 512)
                        + (m_data->l2_index * sizeof(offset)),
                    &offset, sizeof(offset)) < 0) {
@@ -1139,8 +1142,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
-                              uint64_t skip_start_bytes,
-                              uint64_t skip_end_bytes)
+                              uint64_t skip_start_sector,
+                              uint64_t skip_end_sector)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
@@ -1188,7 +1191,8 @@ static int get_cluster_offset(BlockDriverState *bs,
        }
    }
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
-    if (bdrv_pread(extent->file,
+    if (bdrv_pread(
+                extent->file->bs,
                (int64_t)l2_offset * 512,
                l2_table,
                extent->l2_size * sizeof(uint32_t)
@@ -1202,6 +1206,13 @@ static int get_cluster_offset(BlockDriverState *bs,
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
    cluster_sector = le32_to_cpu(l2_table[l2_index]);

+    if (m_data) {
+        m_data->valid = 1;
+        m_data->l1_index = l1_index;
+        m_data->l2_index = l2_index;
+        m_data->l2_offset = l2_offset;
+        m_data->l2_cache_entry = &l2_table[l2_index];
+    }
    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
        zeroed = true;
    }
@@ -1219,18 +1230,13 @@ static int get_cluster_offset(BlockDriverState *bs,
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-                                offset, skip_start_bytes, skip_end_bytes);
+        ret = get_whole_cluster(bs, extent,
+                                cluster_sector,
+                                offset >> BDRV_SECTOR_BITS,
+                                skip_start_sector, skip_end_sector);
        if (ret) {
            return ret;
        }
-        if (m_data) {
-            m_data->valid = 1;
-            m_data->l1_index = l1_index;
-            m_data->l2_index = l2_index;
-            m_data->l2_offset = l2_offset;
-            m_data->l2_cache_entry = &l2_table[l2_index];
-        }
    }
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
    return VMDK_OK;
@@ -1253,24 +1259,15 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
    return NULL;
 }

-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-                                                   int64_t offset)
-{
-    uint64_t extent_begin_offset, extent_relative_offset;
-    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-    extent_begin_offset =
-        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-    extent_relative_offset = offset - extent_begin_offset;
-    return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
 {
-    uint64_t offset;
-    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
-    return offset / BDRV_SECTOR_SIZE;
+    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
+
+    extent_begin_sector = extent->end_sector - extent->sectors;
+    extent_relative_sector_num = sector_num - extent_begin_sector;
+    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+    return index_in_cluster;
 }

 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
@@ -1322,57 +1319,38 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
 }

 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, QEMUIOVector *qiov,
-                            uint64_t qiov_offset, uint64_t n_bytes,
-                            uint64_t offset)
+                            int64_t offset_in_cluster, const uint8_t *buf,
+                            int nb_sectors, int64_t sector_num)
 {
    int ret;
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
-    QEMUIOVector local_qiov;
-    struct iovec iov;
+    const uint8_t *write_buf = buf;
+    int write_len = nb_sectors * 512;
    int64_t write_offset;
    int64_t write_end_sector;

    if (extent->compressed) {
-        void *compressed_data;
-
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
-
-        compressed_data = g_malloc(n_bytes);
-        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
-        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
-        g_free(compressed_data);
-
-        if (ret != Z_OK || buf_len == 0) {
+        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
+                buf_len == 0) {
            ret = -EINVAL;
            goto out;
        }
-
-        data->lba = offset >> BDRV_SECTOR_BITS;
+        data->lba = sector_num;
        data->size = buf_len;
-
-        n_bytes = buf_len + sizeof(VmdkGrainMarker);
-        iov = (struct iovec) {
-            .iov_base   = data,
-            .iov_len    = n_bytes,
-        };
-        qemu_iovec_init_external(&local_qiov, &iov, 1);
-    } else {
-        qemu_iovec_init(&local_qiov, qiov->niov);
-        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
+        write_buf = (uint8_t *)data;
+        write_len = buf_len + sizeof(VmdkGrainMarker);
    }
-
    write_offset = cluster_offset + offset_in_cluster,
-    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
-                          &local_qiov, 0);
+    ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len);

-    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
+    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE);

    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
@@ -1381,21 +1359,19 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
                                          write_end_sector);
    }

-    if (ret < 0) {
+    if (ret != write_len) {
+        ret = ret < 0 ? ret : -EIO;
        goto out;
    }
    ret = 0;
 out:
    g_free(data);
-    if (!extent->compressed) {
-        qemu_iovec_destroy(&local_qiov);
-    }
    return ret;
 }

 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, QEMUIOVector *qiov,
-                            int bytes)
+                            int64_t offset_in_cluster, uint8_t *buf,
+                            int nb_sectors)
 {
    int ret;
    int cluster_bytes, buf_bytes;
@@ -1407,20 +1383,21 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,


    if (!extent->compressed) {
-        ret = bdrv_co_preadv(extent->file,
-                             cluster_offset + offset_in_cluster, bytes,
-                             qiov, 0);
-        if (ret < 0) {
-            return ret;
+        ret = bdrv_pread(extent->file->bs,
+                          cluster_offset + offset_in_cluster,
+                          buf, nb_sectors * 512);
+        if (ret == nb_sectors * 512) {
+            return 0;
+        } else {
+            return -EIO;
        }
-        return 0;
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
-    ret = bdrv_pread(extent->file,
+    ret = bdrv_pread(extent->file->bs,
                cluster_offset,
                cluster_buf, buf_bytes);
    if (ret < 0) {
@@ -1445,11 +1422,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,

    }
    if (offset_in_cluster < 0 ||
-            offset_in_cluster + bytes > buf_len) {
+            offset_in_cluster + nb_sectors * 512 > buf_len) {
        ret = -EINVAL;
        goto out;
    }
-    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
+    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
    ret = 0;

 out:
@@ -1458,73 +1435,64 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
    return ret;
 }

-static int coroutine_fn
-vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-               QEMUIOVector *qiov, int flags)
+static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
 {
    BDRVVmdkState *s = bs->opaque;
    int ret;
-    uint64_t n_bytes, offset_in_cluster;
+    uint64_t n, index_in_cluster;
    VmdkExtent *extent = NULL;
-    QEMUIOVector local_qiov;
    uint64_t cluster_offset;
-    uint64_t bytes_done = 0;

-    qemu_iovec_init(&local_qiov, qiov->niov);
-    qemu_co_mutex_lock(&s->lock);
-
-    while (bytes > 0) {
-        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
+    while (nb_sectors > 0) {
+        extent = find_extent(s, sector_num, extent);
        if (!extent) {
-            ret = -EIO;
-            goto fail;
+            return -EIO;
        }
        ret = get_cluster_offset(bs, extent, NULL,
-                                 offset, false, &cluster_offset, 0, 0);
-        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-
-        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
-                             - offset_in_cluster);
-
+                                 sector_num << 9, false, &cluster_offset,
+                                 0, 0);
+        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
        if (ret != VMDK_OK) {
            /* if not allocated, try to read from parent image, if exist */
            if (bs->backing && ret != VMDK_ZEROED) {
                if (!vmdk_is_cid_valid(bs)) {
-                    ret = -EINVAL;
-                    goto fail;
+                    return -EINVAL;
                }
-
-                qemu_iovec_reset(&local_qiov);
-                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
-                                     &local_qiov, 0);
+                ret = bdrv_read(bs->backing->bs, sector_num, buf, n);
                if (ret < 0) {
-                    goto fail;
+                    return ret;
                }
            } else {
-                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
+                memset(buf, 0, 512 * n);
            }
        } else {
-            qemu_iovec_reset(&local_qiov);
-            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
-                                   &local_qiov, n_bytes);
+            ret = vmdk_read_extent(extent,
+                            cluster_offset, index_in_cluster * 512,
+                            buf, n);
            if (ret) {
-                goto fail;
+                return ret;
            }
        }
-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
    }
+    return 0;
+}

-    ret = 0;
-fail:
+static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
+                                     uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVVmdkState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-    qemu_iovec_destroy(&local_qiov);
-
    return ret;
 }

@@ -1538,38 +1506,38 @@ fail:
 *
 * Returns: error code with 0 for success.
 */
-static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
-                       uint64_t bytes, QEMUIOVector *qiov,
-                       bool zeroed, bool zero_dry_run)
+static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
+                      const uint8_t *buf, int nb_sectors,
+                      bool zeroed, bool zero_dry_run)
 {
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int ret;
-    int64_t offset_in_cluster, n_bytes;
+    int64_t index_in_cluster, n;
    uint64_t cluster_offset;
-    uint64_t bytes_done = 0;
    VmdkMetaData m_data;

-    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
-        error_report("Wrong offset: offset=0x%" PRIx64
+    if (sector_num > bs->total_sectors) {
+        error_report("Wrong offset: sector_num=0x%" PRIx64
                     " total_sectors=0x%" PRIx64,
-                     offset, bs->total_sectors);
+                     sector_num, bs->total_sectors);
        return -EIO;
    }

-    while (bytes > 0) {
-        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
+    while (nb_sectors > 0) {
+        extent = find_extent(s, sector_num, extent);
        if (!extent) {
            return -EIO;
        }
-        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
-                             - offset_in_cluster);
-
-        ret = get_cluster_offset(bs, extent, &m_data, offset,
+        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
                                 !(extent->compressed || zeroed),
-                                 &cluster_offset, offset_in_cluster,
-                                 offset_in_cluster + n_bytes);
+                                 &cluster_offset,
+                                 index_in_cluster, index_in_cluster + n);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1578,7 +1546,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(bs, extent, &m_data, offset,
+                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
                                         true, &cluster_offset, 0, 0);
            }
        }
@@ -1588,9 +1556,9 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
-                    offset_in_cluster == 0 &&
-                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
-                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+                    index_in_cluster == 0 &&
+                    n >= extent->cluster_sectors) {
+                n = extent->cluster_sectors;
                if (!zero_dry_run) {
                    /* update L2 tables */
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
@@ -1602,8 +1570,9 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                return -ENOTSUP;
            }
        } else {
-            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
-                                    qiov, bytes_done, n_bytes, offset);
+            ret = vmdk_write_extent(extent,
+                            cluster_offset, index_in_cluster * 512,
+                            buf, n, sector_num);
            if (ret) {
                return ret;
            }
@@ -1616,9 +1585,9 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                }
            }
        }
-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;

        /* update CID on the first write every time the virtual disk is
         * opened */
@@ -1633,39 +1602,43 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
    return 0;
 }

-static int coroutine_fn
-vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                QEMUIOVector *qiov, int flags)
+static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
+                                      const uint8_t *buf, int nb_sectors)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
+    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }

-static int coroutine_fn
-vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
-                           uint64_t bytes, QEMUIOVector *qiov)
+static int vmdk_write_compressed(BlockDriverState *bs,
+                                 int64_t sector_num,
+                                 const uint8_t *buf,
+                                 int nb_sectors)
 {
-    return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
+    BDRVVmdkState *s = bs->opaque;
+    if (s->num_extents == 1 && s->extents[0].compressed) {
+        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    } else {
+        return -ENOTSUP;
+    }
 }

-static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
-                                              int64_t offset,
-                                              int bytes,
-                                              BdrvRequestFlags flags)
+static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             int nb_sectors,
+                                             BdrvRequestFlags flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
-
    qemu_co_mutex_lock(&s->lock);
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
-    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
+    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
    if (!ret) {
-        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
+        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
    }
    qemu_co_mutex_unlock(&s->lock);
    return ret;
@@ -1856,8 +1829,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size = 0, filesize;
    char *adapter_type = NULL;
    char *backing_file = NULL;
-    char *hw_version = NULL;
    char *fmt = NULL;
+    int flags = 0;
    int ret = 0;
    bool flat, split, compress;
    GString *ext_desc_lines;
@@ -1888,7 +1861,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
-        "ddb.virtualHWVersion = \"%s\"\n"
+        "ddb.virtualHWVersion = \"%d\"\n"
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
        "ddb.geometry.sectors = \"63\"\n"
@@ -1905,20 +1878,11 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        if (strcmp(hw_version, "undefined")) {
-            error_setg(errp,
-                       "compat6 cannot be enabled with hwversion set");
-            ret = -EINVAL;
-            goto exit;
-        }
-        g_free(hw_version);
-        hw_version = g_strdup("6");
+        flags |= BLOCK_FLAG_COMPAT6;
    }
-    if (strcmp(hw_version, "undefined") == 0) {
-        g_free(hw_version);
-        hw_version = g_strdup("4");
+    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_SCSI, false)) {
+        flags |= BLOCK_FLAG_SCSI;
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
@@ -1926,7 +1890,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    if (!adapter_type) {
-        adapter_type = g_strdup("ide");
+        adapter_type = g_strdup(flags & BLOCK_FLAG_SCSI ? "lsilogic" : "ide");
    } else if (strcmp(adapter_type, "ide") &&
               strcmp(adapter_type, "buslogic") &&
               strcmp(adapter_type, "lsilogic") &&
@@ -2040,7 +2004,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
-                           hw_version,
+                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                           number_heads,
@@ -2086,7 +2050,6 @@ exit:
    }
    g_free(adapter_type);
    g_free(backing_file);
-    g_free(hw_version);
    g_free(fmt);
    g_free(desc);
    g_free(path);
@@ -2290,6 +2253,27 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

+static void vmdk_detach_aio_context(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        bdrv_detach_aio_context(s->extents[i].file->bs);
+    }
+}
+
+static void vmdk_attach_aio_context(BlockDriverState *bs,
+                                    AioContext *new_context)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
+    }
+}
+
 static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2316,12 +2300,6 @@ static QemuOptsList vmdk_create_opts = {
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
-        {
-            .name = BLOCK_OPT_HWVERSION,
-            .type = QEMU_OPT_STRING,
-            .help = "VMDK hardware version",
-            .def_value_str = "undefined"
-        },
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
@@ -2335,6 +2313,12 @@ static QemuOptsList vmdk_create_opts = {
            .help = "Enable efficient zero writes "
                    "using the zeroed-grain GTE feature"
        },
+        {
+            .name = BLOCK_OPT_SCSI,
+            .type = QEMU_OPT_BOOL,
+            .help = "SCSI image",
+            .def_value_str = "off"
+        },
        { /* end of list */ }
    }
 };
@@ -2346,10 +2330,10 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
-    .bdrv_co_preadv               = vmdk_co_preadv,
-    .bdrv_co_pwritev              = vmdk_co_pwritev,
-    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
-    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
+    .bdrv_read                    = vmdk_co_read,
+    .bdrv_write                   = vmdk_co_write,
+    .bdrv_write_compressed        = vmdk_write_compressed,
+    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
    .bdrv_close                   = vmdk_close,
    .bdrv_create                  = vmdk_create,
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
@@ -2359,6 +2343,8 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_get_specific_info       = vmdk_get_specific_info,
    .bdrv_refresh_limits          = vmdk_refresh_limits,
    .bdrv_get_info                = vmdk_get_info,
+    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
+    .bdrv_attach_aio_context      = vmdk_attach_aio_context,

    .supports_backing             = true,
    .create_opts                  = &vmdk_create_opts,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,8 +29,9 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#include "qemu/bswap.h"
-#include "qemu/uuid.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif

 /**************************************************************/

@@ -87,7 +88,7 @@ typedef struct vhd_footer {
    uint32_t    checksum;

    /* UUID used to identify a parent hard disk (backing file) */
-    QemuUUID    uuid;
+    uint8_t     uuid[16];

    uint8_t     in_saved_state;
 } QEMU_PACKED VHDFooter;
@@ -235,7 +236,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
+    ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
    if (ret < 0) {
        error_setg(errp, "Unable to read VHD header");
        goto fail;
@@ -255,7 +256,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
        }

        /* If a fixed disk, the footer is found only at the end of the file */
-        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
+        ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf,
                         HEADER_SIZE);
        if (ret < 0) {
            goto fail;
@@ -326,7 +327,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
    }

    if (disk_type == VHD_DYNAMIC) {
-        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
+        ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
                         HEADER_SIZE);
        if (ret < 0) {
            error_setg(errp, "Error reading dynamic VHD header");
@@ -383,7 +384,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,

        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);

-        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
+        ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
                         pagetable_size);
        if (ret < 0) {
            error_setg(errp, "Error reading pagetable");
@@ -453,21 +454,22 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 * The parameter write must be 1 if the offset will be used for a write
 * operation (the block bitmaps is updated then), 0 otherwise.
 */
-static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
-                                       bool write)
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+    int64_t sector_num, int write)
 {
    BDRVVPCState *s = bs->opaque;
+    uint64_t offset = sector_num * 512;
    uint64_t bitmap_offset, block_offset;
-    uint32_t pagetable_index, offset_in_block;
+    uint32_t pagetable_index, pageentry_index;

    pagetable_index = offset / s->block_size;
-    offset_in_block = offset % s->block_size;
+    pageentry_index = (offset % s->block_size) / 512;

    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
        return -1; /* not allocated */

    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
-    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
+    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);

    /* We must ensure that we don't write to any sectors which are marked as
       unused in the bitmap. We get away with setting all bits in the block
@@ -479,18 +481,12 @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,

        s->last_bitmap_offset = bitmap_offset;
        memset(bitmap, 0xff, s->bitmap_size);
-        bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
+        bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size);
    }

    return block_offset;
 }

-static inline int64_t get_sector_offset(BlockDriverState *bs,
-                                        int64_t sector_num, bool write)
-{
-    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
-}
-
 /*
 * Writes the footer to the end of the image file. This is needed when the
 * file grows as it overwrites the old footer
@@ -503,7 +499,7 @@ static int rewrite_footer(BlockDriverState* bs)
    BDRVVPCState *s = bs->opaque;
    int64_t offset = s->free_data_block_offset;

-    ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
+    ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE);
    if (ret < 0)
        return ret;

@@ -517,7 +513,7 @@ static int rewrite_footer(BlockDriverState* bs)
 *
 * Returns the sectors' offset in the image file on success and < 0 on error
 */
-static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
+static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t bat_offset;
@@ -526,18 +522,19 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
    uint8_t bitmap[s->bitmap_size];

    /* Check if sector_num is valid */
-    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
-        return -EINVAL;
-    }
+    if ((sector_num < 0) || (sector_num > bs->total_sectors))
+        return -1;

    /* Write entry into in-memory BAT */
-    index = offset / s->block_size;
-    assert(s->pagetable[index] == 0xFFFFFFFF);
+    index = (sector_num * 512) / s->block_size;
+    if (s->pagetable[index] != 0xFFFFFFFF)
+        return -1;
+
    s->pagetable[index] = s->free_data_block_offset / 512;

    /* Initialize the block's bitmap */
    memset(bitmap, 0xff, s->bitmap_size);
-    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
+    ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
        s->bitmap_size);
    if (ret < 0) {
        return ret;
@@ -552,15 +549,15 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
    /* Write BAT entry to disk */
    bat_offset = s->bat_offset + (4 * index);
    bat_value = cpu_to_be32(s->pagetable[index]);
-    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
+    ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
    if (ret < 0)
        goto fail;

-    return get_image_offset(bs, offset, false);
+    return get_sector_offset(bs, sector_num, 0);

 fail:
    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
-    return ret;
+    return -1;
 }

 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -576,105 +573,104 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static int coroutine_fn
-vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-              QEMUIOVector *qiov, int flags)
+static int vpc_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
 {
    BDRVVPCState *s = bs->opaque;
    int ret;
-    int64_t image_offset;
-    int64_t n_bytes;
-    int64_t bytes_done = 0;
+    int64_t offset;
+    int64_t sectors, sectors_per_block;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;
-    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
+        return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
    }
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 0);

-    qemu_co_mutex_lock(&s->lock);
-    qemu_iovec_init(&local_qiov, qiov->niov);
+        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
+        sectors = sectors_per_block - (sector_num % sectors_per_block);
+        if (sectors > nb_sectors) {
+            sectors = nb_sectors;
+        }

-    while (bytes > 0) {
-        image_offset = get_image_offset(bs, offset, false);
-        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
-
-        if (image_offset == -1) {
-            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
+        if (offset == -1) {
+            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
        } else {
-            qemu_iovec_reset(&local_qiov);
-            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
-                                 &local_qiov, 0);
-            if (ret < 0) {
-                goto fail;
+            ret = bdrv_pread(bs->file->bs, offset, buf,
+                sectors * BDRV_SECTOR_SIZE);
+            if (ret != sectors * BDRV_SECTOR_SIZE) {
+                return -1;
            }
        }

-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
+        nb_sectors -= sectors;
+        sector_num += sectors;
+        buf += sectors * BDRV_SECTOR_SIZE;
    }
+    return 0;
+}

-    ret = 0;
-fail:
-    qemu_iovec_destroy(&local_qiov);
+static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
+                                    uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVVPCState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = vpc_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-
    return ret;
 }

-static int coroutine_fn
-vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-               QEMUIOVector *qiov, int flags)
+static int vpc_write(BlockDriverState *bs, int64_t sector_num,
+    const uint8_t *buf, int nb_sectors)
 {
    BDRVVPCState *s = bs->opaque;
-    int64_t image_offset;
-    int64_t n_bytes;
-    int64_t bytes_done = 0;
+    int64_t offset;
+    int64_t sectors, sectors_per_block;
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
-    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
+        return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
+    }
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 1);
+
+        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
+        sectors = sectors_per_block - (sector_num % sectors_per_block);
+        if (sectors > nb_sectors) {
+            sectors = nb_sectors;
+        }
+
+        if (offset == -1) {
+            offset = alloc_block(bs, sector_num);
+            if (offset < 0)
+                return -1;
+        }
+
+        ret = bdrv_pwrite(bs->file->bs, offset, buf,
+                          sectors * BDRV_SECTOR_SIZE);
+        if (ret != sectors * BDRV_SECTOR_SIZE) {
+            return -1;
+        }
+
+        nb_sectors -= sectors;
+        sector_num += sectors;
+        buf += sectors * BDRV_SECTOR_SIZE;
    }

+    return 0;
+}
+
+static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
+                                     const uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVVPCState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    qemu_iovec_init(&local_qiov, qiov->niov);
-
-    while (bytes > 0) {
-        image_offset = get_image_offset(bs, offset, true);
-        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
-
-        if (image_offset == -1) {
-            image_offset = alloc_block(bs, offset);
-            if (image_offset < 0) {
-                ret = image_offset;
-                goto fail;
-            }
-        }
-
-        qemu_iovec_reset(&local_qiov);
-        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
-
-        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
-                              &local_qiov, 0);
-        if (ret < 0) {
-            goto fail;
-        }
-
-        bytes -= n_bytes;
-        offset += n_bytes;
-        bytes_done += n_bytes;
-    }
-
-    ret = 0;
-fail:
-    qemu_iovec_destroy(&local_qiov);
+    ret = vpc_write(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-
    return ret;
 }

@@ -978,7 +974,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)

    footer->type = cpu_to_be32(disk_type);

-    qemu_uuid_generate(&footer->uuid);
+#if defined(CONFIG_UUID)
+    uuid_generate(footer->uuid);
+#endif

    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));

@@ -1058,8 +1056,8 @@ static BlockDriver bdrv_vpc = {
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
    .bdrv_create            = vpc_create,

-    .bdrv_co_preadv             = vpc_co_preadv,
-    .bdrv_co_pwritev            = vpc_co_pwritev,
+    .bdrv_read                  = vpc_co_read,
+    .bdrv_write                 = vpc_co_write,
    .bdrv_co_get_block_status   = vpc_co_get_block_status,

    .bdrv_get_info          = vpc_get_info,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -27,7 +27,6 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
@@ -114,12 +113,15 @@ static inline int array_ensure_allocated(array_t* array, int index)

 static inline void* array_get_next(array_t* array) {
    unsigned int next = array->next;
+    void* result;

    if (array_ensure_allocated(array, next) < 0)
 	return NULL;

    array->next = next + 1;
-    return array_get(array, next);
+    result = array_get(array, next);
+
+    return result;
 }

 static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
@@ -341,8 +343,9 @@ typedef struct BDRVVVFATState {
    unsigned int current_cluster;

    /* write support */
+    BlockDriverState* write_target;
    char* qcow_filename;
-    BdrvChild* qcow;
+    BlockDriverState* qcow;
    void* fat2;
    char* used_clusters;
    array_t commits;
@@ -980,7 +983,7 @@ static int init_directories(BDRVVVFATState* s,
 static BDRVVVFATState *vvv = NULL;
 #endif

-static int enable_write_target(BlockDriverState *bs, Error **errp);
+static int enable_write_target(BDRVVVFATState *s, Error **errp);
 static int is_consistent(BDRVVVFATState *s);

 static QemuOptsList runtime_opts = {
@@ -1157,8 +1160,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
    s->current_cluster=0xffffffff;

    /* read only is the default for safety */
-    bs->read_only = true;
-    s->qcow = NULL;
+    bs->read_only = 1;
+    s->qcow = s->write_target = NULL;
    s->qcow_filename = NULL;
    s->fat2 = NULL;
    s->downcase_short_names = 1;
@@ -1169,11 +1172,11 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
    s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);

    if (qemu_opt_get_bool(opts, "rw", false)) {
-        ret = enable_write_target(bs, errp);
+        ret = enable_write_target(s, errp);
        if (ret < 0) {
            goto fail;
        }
-        bs->read_only = false;
+        bs->read_only = 0;
    }

    bs->total_sectors = cyls * heads * secs;
@@ -1207,11 +1210,6 @@ fail:
    return ret;
 }

-static void vvfat_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
-}
-
 static inline void vvfat_close_current_file(BDRVVVFATState *s)
 {
    if(s->current_mapping) {
@@ -1390,10 +1388,9 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
 	   return -1;
 	if (s->qcow) {
 	    int n;
-            if (bdrv_is_allocated(s->qcow->bs, sector_num, nb_sectors-i, &n)) {
-                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
-                             (int)sector_num, n));
-                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
+            if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) {
+DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
+                if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) {
                    return -1;
                }
                i += n - 1;
@@ -1424,31 +1421,14 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
    return 0;
 }

-static int coroutine_fn
-vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                QEMUIOVector *qiov, int flags)
+static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
+                                      uint8_t *buf, int nb_sectors)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
-    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-    void *buf;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-
-    buf = g_try_malloc(bytes);
-    if (bytes && buf == NULL) {
-        return -ENOMEM;
-    }
-
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-
-    qemu_iovec_from_buf(qiov, 0, buf, bytes);
-    g_free(buf);
-
    return ret;
 }

@@ -1669,15 +1649,12 @@ static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
    int was_modified = 0;
    int i, dummy;

-    if (s->qcow == NULL) {
-        return 0;
-    }
+    if (s->qcow == NULL)
+	return 0;

-    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) {
-        was_modified = bdrv_is_allocated(s->qcow->bs,
-                                         cluster2sector(s, cluster_num) + i,
-                                         1, &dummy);
-    }
+    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
+	was_modified = bdrv_is_allocated(s->qcow,
+		cluster2sector(s, cluster_num) + i, 1, &dummy);

    return was_modified;
 }
@@ -1826,16 +1803,11 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,

 		vvfat_close_current_file(s);
                for (i = 0; i < s->sectors_per_cluster; i++) {
-                    int res;
-
-                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
-                    if (!res) {
-                        res = vvfat_read(s->bs, offset, s->cluster_buffer, 1);
-                        if (res) {
+                    if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) {
+                        if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) {
                            return -1;
                        }
-                        res = bdrv_write(s->qcow, offset, s->cluster_buffer, 1);
-                        if (res) {
+                        if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) {
                            return -2;
                        }
                    }
@@ -1969,7 +1941,8 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i))
 		/* check file size with FAT */
 		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
 		if (cluster_count !=
-            DIV_ROUND_UP(le32_to_cpu(direntries[i].size), s->cluster_size)) {
+			(le32_to_cpu(direntries[i].size) + s->cluster_size
+			 - 1) / s->cluster_size) {
 		    DLOG(fprintf(stderr, "Cluster count mismatch\n"));
 		    goto fail;
 		}
@@ -2791,8 +2764,8 @@ static int do_commit(BDRVVVFATState* s)
 	return ret;
    }

-    if (s->qcow->bs->drv->bdrv_make_empty) {
-        s->qcow->bs->drv->bdrv_make_empty(s->qcow->bs);
+    if (s->qcow->drv->bdrv_make_empty) {
+        s->qcow->drv->bdrv_make_empty(s->qcow);
    }

    memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
@@ -2907,31 +2880,14 @@ DLOG(checkpoint());
    return 0;
 }

-static int coroutine_fn
-vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                 QEMUIOVector *qiov, int flags)
+static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
+                                       const uint8_t *buf, int nb_sectors)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
-    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-    void *buf;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-
-    buf = g_try_malloc(bytes);
-    if (bytes && buf == NULL) {
-        return -ENOMEM;
-    }
-    qemu_iovec_to_buf(qiov, 0, buf, bytes);
-
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_write(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
-
-    g_free(buf);
-
    return ret;
 }

@@ -2948,40 +2904,26 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA;
 }

-static int coroutine_fn
-write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                    QEMUIOVector *qiov, int flags)
-{
+static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
+	const uint8_t* buffer, int nb_sectors) {
    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
    return try_commit(s);
 }

 static void write_target_close(BlockDriverState *bs) {
    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
-    bdrv_unref_child(s->bs, s->qcow);
+    bdrv_unref(s->qcow);
    g_free(s->qcow_filename);
 }

 static BlockDriver vvfat_write_target = {
    .format_name        = "vvfat_write_target",
-    .bdrv_co_pwritev    = write_target_commit,
+    .bdrv_write         = write_target_commit,
    .bdrv_close         = write_target_close,
 };

-static void vvfat_qcow_options(int *child_flags, QDict *child_options,
-                               int parent_flags, QDict *parent_options)
+static int enable_write_target(BDRVVVFATState *s, Error **errp)
 {
-    qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "off");
-    *child_flags = BDRV_O_NO_FLUSH;
-}
-
-static const BdrvChildRole child_vvfat_qcow = {
-    .inherit_options    = vvfat_qcow_options,
-};
-
-static int enable_write_target(BlockDriverState *bs, Error **errp)
-{
-    BDRVVVFATState *s = bs->opaque;
    BlockDriver *bdrv_qcow = NULL;
    BlockDriverState *backing;
    QemuOpts *opts = NULL;
@@ -3018,13 +2960,12 @@ static int enable_write_target(BlockDriverState *bs, Error **errp)
        goto err;
    }

+    s->qcow = NULL;
    options = qdict_new();
-    qdict_put(options, "write-target.driver", qstring_from_str("qcow"));
-    s->qcow = bdrv_open_child(s->qcow_filename, options, "write-target", bs,
-                              &child_vvfat_qcow, false, errp);
-    QDECREF(options);
-    if (!s->qcow) {
-        ret = -EINVAL;
+    qdict_put(options, "driver", qstring_from_str("qcow"));
+    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
+                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
+    if (ret < 0) {
        goto err;
    }

@@ -3071,11 +3012,10 @@ static BlockDriver bdrv_vvfat = {

    .bdrv_parse_filename    = vvfat_parse_filename,
    .bdrv_file_open         = vvfat_open,
-    .bdrv_refresh_limits    = vvfat_refresh_limits,
    .bdrv_close             = vvfat_close,

-    .bdrv_co_preadv         = vvfat_co_preadv,
-    .bdrv_co_pwritev        = vvfat_co_pwritev,
+    .bdrv_read              = vvfat_co_read,
+    .bdrv_write             = vvfat_co_write,
    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };

--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -27,7 +27,7 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "block/aio.h"
-#include "block/raw-aio.h"
+#include "raw-aio.h"
 #include "qemu/event_notifier.h"
 #include "qemu/iov.h"
 #include <windows.h>
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -28,6 +28,10 @@ typedef struct NBDServerData {

 static NBDServerData *nbd_server;

+static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
+{
+    nbd_client_put(client);
+}

 static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,
                           gpointer opaque)
@@ -46,7 +50,7 @@ static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,

    nbd_client_new(NULL, cioc,
                   nbd_server->tlscreds, NULL,
-                   nbd_client_put);
+                   nbd_blockdev_client_closed);
    object_unref(OBJECT(cioc));
    return TRUE;
 }
@@ -145,8 +149,7 @@ void qmp_nbd_server_start(SocketAddress *addr,
 void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
                        Error **errp)
 {
-    BlockDriverState *bs = NULL;
-    BlockBackend *on_eject_blk;
+    BlockBackend *blk;
    NBDExport *exp;

    if (!nbd_server) {
@@ -159,22 +162,26 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
        return;
    }

-    on_eject_blk = blk_by_name(device);
-
-    bs = bdrv_lookup_bs(device, device, errp);
-    if (!bs) {
+    blk = blk_by_name(device);
+    if (!blk) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                  "Device '%s' not found", device);
+        return;
+    }
+    if (!blk_is_inserted(blk)) {
+        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
        return;
    }

    if (!has_writable) {
        writable = false;
    }
-    if (bdrv_is_read_only(bs)) {
+    if (blk_is_read_only(blk)) {
        writable = false;
    }

-    exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY,
-                         NULL, false, on_eject_blk, errp);
+    exp = nbd_export_new(blk, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, NULL,
+                         errp);
    if (!exp) {
        return;
    }
--- a/blockdev.c
+++ b/blockdev.c
--- a/blockjob.c
+++ b/blockjob.c
@@ -33,7 +33,6 @@
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qjson.h"
 #include "qemu/coroutine.h"
-#include "qemu/id.h"
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"
@@ -51,106 +50,17 @@ struct BlockJobTxn {
    int refcnt;
 };

-static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);
-
-BlockJob *block_job_next(BlockJob *job)
-{
-    if (!job) {
-        return QLIST_FIRST(&block_jobs);
-    }
-    return QLIST_NEXT(job, job_list);
-}
-
-BlockJob *block_job_get(const char *id)
+void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
+                       int64_t speed, BlockCompletionFunc *cb,
+                       void *opaque, Error **errp)
 {
    BlockJob *job;

-    QLIST_FOREACH(job, &block_jobs, job_list) {
-        if (!strcmp(id, job->id)) {
-            return job;
-        }
-    }
-
-    return NULL;
-}
-
-/* Normally the job runs in its BlockBackend's AioContext.  The exception is
- * block_job_defer_to_main_loop() where it runs in the QEMU main loop.  Code
- * that supports both cases uses this helper function.
- */
-static AioContext *block_job_get_aio_context(BlockJob *job)
-{
-    return job->deferred_to_main_loop ?
-           qemu_get_aio_context() :
-           blk_get_aio_context(job->blk);
-}
-
-static void block_job_attached_aio_context(AioContext *new_context,
-                                           void *opaque)
-{
-    BlockJob *job = opaque;
-
-    if (job->driver->attached_aio_context) {
-        job->driver->attached_aio_context(job, new_context);
-    }
-
-    block_job_resume(job);
-}
-
-static void block_job_detach_aio_context(void *opaque)
-{
-    BlockJob *job = opaque;
-
-    /* In case the job terminates during aio_poll()... */
-    block_job_ref(job);
-
-    block_job_pause(job);
-
-    if (!job->paused) {
-        /* If job is !job->busy this kicks it into the next pause point. */
-        block_job_enter(job);
-    }
-    while (!job->paused && !job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
-    }
-
-    block_job_unref(job);
-}
-
-void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed,
-                       BlockCompletionFunc *cb, void *opaque, Error **errp)
-{
-    BlockBackend *blk;
-    BlockJob *job;
-
-    assert(cb);
    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
        return NULL;
    }
-
-    if (job_id == NULL) {
-        job_id = bdrv_get_device_name(bs);
-        if (!*job_id) {
-            error_setg(errp, "An explicit job ID is required for this node");
-            return NULL;
-        }
-    }
-
-    if (!id_wellformed(job_id)) {
-        error_setg(errp, "Invalid job ID '%s'", job_id);
-        return NULL;
-    }
-
-    if (block_job_get(job_id)) {
-        error_setg(errp, "Job ID '%s' already in use", job_id);
-        return NULL;
-    }
-
-    blk = blk_new();
-    blk_insert_bs(blk, bs);
-
+    bdrv_ref(bs);
    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
@@ -158,19 +68,14 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);

    job->driver        = driver;
-    job->id            = g_strdup(job_id);
-    job->blk           = blk;
+    job->id            = g_strdup(bdrv_get_device_name(bs));
+    job->bs            = bs;
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
    job->refcnt        = 1;
    bs->job = job;

-    QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-
-    blk_add_aio_context_notifier(blk, block_job_attached_aio_context,
-                                 block_job_detach_aio_context, job);
-
    /* Only set speed when necessary to avoid NotSupported error */
    if (speed != 0) {
        Error *local_err = NULL;
@@ -193,16 +98,11 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        BlockDriverState *bs = blk_bs(job->blk);
-        bs->job = NULL;
-        bdrv_op_unblock_all(bs, job->blocker);
-        blk_remove_aio_context_notifier(job->blk,
-                                        block_job_attached_aio_context,
-                                        block_job_detach_aio_context, job);
-        blk_unref(job->blk);
+        job->bs->job = NULL;
+        bdrv_op_unblock_all(job->bs, job->blocker);
+        bdrv_unref(job->bs);
        error_free(job->blocker);
        g_free(job->id);
-        QLIST_REMOVE(job, job_list);
        g_free(job);
    }
 }
@@ -240,7 +140,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
    txn->aborting = true;
    /* We are the first failed job. Cancel other jobs. */
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        ctx = blk_get_aio_context(other_job->blk);
+        ctx = bdrv_get_aio_context(other_job->bs);
        aio_context_acquire(ctx);
    }
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
@@ -257,7 +157,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
        assert(other_job->completed);
    }
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = blk_get_aio_context(other_job->blk);
+        ctx = bdrv_get_aio_context(other_job->bs);
        block_job_completed_single(other_job);
        aio_context_release(ctx);
    }
@@ -279,7 +179,7 @@ static void block_job_completed_txn_success(BlockJob *job)
    }
    /* We are the last completed job, commit the transaction. */
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = blk_get_aio_context(other_job->blk);
+        ctx = bdrv_get_aio_context(other_job->bs);
        aio_context_acquire(ctx);
        assert(other_job->ret == 0);
        block_job_completed_single(other_job);
@@ -289,7 +189,9 @@ static void block_job_completed_txn_success(BlockJob *job)

 void block_job_completed(BlockJob *job, int ret)
 {
-    assert(blk_bs(job->blk)->job == job);
+    BlockDriverState *bs = job->bs;
+
+    assert(bs->job == job);
    assert(!job->completed);
    job->completed = true;
    job->ret = ret;
@@ -322,8 +224,7 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 void block_job_complete(BlockJob *job, Error **errp)
 {
    if (job->pause_count || job->cancelled || !job->driver->complete) {
-        error_setg(errp, "The active block job '%s' cannot be completed",
-                   job->id);
+        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
        return;
    }

@@ -335,37 +236,11 @@ void block_job_pause(BlockJob *job)
    job->pause_count++;
 }

-static bool block_job_should_pause(BlockJob *job)
+bool block_job_is_paused(BlockJob *job)
 {
    return job->pause_count > 0;
 }

-void coroutine_fn block_job_pause_point(BlockJob *job)
-{
-    if (!block_job_should_pause(job)) {
-        return;
-    }
-    if (block_job_is_cancelled(job)) {
-        return;
-    }
-
-    if (job->driver->pause) {
-        job->driver->pause(job);
-    }
-
-    if (block_job_should_pause(job) && !block_job_is_cancelled(job)) {
-        job->paused = true;
-        job->busy = false;
-        qemu_coroutine_yield(); /* wait for block_job_resume() */
-        job->busy = true;
-        job->paused = false;
-    }
-
-    if (job->driver->resume) {
-        job->driver->resume(job);
-    }
-}
-
 void block_job_resume(BlockJob *job)
 {
    assert(job->pause_count > 0);
@@ -378,15 +253,15 @@ void block_job_resume(BlockJob *job)

 void block_job_enter(BlockJob *job)
 {
+    block_job_iostatus_reset(job);
    if (job->co && !job->busy) {
-        qemu_coroutine_enter(job->co);
+        qemu_coroutine_enter(job->co, NULL);
    }
 }

 void block_job_cancel(BlockJob *job)
 {
    job->cancelled = true;
-    block_job_iostatus_reset(job);
    block_job_enter(job);
 }

@@ -407,10 +282,11 @@ static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
+    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
    int ret;

-    assert(blk_bs(job->blk)->job == job);
+    assert(bs->job == job);

    block_job_ref(job);
    finish(job, &local_err);
@@ -420,7 +296,9 @@ static int block_job_finish_sync(BlockJob *job,
        return -EBUSY;
    }
    while (!job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
+                                              bdrv_get_aio_context(bs),
+                 true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
    block_job_unref(job);
@@ -440,19 +318,6 @@ int block_job_cancel_sync(BlockJob *job)
    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
 }

-void block_job_cancel_sync_all(void)
-{
-    BlockJob *job;
-    AioContext *aio_context;
-
-    while ((job = QLIST_FIRST(&block_jobs))) {
-        aio_context = blk_get_aio_context(job->blk);
-        aio_context_acquire(aio_context);
-        block_job_cancel_sync(job);
-        aio_context_release(aio_context);
-    }
-}
-
 int block_job_complete_sync(BlockJob *job, Error **errp)
 {
    return block_job_finish_sync(job, &block_job_complete, errp);
@@ -468,12 +333,12 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    }

    job->busy = false;
-    if (!block_job_should_pause(job)) {
-        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
+    if (block_job_is_paused(job)) {
+        qemu_coroutine_yield();
+    } else {
+        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
    }
    job->busy = true;
-
-    block_job_pause_point(job);
 }

 void block_job_yield(BlockJob *job)
@@ -486,12 +351,8 @@ void block_job_yield(BlockJob *job)
    }

    job->busy = false;
-    if (!block_job_should_pause(job)) {
-        qemu_coroutine_yield();
-    }
+    qemu_coroutine_yield();
    job->busy = true;
-
-    block_job_pause_point(job);
 }

 BlockJobInfo *block_job_query(BlockJob *job)
@@ -550,14 +411,14 @@ void block_job_event_ready(BlockJob *job)
                                    job->speed, &error_abort);
 }

-BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
+                                        BlockdevOnError on_err,
                                        int is_read, int error)
 {
    BlockErrorAction action;

    switch (on_err) {
    case BLOCKDEV_ON_ERROR_ENOSPC:
-    case BLOCKDEV_ON_ERROR_AUTO:
        action = (error == ENOSPC) ?
                 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
        break;
@@ -582,6 +443,9 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
+        if (bs->blk && bs != job->bs) {
+            blk_iostatus_set_err(bs->blk, error);
+        }
    }
    return action;
 }
@@ -605,7 +469,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    aio_context_acquire(data->aio_context);

    /* Fetch BDS AioContext again, in case it has changed */
-    aio_context = blk_get_aio_context(data->job->blk);
+    aio_context = bdrv_get_aio_context(data->job->bs);
    aio_context_acquire(aio_context);

    data->job->deferred_to_main_loop = false;
@@ -625,7 +489,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
    data->job = job;
    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
-    data->aio_context = blk_get_aio_context(job->blk);
+    data->aio_context = bdrv_get_aio_context(job->bs);
    data->fn = fn;
    data->opaque = opaque;
    job->deferred_to_main_loop = true;
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -28,7 +28,6 @@
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
 #include "hw/hw.h"
-#include "hw/qdev-core.h"

 typedef struct FWBootEntry FWBootEntry;

@@ -302,7 +301,9 @@ static void device_set_bootindex(Object *obj, Visitor *v, const char *name,
    add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix);

 out:
-    error_propagate(errp, local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
 }

 static void property_release_bootindex(Object *obj, const char *name,
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -1,6 +1,7 @@
 /* This is the Linux kernel elf-loading code, ported into user space */

 #include "qemu/osdep.h"
+#include <sys/mman.h>

 #include "qemu.h"
 #include "disas/disas.h"
--- a/bsd-user/i386/target_syscall.h
+++ b/bsd-user/i386/target_syscall.h
@@ -162,4 +162,4 @@ struct target_vm86plus_struct {

 #define UNAME_MACHINE "i386"

-#endif /* TARGET_SYSCALL_H */
+#endif  /* TARGET_SYSCALL_H */
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -17,23 +17,18 @@
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
-#include "qemu-version.h"
 #include <machine/trap.h>
+#include <sys/mman.h>

-#include "qapi/error.h"
 #include "qemu.h"
-#include "qemu/config-file.h"
 #include "qemu/path.h"
 #include "qemu/help_option.h"
 /* For tb_lock */
 #include "cpu.h"
-#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
 #include "exec/log.h"
-#include "trace/control.h"
-#include "glib-compat.h"

 int singlestep;
 unsigned long mmap_min_addr;
@@ -172,7 +167,7 @@ void cpu_loop(CPUX86State *env)
    //target_siginfo_t info;

    for(;;) {
-        trapnr = cpu_exec(cs);
+        trapnr = cpu_x86_exec(cs);
        switch(trapnr) {
        case 0x80:
            /* syscall from int $0x80 */
@@ -513,7 +508,7 @@ void cpu_loop(CPUSPARCState *env)
    //target_siginfo_t info;

    while (1) {
-        trapnr = cpu_exec(cs);
+        trapnr = cpu_sparc_exec(cs);

        switch (trapnr) {
 #ifndef TARGET_SPARC64
@@ -668,8 +663,7 @@ void cpu_loop(CPUSPARCState *env)

 static void usage(void)
 {
-    printf("qemu-" TARGET_NAME " version " QEMU_VERSION QEMU_PKGVERSION
-           ", " QEMU_COPYRIGHT "\n"
+    printf("qemu-" TARGET_NAME " version " QEMU_VERSION ", Copyright (c) 2003-2008 Fabrice Bellard\n"
           "usage: qemu-" TARGET_NAME " [options] program [arguments...]\n"
           "BSD CPU emulator (compiled for %s emulation)\n"
           "\n"
@@ -692,8 +686,6 @@ static void usage(void)
           "-p pagesize       set the host page size to 'pagesize'\n"
           "-singlestep       always run in singlestep mode\n"
           "-strace           log system calls\n"
-           "-trace            [[enable=]<pattern>][,events=<file>][,file=<file>]\n"
-           "                  specify tracing options\n"
           "\n"
           "Environment variables:\n"
           "QEMU_STRACE       Print system calls and arguments similar to the\n"
@@ -742,7 +734,6 @@ int main(int argc, char **argv)
    int gdbstub_port = 0;
    char **target_environ, **wrk;
    envlist_t *envlist = NULL;
-    char *trace_file = NULL;
    bsd_type = target_openbsd;

    if (argc <= 1)
@@ -761,11 +752,12 @@ int main(int argc, char **argv)
    }

    cpu_model = NULL;
-
-    qemu_add_opts(&qemu_trace_opts);
+#if defined(cpudef_setup)
+    cpudef_setup(); /* parse cpu definitions in target config file (TBD) */
+#endif

    optind = 1;
-    for (;;) {
+    for(;;) {
        if (optind >= argc)
            break;
        r = argv[optind];
@@ -850,17 +842,14 @@ int main(int argc, char **argv)
            singlestep = 1;
        } else if (!strcmp(r, "strace")) {
            do_strace = 1;
-        } else if (!strcmp(r, "trace")) {
-            g_free(trace_file);
-            trace_file = trace_opt_parse(optarg);
-        } else {
+        } else
+        {
            usage();
        }
    }

    /* init debug */
-    qemu_log_needs_buffers();
-    qemu_set_log_filename(log_file, &error_fatal);
+    qemu_set_log_filename(log_file);
    if (log_mask) {
        int mask;

@@ -877,11 +866,6 @@ int main(int argc, char **argv)
    }
    filename = argv[optind];

-    if (!trace_init_backends()) {
-        exit(1);
-    }
-    trace_init_file(trace_file);
-
    /* Zero out regs */
    memset(regs, 0, sizeof(struct target_pt_regs));

@@ -1133,7 +1117,6 @@ int main(int argc, char **argv)
        gdbserver_start (gdbstub_port);
        gdb_handlesig(cpu, 0);
    }
-    trace_init_vcpu_events();
    cpu_loop(env);
    /* never exits */
    return 0;
--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -17,6 +17,7 @@
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
+#include <sys/mman.h>

 #include "qemu.h"
 #include "qemu-common.h"
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -19,7 +19,6 @@


 #include "cpu.h"
-#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"

 #undef DEBUG_REMAP
@@ -209,6 +208,8 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
                       abi_ulong new_addr);
 int target_msync(abi_ulong start, abi_ulong len, int flags);
 extern unsigned long last_brk;
+void cpu_list_lock(void);
+void cpu_list_unlock(void);
 #if defined(CONFIG_USE_NPTL)
 void mmap_fork_start(void);
 void mmap_fork_end(int child);
@@ -356,7 +357,7 @@ static inline void *lock_user(int type, abi_ulong guest_addr, long len, int copy
 #ifdef DEBUG_REMAP
    {
        void *addr;
-        addr = g_malloc(len);
+        addr = malloc(len);
        if (copy)
            memcpy(addr, g2h(guest_addr), len);
        else
@@ -382,7 +383,7 @@ static inline void unlock_user(void *host_ptr, abi_ulong guest_addr,
        return;
    if (len > 0)
        memcpy(g2h(guest_addr), host_ptr, len);
-    g_free(host_ptr);
+    free(host_ptr);
 #endif
 }

--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .7.50
 .6.2