seabios: update binaries from 1.9.1 to 1.9.3

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
seabios: update 128k config
2016-07-04 11:28:58 +02:00 · 2016-07-04 11:28:58 +02:00 · 2016-07-04 11:28:58 +02:00 · 2016-07-04 11:28:58 +02:00 · 2016-07-01 19:29:27 +01:00 · 2016-07-01 16:06:57 +01:00
1418 changed files with 64682 additions and 25436 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 /config-target.*
 /config.status
 /config-temp
+/trace-events-all
 /trace/generated-tracers.h
 /trace/generated-tracers.c
 /trace/generated-tracers-dtrace.h
@@ -108,4 +109,5 @@
 cscope.*
 tags
 TAGS
+docker-src.*
 *~
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,6 +17,7 @@ addons:
      - libgtk-3-dev
      - libiscsi-dev
      - liblttng-ust-dev
+      - libnfs-dev
      - libncurses5-dev
      - libnss3-dev
      - libpixman-1-dev
@@ -63,9 +64,6 @@ script:
  - make -j3 && ${TEST_CMD}
 matrix:
  include:
-    # Sparse is GCC only
-    - env: CONFIG="--enable-sparse"
-      compiler: gcc
    # gprof/gcov are GCC features
    - env: CONFIG="--enable-gprof --enable-gcov --disable-pie"
      compiler: gcc
@@ -88,3 +86,13 @@ matrix:
    - env: CONFIG=""
      os: osx
      compiler: clang
+    - env: CONFIG=""
+      sudo: required
+      addons:
+      dist: trusty
+      compiler: gcc
+      before_install:
+        - sudo apt-get update -qq
+        - sudo apt-get build-dep -qq qemu
+        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
+        - git submodule update --init --recursive
--- a/47
+++ b/47
@@ -165,6 +165,7 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/

 PowerPC
+M: David Gibson <david@gibson.dropbear.id.au>
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Maintained
@@ -188,8 +189,8 @@ F: hw/sh4/
 F: disas/sh4.c

 SPARC
-M: Blue Swirl <blauwirbel@gmail.com>
 M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+M: Artyom Tarasenko <atar4qemu@gmail.com>
 S: Maintained
 F: target-sparc/
 F: hw/sparc/
@@ -597,7 +598,7 @@ F: hw/pci-host/grackle.c
 F: hw/misc/macio/

 PReP
-M: Andreas Färber <andreas.faerber@web.de>
+L: qemu-devel@nongnu.org
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/prep.c
@@ -779,6 +780,7 @@ F: hw/ipack/

 PCI
 M: Michael S. Tsirkin <mst@redhat.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
 S: Supported
 F: include/hw/pci/*
 F: hw/misc/pci-testdev.c
@@ -886,12 +888,13 @@ F: include/hw/virtio/

 virtio-9p
 M: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-M: Greg Kurz <gkurz@linux.vnet.ibm.com>
+M: Greg Kurz <groug@kaod.org>
 S: Supported
 F: hw/9pfs/
 F: fsdev/
 F: tests/virtio-9p-test.c
 T: git git://github.com/kvaneesh/QEMU.git
+T: git git://github.com/gkurz/qemu.git 9p-next

 virtio-blk
 M: Stefan Hajnoczi <stefanha@redhat.com>
@@ -953,6 +956,14 @@ S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h

+Network packet abstractions
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: include/net/eth.h
+F: net/eth.c
+F: hw/net/net_rx_pkt*
+F: hw/net/net_tx_pkt*
+
 Vmware
 M: Dmitry Fleytman <dmitry@daynix.com>
 S: Maintained
@@ -972,6 +983,16 @@ F: hw/acpi/nvdimm.c
 F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h

+e1000x
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000x*
+
+e1000e
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000e*
+
 Subsystems
 ----------
 Audio
@@ -1046,7 +1067,7 @@ S: Supported
 F: scripts/coverity-model.c

 CPU
-M: Andreas Färber <afaerber@suse.de>
+L: qemu-devel@nongnu.org
 S: Supported
 F: qom/cpu.c
 F: include/qom/cpu.h
@@ -1105,7 +1126,6 @@ F: ui/
 F: include/ui/

 Cocoa graphics
-M: Andreas Färber <andreas.faerber@web.de>
 M: Peter Maydell <peter.maydell@linaro.org>
 S: Odd Fixes
 F: ui/cocoa.m
@@ -1242,7 +1262,6 @@ F: docs/tracing.txt
 T: git git://github.com/stefanha/qemu.git tracing

 Checkpatch
-M: Blue Swirl <blauwirbel@gmail.com>
 S: Odd Fixes
 F: scripts/checkpatch.pl

@@ -1315,8 +1334,7 @@ F: thunk.c
 F: user-exec.c

 BSD user
-M: Blue Swirl <blauwirbel@gmail.com>
-S: Maintained
+S: Orphan
 F: bsd-user/

 Linux user
@@ -1379,8 +1397,7 @@ F: tcg/s390/
 F: disas/s390.c

 SPARC target
-M: Blue Swirl <blauwirbel@gmail.com>
-S: Maintained
+S: Odd Fixes
 F: tcg/sparc/
 F: disas/sparc.c

@@ -1400,9 +1417,8 @@ S: Orphan

 Stable 0.15
 L: qemu-stable@nongnu.org
-M: Andreas Färber <afaerber@suse.de>
 T: git git://git.qemu-project.org/qemu-stable-0.15.git
-S: Supported
+S: Orphan

 Stable 0.14
 L: qemu-stable@nongnu.org
@@ -1616,3 +1632,10 @@ Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt
+
+Docker testing
+--------------
+Docker based testing framework and cases
+M: Fam Zheng <famz@redhat.com>
+S: Maintained
+F: tests/docker/
--- a/75
+++ b/75
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.

-UNCHECKED_GOALS := %clean TAGS cscope ctags
+UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-%

 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -30,7 +30,6 @@ CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak

-include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@# TODO: The next lines include code which supports a smooth
@@ -49,7 +48,9 @@ ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fa
 endif
 endif

-GENERATED_HEADERS = config-host.h qemu-options.def
+include $(SRC_PATH)/rules.mak
+
+GENERATED_HEADERS = qemu-version.h config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
 GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
 GENERATED_HEADERS += qmp-introspect.h
@@ -81,7 +82,7 @@ Makefile: ;
 configure: ;

 .PHONY: all clean cscope distclean dvi html info install install-doc \
-	pdf recurse-all speed test dist msi
+	pdf recurse-all speed test dist msi FORCE

 $(call set-vpath, $(SRC_PATH))

@@ -92,9 +93,6 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 DOCS+=qmp-commands.txt
-ifdef CONFIG_LINUX
-DOCS+=kvm_stat.1
-endif
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -119,7 +117,7 @@ endif

 -include $(SUBDIR_DEVICES_MAK_DEP)

-%/config-devices.mak: default-configs/%.mak
+%/config-devices.mak: default-configs/%.mak $(SRC_PATH)/scripts/make_device_config.sh
 	$(call quiet-command, \
            $(SHELL) $(SRC_PATH)/scripts/make_device_config.sh $< $*-config-devices.mak.d $@ > $@.tmp, "  GEN   $@.tmp")
 	$(call quiet-command, if test -f $@; then \
@@ -164,14 +162,34 @@ dummy := $(call unnest-vars,, \
                common-obj-m)

 ifneq ($(wildcard config-host.mak),)
-include $(SRC_PATH)/tests/Makefile
+include $(SRC_PATH)/tests/Makefile.include
 endif

 all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules

+qemu-version.h: FORCE
+	$(call quiet-command, \
+		(cd $(SRC_PATH); \
+		printf '#define QEMU_PKGVERSION '; \
+		if test -n "$(PKGVERSION)"; then \
+			printf '"$(PKGVERSION)"\n'; \
+		else \
+			if test -d .git; then \
+				printf '" ('; \
+				git describe --match 'v*' 2>/dev/null | tr -d '\n'; \
+				if ! git diff-index --quiet HEAD &>/dev/null; then \
+					printf -- '-dirty'; \
+				fi; \
+				printf ')"\n'; \
+			else \
+				printf '""\n'; \
+			fi; \
+		fi) > $@.tmp)
+	$(call quiet-command, cmp --quiet $@ $@.tmp || mv $@.tmp $@)
+
 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
-qemu-options.def: $(SRC_PATH)/qemu-options.hx
+qemu-options.def: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")

 SUBDIR_RULES=$(patsubst %,subdir-%, $(TARGET_DIRS))
@@ -243,7 +261,7 @@ qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap

-qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
+qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")

 qemu-ga$(EXESUF): LIBS = $(LIBS_QGA)
@@ -356,6 +374,7 @@ clean:
 	if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
 	rm -f $$d/qemu-options.def; \
        done
+	rm -f $(SUBDIR_DEVICES_MAK) config-all-devices.mak

 VERSION ?= $(shell cat VERSION)

@@ -389,7 +408,8 @@ common  de-ch  es     fo  fr-ca  hu     ja  mk  nl-be      pt  sl     tr \
 bepo    cz

 ifdef INSTALL_BLOBS
-BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
+BLOBS=bios.bin bios-256k.bin bios-fast.bin \
+sgabios.bin vgabios.bin vgabios-cirrus.bin \
 vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin vgabios-virtio.bin \
 acpi-dsdt.aml \
 ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \
@@ -468,7 +488,7 @@ endif
 	set -e; for x in $(KEYMAPS); do \
 		$(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \
 	done
-	$(INSTALL_DATA) $(SRC_PATH)/trace-events "$(DESTDIR)$(qemu_datadir)/trace-events"
+	$(INSTALL_DATA) $(BUILD_DIR)/trace-events-all "$(DESTDIR)$(qemu_datadir)/trace-events-all"
 	for d in $(TARGET_DIRS); do \
 	$(MAKE) $(SUBDIR_MAKEFLAGS) TARGET_DIR=$$d/ -C $$d $@ || exit 1 ; \
        done
@@ -479,12 +499,12 @@ test speed: all

 .PHONY: ctags
 ctags:
-	rm -f $@
+	rm -f tags
 	find "$(SRC_PATH)" -name '*.[hc]' -exec ctags --append {} +

 .PHONY: TAGS
 TAGS:
-	rm -f $@
+	rm -f TAGS
 	find "$(SRC_PATH)" -name '*.[hc]' -exec etags --append {} +

 cscope:
@@ -525,19 +545,19 @@ TEXIFLAG=$(if $(V),,--quiet)
 %.pdf: %.texi
 	$(call quiet-command,texi2pdf $(TEXIFLAG) -I . $<,"  GEN   $@")

-qemu-options.texi: $(SRC_PATH)/qemu-options.hx
+qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx
+qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx
+qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

-qmp-commands.txt: $(SRC_PATH)/qmp-commands.hx
+qmp-commands.txt: $(SRC_PATH)/qmp-commands.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -q < $< > $@,"  GEN   $@")

-qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx
+qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")

 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
@@ -545,8 +565,9 @@ qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu.pod > $@, \
 	  "  GEN   $@")
+qemu.1: qemu-option-trace.texi

-qemu-img.1: qemu-img.texi qemu-img-cmds.texi
+qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-img.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu-img.pod > $@, \
@@ -558,7 +579,7 @@ fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
 	  $(POD2MAN) --section=1 --center=" " --release=" " fsdev/virtfs-proxy-helper.pod > $@, \
 	  "  GEN   $@")

-qemu-nbd.8: qemu-nbd.texi
+qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-nbd.pod && \
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-nbd.pod > $@, \
@@ -570,19 +591,13 @@ qemu-ga.8: qemu-ga.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
 	  "  GEN   $@")

-kvm_stat.1: scripts/kvm/kvm_stat.texi
-	$(call quiet-command, \
-	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
-	  $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \
-	  "  GEN   $@")
-
 dvi: qemu-doc.dvi qemu-tech.dvi
 html: qemu-doc.html qemu-tech.html
 info: qemu-doc.info qemu-tech.info
 pdf: qemu-doc.pdf qemu-tech.pdf

 qemu-doc.dvi qemu-doc.html qemu-doc.info qemu-doc.pdf: \
-	qemu-img.texi qemu-nbd.texi qemu-options.texi \
+	qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \
 	qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi \
 	qemu-monitor-info.texi

@@ -651,3 +666,5 @@ endif
 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
 -include $(wildcard *.d tests/*.d)
+
+include $(SRC_PATH)/tests/docker/Makefile.include
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,7 +52,6 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 common-obj-y += migration/
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += page_cache.o
-common-obj-y += qjson.o

 common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

@@ -116,3 +115,46 @@ qga-vss-dll-obj-y = qga/
 # contrib
 ivshmem-client-obj-y = contrib/ivshmem-client/
 ivshmem-server-obj-y = contrib/ivshmem-server/
+
+
+######################################################################
+trace-events-y = trace-events
+trace-events-y += util/trace-events
+trace-events-y += crypto/trace-events
+trace-events-y += io/trace-events
+trace-events-y += migration/trace-events
+trace-events-y += block/trace-events
+trace-events-y += hw/block/trace-events
+trace-events-y += hw/char/trace-events
+trace-events-y += hw/intc/trace-events
+trace-events-y += hw/net/trace-events
+trace-events-y += hw/virtio/trace-events
+trace-events-y += hw/audio/trace-events
+trace-events-y += hw/misc/trace-events
+trace-events-y += hw/usb/trace-events
+trace-events-y += hw/scsi/trace-events
+trace-events-y += hw/nvram/trace-events
+trace-events-y += hw/display/trace-events
+trace-events-y += hw/input/trace-events
+trace-events-y += hw/timer/trace-events
+trace-events-y += hw/dma/trace-events
+trace-events-y += hw/sparc/trace-events
+trace-events-y += hw/sd/trace-events
+trace-events-y += hw/isa/trace-events
+trace-events-y += hw/i386/trace-events
+trace-events-y += hw/9pfs/trace-events
+trace-events-y += hw/ppc/trace-events
+trace-events-y += hw/pci/trace-events
+trace-events-y += hw/s390x/trace-events
+trace-events-y += hw/vfio/trace-events
+trace-events-y += hw/acpi/trace-events
+trace-events-y += hw/arm/trace-events
+trace-events-y += hw/alpha/trace-events
+trace-events-y += ui/trace-events
+trace-events-y += audio/trace-events
+trace-events-y += net/trace-events
+trace-events-y += target-sparc/trace-events
+trace-events-y += target-s390x/trace-events
+trace-events-y += target-ppc/trace-events
+trace-events-y += qom/trace-events
+trace-events-y += linux-user/trace-events
--- a/Makefile.target
+++ b/Makefile.target
@@ -48,7 +48,7 @@ else
 TARGET_TYPE=system
 endif

-$(QEMU_PROG).stp-installed: $(SRC_PATH)/trace-events
+$(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
 	$(call quiet-command,$(TRACETOOL) \
 		--format=stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -57,7 +57,7 @@ $(QEMU_PROG).stp-installed: $(SRC_PATH)/trace-events
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp-installed")

-$(QEMU_PROG).stp: $(SRC_PATH)/trace-events
+$(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
 	$(call quiet-command,$(TRACETOOL) \
 		--format=stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -66,7 +66,7 @@ $(QEMU_PROG).stp: $(SRC_PATH)/trace-events
 		--target-type=$(TARGET_TYPE) \
 		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp")

-$(QEMU_PROG)-simpletrace.stp: $(SRC_PATH)/trace-events
+$(QEMU_PROG)-simpletrace.stp: $(BUILD_DIR)/trace-events-all
 	$(call quiet-command,$(TRACETOOL) \
 		--format=simpletrace-stap \
 		--backends=$(TRACE_BACKENDS) \
@@ -108,7 +108,9 @@ obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o

 ifdef CONFIG_LINUX_USER

-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+             -I$(SRC_PATH)/linux-user/host/$(ARCH) \
+             -I$(SRC_PATH)/linux-user

 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o
@@ -201,13 +203,13 @@ endif
 gdbstub-xml.c: $(TARGET_XML_FILES) $(SRC_PATH)/scripts/feature_to_c.sh
 	$(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES),"  GEN   $(TARGET_DIR)$@")

-hmp-commands.h: $(SRC_PATH)/hmp-commands.hx
+hmp-commands.h: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")

-hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx
+hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")

-qmp-commands-old.h: $(SRC_PATH)/qmp-commands.hx
+qmp-commands-old.h: $(SRC_PATH)/qmp-commands.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")

 clean:
--- a/2
+++ b/2
@@ -1 +1 @@
-2.6.0
+2.6.50
--- a/accel.c
+++ b/accel.c
@@ -77,7 +77,7 @@ static int accel_init_machine(AccelClass *acc, MachineState *ms)
    return ret;
 }

-int configure_accelerator(MachineState *ms)
+void configure_accelerator(MachineState *ms)
 {
    const char *p;
    char buf[10];
@@ -128,8 +128,6 @@ int configure_accelerator(MachineState *ms)
    if (init_failed) {
        fprintf(stderr, "Back to %s accelerator.\n", acc->name);
    }
-
-    return !accel_initialised;
 }


--- a/arch_init.c
+++ b/arch_init.c
@@ -22,6 +22,8 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
@@ -272,13 +274,6 @@ void do_smbios_option(QemuOpts *opts)
 #endif
 }

-void cpudef_init(void)
-{
-#if defined(cpudef_setup)
-    cpudef_setup(); /* parse cpu definitions in target config file */
-#endif
-}
-
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -1131,8 +1131,6 @@ static void audio_timer (void *opaque)
 */
 int AUD_write (SWVoiceOut *sw, void *buf, int size)
 {
-    int bytes;
-
    if (!sw) {
        /* XXX: Consider options */
        return size;
@@ -1143,14 +1141,11 @@ int AUD_write (SWVoiceOut *sw, void *buf, int size)
        return 0;
    }

-    bytes = sw->hw->pcm_ops->write (sw, buf, size);
-    return bytes;
+    return sw->hw->pcm_ops->write(sw, buf, size);
 }

 int AUD_read (SWVoiceIn *sw, void *buf, int size)
 {
-    int bytes;
-
    if (!sw) {
        /* XXX: Consider options */
        return size;
@@ -1161,8 +1156,7 @@ int AUD_read (SWVoiceIn *sw, void *buf, int size)
        return 0;
    }

-    bytes = sw->hw->pcm_ops->read (sw, buf, size);
-    return bytes;
+    return sw->hw->pcm_ops->read(sw, buf, size);
 }

 int AUD_get_buffer_size_out (SWVoiceOut *sw)
--- a/audio/mixeng.c
+++ b/audio/mixeng.c
@@ -24,6 +24,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/bswap.h"
 #include "audio.h"

 #define AUDIO_CAP "mixeng"
@@ -270,7 +271,7 @@ f_sample *mixeng_clip[2][2][2][3] = {
 * August 21, 1998
 * Copyright 1998 Fabrice Bellard.
 *
- * [Rewrote completly the code of Lance Norskog And Sundry
+ * [Rewrote completely the code of Lance Norskog And Sundry
 * Contributors with a more efficient algorithm.]
 *
 * This source code is freely redistributable and may be used for
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@@ -23,6 +23,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "audio.h"
 #include "qemu/timer.h"

--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@@ -22,7 +22,6 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <sys/soundcard.h>
 #include "qemu-common.h"
@@ -898,7 +897,7 @@ static struct audio_option oss_options[] = {
        .name  = "EXCLUSIVE",
        .tag   = AUD_OPT_BOOL,
        .valp  = &glob_conf.exclusive,
-        .descr = "Open device in exclusive mode (vmix wont work)"
+        .descr = "Open device in exclusive mode (vmix won't work)"
    },
 #ifdef USE_DSP_POLICY
    {
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -781,23 +781,22 @@ static int qpa_ctl_in (HWVoiceIn *hw, int cmd, ...)

            pa_threaded_mainloop_lock (g->mainloop);

-            /* FIXME: use the upcoming "set_source_output_{volume,mute}" */
-            op = pa_context_set_source_volume_by_index (g->context,
-                pa_stream_get_device_index (pa->stream),
+            op = pa_context_set_source_output_volume (g->context,
+                pa_stream_get_index (pa->stream),
                &v, NULL, NULL);
            if (!op) {
                qpa_logerr (pa_context_errno (g->context),
-                            "set_source_volume() failed\n");
+                            "set_source_output_volume() failed\n");
            } else {
                pa_operation_unref(op);
            }

-            op = pa_context_set_source_mute_by_index (g->context,
+            op = pa_context_set_source_output_mute (g->context,
                pa_stream_get_index (pa->stream),
                sw->vol.mute, NULL, NULL);
            if (!op) {
                qpa_logerr (pa_context_errno (g->context),
-                            "set_source_mute() failed\n");
+                            "set_source_output_mute() failed\n");
            } else {
                pa_operation_unref (op);
            }
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -19,6 +19,7 @@

 #include "qemu/osdep.h"
 #include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "ui/qemu-spice.h"
--- a/audio/trace-events
+++ b/audio/trace-events
@@ -0,0 +1,17 @@
+# See docs/trace-events.txt for syntax documentation.
+
+# audio/alsaaudio.c
+alsa_revents(int revents) "revents = %d"
+alsa_pollout(int i, int fd) "i = %d fd = %d"
+alsa_set_handler(int events, int index, int fd, int err) "events=%#x index=%d fd=%d err=%d"
+alsa_wrote_zero(int len) "Failed to write %d frames (wrote zero)"
+alsa_read_zero(long len) "Failed to read %ld frames (read zero)"
+alsa_xrun_out(void) "Recovering from playback xrun"
+alsa_xrun_in(void) "Recovering from capture xrun"
+alsa_resume_out(void) "Resuming suspended output stream"
+alsa_resume_in(void) "Resuming suspended input stream"
+alsa_no_frames(int state) "No frames available and ALSA state is %d"
+
+# audio/ossaudio.c
+oss_version(int version) "OSS version = %#x"
+oss_invalid_available_size(int size, int bufsize) "Invalid available size, size=%d bufsize=%d"
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
-#include "hw/hw.h"
+#include "qemu/host-utils.h"
 #include "qemu/timer.h"
 #include "audio.h"

--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -17,7 +17,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/main-loop.h"

-struct RndRandom
+struct RngRandom
 {
    RngBackend parent;

@@ -34,7 +34,7 @@ struct RndRandom

 static void entropy_available(void *opaque)
 {
-    RndRandom *s = RNG_RANDOM(opaque);
+    RngRandom *s = RNG_RANDOM(opaque);

    while (!QSIMPLEQ_EMPTY(&s->parent.requests)) {
        RngRequest *req = QSIMPLEQ_FIRST(&s->parent.requests);
@@ -57,7 +57,7 @@ static void entropy_available(void *opaque)

 static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
-    RndRandom *s = RNG_RANDOM(b);
+    RngRandom *s = RNG_RANDOM(b);

    if (QSIMPLEQ_EMPTY(&s->parent.requests)) {
        /* If there are no pending requests yet, we need to
@@ -68,7 +68,7 @@ static void rng_random_request_entropy(RngBackend *b, RngRequest *req)

 static void rng_random_opened(RngBackend *b, Error **errp)
 {
-    RndRandom *s = RNG_RANDOM(b);
+    RngRandom *s = RNG_RANDOM(b);

    if (s->filename == NULL) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
@@ -83,7 +83,7 @@ static void rng_random_opened(RngBackend *b, Error **errp)

 static char *rng_random_get_filename(Object *obj, Error **errp)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    return g_strdup(s->filename);
 }
@@ -92,7 +92,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,
                                 Error **errp)
 {
    RngBackend *b = RNG_BACKEND(obj);
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    if (b->opened) {
        error_setg(errp, QERR_PERMISSION_DENIED);
@@ -105,7 +105,7 @@ static void rng_random_set_filename(Object *obj, const char *filename,

 static void rng_random_init(Object *obj)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    object_property_add_str(obj, "filename",
                            rng_random_get_filename,
@@ -118,7 +118,7 @@ static void rng_random_init(Object *obj)

 static void rng_random_finalize(Object *obj)
 {
-    RndRandom *s = RNG_RANDOM(obj);
+    RngRandom *s = RNG_RANDOM(obj);

    if (s->fd != -1) {
        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
@@ -139,7 +139,7 @@ static void rng_random_class_init(ObjectClass *klass, void *data)
 static const TypeInfo rng_random_info = {
    .name = TYPE_RNG_RANDOM,
    .parent = TYPE_RNG_BACKEND,
-    .instance_size = sizeof(RndRandom),
+    .instance_size = sizeof(RngRandom),
    .class_init = rng_random_class_init,
    .instance_init = rng_random_init,
    .instance_finalize = rng_random_finalize,
--- a/block.c
+++ b/block.c
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -974,11 +974,9 @@ err_exit2:

 static int64_t qemu_archipelago_getlength(BlockDriverState *bs)
 {
-    int64_t ret;
    BDRVArchipelagoState *s = bs->opaque;

-    ret = archipelago_volume_info(s);
-    return ret;
+    return archipelago_volume_info(s);
 }

 static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset)
--- a/block/backup.c
+++ b/block/backup.c
@@ -36,7 +36,7 @@ typedef struct CowRequest {

 typedef struct BackupBlockJob {
    BlockJob common;
-    BlockDriverState *target;
+    BlockBackend *target;
    /* bitmap for sync=incremental */
    BdrvDirtyBitmap *sync_bitmap;
    MirrorSyncMode sync_mode;
@@ -47,6 +47,7 @@ typedef struct BackupBlockJob {
    uint64_t sectors_read;
    unsigned long *done_bitmap;
    int64_t cluster_size;
+    NotifierWithReturn before_write;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

@@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req)
    qemu_co_queue_restart_all(&req->wait_queue);
 }

-static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read,
                                      bool is_write_notifier)
 {
-    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    BlockBackend *blk = job->common.blk;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
@@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
+            bounce_buffer = blk_blockalign(blk, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

-        if (is_write_notifier) {
-            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * sectors_per_cluster,
-                                           n, &bounce_qiov);
-        } else {
-            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
-                                &bounce_qiov);
-        }
+        ret = blk_co_preadv(blk, start * job->cluster_size,
+                            bounce_qiov.size, &bounce_qiov,
+                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
@@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = bdrv_co_write_zeroes(job->target,
-                                       start * sectors_per_cluster,
-                                       n, BDRV_REQ_MAY_UNMAP);
+            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
        } else {
-            ret = bdrv_co_writev(job->target,
-                                 start * sectors_per_cluster, n,
-                                 &bounce_qiov);
+            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+                                 bounce_qiov.size, &bounce_qiov, 0);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
@@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
 {
+    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
    BdrvTrackedRequest *req = opaque;
    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

+    assert(req->bs == blk_bs(job->common.blk));
    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);

-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -218,19 +214,10 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void backup_iostatus_reset(BlockJob *job)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
    BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *bs = blk_bs(job->common.blk);

    if (ret < 0 || block_job_is_cancelled(&job->common)) {
        /* Merge the successor back into the parent, delete nothing. */
@@ -259,24 +246,31 @@ static void backup_abort(BlockJob *job)
    }
 }

+static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    blk_set_aio_context(s->target, aio_context);
+}
+
 static const BlockJobDriver backup_job_driver = {
-    .instance_size  = sizeof(BackupBlockJob),
-    .job_type       = BLOCK_JOB_TYPE_BACKUP,
-    .set_speed      = backup_set_speed,
-    .iostatus_reset = backup_iostatus_reset,
-    .commit         = backup_commit,
-    .abort          = backup_abort,
+    .instance_size          = sizeof(BackupBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_BACKUP,
+    .set_speed              = backup_set_speed,
+    .commit                 = backup_commit,
+    .abort                  = backup_abort,
+    .attached_aio_context   = backup_attached_aio_context,
 };

 static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
 {
    if (read) {
-        return block_job_error_action(&job->common, job->common.bs,
-                                      job->on_source_error, true, error);
+        return block_job_error_action(&job->common, job->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&job->common, job->target,
-                                      job->on_target_error, false, error);
+        return block_job_error_action(&job->common, job->on_target_error,
+                                      false, error);
    }
 }

@@ -289,7 +283,7 @@ static void backup_complete(BlockJob *job, void *opaque)
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
    BackupCompleteData *data = opaque;

-    bdrv_unref(s->target);
+    blk_unref(s->target);

    block_job_completed(job, data->ret);
    g_free(data);
@@ -331,7 +325,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
-    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -353,7 +346,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
@@ -386,12 +379,8 @@ static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
-    BlockDriverState *bs = job->common.bs;
-    BlockDriverState *target = job->target;
-    BlockdevOnError on_target_error = job->on_target_error;
-    NotifierWithReturn before_write = {
-        .notify = backup_before_write_notify,
-    };
+    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockBackend *target = job->target;
    int64_t start, end;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;
@@ -404,20 +393,14 @@ static void coroutine_fn backup_run(void *opaque)

    job->done_bitmap = bitmap_new(end);

-    if (target->blk) {
-        blk_set_on_error(target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(target->blk);
-    }
-
-    bdrv_add_before_write_notifier(bs, &before_write);
+    job->before_write.notify = backup_before_write_notify;
+    bdrv_add_before_write_notifier(bs, &job->before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
            /* Yield until the job is cancelled.  We just let our before_write
             * notify callback service CoW requests. */
-            job->common.busy = false;
-            qemu_coroutine_yield();
-            job->common.busy = true;
+            block_job_yield(&job->common);
        }
    } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        ret = backup_run_incremental(job);
@@ -461,7 +444,7 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * sectors_per_cluster,
+            ret = backup_do_cow(job, start * sectors_per_cluster,
                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
@@ -477,17 +460,14 @@ static void coroutine_fn backup_run(void *opaque)
        }
    }

-    notifier_with_return_remove(&before_write);
+    notifier_with_return_remove(&job->before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    if (target->blk) {
-        blk_iostatus_disable(target->blk);
-    }
-    bdrv_op_unblock_all(target, job->common.blocker);
+    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
@@ -504,24 +484,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 {
    int64_t len;
    BlockDriverInfo bdi;
+    BackupBlockJob *job = NULL;
    int ret;

    assert(bs);
    assert(target);
-    assert(cb);

    if (bs == target) {
        error_setg(errp, "Source and target cannot be the same");
        return;
    }

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
@@ -568,15 +541,16 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        goto error;
    }

-    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
-                                           cb, opaque, errp);
+    job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp);
    if (!job) {
        goto error;
    }

+    job->target = blk_new();
+    blk_insert_bs(job->target, target);
+
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
-    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
@@ -584,7 +558,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    /* If there is no backing file on the target, we cannot rely on COW if our
     * backup cluster size is smaller than the target cluster size. Even for
     * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(job->target, &bdi);
+    ret = bdrv_get_info(target, &bdi);
    if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
            "Couldn't determine the cluster size of the target image, "
@@ -610,4 +584,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    if (sync_bitmap) {
        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
    }
+    if (job) {
+        blk_unref(job->target);
+        block_job_unref(&job->common);
+    }
 }
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -103,11 +103,11 @@ static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs,
    return ret;
 }

-static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
    uint64_t reqid = request_id++;
-    int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();

@@ -147,7 +147,7 @@ static BlockDriver bdrv_blkreplay = {
    .bdrv_co_readv          = blkreplay_co_readv,
    .bdrv_co_writev         = blkreplay_co_writev,

-    .bdrv_co_write_zeroes   = blkreplay_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
    .bdrv_co_discard        = blkreplay_co_discard,
    .bdrv_co_flush          = blkreplay_co_flush,
 };
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
    return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }

-/* Propagate AioContext changes to ->test_file */
-static void blkverify_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_detach_aio_context(s->test_file->bs);
-}
-
-static void blkverify_attach_aio_context(BlockDriverState *bs,
-                                         AioContext *new_context)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_attach_aio_context(s->test_file->bs, new_context);
-}
-
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVBlkverifyState *s = bs->opaque;
@@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = {
    .bdrv_aio_writev                  = blkverify_aio_writev,
    .bdrv_aio_flush                   = blkverify_aio_flush,

-    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
-    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
-
    .is_filter                        = true,
    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Block backends
 *
- * Copyright (C) 2014 Red Hat, Inc.
+ * Copyright (C) 2014-2016 Red Hat, Inc.
 *
 * Authors:
 *  Markus Armbruster <armbru@redhat.com>,
@@ -19,6 +19,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
+#include "trace.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -34,6 +35,7 @@ struct BlockBackend {
    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
+    BlockBackendPublic public;

    void *dev;                  /* attached device model, if any */
    /* TODO change to DeviceState when all users are qdevified */
@@ -74,6 +76,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };

 static void drive_info_del(DriveInfo *dinfo);
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs);

 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -90,9 +93,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
    /* We're not supposed to call this function for root nodes */
    abort();
 }
+static void blk_root_drained_begin(BdrvChild *child);
+static void blk_root_drained_end(BdrvChild *child);
+
+static void blk_root_change_media(BdrvChild *child, bool load);
+static void blk_root_resize(BdrvChild *child);
+
+static const char *blk_root_get_name(BdrvChild *child)
+{
+    return blk_name(child->opaque);
+}

 static const BdrvChildRole child_root = {
-    .inherit_options = blk_root_inherit_options,
+    .inherit_options    = blk_root_inherit_options,
+
+    .change_media       = blk_root_change_media,
+    .resize             = blk_root_resize,
+    .get_name           = blk_root_get_name,
+
+    .drained_begin      = blk_root_drained_begin,
+    .drained_end        = blk_root_drained_end,
 };

 /*
@@ -100,40 +120,26 @@ static const BdrvChildRole child_root = {
 * Store an error through @errp on failure, unless it's null.
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(Error **errp)
+BlockBackend *blk_new(void)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    blk_set_enable_write_cache(blk, true);
+
+    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+
    notifier_list_init(&blk->remove_bs_notifiers);
    notifier_list_init(&blk->insert_bs_notifiers);
+
    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
    return blk;
 }

 /*
- * Create a new BlockBackend with a new BlockDriverState attached.
- * Otherwise just like blk_new(), which see.
- */
-BlockBackend *blk_new_with_bs(Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
-
-    blk = blk_new(errp);
-    if (!blk) {
-        return NULL;
-    }
-
-    bs = bdrv_new_root();
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
-    return blk;
-}
-
-/*
- * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
+ * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
@@ -148,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                           QDict *options, int flags, Error **errp)
 {
    BlockBackend *blk;
-    int ret;
+    BlockDriverState *bs;

-    blk = blk_new_with_bs(errp);
-    if (!blk) {
-        QDECREF(options);
-        return NULL;
-    }
-
-    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
-    if (ret < 0) {
+    blk = blk_new();
+    bs = bdrv_open(filename, reference, options, flags, errp);
+    if (!bs) {
        blk_unref(blk);
        return NULL;
    }

-    blk_set_enable_write_cache(blk, true);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    return blk;
 }
@@ -177,10 +178,6 @@ static void blk_delete(BlockBackend *blk)
    }
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
-    if (blk->root_state.throttle_state) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
    QTAILQ_REMOVE(&block_backends, blk, link);
    drive_info_del(blk->legacy_dinfo);
    block_acct_cleanup(&blk->stats);
@@ -267,28 +264,45 @@ BlockBackend *blk_next(BlockBackend *blk)
               : QTAILQ_FIRST(&monitor_block_backends);
 }

-/*
- * Iterates over all BlockDriverStates which are attached to a BlockBackend.
- * This function is for use by bdrv_next().
- *
- * @bs must be NULL or a BDS that is attached to a BB.
- */
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-    BlockBackend *blk;
+    BlockDriverState *bs;

-    if (bs) {
-        assert(bs->blk);
-        blk = bs->blk;
-    } else {
-        blk = NULL;
+    /* First, return all root nodes of BlockBackends. In order to avoid
+     * returning a BDS twice when multiple BBs refer to it, we only return it
+     * if the BB is the first one in the parent list of the BDS. */
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        do {
+            it->blk = blk_all_next(it->blk);
+            bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
+
+        if (bs) {
+            return bs;
+        }
+        it->phase = BDRV_NEXT_MONITOR_OWNED;
    }

+    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
+     * BDSes that are attached to a BlockBackend here; they have been handled
+     * by the above block already */
    do {
-        blk = blk_all_next(blk);
-    } while (blk && !blk->root);
+        it->bs = bdrv_next_monitor_owned(it->bs);
+        bs = it->bs;
+    } while (bs && bdrv_has_blk(bs));

-    return blk ? blk->root->bs : NULL;
+    return bs;
+}
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+    *it = (BdrvNextIterator) {
+        .phase = BDRV_NEXT_BACKEND_ROOTS,
+    };
+
+    return bdrv_next(it);
 }

 /*
@@ -375,6 +389,26 @@ BlockDriverState *blk_bs(BlockBackend *blk)
    return blk->root ? blk->root->bs : NULL;
 }

+static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
+{
+    BdrvChild *child;
+    QLIST_FOREACH(child, &bs->parents, next_parent) {
+        if (child->role == &child_root) {
+            return child->opaque;
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Returns true if @bs has an associated BlockBackend.
+ */
+bool bdrv_has_blk(BlockDriverState *bs)
+{
+    return bdrv_first_blk(bs) != NULL;
+}
+
 /*
 * Return @blk's DriveInfo if any, else null.
 */
@@ -410,18 +444,34 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
    abort();
 }

+/*
+ * Returns a pointer to the publicly accessible fields of @blk.
+ */
+BlockBackendPublic *blk_get_public(BlockBackend *blk)
+{
+    return &blk->public;
+}
+
+/*
+ * Returns a BlockBackend given the associated @public fields.
+ */
+BlockBackend *blk_by_public(BlockBackendPublic *public)
+{
+    return container_of(public, BlockBackend, public);
+}
+
 /*
 * Disassociates the currently associated BlockDriverState from @blk.
 */
 void blk_remove_bs(BlockBackend *blk)
 {
-    assert(blk->root->bs->blk == blk);
-
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    }

    blk_update_root_state(blk);

-    blk->root->bs->blk = NULL;
    bdrv_root_unref_child(blk->root);
    blk->root = NULL;
 }
@@ -431,12 +481,14 @@ void blk_remove_bs(BlockBackend *blk)
 */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
-    assert(!blk->root && !bs->blk);
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_attach_aio_context(
+            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+    }
 }

 /*
@@ -525,6 +577,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
    }
 }

+static void blk_root_change_media(BdrvChild *child, bool load)
+{
+    blk_dev_change_media_cb(child->opaque, load);
+}
+
 /*
 * Does @blk's attached device model have removable media?
 * %true if no device model is attached.
@@ -579,8 +636,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
 * Notify @blk's attached device model of a backend size change.
 */
-void blk_dev_resize_cb(BlockBackend *blk)
+static void blk_root_resize(BdrvChild *child)
 {
+    BlockBackend *blk = child->opaque;
+
    if (blk->dev_ops && blk->dev_ops->resize_cb) {
        blk->dev_ops->resize_cb(blk->dev_opaque);
    }
@@ -683,34 +742,50 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num,
                                  nb_sectors * BDRV_SECTOR_SIZE);
 }

-static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
-{
-    int ret = blk_check_byte_request(blk, offset, bytes);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags);
-}
-
-static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
 {
    int ret;

+    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+
    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    }
+
+    return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
+}
+
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
+{
+    int ret;
+
+    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    }
+
    if (!blk->enable_write_cache) {
        flags |= BDRV_REQ_FUA;
    }

-    return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
+    return bdrv_co_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
 }

 typedef struct BlkRwCo {
@@ -772,55 +847,27 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
    return rwco.ret;
 }

-static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                  int nb_sectors, CoroutineEntry co_entry,
-                  BdrvRequestFlags flags)
+int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
+                          int count)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf,
-                   nb_sectors << BDRV_SECTOR_BITS, co_entry, flags);
-}
-
-int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-             int nb_sectors)
-{
-    return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0);
-}
-
-int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                         int nb_sectors)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    bool enabled;
    int ret;

-    ret = blk_check_request(blk, sector_num, nb_sectors);
+    ret = blk_check_byte_request(blk, offset, count);
    if (ret < 0) {
        return ret;
    }

-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = blk_read(blk, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
+    blk_root_drained_begin(blk->root);
+    ret = blk_pread(blk, offset, buf, count);
+    blk_root_drained_end(blk->root);
    return ret;
 }

-int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
-              int nb_sectors)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags)
 {
-    return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors,
-                  blk_write_entry, 0);
-}
-
-int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                     int nb_sectors, BdrvRequestFlags flags)
-{
-    return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry,
-                  flags | BDRV_REQ_ZERO_WRITE);
+    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+                   flags | BDRV_REQ_ZERO_WRITE);
 }

 static void error_callback_bh(void *opaque)
@@ -932,18 +979,12 @@ static void blk_aio_write_entry(void *opaque)
    blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                 int nb_sectors, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS,
-                        nb_sectors << BDRV_SECTOR_BITS, NULL,
-                        blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE,
-                        cb, opaque);
+    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
+                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }

 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
@@ -955,9 +996,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
    return count;
 }

-int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
+               BdrvRequestFlags flags)
 {
-    int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0);
+    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
+                      flags);
    if (ret < 0) {
        return ret;
    }
@@ -991,30 +1034,20 @@ int64_t blk_nb_sectors(BlockBackend *blk)
    return bdrv_nb_sectors(blk_bs(blk));
 }

-BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
-                          QEMUIOVector *iov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_read_entry, 0, cb, opaque);
-}
-
-BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
-                           QEMUIOVector *iov, int nb_sectors,
+BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
+                           QEMUIOVector *qiov, BdrvRequestFlags flags,
                           BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_read_entry, flags, cb, opaque);
+}

-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_write_entry, 0, cb, opaque);
+BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
+                            QEMUIOVector *qiov, BdrvRequestFlags flags,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_write_entry, flags, cb, opaque);
 }

 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
@@ -1049,20 +1082,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
    bdrv_aio_cancel_async(acb);
 }

-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
-{
-    int i, ret;
-
-    for (i = 0; i < num_reqs; i++) {
-        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
-}
-
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
    if (!blk_is_available(blk)) {
@@ -1375,7 +1394,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
    BlockDriverState *bs = blk_bs(blk);

    if (bs) {
+        if (blk->public.throttle_state) {
+            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        }
        bdrv_set_aio_context(bs, new_context);
+        if (blk->public.throttle_state) {
+            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
+                                               new_context);
+        }
    }
 }

@@ -1444,15 +1470,10 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                     int nb_sectors, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS,
-                          nb_sectors << BDRV_SECTOR_BITS, NULL,
+    return blk_co_pwritev(blk, offset, count, NULL,
                          flags | BDRV_REQ_ZERO_WRITE);
 }

@@ -1545,19 +1566,6 @@ void blk_update_root_state(BlockBackend *blk)
    blk->root_state.open_flags    = blk->root->bs->open_flags;
    blk->root_state.read_only     = blk->root->bs->read_only;
    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
-
-    if (blk->root_state.throttle_group) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
-    if (blk->root->bs->throttle_state) {
-        const char *name = throttle_group_get_name(blk->root->bs);
-        blk->root_state.throttle_group = g_strdup(name);
-        blk->root_state.throttle_state = throttle_group_incref(name);
-    } else {
-        blk->root_state.throttle_group = NULL;
-        blk->root_state.throttle_state = NULL;
-    }
 }

 /*
@@ -1568,9 +1576,6 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
    bs->detect_zeroes = blk->root_state.detect_zeroes;
-    if (blk->root_state.throttle_group) {
-        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
-    }
 }

 /*
@@ -1633,3 +1638,62 @@ int blk_flush_all(void)

    return result;
 }
+
+
+/* throttling disk I/O limits */
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+{
+    throttle_group_config(blk, cfg);
+}
+
+void blk_io_limits_disable(BlockBackend *blk)
+{
+    assert(blk->public.throttle_state);
+    bdrv_drained_begin(blk_bs(blk));
+    throttle_group_unregister_blk(blk);
+    bdrv_drained_end(blk_bs(blk));
+}
+
+/* should be called before blk_set_io_limits if a limit is set */
+void blk_io_limits_enable(BlockBackend *blk, const char *group)
+{
+    assert(!blk->public.throttle_state);
+    throttle_group_register_blk(blk, group);
+}
+
+void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+{
+    /* this BB is not part of any group */
+    if (!blk->public.throttle_state) {
+        return;
+    }
+
+    /* this BB is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    blk_io_limits_disable(blk);
+    blk_io_limits_enable(blk, group);
+}
+
+static void blk_root_drained_begin(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    /* Note that blk->root may not be accessible here yet if we are just
+     * attaching to a BlockDriverState that is drained. Use child instead. */
+
+    if (blk->public.io_limits_disabled++ == 0) {
+        throttle_group_restart_blk(blk);
+    }
+}
+
+static void blk_root_drained_end(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    assert(blk->public.io_limits_disabled);
+    --blk->public.io_limits_disabled;
+}
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -27,6 +27,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"

 /**************************************************************/

@@ -104,6 +105,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1; // no write support yet
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs));
    if (ret < 0) {
@@ -221,38 +223,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
 }

-static int bochs_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
+    BDRVBochsState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    uint64_t bytes_done = 0;
+    QEMUIOVector local_qiov;
    int ret;

+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
    while (nb_sectors > 0) {
        int64_t block_offset = seek_to_sector(bs, sector_num);
        if (block_offset < 0) {
-            return block_offset;
-        } else if (block_offset > 0) {
-            ret = bdrv_pread(bs->file->bs, block_offset, buf, 512);
+            ret = block_offset;
+            goto fail;
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512);
+
+        if (block_offset > 0) {
+            ret = bdrv_co_preadv(bs->file->bs, block_offset, 512,
+                                 &local_qiov, 0);
            if (ret < 0) {
-                return ret;
+                goto fail;
            }
        } else {
-            memset(buf, 0, 512);
+            qemu_iovec_memset(&local_qiov, 0, 0, 512);
        }
        nb_sectors--;
        sector_num++;
-        buf += 512;
+        bytes_done += 512;
    }
-    return 0;
-}

-static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVBochsState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = bochs_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -267,7 +283,7 @@ static BlockDriver bdrv_bochs = {
    .instance_size	= sizeof(BDRVBochsState),
    .bdrv_probe		= bochs_probe,
    .bdrv_open		= bochs_open,
-    .bdrv_read          = bochs_co_read,
+    .bdrv_co_preadv = bochs_co_preadv,
    .bdrv_close		= bochs_close,
 };

--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>

 /* Maximum compressed block size */
@@ -66,6 +67,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */

    /* read header */
    ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4);
@@ -229,33 +231,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
    return 0;
 }

-static int cloop_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    BDRVCloopState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
+        void *data;
        uint32_t sector_offset_in_block =
            ((sector_num + i) % s->sectors_per_block),
            block_num = (sector_num + i) / s->sectors_per_block;
        if (cloop_read_block(bs, block_num) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
-        memcpy(buf + i * 512,
-            s->uncompressed_block + sector_offset_in_block * 512, 512);
-    }
-    return 0;
-}

-static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCloopState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cloop_read(bs, sector_num, buf, nb_sectors);
+        data = s->uncompressed_block + sector_offset_in_block * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
+    }
+
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -273,7 +280,7 @@ static BlockDriver bdrv_cloop = {
    .instance_size  = sizeof(BDRVCloopState),
    .bdrv_probe     = cloop_probe,
    .bdrv_open      = cloop_open,
-    .bdrv_read      = cloop_co_read,
+    .bdrv_co_preadv = cloop_co_preadv,
    .bdrv_close     = cloop_close,
 };

--- a/block/commit.c
+++ b/block/commit.c
@@ -36,28 +36,36 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
-    BlockDriverState *top;
-    BlockDriverState *base;
+    BlockBackend *top;
+    BlockBackend *base;
    BlockdevOnError on_error;
    int base_flags;
    int orig_overlay_flags;
    char *backing_file_str;
 } CommitBlockJob;

-static int coroutine_fn commit_populate(BlockDriverState *bs,
-                                        BlockDriverState *base,
+static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
    int ret = 0;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };

-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    if (ret) {
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
+                        qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

-    ret = bdrv_write(base, sector_num, buf, nb_sectors);
-    if (ret) {
+    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
+                         qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

@@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque)
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
+    BlockDriverState *top = blk_bs(s->top);
+    BlockDriverState *base = blk_bs(s->base);
    BlockDriverState *overlay_bs;
    int ret = data->ret;

@@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque)
        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
    }
    g_free(s->backing_file_str);
+    blk_unref(s->top);
+    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
 }
@@ -102,8 +112,6 @@ static void coroutine_fn commit_run(void *opaque)
 {
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
@@ -111,27 +119,27 @@ static void coroutine_fn commit_run(void *opaque)
    int bytes_written = 0;
    int64_t base_len;

-    ret = s->common.len = bdrv_getlength(top);
+    ret = s->common.len = blk_getlength(s->top);


    if (s->common.len < 0) {
        goto out;
    }

-    ret = base_len = bdrv_getlength(base);
+    ret = base_len = blk_getlength(s->base);
    if (base_len < 0) {
        goto out;
    }

    if (base_len < s->common.len) {
-        ret = bdrv_truncate(base, s->common.len);
+        ret = blk_truncate(s->base, s->common.len);
        if (ret) {
            goto out;
        }
    }

    end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);

    for (sector_num = 0; sector_num < end; sector_num += n) {
        uint64_t delay_ns = 0;
@@ -146,7 +154,8 @@ wait:
            break;
        }
        /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(top, base, sector_num,
+        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
+                                      sector_num,
                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                      &n);
        copy = (ret == 1);
@@ -158,7 +167,7 @@ wait:
                    goto wait;
                }
            }
-            ret = commit_populate(top, base, sector_num, n, buf);
+            ret = commit_populate(s->top, s->base, sector_num, n, buf);
            bytes_written += n * BDRV_SECTOR_SIZE;
        }
        if (ret < 0) {
@@ -214,13 +223,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
    BlockDriverState *overlay_bs;
    Error *local_err = NULL;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, "Invalid parameter combination");
-        return;
-    }
-
    assert(top != bs);
    if (top == base) {
        error_setg(errp, "Invalid files for merge: top and base are the same");
@@ -234,6 +236,11 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
        return;
    }

+    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
    orig_base_flags    = bdrv_get_flags(base);
    orig_overlay_flags = bdrv_get_flags(overlay_bs);

@@ -250,18 +257,18 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
        bdrv_reopen_multiple(reopen_queue, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
+            block_job_unref(&s->common);
            return;
        }
    }


-    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
-    if (!s) {
-        return;
-    }
+    s->base = blk_new();
+    blk_insert_bs(s->base, base);
+
+    s->top = blk_new();
+    blk_insert_bs(s->top, top);

-    s->base   = base;
-    s->top    = top;
    s->active = bs;

    s->base_flags          = orig_base_flags;
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block,
    struct BlockCryptoCreateData *data = opaque;
    ssize_t ret;

-    ret = blk_pwrite(data->blk, offset, buf, buflen);
+    ret = blk_pwrite(data->blk, offset, buf, buflen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        return ret;
@@ -196,7 +196,6 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockOpenOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockOpenOptions, 1);
    ret->format = format;
@@ -219,9 +218,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
@@ -242,7 +243,6 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
    OptsVisitor *ov;
    QCryptoBlockCreateOptions *ret = NULL;
    Error *local_err = NULL;
-    Error *end_err = NULL;

    ret = g_new0(QCryptoBlockCreateOptions, 1);
    ret->format = format;
@@ -265,9 +265,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
        error_setg(&local_err, "Unsupported block format %d", format);
        break;
    }
+    if (!local_err) {
+        visit_check_struct(opts_get_visitor(ov), &local_err);
+    }

-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(opts_get_visitor(ov));

 out:
    if (local_err) {
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE

 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)                                            \
+    do {                                                             \
+        if (DEBUG_CURL_PRINT) {                                      \
+            fprintf(stderr, fmt, ## __VA_ARGS__);                    \
+        }                                                            \
+    } while (0)

 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -32,7 +32,6 @@
 #ifdef CONFIG_BZIP2
 #include <bzlib.h>
 #endif
-#include <glib.h>

 enum {
    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -440,6 +439,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int ret;

    bs->read_only = 1;
+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
+
    s->n_chunks = 0;
    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
    /* used by dmg_read_mish_block to keep track of the current I/O position */
@@ -659,38 +660,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    return 0;
 }

-static int dmg_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVDMGState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);

    for (i = 0; i < nb_sectors; i++) {
        uint32_t sector_offset_in_chunk;
+        void *data;
+
        if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
        }
        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
         * s->uncompressed_chunk may be too small to cover the large all-zeroes
         * section. dmg_read_chunk is called to find s->current_chunk */
        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-            memset(buf + i * 512, 0, 512);
+            qemu_iovec_memset(qiov, i * 512, 0, 512);
            continue;
        }
        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        memcpy(buf + i * 512,
-               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
+        data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
    }
-    return 0;
-}

-static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVDMGState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = dmg_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@@ -715,7 +720,7 @@ static BlockDriver bdrv_dmg = {
    .instance_size  = sizeof(BDRVDMGState),
    .bdrv_probe     = dmg_probe,
    .bdrv_open      = dmg_open,
-    .bdrv_read      = dmg_co_read,
+    .bdrv_co_preadv = dmg_co_preadv,
    .bdrv_close     = dmg_close,
 };

--- a/block/gluster.c
+++ b/block/gluster.c
@@ -24,6 +24,8 @@ typedef struct GlusterAIOCB {
 typedef struct BDRVGlusterState {
    struct glfs *glfs;
    struct glfs_fd *fd;
+    bool supports_seek_data;
+    int debug_level;
 } BDRVGlusterState;

 typedef struct GlusterConf {
@@ -32,6 +34,7 @@ typedef struct GlusterConf {
    char *volname;
    char *image;
    char *transport;
+    int debug_level;
 } GlusterConf;

 static void qemu_gluster_gconf_free(GlusterConf *gconf)
@@ -194,11 +197,7 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
        goto out;
    }

-    /*
-     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
-     * GlusterFS makes GF_LOG_* macros available to libgfapi users.
-     */
-    ret = glfs_set_logging(glfs, "-", 4);
+    ret = glfs_set_logging(glfs, "-", gconf->debug_level);
    if (ret < 0) {
        goto out;
    }
@@ -256,16 +255,26 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
    qemu_bh_schedule(acb->bh);
 }

+#define GLUSTER_OPT_FILENAME "filename"
+#define GLUSTER_OPT_DEBUG "debug"
+#define GLUSTER_DEBUG_DEFAULT 4
+#define GLUSTER_DEBUG_MAX 9
+
 /* TODO Convert to fine grained options */
 static QemuOptsList runtime_opts = {
    .name = "gluster",
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    .desc = {
        {
-            .name = "filename",
+            .name = GLUSTER_OPT_FILENAME,
            .type = QEMU_OPT_STRING,
            .help = "URL to the gluster image",
        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
        { /* end of list */ }
    },
 };
@@ -287,6 +296,28 @@ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
    }
 }

+/*
+ * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
+ * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
+ * - Corrected versions return -1 and set errno to EINVAL.
+ * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
+ *   errno to ENXIO when SEEK_DATA is called with a position of EOF.
+ */
+static bool qemu_gluster_test_seek(struct glfs_fd *fd)
+{
+    off_t ret, eof;
+
+    eof = glfs_lseek(fd, 0, SEEK_END);
+    if (eof < 0) {
+        /* this should never occur */
+        return false;
+    }
+
+    /* this should always fail with ENXIO if SEEK_DATA is supported */
+    ret = glfs_lseek(fd, eof, SEEK_DATA);
+    return (ret < 0) && (errno == ENXIO);
+}
+
 static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
                             int bdrv_flags, Error **errp)
 {
@@ -306,8 +337,17 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
        goto out;
    }

-    filename = qemu_opt_get(opts, "filename");
+    filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);

+    s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
+                                         GLUSTER_DEBUG_DEFAULT);
+    if (s->debug_level < 0) {
+        s->debug_level = 0;
+    } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
+        s->debug_level = GLUSTER_DEBUG_MAX;
+    }
+
+    gconf->debug_level = s->debug_level;
    s->glfs = qemu_gluster_init(gconf, filename, errp);
    if (!s->glfs) {
        ret = -errno;
@@ -338,6 +378,8 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
        ret = -errno;
    }

+    s->supports_seek_data = qemu_gluster_test_seek(s->fd);
+
 out:
    qemu_opts_del(opts);
    qemu_gluster_gconf_free(gconf);
@@ -363,6 +405,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
                                       BlockReopenQueue *queue, Error **errp)
 {
    int ret = 0;
+    BDRVGlusterState *s;
    BDRVGlusterReopenState *reop_s;
    GlusterConf *gconf = NULL;
    int open_flags = 0;
@@ -370,6 +413,8 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
    assert(state != NULL);
    assert(state->bs != NULL);

+    s = state->bs->opaque;
+
    state->opaque = g_new0(BDRVGlusterReopenState, 1);
    reop_s = state->opaque;

@@ -377,6 +422,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,

    gconf = g_new0(GlusterConf, 1);

+    gconf->debug_level = s->debug_level;
    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
    if (reop_s->glfs == NULL) {
        ret = -errno;
@@ -454,14 +500,12 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
 }

 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+        int64_t offset, int size, BdrvRequestFlags flags)
 {
    int ret;
    GlusterAIOCB acb;
    BDRVGlusterState *s = bs->opaque;
-    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;

    acb.size = size;
    acb.ret = 0;
@@ -512,6 +556,14 @@ static int qemu_gluster_create(const char *filename,
    char *tmp = NULL;
    GlusterConf *gconf = g_new0(GlusterConf, 1);

+    gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
+                                                 GLUSTER_DEBUG_DEFAULT);
+    if (gconf->debug_level < 0) {
+        gconf->debug_level = 0;
+    } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
+        gconf->debug_level = GLUSTER_DEBUG_MAX;
+    }
+
    glfs = qemu_gluster_init(gconf, filename, errp);
    if (!glfs) {
        ret = -errno;
@@ -729,6 +781,159 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs)
    return 0;
 }

+/*
+ * Find allocation range in @bs around offset @start.
+ * May change underlying file descriptor's file offset.
+ * If @start is not in a hole, store @start in @data, and the
+ * beginning of the next hole in @hole, and return 0.
+ * If @start is in a non-trailing hole, store @start in @hole and the
+ * beginning of the next non-hole in @data, and return 0.
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
+ * If we can't find out, return a negative errno other than -ENXIO.
+ *
+ * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
+ */
+static int find_allocation(BlockDriverState *bs, off_t start,
+                           off_t *data, off_t *hole)
+{
+    BDRVGlusterState *s = bs->opaque;
+    off_t offs;
+
+    if (!s->supports_seek_data) {
+        return -ENOTSUP;
+    }
+
+    /*
+     * SEEK_DATA cases:
+     * D1. offs == start: start is in data
+     * D2. offs > start: start is in a hole, next data at offs
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
+     *                              or start is beyond EOF
+     *     If the latter happens, the file has been truncated behind
+     *     our back since we opened it.  All bets are off then.
+     *     Treating like a trailing hole is simplest.
+     * D4. offs < 0, errno != ENXIO: we learned nothing
+     */
+    offs = glfs_lseek(s->fd, start, SEEK_DATA);
+    if (offs < 0) {
+        return -errno;          /* D3 or D4 */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /* D2: in hole, next data at offs */
+        *hole = start;
+        *data = offs;
+        return 0;
+    }
+
+    /* D1: in data, end not yet known */
+
+    /*
+     * SEEK_HOLE cases:
+     * H1. offs == start: start is in a hole
+     *     If this happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H2. offs > start: either start is in data, next hole at offs,
+     *                   or start is in trailing hole, EOF at offs
+     *     Linux treats trailing holes like any other hole: offs ==
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
+     *     If that happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
+     *     If this happens, the file has been truncated behind our
+     *     back since we opened it.  Treat it like a trailing hole.
+     * H4. offs < 0, errno != ENXIO: we learned nothing
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
+     */
+    offs = glfs_lseek(s->fd, start, SEEK_HOLE);
+    if (offs < 0) {
+        return -errno;          /* D1 and (H3 or H4) */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /*
+         * D1 and H2: either in data, next hole at offs, or it was in
+         * data but is now in a trailing hole.  In the latter case,
+         * all bets are off.  Treating it as if it there was data all
+         * the way to EOF is safe, so simply do that.
+         */
+        *data = start;
+        *hole = offs;
+        return 0;
+    }
+
+    /* D1 and H1 */
+    return -EBUSY;
+}
+
+/*
+ * Returns the allocation status of the specified sectors.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ *
+ * (Based on raw_co_get_block_status() from raw-posix.c.)
+ */
+static int64_t coroutine_fn qemu_gluster_co_get_block_status(
+        BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+        BlockDriverState **file)
+{
+    BDRVGlusterState *s = bs->opaque;
+    off_t start, data = 0, hole = 0;
+    int64_t total_size;
+    int ret = -EINVAL;
+
+    if (!s->fd) {
+        return ret;
+    }
+
+    start = sector_num * BDRV_SECTOR_SIZE;
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        return total_size;
+    } else if (start >= total_size) {
+        *pnum = 0;
+        return 0;
+    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
+        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    }
+
+    ret = find_allocation(bs, start, &data, &hole);
+    if (ret == -ENXIO) {
+        /* Trailing hole */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_ZERO;
+    } else if (ret < 0) {
+        /* No info available, so pretend there are no holes */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA;
+    } else if (data == start) {
+        /* On a data extent, compute sectors to the end of the extent,
+         * possibly including a partial sector at EOF. */
+        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        ret = BDRV_BLOCK_DATA;
+    } else {
+        /* On a hole, compute sectors to the beginning of the next extent.  */
+        assert(hole == start);
+        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        ret = BDRV_BLOCK_ZERO;
+    }
+
+    *file = bs;
+
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+}
+
+
 static QemuOptsList qemu_gluster_create_opts = {
    .name = "qemu-gluster-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
@@ -743,6 +948,11 @@ static QemuOptsList qemu_gluster_create_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Preallocation mode (allowed values: off, full)"
        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
        { /* end of list */ }
    }
 };
@@ -769,8 +979,9 @@ static BlockDriver bdrv_gluster = {
    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -796,8 +1007,9 @@ static BlockDriver bdrv_gluster_tcp = {
    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -823,8 +1035,9 @@ static BlockDriver bdrv_gluster_unix = {
    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@@ -850,8 +1063,9 @@ static BlockDriver bdrv_gluster_rdma = {
    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

--- a/block/io.c
+++ b/block/io.c
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -401,18 +401,26 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }

-static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
-                                      IscsiLun *iscsilun)
+static bool is_byte_request_lun_aligned(int64_t offset, int count,
+                                        IscsiLun *iscsilun)
 {
-    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
-        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
-            error_report("iSCSI misaligned request: "
-                         "iscsilun->block_size %u, sector_num %" PRIi64
-                         ", nb_sectors %d",
-                         iscsilun->block_size, sector_num, nb_sectors);
-            return 0;
+    if (offset % iscsilun->block_size || count % iscsilun->block_size) {
+        error_report("iSCSI misaligned request: "
+                     "iscsilun->block_size %u, offset %" PRIi64
+                     ", count %d",
+                     iscsilun->block_size, offset, count);
+        return false;
    }
-    return 1;
+    return true;
+}
+
+static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors,
+                                          IscsiLun *iscsilun)
+{
+    assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
+    return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS,
+                                       nb_sectors << BDRV_SECTOR_BITS,
+                                       iscsilun);
 }

 static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
@@ -456,9 +464,12 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    struct IscsiTask iTask;
    uint64_t lba;
    uint32_t num_sectors;
-    bool fua;
+    bool fua = flags & BDRV_REQ_FUA;

-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (fua) {
+        assert(iscsilun->dpofua);
+    }
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }

@@ -472,7 +483,6 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
-    fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA);
    if (iscsilun->use_16_for_rw) {
        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        NULL, num_sectors * iscsilun->block_size,
@@ -513,13 +523,6 @@ retry:
    return 0;
 }

-static int coroutine_fn
-iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                QEMUIOVector *iov)
-{
-    return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0);
-}
-

 static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
                                             int64_t sector_num, int nb_sectors)
@@ -546,7 +549,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,

    iscsi_co_init_iscsitask(iscsilun, &iTask);

-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        ret = -EINVAL;
        goto out;
    }
@@ -643,7 +646,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    uint64_t lba;
    uint32_t num_sectors;

-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }

@@ -658,7 +661,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
        int64_t ret;
        int pnum;
        BlockDriverState *file;
-        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
+        ret = iscsi_co_get_block_status(bs, sector_num,
+                                        BDRV_REQUEST_MAX_SECTORS, &pnum, &file);
        if (ret < 0) {
            return ret;
        }
@@ -766,6 +770,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
    acb->ioh->driver_status = 0;
    acb->ioh->host_status   = 0;
    acb->ioh->resid         = 0;
+    acb->ioh->status        = status;

 #define SG_ERR_DRIVER_SENSE    0x08

@@ -837,6 +842,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        return &acb->common;
    }

+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -923,7 +935,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
    struct IscsiTask iTask;
    struct unmap_list list;

-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
        return -EINVAL;
    }

@@ -974,8 +986,8 @@ retry:
 }

 static int
-coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, BdrvRequestFlags flags)
+coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                    int count, BdrvRequestFlags flags)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct IscsiTask iTask;
@@ -983,8 +995,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
    uint32_t nb_blocks;
    bool use_16_for_ws = iscsilun->use_16_for_rw;

-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return -EINVAL;
+    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+        return -ENOTSUP;
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1005,8 +1017,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
        return -ENOTSUP;
    }

-    lba = sector_qemu2lun(sector_num, iscsilun);
-    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+    lba = offset / iscsilun->block_size;
+    nb_blocks = count / iscsilun->block_size;

    if (iscsilun->zeroblock == NULL) {
        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1062,9 +1074,11 @@ retry:
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+        iscsi_allocationmap_clear(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                  count >> BDRV_SECTOR_BITS);
    } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+        iscsi_allocationmap_set(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                count >> BDRV_SECTOR_BITS);
    }

    return 0;
@@ -1555,6 +1569,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
    task = NULL;

    iscsi_modesense_sync(iscsilun);
+    if (iscsilun->dpofua) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Check the write protect flag of the LUN if we want to write */
    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1704,15 +1722,19 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
        }
        bs->bl.discard_alignment =
            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+    } else {
+        bs->bl.discard_alignment = iscsilun->block_size >> BDRV_SECTOR_BITS;
    }

-    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes =
-            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
+    if (iscsilun->bl.max_ws_len < 0xffffffff / iscsilun->block_size) {
+        bs->bl.max_pwrite_zeroes =
+            iscsilun->bl.max_ws_len * iscsilun->block_size;
    }
    if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment =
-            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+        bs->bl.pwrite_zeroes_alignment =
+            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
+    } else {
+        bs->bl.pwrite_zeroes_alignment = iscsilun->block_size;
    }
    bs->bl.opt_transfer_length =
        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
@@ -1845,11 +1867,9 @@ static BlockDriver bdrv_iscsi = {

    .bdrv_co_get_block_status = iscsi_co_get_block_status,
    .bdrv_co_discard      = iscsi_co_discard,
-    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev        = iscsi_co_writev,
    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
    .bdrv_co_flush_to_disk = iscsi_co_flush,

 #ifdef __linux__
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -11,8 +11,10 @@
 #include "qemu-common.h"
 #include "block/aio.h"
 #include "qemu/queue.h"
+#include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
+#include "qemu/coroutine.h"

 #include <libaio.h>

@@ -30,7 +32,8 @@

 struct qemu_laiocb {
    BlockAIOCB common;
-    struct qemu_laio_state *ctx;
+    Coroutine *co;
+    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
@@ -46,7 +49,7 @@ typedef struct {
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;

-struct qemu_laio_state {
+struct LinuxAioState {
    io_context_t ctx;
    EventNotifier e;

@@ -60,7 +63,7 @@ struct qemu_laio_state {
    int event_max;
 };

-static void ioq_submit(struct qemu_laio_state *s);
+static void ioq_submit(LinuxAioState *s);

 static inline ssize_t io_event_ret(struct io_event *ev)
 {
@@ -70,8 +73,7 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 /*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
-static void qemu_laio_process_completion(struct qemu_laio_state *s,
-    struct qemu_laiocb *laiocb)
+static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
    int ret;

@@ -89,9 +91,14 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
            }
        }
    }
-    laiocb->common.cb(laiocb->common.opaque, ret);

-    qemu_aio_unref(laiocb);
+    laiocb->ret = ret;
+    if (laiocb->co) {
+        qemu_coroutine_enter(laiocb->co, NULL);
+    } else {
+        laiocb->common.cb(laiocb->common.opaque, ret);
+        qemu_aio_unref(laiocb);
+    }
 }

 /* The completion BH fetches completed I/O requests and invokes their
@@ -99,7 +106,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
- * the completion events array and index are kept in qemu_laio_state.  The BH
+ * the completion events array and index are kept in LinuxAioState.  The BH
 * reschedules itself as long as there are completions pending so it will
 * either be called again in a nested event loop or will be called after all
 * events have been completed.  When there are no events left to complete, the
@@ -107,7 +114,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 */
 static void qemu_laio_completion_bh(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
+    LinuxAioState *s = opaque;

    /* Fetch more completion events when empty */
    if (s->event_idx == s->event_max) {
@@ -136,20 +143,22 @@ static void qemu_laio_completion_bh(void *opaque)
        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
        s->event_idx++;

-        qemu_laio_process_completion(s, laiocb);
+        qemu_laio_process_completion(laiocb);
    }

    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
+
+    qemu_bh_cancel(s->completion_bh);
 }

 static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
-        qemu_bh_schedule(s->completion_bh);
+        qemu_laio_completion_bh(s);
    }
 }

@@ -185,7 +194,7 @@ static void ioq_init(LaioQueue *io_q)
    io_q->blocked = false;
 }

-static void ioq_submit(struct qemu_laio_state *s)
+static void ioq_submit(LinuxAioState *s)
 {
    int ret, len;
    struct qemu_laiocb *aiocb;
@@ -216,45 +225,27 @@ static void ioq_submit(struct qemu_laio_state *s)
    s->io_q.blocked = (s->io_q.n > 0);
 }

-void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    s->io_q.plugged++;
+    assert(!s->io_q.plugged);
+    s->io_q.plugged = 1;
 }

-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    assert(s->io_q.plugged > 0 || !unplug);
-
-    if (unplug && --s->io_q.plugged > 0) {
-        return;
-    }
-
+    assert(s->io_q.plugged);
+    s->io_q.plugged = 0;
    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
 }

-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
+                          int type)
 {
-    struct qemu_laio_state *s = aio_ctx;
-    struct qemu_laiocb *laiocb;
-    struct iocb *iocbs;
-    off_t offset = sector_num * 512;
-
-    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
-    laiocb->nbytes = nb_sectors * 512;
-    laiocb->ctx = s;
-    laiocb->ret = -EINPROGRESS;
-    laiocb->is_read = (type == QEMU_AIO_READ);
-    laiocb->qiov = qiov;
-
-    iocbs = &laiocb->iocb;
+    LinuxAioState *s = laiocb->ctx;
+    struct iocb *iocbs = &laiocb->iocb;
+    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
@@ -267,7 +258,7 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
-        goto out_free_aiocb;
+        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

@@ -277,33 +268,71 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
        ioq_submit(s);
    }
-    return &laiocb->common;

-out_free_aiocb:
-    qemu_aio_unref(laiocb);
-    return NULL;
+    return 0;
 }

-void laio_detach_aio_context(void *s_, AioContext *old_context)
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                uint64_t offset, QEMUIOVector *qiov, int type)
 {
-    struct qemu_laio_state *s = s_;
+    int ret;
+    struct qemu_laiocb laiocb = {
+        .co         = qemu_coroutine_self(),
+        .nbytes     = qiov->size,
+        .ctx        = s,
+        .is_read    = (type == QEMU_AIO_READ),
+        .qiov       = qiov,
+    };

+    ret = laio_do_submit(fd, &laiocb, offset, type);
+    if (ret < 0) {
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+    return laiocb.ret;
+}
+
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_laiocb *laiocb;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+    int ret;
+
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+    laiocb->is_read = (type == QEMU_AIO_READ);
+    laiocb->qiov = qiov;
+
+    ret = laio_do_submit(fd, laiocb, offset, type);
+    if (ret < 0) {
+        qemu_aio_unref(laiocb);
+        return NULL;
+    }
+
+    return &laiocb->common;
+}
+
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+{
    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
 }

-void laio_attach_aio_context(void *s_, AioContext *new_context)
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
-    struct qemu_laio_state *s = s_;
-
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb);
 }

-void *laio_init(void)
+LinuxAioState *laio_init(void)
 {
-    struct qemu_laio_state *s;
+    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    if (event_notifier_init(&s->e, false) < 0) {
@@ -325,10 +354,8 @@ out_free_state:
    return NULL;
 }

-void laio_cleanup(void *s_)
+void laio_cleanup(LinuxAioState *s)
 {
-    struct qemu_laio_state *s = s_;
-
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,7 +20,6 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
-#include "qemu/error-report.h"

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
@@ -36,7 +35,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
-    BlockDriverState *target;
+    BlockBackend *target;
    BlockDriverState *base;
    /* The name of the graph node to replace */
    char *replaces;
@@ -45,6 +44,7 @@ typedef struct MirrorBlockJob {
    /* Used to block operations on the drive-mirror-replace target */
    Error *replace_blocker;
    bool is_none_mode;
+    BlockMirrorBackingMode backing_mode;
    BlockdevOnError on_source_error, on_target_error;
    bool synced;
    bool should_complete;
@@ -80,11 +80,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
    s->synced = false;
    if (read) {
-        return block_job_error_action(&s->common, s->common.bs,
-                                      s->on_source_error, true, error);
+        return block_job_error_action(&s->common, s->on_source_error,
+                                      true, error);
    } else {
-        return block_job_error_action(&s->common, s->target,
-                                      s->on_target_error, false, error);
+        return block_job_error_action(&s->common, s->on_target_error,
+                                      false, error);
    }
 }

@@ -157,8 +157,8 @@ static void mirror_read_complete(void *opaque, int ret)
        mirror_iteration_done(op, ret);
        return;
    }
-    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
-                    mirror_write_complete, op);
+    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                    0, mirror_write_complete, op);
 }

 static inline void mirror_clip_sectors(MirrorBlockJob *s,
@@ -186,8 +186,9 @@ static int mirror_cow_align(MirrorBlockJob *s,
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
-        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
-                               &align_sector_num, &align_nb_sectors);
+        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
+                                       *nb_sectors, &align_sector_num,
+                                       &align_nb_sectors);
    }

    if (align_nb_sectors > max_sectors) {
@@ -217,23 +218,29 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 }

 /* Submit async read while handling COW.
- * Returns: nb_sectors if no alignment is necessary, or
+ * Returns: The number of sectors copied after and including sector_num,
+ *          excluding any sectors copied prior to sector_num due to alignment.
+ *          This will be nb_sectors if no alignment is necessary, or
 *          (new_end - sector_num) if tail is rounded up or down due to
 *          alignment or buffer limit.
 */
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockBackend *source = s->common.blk;
    int sectors_per_chunk, nb_chunks;
-    int ret = nb_sectors;
+    int ret;
    MirrorOp *op;
+    int max_sectors;

    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    max_sectors = sectors_per_chunk * s->max_iov;

    /* We can only handle as much as buf_size at a time. */
    nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
+    nb_sectors = MIN(max_sectors, nb_sectors);
    assert(nb_sectors);
+    ret = nb_sectors;

    if (s->cow_bitmap) {
        ret += mirror_cow_align(s, &sector_num, &nb_sectors);
@@ -274,7 +281,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
                   mirror_read_complete, op);
    return ret;
 }
@@ -296,10 +303,11 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+        blk_aio_discard(s->target, sector_num, op->nb_sectors,
+                        mirror_write_complete, op);
    } else {
-        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
+                              op->nb_sectors * BDRV_SECTOR_SIZE,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    }
@@ -307,7 +315,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockDriverState *source = blk_bs(s->common.blk);
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -325,10 +333,12 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)

    first_chunk = sector_num / sectors_per_chunk;
    while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, first_chunk, s->in_flight);
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
        mirror_wait_for_io(s);
    }

+    block_job_pause_point(&s->common);
+
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
@@ -384,8 +394,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
-            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
-                                   &target_sector_num, &target_nb_sectors);
+            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
+                                           io_sectors,  &target_sector_num,
+                                           &target_nb_sectors);
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
                mirror_method = ret & BDRV_BLOCK_ZERO ?
@@ -449,7 +460,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->common.bs;
+    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
@@ -461,26 +473,25 @@ static void mirror_exit(BlockJob *job, void *opaque)
    }

    if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
+        BlockDriverState *to_replace = src;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }

-        /* This was checked in mirror_start_job(), but meanwhile one of the
-         * nodes could have been newly attached to a BlockBackend. */
-        if (to_replace->blk && s->target->blk) {
-            error_report("block job: Can't create node with two BlockBackends");
-            data->ret = -EINVAL;
-            goto out;
+        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
        }

-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_replace_in_backing_chain(to_replace, s->target);
+        /* The mirror job has no requests in flight any more, but we need to
+         * drain potential other users of the BDS before changing the graph. */
+        bdrv_drained_begin(target_bs);
+        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_drained_end(target_bs);
+
+        /* We just changed the BDS the job BB refers to */
+        blk_remove_bs(job->blk);
+        blk_insert_bs(job->blk, src);
    }
-
-out:
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
@@ -490,8 +501,8 @@ out:
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_op_unblock_all(s->target, s->common.blocker);
-    bdrv_unref(s->target);
+    bdrv_op_unblock_all(target_bs, s->common.blocker);
+    blk_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
@@ -505,7 +516,8 @@ static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
    int64_t sector_num, end, length;
    uint64_t last_pause_ns;
    BlockDriverInfo bdi;
@@ -541,18 +553,18 @@ static void coroutine_fn mirror_run(void *opaque)
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
-    bdrv_get_backing_filename(s->target, backing_filename,
+    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
-    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
        target_cluster_size = bdi.cluster_size;
    }
-    if (backing_filename[0] && !s->target->backing
+    if (backing_filename[0] && !target_bs->backing
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
+    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);

    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -567,7 +579,7 @@ static void coroutine_fn mirror_run(void *opaque)
    if (!s->is_none_mode) {
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
        BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
+        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);

        for (sector_num = 0; sector_num < end; ) {
            /* Just to make sure we are not exceeding int limit. */
@@ -578,6 +590,8 @@ static void coroutine_fn mirror_run(void *opaque)
            if (now - last_pause_ns > SLICE_TIME) {
                last_pause_ns = now;
                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+            } else {
+                block_job_pause_point(&s->common);
            }

            if (block_job_is_cancelled(&s->common)) {
@@ -609,6 +623,8 @@ static void coroutine_fn mirror_run(void *opaque)
            goto immediate_exit;
        }

+        block_job_pause_point(&s->common);
+
        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
        /* s->common.offset contains the number of bytes already processed so
         * far, cnt is the number of dirty sectors remaining and
@@ -637,7 +653,7 @@ static void coroutine_fn mirror_run(void *opaque)
        should_complete = false;
        if (s->in_flight == 0 && cnt == 0) {
            trace_mirror_before_flush(s);
-            ret = bdrv_flush(s->target);
+            ret = blk_flush(s->target);
            if (ret < 0) {
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
@@ -710,15 +726,12 @@ immediate_exit:
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    if (s->target->blk) {
-        blk_iostatus_disable(s->target->blk);
-    }

    data = g_malloc(sizeof(*data));
    data->ret = ret;
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
-    bdrv_drained_begin(s->common.bs);
+    bdrv_drained_begin(bs);
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
@@ -739,32 +752,30 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }

-static void mirror_iostatus_reset(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    Error *local_err = NULL;
-    int ret;
+    BlockDriverState *src, *target;
+
+    src = blk_bs(job->blk);
+    target = blk_bs(s->target);

-    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        return;
-    }
    if (!s->synced) {
        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
        return;
    }

-    /* check the target bs is not blocked and block all operations on it */
+    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
+        int ret;
+
+        assert(!target->backing);
+        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
+        if (ret < 0) {
+            return;
+        }
+    }
+
+    /* block all operations on to_replace bs */
    if (s->replaces) {
        AioContext *replace_aio_context;

@@ -785,31 +796,57 @@ static void mirror_complete(BlockJob *job, Error **errp)
        aio_context_release(replace_aio_context);
    }

+    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
+        BlockDriverState *backing = s->is_none_mode ? src : s->base;
+        if (backing_bs(target) != backing) {
+            bdrv_set_backing_hd(target, backing);
+        }
+    }
+
    s->should_complete = true;
    block_job_enter(&s->common);
 }

+/* There is no matching mirror_resume() because mirror_run() will begin
+ * iterating again when the job is resumed.
+ */
+static void coroutine_fn mirror_pause(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    mirror_drain(s);
+}
+
+static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    blk_set_aio_context(s->target, new_context);
+}
+
 static const BlockJobDriver mirror_job_driver = {
-    .instance_size = sizeof(MirrorBlockJob),
-    .job_type      = BLOCK_JOB_TYPE_MIRROR,
-    .set_speed     = mirror_set_speed,
-    .iostatus_reset= mirror_iostatus_reset,
-    .complete      = mirror_complete,
+    .instance_size          = sizeof(MirrorBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_MIRROR,
+    .set_speed              = mirror_set_speed,
+    .complete               = mirror_complete,
+    .pause                  = mirror_pause,
+    .attached_aio_context   = mirror_attached_aio_context,
 };

 static const BlockJobDriver commit_active_job_driver = {
-    .instance_size = sizeof(MirrorBlockJob),
-    .job_type      = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed     = mirror_set_speed,
-    .iostatus_reset
-                   = mirror_iostatus_reset,
-    .complete      = mirror_complete,
+    .instance_size          = sizeof(MirrorBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed              = mirror_set_speed,
+    .complete               = mirror_complete,
+    .pause                  = mirror_pause,
+    .attached_aio_context   = mirror_attached_aio_context,
 };

 static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             const char *replaces,
                             int64_t speed, uint32_t granularity,
                             int64_t buf_size,
+                             BlockMirrorBackingMode backing_mode,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
                             bool unmap,
@@ -819,7 +856,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             bool is_none_mode, BlockDriverState *base)
 {
    MirrorBlockJob *s;
-    BlockDriverState *replaced_bs;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -827,13 +863,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,

    assert ((granularity & (granularity - 1)) == 0);

-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
    if (buf_size < 0) {
        error_setg(errp, "Invalid parameter 'buf-size'");
        return;
@@ -843,31 +872,19 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    /* We can't support this case as long as the block layer can't handle
-     * multiple BlockBackends per BlockDriverState. */
-    if (replaces) {
-        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
-        if (replaced_bs == NULL) {
-            return;
-        }
-    } else {
-        replaced_bs = bs;
-    }
-    if (replaced_bs->blk && target->blk) {
-        error_setg(errp, "Can't create node with two BlockBackends");
-        return;
-    }
-
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
    }

+    s->target = blk_new();
+    blk_insert_bs(s->target, target);
+
    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
-    s->target = target;
    s->is_none_mode = is_none_mode;
+    s->backing_mode = backing_mode;
    s->base = base;
    s->granularity = granularity;
    s->buf_size = ROUND_UP(buf_size, granularity);
@@ -876,16 +893,13 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
+        blk_unref(s->target);
        block_job_unref(&s->common);
        return;
    }

-    bdrv_op_block_all(s->target, s->common.blocker);
+    bdrv_op_block_all(target, s->common.blocker);

-    if (s->target->blk) {
-        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(s->target->blk);
-    }
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
@@ -894,7 +908,8 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                  const char *replaces,
                  int64_t speed, uint32_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  bool unmap,
                  BlockCompletionFunc *cb,
@@ -910,7 +925,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
    mirror_start_job(bs, target, replaces,
-                     speed, granularity, buf_size,
+                     speed, granularity, buf_size, backing_mode,
                     on_source_error, on_target_error, unmap, cb, opaque, errp,
                     &mirror_job_driver, is_none_mode, base);
 }
@@ -957,8 +972,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
        }
    }

-    bdrv_ref(base);
-    mirror_start_job(bs, base, NULL, speed, 0, 0,
+    mirror_start_job(bs, base, NULL, speed, 0, 0, MIRROR_LEAVE_BACKING_CHAIN,
                     on_error, on_error, false, cb, opaque, &local_err,
                     &commit_active_job_driver, false, base);
    if (local_err) {
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -243,15 +243,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,

 static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset, int *flags)
+                           int offset, int flags)
 {
    NbdClientSession *client = nbd_get_client_session(bs);
    struct nbd_request request = { .type = NBD_CMD_WRITE };
    struct nbd_reply reply;
    ssize_t ret;

-    if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
-        *flags &= ~BDRV_REQ_FUA;
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
        request.type |= NBD_CMD_FLAG_FUA;
    }

@@ -291,7 +291,7 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
 }

 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags)
+                         int nb_sectors, QEMUIOVector *qiov, int flags)
 {
    int offset = 0;
    int ret;
@@ -414,6 +414,9 @@ int nbd_client_init(BlockDriverState *bs,
        logout("Failed to negotiate with the NBD server\n");
        return ret;
    }
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_mutex_init(&client->free_sema);
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -48,7 +48,7 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags);
+                         int nb_sectors, QEMUIOVector *qiov, int flags);
 int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -355,31 +355,6 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
 }

-static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
-                               int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-    int ret;
-
-    ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* The flag wasn't sent to the server, so we need to emulate it with an
-     * explicit flush */
-    if (flags & BDRV_REQ_FUA) {
-        ret = nbd_client_co_flush(bs);
-    }
-
-    return ret;
-}
-
-static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
    return nbd_client_co_flush(bs);
@@ -476,9 +451,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -496,9 +469,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
@@ -516,9 +487,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_parse_filename        = nbd_parse_filename,
    .bdrv_file_open             = nbd_open,
    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_writev_flags       = nbd_client_co_writev,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_discard            = nbd_co_discard,
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Block driver for native access to files on NFS shares
 *
- * Copyright (c) 2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2014-2016 Peter Lieven <pl@kamp.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -38,6 +38,7 @@
 #include <nfsc/libnfs.h>

 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
+#define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
 #define QEMU_NFS_MAX_DEBUG_LEVEL 2

 typedef struct NFSClient {
@@ -47,6 +48,7 @@ typedef struct NFSClient {
    bool has_zero_init;
    AioContext *aio_context;
    blkcnt_t st_blocks;
+    bool cache_used;
 } NFSClient;

 typedef struct NFSRPC {
@@ -278,7 +280,7 @@ static void nfs_file_close(BlockDriverState *bs)
 }

 static int64_t nfs_client_open(NFSClient *client, const char *filename,
-                               int flags, Error **errp)
+                               int flags, Error **errp, int open_flags)
 {
    int ret = -EINVAL, i;
    struct stat st;
@@ -330,12 +332,38 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
            nfs_set_tcp_syncnt(client->context, val);
 #ifdef LIBNFS_FEATURE_READAHEAD
        } else if (!strcmp(qp->p[i].name, "readahead")) {
+            if (open_flags & BDRV_O_NOCACHE) {
+                error_setg(errp, "Cannot enable NFS readahead "
+                                 "if cache.direct = on");
+                goto fail;
+            }
            if (val > QEMU_NFS_MAX_READAHEAD_SIZE) {
                error_report("NFS Warning: Truncating NFS readahead"
                             " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
                val = QEMU_NFS_MAX_READAHEAD_SIZE;
            }
            nfs_set_readahead(client->context, val);
+#ifdef LIBNFS_FEATURE_PAGECACHE
+            nfs_set_pagecache_ttl(client->context, 0);
+#endif
+            client->cache_used = true;
+#endif
+#ifdef LIBNFS_FEATURE_PAGECACHE
+            nfs_set_pagecache_ttl(client->context, 0);
+        } else if (!strcmp(qp->p[i].name, "pagecache")) {
+            if (open_flags & BDRV_O_NOCACHE) {
+                error_setg(errp, "Cannot enable NFS pagecache "
+                                 "if cache.direct = on");
+                goto fail;
+            }
+            if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) {
+                error_report("NFS Warning: Truncating NFS pagecache"
+                             " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
+                val = QEMU_NFS_MAX_PAGECACHE_SIZE;
+            }
+            nfs_set_pagecache(client->context, val);
+            nfs_set_pagecache_ttl(client->context, 0);
+            client->cache_used = true;
 #endif
 #ifdef LIBNFS_FEATURE_DEBUG
        } else if (!strcmp(qp->p[i].name, "debug")) {
@@ -418,7 +446,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
    }
    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
-                          errp);
+                          errp, bs->open_flags);
    if (ret < 0) {
        goto out;
    }
@@ -454,7 +482,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);

-    ret = nfs_client_open(client, url, O_CREAT, errp);
+    ret = nfs_client_open(client, url, O_CREAT, errp, 0);
    if (ret < 0) {
        goto out;
    }
@@ -516,6 +544,12 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
        return -EACCES;
    }

+    if ((state->flags & BDRV_O_NOCACHE) && client->cache_used) {
+        error_setg(errp, "Cannot disable cache if libnfs readahead or"
+                         " pagecache is enabled");
+        return -EINVAL;
+    }
+
    /* Update cache for read-only reopens */
    if (!(state->flags & BDRV_O_RDWR)) {
        ret = nfs_fstat(client->context, client->fh, &st);
@@ -530,6 +564,15 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

+#ifdef LIBNFS_FEATURE_PAGECACHE
+static void nfs_invalidate_cache(BlockDriverState *bs,
+                                 Error **errp)
+{
+    NFSClient *client = bs->opaque;
+    nfs_pagecache_invalidate(client->context, client->fh);
+}
+#endif
+
 static BlockDriver bdrv_nfs = {
    .format_name                    = "nfs",
    .protocol_name                  = "nfs",
@@ -553,6 +596,10 @@ static BlockDriver bdrv_nfs = {

    .bdrv_detach_aio_context        = nfs_detach_aio_context,
    .bdrv_attach_aio_context        = nfs_attach_aio_context,
+
+#ifdef LIBNFS_FEATURE_PAGECACHE
+    .bdrv_invalidate_cache          = nfs_invalidate_cache,
+#endif
 };

 static void nfs_block_init(void)
--- a/block/null.c
+++ b/block/null.c
@@ -12,6 +12,8 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
 #include "block/block_int.h"

 #define NULL_OPT_LATENCY "latency-ns"
@@ -223,6 +225,20 @@ static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
    }
 }

+static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
+{
+    QINCREF(opts);
+    qdict_del(opts, "filename");
+
+    if (!qdict_size(opts)) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://",
+                 bs->drv->format_name);
+    }
+
+    qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name));
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_null_co = {
    .format_name            = "null-co",
    .protocol_name          = "null-co",
@@ -238,6 +254,8 @@ static BlockDriver bdrv_null_co = {
    .bdrv_reopen_prepare    = null_reopen_prepare,

    .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };

 static BlockDriver bdrv_null_aio = {
@@ -255,6 +273,8 @@ static BlockDriver bdrv_null_aio = {
    .bdrv_reopen_prepare    = null_reopen_prepare,

    .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };

 static void bdrv_null_init(void)
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -33,6 +33,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"

@@ -203,13 +204,15 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        return -EINVAL;
    }

-    to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx;
+    to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;
    space = to_allocate * s->tracks;
    if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) {
        int ret;
        space += s->prealloc_size;
        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
-            ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0);
+            ret = bdrv_pwrite_zeroes(bs->file->bs,
+                                     s->data_end << BDRV_SECTOR_BITS,
+                                     space << BDRV_SECTOR_BITS, 0);
        } else {
            ret = bdrv_truncate(bs->file->bs,
                                (s->data_end + space) << BDRV_SECTOR_BITS);
@@ -512,11 +515,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    memset(tmp, 0, sizeof(tmp));
    memcpy(tmp, &header, sizeof(header));

-    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0);
    if (ret < 0) {
        goto exit;
    }
-    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
+                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
    if (ret < 0) {
        goto exit;
    }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
    info->detect_zeroes = bs->detect_zeroes;

-    if (bs->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_state) {
        ThrottleConfig cfg;

-        throttle_group_get_config(bs, &cfg);
+        throttle_group_get_config(blk, &cfg);

        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
        info->iops_size = cfg.op_size;

        info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(bs));
+        info->group = g_strdup(throttle_group_get_name(blk));
    }

    info->write_threshold = bdrv_write_threshold_get(bs);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
 #include "crypto/cipher.h"
@@ -161,10 +162,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->crypt_method_header) {
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
        }

        bs->encrypted = 1;
@@ -853,24 +860,24 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    /* write all the data */
-    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
    if (ret != sizeof(header)) {
        goto exit;
    }

    if (backing_file) {
        ret = blk_pwrite(qcow_blk, sizeof(header),
-            backing_file, backing_filename_len);
+                         backing_file, backing_filename_len, 0);
        if (ret != backing_filename_len) {
            goto exit;
        }
    }

    tmp = g_malloc0(BDRV_SECTOR_SIZE);
-    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
-        BDRV_SECTOR_SIZE); i++) {
-        ret = blk_pwrite(qcow_blk, header_size +
-            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+    for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
+         i++) {
+        ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
+                         tmp, BDRV_SECTOR_SIZE, 0);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
            goto exit;
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -24,11 +24,6 @@

 /* Needed for CONFIG_MADVISE */
 #include "qemu/osdep.h"
-
-#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
-#include <sys/mman.h>
-#endif
-
 #include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
@@ -226,7 +221,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
    return 0;
 }

-int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c)
 {
    BDRVQcow2State *s = bs->opaque;
    int result = 0;
@@ -242,8 +237,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
        }
    }

+    return result;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int result = qcow2_cache_write(bs, c);
+
    if (result == 0) {
-        ret = bdrv_flush(bs->file->bs);
+        int ret = bdrv_flush(bs->file->bs);
        if (ret < 0) {
            result = ret;
        }
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "trace.h"

 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
@@ -153,11 +154,9 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
    uint64_t **l2_table)
 {
    BDRVQcow2State *s = bs->opaque;
-    int ret;

-    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
-
-    return ret;
+    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                           (void **)l2_table);
 }

 /*
@@ -389,22 +388,18 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
    return 0;
 }

-static int coroutine_fn copy_sectors(BlockDriverState *bs,
-                                     uint64_t start_sect,
-                                     uint64_t cluster_offset,
-                                     int n_start, int n_end)
+static int coroutine_fn do_perform_cow(BlockDriverState *bs,
+                                       uint64_t src_cluster_offset,
+                                       uint64_t cluster_offset,
+                                       int offset_in_cluster,
+                                       int bytes)
 {
    BDRVQcow2State *s = bs->opaque;
    QEMUIOVector qiov;
    struct iovec iov;
-    int n, ret;
+    int ret;

-    n = n_end - n_start;
-    if (n <= 0) {
-        return 0;
-    }
-
-    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_len = bytes;
    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
    if (iov.iov_base == NULL) {
        return -ENOMEM;
@@ -423,17 +418,21 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
     * interface.  This avoids double I/O throttling and request tracking,
     * which can lead to deadlock when block layer copy-on-read is enabled.
     */
-    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
+                                  bytes, &qiov, 0);
    if (ret < 0) {
        goto out;
    }

    if (bs->encrypted) {
        Error *err = NULL;
+        int64_t sector = (cluster_offset + offset_in_cluster)
+                         >> BDRV_SECTOR_BITS;
        assert(s->cipher);
-        if (qcow2_encrypt_sectors(s, start_sect + n_start,
-                                  iov.iov_base, iov.iov_base, n,
-                                  true, &err) < 0) {
+        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
+        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
+        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
            ret = -EIO;
            error_free(err);
            goto out;
@@ -441,14 +440,14 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
    }

    ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+            cluster_offset + offset_in_cluster, bytes);
    if (ret < 0) {
        goto out;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
-                         &qiov);
+    ret = bdrv_co_pwritev(bs->file->bs, cluster_offset + offset_in_cluster,
+                          bytes, &qiov, 0);
    if (ret < 0) {
        goto out;
    }
@@ -463,47 +462,44 @@ out:
 /*
 * get_cluster_offset
 *
- * For a given offset of the disk image, find the cluster offset in
- * qcow2 file. The offset is stored in *cluster_offset.
+ * For a given offset of the virtual disk, find the cluster type and offset in
+ * the qcow2 file. The offset is stored in *cluster_offset.
 *
- * on entry, *num is the number of contiguous sectors we'd like to
- * access following offset.
+ * On entry, *bytes is the maximum number of contiguous bytes starting at
+ * offset that we are interested in.
 *
- * on exit, *num is the number of contiguous sectors we can read.
+ * On exit, *bytes is the number of bytes starting at offset that have the same
+ * cluster type and (if applicable) are stored contiguously in the image file.
+ * Compressed clusters are always returned one by one.
 *
 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 * cases.
 */
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset)
+                             unsigned int *bytes, uint64_t *cluster_offset)
 {
    BDRVQcow2State *s = bs->opaque;
    unsigned int l2_index;
    uint64_t l1_index, l2_offset, *l2_table;
    int l1_bits, c;
-    unsigned int index_in_cluster, nb_clusters;
-    uint64_t nb_available, nb_needed;
+    unsigned int offset_in_cluster, nb_clusters;
+    uint64_t bytes_available, bytes_needed;
    int ret;

-    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
-    nb_needed = *num + index_in_cluster;
+    offset_in_cluster = offset_into_cluster(s, offset);
+    bytes_needed = (uint64_t) *bytes + offset_in_cluster;

    l1_bits = s->l2_bits + s->cluster_bits;

-    /* compute how many bytes there are between the offset and
-     * the end of the l1 entry
-     */
+    /* compute how many bytes there are between the start of the cluster
+     * containing offset and the end of the l1 entry */
+    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
+                    + offset_in_cluster;

-    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
-
-    /* compute the number of available sectors */
-
-    nb_available = (nb_available >> 9) + index_in_cluster;
-
-    if (nb_needed > nb_available) {
-        nb_needed = nb_available;
+    if (bytes_needed > bytes_available) {
+        bytes_needed = bytes_available;
    }
-    assert(nb_needed <= INT_MAX);
+    assert(bytes_needed <= INT_MAX);

    *cluster_offset = 0;

@@ -541,7 +537,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    *cluster_offset = be64_to_cpu(l2_table[l2_index]);

    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
-    nb_clusters = size_to_clusters(s, nb_needed << 9);
+    nb_clusters = size_to_clusters(s, bytes_needed);

    ret = qcow2_get_cluster_type(*cluster_offset);
    switch (ret) {
@@ -588,13 +584,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,

    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);

-    nb_available = (c * s->cluster_sectors);
+    bytes_available = (c * s->cluster_size);

 out:
-    if (nb_available > nb_needed)
-        nb_available = nb_needed;
+    if (bytes_available > bytes_needed) {
+        bytes_available = bytes_needed;
+    }

-    *num = nb_available - index_in_cluster;
+    *bytes = bytes_available - offset_in_cluster;

    return ret;

@@ -740,14 +737,12 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
    BDRVQcow2State *s = bs->opaque;
    int ret;

-    if (r->nb_sectors == 0) {
+    if (r->nb_bytes == 0) {
        return 0;
    }

    qemu_co_mutex_unlock(&s->lock);
-    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
-                       r->offset / BDRV_SECTOR_SIZE,
-                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
    qemu_co_mutex_lock(&s->lock);

    if (ret < 0) {
@@ -809,13 +804,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
    assert(l2_index + m->nb_clusters <= s->l2_size);
    for (i = 0; i < m->nb_clusters; i++) {
        /* if two concurrent writes happen to the same unallocated cluster
-	 * each write allocates separate cluster and writes data concurrently.
-	 * The first one to complete updates l2 table with pointer to its
-	 * cluster the second one has to do RMW (which is done above by
-	 * copy_sectors()), update l2 table with its cluster pointer and free
-	 * old cluster. This is what this loop does */
-        if(l2_table[l2_index + i] != 0)
+         * each write allocates separate cluster and writes data concurrently.
+         * The first one to complete updates l2 table with pointer to its
+         * cluster the second one has to do RMW (which is done above by
+         * perform_cow()), update l2 table with its cluster pointer and free
+         * old cluster. This is what this loop does */
+        if (l2_table[l2_index + i] != 0) {
            old_cluster[j++] = l2_table[l2_index + i];
+        }

        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
@@ -1197,25 +1193,20 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
    /*
     * Save info needed for meta data update.
     *
-     * requested_sectors: Number of sectors from the start of the first
+     * requested_bytes: Number of bytes from the start of the first
     * newly allocated cluster to the end of the (possibly shortened
     * before) write request.
     *
-     * avail_sectors: Number of sectors from the start of the first
+     * avail_bytes: Number of bytes from the start of the first
     * newly allocated to the end of the last newly allocated cluster.
     *
-     * nb_sectors: The number of sectors from the start of the first
+     * nb_bytes: The number of bytes from the start of the first
     * newly allocated cluster to the end of the area that the write
     * request actually writes to (excluding COW at the end)
     */
-    int requested_sectors =
-        (*bytes + offset_into_cluster(s, guest_offset))
-        >> BDRV_SECTOR_BITS;
-    int avail_sectors = nb_clusters
-                        << (s->cluster_bits - BDRV_SECTOR_BITS);
-    int alloc_n_start = offset_into_cluster(s, guest_offset)
-                        >> BDRV_SECTOR_BITS;
-    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
+    int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
+    int nb_bytes = MIN(requested_bytes, avail_bytes);
    QCowL2Meta *old_m = *m;

    *m = g_malloc0(sizeof(**m));
@@ -1226,23 +1217,21 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
        .alloc_offset   = alloc_cluster_offset,
        .offset         = start_of_cluster(s, guest_offset),
        .nb_clusters    = nb_clusters,
-        .nb_available   = nb_sectors,

        .cow_start = {
            .offset     = 0,
-            .nb_sectors = alloc_n_start,
+            .nb_bytes   = offset_into_cluster(s, guest_offset),
        },
        .cow_end = {
-            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
-            .nb_sectors = avail_sectors - nb_sectors,
+            .offset     = nb_bytes,
+            .nb_bytes   = avail_bytes - nb_bytes,
        },
    };
    qemu_co_queue_init(&(*m)->dependent_requests);
    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);

    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
-    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
-                         - offset_into_cluster(s, guest_offset));
+    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
    assert(*bytes != 0);

    return 1;
@@ -1274,7 +1263,8 @@ fail:
 * Return 0 on success and -errno in error cases
 */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m)
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t start, remaining;
@@ -1282,13 +1272,11 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
    uint64_t cur_bytes;
    int ret;

-    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
-
-    assert((offset & ~BDRV_SECTOR_MASK) == 0);
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);

 again:
    start = offset;
-    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
+    remaining = *bytes;
    cluster_offset = 0;
    *host_offset = 0;
    cur_bytes = 0;
@@ -1374,8 +1362,8 @@ again:
        }
    }

-    *num -= remaining >> BDRV_SECTOR_BITS;
-    assert(*num > 0);
+    *bytes -= remaining;
+    assert(*bytes > 0);
    assert(*host_offset != 0);

    return 0;
@@ -1764,8 +1752,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                goto fail;
            }

-            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
-                                    s->cluster_sectors, 0);
+            ret = bdrv_pwrite_zeroes(bs->file->bs, offset, s->cluster_size, 0);
            if (ret < 0) {
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
@@ -1867,8 +1854,8 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
    }

    for (i = 0; i < s->nb_snapshots; i++) {
-        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
-                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
+        int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
+                                      sizeof(uint64_t), BDRV_SECTOR_SIZE);

        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);

--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 #include "qemu/range.h"
+#include "qemu/bswap.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -217,13 +218,10 @@ static int load_refcount_block(BlockDriverState *bs,
                               void **refcount_block)
 {
    BDRVQcow2State *s = bs->opaque;
-    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
-    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
-        refcount_block);
-
-    return ret;
+    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+                           refcount_block);
 }

 /*
@@ -489,14 +487,12 @@ static int alloc_refcount_block(BlockDriverState *bs,
        uint64_t table_clusters =
            size_to_clusters(s, table_size * sizeof(uint64_t));
        blocks_clusters = 1 +
-            ((table_clusters + s->refcount_block_size - 1)
-            / s->refcount_block_size);
+            DIV_ROUND_UP(table_clusters, s->refcount_block_size);
        uint64_t meta_clusters = table_clusters + blocks_clusters;

        last_table_size = table_size;
        table_size = next_refcount_table_size(s, blocks_used +
-            ((meta_clusters + s->refcount_block_size - 1)
-            / s->refcount_block_size));
+            DIV_ROUND_UP(meta_clusters, s->refcount_block_size));

    } while (last_table_size != table_size);

--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,6 +26,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"

--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -36,6 +36,7 @@
 #include "trace.h"
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"

 /*
  Differences with QCOW:
@@ -967,13 +968,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->crypt_method_header) {
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow2 built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow2 image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow2 images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow2, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
        }

        bs->encrypted = 1;
+
+        /* Encryption works on a sector granularity */
+        bs->request_alignment = BDRV_SECTOR_SIZE;
    }

    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
@@ -1192,7 +1202,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQcow2State *s = bs->opaque;

-    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
 }

 static int qcow2_set_key(BlockDriverState *bs, const char *key)
@@ -1330,16 +1340,20 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
    BDRVQcow2State *s = bs->opaque;
    uint64_t cluster_offset;
    int index_in_cluster, ret;
+    unsigned int bytes;
    int64_t status = 0;

-    *pnum = nb_sectors;
+    bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes,
+                                   &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        return ret;
    }

+    *pnum = bytes >> BDRV_SECTOR_BITS;
+
    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
        !s->cipher) {
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1357,28 +1371,34 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,

 /* handle reading after the end of the backing file */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors)
+                        int64_t offset, int bytes)
 {
+    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    int n1;
-    if ((sector_num + nb_sectors) <= bs->total_sectors)
-        return nb_sectors;
-    if (sector_num >= bs->total_sectors)
-        n1 = 0;
-    else
-        n1 = bs->total_sectors - sector_num;

-    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+    if ((offset + bytes) <= bs_size) {
+        return bytes;
+    }
+
+    if (offset >= bs_size) {
+        n1 = 0;
+    } else {
+        n1 = bs_size - offset;
+    }
+
+    qemu_iovec_memset(qiov, n1, 0, bytes - n1);

    return n1;
 }

-static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
-                          int remaining_sectors, QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                        uint64_t bytes, QEMUIOVector *qiov,
+                                        int flags)
 {
    BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster, n1;
+    int offset_in_cluster, n1;
    int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of bytes in current iteration */
    uint64_t cluster_offset = 0;
    uint64_t bytes_done = 0;
    QEMUIOVector hd_qiov;
@@ -1388,26 +1408,24 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,

    qemu_co_mutex_lock(&s->lock);

-    while (remaining_sectors != 0) {
+    while (bytes != 0) {

        /* prepare next request */
-        cur_nr_sectors = remaining_sectors;
+        cur_bytes = MIN(bytes, INT_MAX);
        if (s->cipher) {
-            cur_nr_sectors = MIN(cur_nr_sectors,
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
        }

-        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset);
+        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
        if (ret < 0) {
            goto fail;
        }

-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        offset_in_cluster = offset_into_cluster(s, offset);

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);

        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
@@ -1415,18 +1433,17 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
            if (bs->backing) {
                /* read from the base image */
                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                    sector_num, cur_nr_sectors);
+                                         offset, cur_bytes);
                if (n1 > 0) {
                    QEMUIOVector local_qiov;

                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
-                                      n1 * BDRV_SECTOR_SIZE);
+                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);

                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_readv(bs->backing->bs, sector_num,
-                                        n1, &local_qiov);
+                    ret = bdrv_co_preadv(bs->backing->bs, offset, n1,
+                                         &local_qiov, 0);
                    qemu_co_mutex_lock(&s->lock);

                    qemu_iovec_destroy(&local_qiov);
@@ -1437,12 +1454,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                }
            } else {
                /* Note: in this case, no need to wait */
-                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
            }
            break;

        case QCOW2_CLUSTER_ZERO:
-            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
            break;

        case QCOW2_CLUSTER_COMPRESSED:
@@ -1453,8 +1470,8 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
            }

            qemu_iovec_from_buf(&hd_qiov, 0,
-                s->cluster_cache + index_in_cluster * 512,
-                512 * cur_nr_sectors);
+                                s->cluster_cache + offset_in_cluster,
+                                cur_bytes);
            break;

        case QCOW2_CLUSTER_NORMAL:
@@ -1481,34 +1498,34 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                    }
                }

-                assert(cur_nr_sectors <=
-                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
                qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data,
-                    512 * cur_nr_sectors);
+                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
            }

            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->file->bs,
-                                (cluster_offset >> 9) + index_in_cluster,
-                                cur_nr_sectors, &hd_qiov);
+            ret = bdrv_co_preadv(bs->file->bs,
+                                 cluster_offset + offset_in_cluster,
+                                 cur_bytes, &hd_qiov, 0);
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
                goto fail;
            }
            if (bs->encrypted) {
                assert(s->cipher);
+                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
                Error *err = NULL;
-                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data,
-                                          cluster_data, cur_nr_sectors, false,
-                                          &err) < 0) {
+                if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                          cluster_data, cluster_data,
+                                          cur_bytes >> BDRV_SECTOR_BITS,
+                                          false, &err) < 0) {
                    error_free(err);
                    ret = -EIO;
                    goto fail;
                }
-                qemu_iovec_from_buf(qiov, bytes_done,
-                    cluster_data, 512 * cur_nr_sectors);
+                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
            }
            break;

@@ -1518,9 +1535,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
            goto fail;
        }

-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
    }
    ret = 0;

@@ -1533,23 +1550,21 @@ fail:
    return ret;
 }

-static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
-                           int64_t sector_num,
-                           int remaining_sectors,
-                           QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                         uint64_t bytes, QEMUIOVector *qiov,
+                                         int flags)
 {
    BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster;
+    int offset_in_cluster;
    int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of sectors in current iteration */
    uint64_t cluster_offset;
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
    QCowL2Meta *l2meta = NULL;

-    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
-                                 remaining_sectors);
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);

    qemu_iovec_init(&hd_qiov, qiov->niov);

@@ -1557,22 +1572,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,

    qemu_co_mutex_lock(&s->lock);

-    while (remaining_sectors != 0) {
+    while (bytes != 0) {

        l2meta = NULL;

        trace_qcow2_writev_start_part(qemu_coroutine_self());
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        cur_nr_sectors = remaining_sectors;
-        if (bs->encrypted &&
-            cur_nr_sectors >
-            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
-            cur_nr_sectors =
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
+        offset_in_cluster = offset_into_cluster(s, offset);
+        cur_bytes = MIN(bytes, INT_MAX);
+        if (bs->encrypted) {
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                            - offset_in_cluster);
        }

-        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset, &l2meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
+                                         &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }
@@ -1580,8 +1594,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
        assert((cluster_offset & 511) == 0);

        qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);

        if (bs->encrypted) {
            Error *err = NULL;
@@ -1600,8 +1613,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);

-            if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
-                                      cluster_data, cur_nr_sectors,
+            if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                      cluster_data, cluster_data,
+                                      cur_bytes >>BDRV_SECTOR_BITS,
                                      true, &err) < 0) {
                error_free(err);
                ret = -EIO;
@@ -1609,13 +1623,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
            }

            qemu_iovec_reset(&hd_qiov);
-            qemu_iovec_add(&hd_qiov, cluster_data,
-                cur_nr_sectors * 512);
+            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
        }

        ret = qcow2_pre_write_overlap_check(bs, 0,
-                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
-                cur_nr_sectors * BDRV_SECTOR_SIZE);
+                cluster_offset + offset_in_cluster, cur_bytes);
        if (ret < 0) {
            goto fail;
        }
@@ -1623,10 +1635,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
        qemu_co_mutex_unlock(&s->lock);
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        trace_qcow2_writev_data(qemu_coroutine_self(),
-                                (cluster_offset >> 9) + index_in_cluster);
-        ret = bdrv_co_writev(bs->file->bs,
-                             (cluster_offset >> 9) + index_in_cluster,
-                             cur_nr_sectors, &hd_qiov);
+                                cluster_offset + offset_in_cluster);
+        ret = bdrv_co_pwritev(bs->file->bs,
+                              cluster_offset + offset_in_cluster,
+                              cur_bytes, &hd_qiov, 0);
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto fail;
@@ -1652,10 +1664,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
            l2meta = next;
        }

-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
-        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
    }
    ret = 0;

@@ -1757,13 +1769,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)

    qcow2_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bs->drv = NULL;
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQcow2State));
    options = qdict_clone_shallow(bs->options);

@@ -2004,19 +2009,19 @@ static int qcow2_change_backing_file(BlockDriverState *bs,

 static int preallocate(BlockDriverState *bs)
 {
-    uint64_t nb_sectors;
+    uint64_t bytes;
    uint64_t offset;
    uint64_t host_offset = 0;
-    int num;
+    unsigned int cur_bytes;
    int ret;
    QCowL2Meta *meta;

-    nb_sectors = bdrv_nb_sectors(bs);
+    bytes = bdrv_getlength(bs);
    offset = 0;

-    while (nb_sectors) {
-        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
-        ret = qcow2_alloc_cluster_offset(bs, offset, &num,
+    while (bytes) {
+        cur_bytes = MIN(bytes, INT_MAX);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                         &host_offset, &meta);
        if (ret < 0) {
            return ret;
@@ -2042,8 +2047,8 @@ static int preallocate(BlockDriverState *bs)

        /* TODO Preallocate data if requested */

-        nb_sectors -= num;
-        offset += num << BDRV_SECTOR_BITS;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
    }

    /*
@@ -2052,11 +2057,9 @@ static int preallocate(BlockDriverState *bs)
     * EOF). Extend the image to the last allocated sector.
     */
    if (host_offset != 0) {
-        uint8_t buf[BDRV_SECTOR_SIZE];
-        memset(buf, 0, BDRV_SECTOR_SIZE);
-        ret = bdrv_write(bs->file->bs,
-                         (host_offset >> BDRV_SECTOR_BITS) + num - 1,
-                         buf, 1);
+        uint8_t data = 0;
+        ret = bdrv_pwrite(bs->file->bs, (host_offset + cur_bytes) - 1,
+                          &data, 1);
        if (ret < 0) {
            return ret;
        }
@@ -2207,7 +2210,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

-    ret = blk_pwrite(blk, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
    g_free(header);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2217,7 +2220,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
    g_free(refcount_table);

    if (ret < 0) {
@@ -2400,9 +2403,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                        cluster_size, prealloc, opts, version, refcount_order,
                        &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);

 finish:
    g_free(backing_file);
@@ -2411,21 +2412,67 @@ finish:
    return ret;
 }

-static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+                            uint32_t count)
+{
+    int nr;
+    BlockDriverState *file;
+    int64_t res;
+
+    if (!count) {
+        return true;
+    }
+    res = bdrv_get_block_status_above(bs, NULL, start, count,
+                                      &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
+}
+
+static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
    int ret;
    BDRVQcow2State *s = bs->opaque;

-    /* Emulate misaligned zero writes */
-    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
-        return -ENOTSUP;
+    uint32_t head = offset % s->cluster_size;
+    uint32_t tail = (offset + count) % s->cluster_size;
+
+    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
+
+    if (head || tail) {
+        int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
+        uint64_t off;
+        unsigned int nr;
+
+        assert(head + count <= s->cluster_size);
+
+        /* check whether remainder of cluster already reads as zero */
+        if (!(is_zero_sectors(bs, cl_start,
+                              DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
+              is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS,
+                              DIV_ROUND_UP(-tail & (s->cluster_size - 1),
+                                           BDRV_SECTOR_SIZE)))) {
+            return -ENOTSUP;
+        }
+
+        qemu_co_mutex_lock(&s->lock);
+        /* We can have new write after previous check */
+        offset = cl_start << BDRV_SECTOR_BITS;
+        count = s->cluster_size;
+        nr = s->cluster_size;
+        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
+        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
+            qemu_co_mutex_unlock(&s->lock);
+            return -ENOTSUP;
+        }
+    } else {
+        qemu_co_mutex_lock(&s->lock);
    }

+    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
+
    /* Whatever is left can use real zero clusters */
-    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
@@ -2616,8 +2663,8 @@ static int make_completely_empty(BlockDriverState *bs)
    /* After this call, neither the in-memory nor the on-disk refcount
     * information accurately describe the actual references */

-    ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE,
-                            l1_clusters * s->cluster_sectors, 0);
+    ret = bdrv_pwrite_zeroes(bs->file->bs, s->l1_table_offset,
+                             l1_clusters * s->cluster_size, 0);
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
@@ -2630,9 +2677,8 @@ static int make_completely_empty(BlockDriverState *bs)
     * overwrite parts of the existing refcount and L1 table, which is not
     * an issue because the dirty flag is set, complete data loss is in fact
     * desired and partial data loss is consequently fine as well */
-    ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE,
-                            (2 + l1_clusters) * s->cluster_size /
-                            BDRV_SECTOR_SIZE, 0);
+    ret = bdrv_pwrite_zeroes(bs->file->bs, s->cluster_size,
+                             (2 + l1_clusters) * s->cluster_size, 0);
    /* This call (even if it failed overall) may have overwritten on-disk
     * refcount structures; in that case, the in-memory refcount information
     * will probably differ from the on-disk information which makes the BDS
@@ -2774,14 +2820,14 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
    int ret;

    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    ret = qcow2_cache_write(bs, s->l2_table_cache);
    if (ret < 0) {
        qemu_co_mutex_unlock(&s->lock);
        return ret;
    }

    if (qcow2_need_accurate_refcounts(s)) {
-        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        ret = qcow2_cache_write(bs, s->refcount_block_cache);
        if (ret < 0) {
            qemu_co_mutex_unlock(&s->lock);
            return ret;
@@ -2861,36 +2907,20 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
 {
    BDRVQcow2State *s = bs->opaque;
-    int64_t total_sectors = bs->total_sectors;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    /* bdrv_co_do_writev will have increased the total_sectors value to include
-     * the VM state - the VM state is however not an actual part of the block
-     * device, therefore, we need to restore the old value. */
-    bs->total_sectors = total_sectors;
-
-    return ret;
+    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
+                                    qiov->size, qiov, 0);
 }

-static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                              int64_t pos, int size)
+static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
 {
    BDRVQcow2State *s = bs->opaque;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    return ret;
+    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
+                                   qiov->size, qiov, 0);
 }

 /*
@@ -3329,11 +3359,11 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_co_get_block_status = qcow2_co_get_block_status,
    .bdrv_set_key       = qcow2_set_key,

-    .bdrv_co_readv          = qcow2_co_readv,
-    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_preadv         = qcow2_co_preadv,
+    .bdrv_co_pwritev        = qcow2_co_pwritev,
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,

-    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
    .bdrv_co_discard        = qcow2_co_discard,
    .bdrv_truncate          = qcow2_truncate,
    .bdrv_write_compressed  = qcow2_write_compressed,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion {
     */
    uint64_t    offset;

-    /** Number of sectors to copy */
-    int         nb_sectors;
+    /** Number of bytes to copy */
+    int         nb_bytes;
 } Qcow2COWRegion;

 /**
@@ -318,12 +318,6 @@ typedef struct QCowL2Meta
    /** Host offset of the first newly allocated cluster */
    uint64_t alloc_offset;

-    /**
-     * Number of sectors from the start of the first allocated cluster to
-     * the end of the (possibly shortened) request
-     */
-    int nb_available;
-
    /** Number of newly allocated clusters */
    int nb_clusters;

@@ -471,8 +465,7 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m)

 static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
 {
-    return m->offset + m->cow_end.offset
-        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+    return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
 }

 static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
@@ -544,9 +537,10 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
                          int nb_sectors, bool enc, Error **errp);

 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset);
+                             unsigned int *bytes, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m);
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                         uint64_t offset,
                                         int compressed_size);
@@ -583,6 +577,7 @@ int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
     void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
    Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -234,8 +234,7 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
    }

    check.result->bfi.total_clusters =
-        (s->header.image_size + s->header.cluster_size - 1) /
-            s->header.cluster_size;
+        DIV_ROUND_UP(s->header.image_size, s->header.cluster_size);
    ret = qed_check_l1_table(&check, s->l1_table);
    if (ret == 0) {
        /* Only check for leaks if entire image was scanned successfully */
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -16,6 +16,7 @@
 #include "trace.h"
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
+#include "qemu/bswap.h"

 typedef struct {
    GenericCB gencb;
--- a/block/qed.c
+++ b/block/qed.c
@@ -15,6 +15,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
+#include "qemu/bswap.h"
 #include "trace.h"
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
@@ -142,8 +143,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
     * them, and write back.
     */

-    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
-                   BDRV_SECTOR_SIZE;
+    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
    size_t len = nsectors * BDRV_SECTOR_SIZE;
    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
                                                    cb, opaque);
@@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVQEDState *s = bs->opaque;

-    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
 }

 /* We have nothing to do for QED reopen, stubs just return
@@ -601,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
    }

    qed_header_cpu_to_le(&header, &le_header);
-    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
    if (ret < 0) {
        goto out;
    }
    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
-                     header.backing_filename_size);
+                     header.backing_filename_size, 0);
    if (ret < 0) {
        goto out;
    }

    l1_table = g_malloc0(l1_size);
-    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
    if (ret < 0) {
        goto out;
    }
@@ -1418,7 +1418,7 @@ typedef struct {
    bool done;
 } QEDWriteZeroesCB;

-static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
 {
    QEDWriteZeroesCB *cb = opaque;

@@ -1429,10 +1429,10 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
    }
 }

-static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors,
-                                                 BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset,
+                                                  int count,
+                                                  BdrvRequestFlags flags)
 {
    BlockAIOCB *blockacb;
    BDRVQEDState *s = bs->opaque;
@@ -1440,25 +1440,22 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
    QEMUIOVector qiov;
    struct iovec iov;

-    /* Refuse if there are untouched backing file sectors */
-    if (bs->backing) {
-        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
-        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
+    /* Fall back if the request is not aligned */
+    if (qed_offset_into_cluster(s, offset) ||
+        qed_offset_into_cluster(s, count)) {
+        return -ENOTSUP;
    }

    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
     * then it will be allocated during request processing.
     */
-    iov.iov_base = NULL,
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+    iov.iov_base = NULL;
+    iov.iov_len = count;

    qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
-                             qed_co_write_zeroes_cb, &cb,
+    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                             count >> BDRV_SECTOR_BITS,
+                             qed_co_pwrite_zeroes_cb, &cb,
                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
    if (!blockacb) {
        return -EIO;
@@ -1594,12 +1591,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)

    bdrv_qed_close(bs);

-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
    memset(s, 0, sizeof(BDRVQEDState));
    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
    if (local_err) {
@@ -1669,7 +1660,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
    .bdrv_aio_readv           = bdrv_qed_aio_readv,
    .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
    .bdrv_truncate            = bdrv_qed_truncate,
    .bdrv_getlength           = bdrv_qed_getlength,
    .bdrv_get_info            = bdrv_qed_get_info,
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
    BdrvChild **children;  /* children BlockDriverStates */
    int num_children;      /* children count */
+    unsigned next_child_index;  /* the index of the next child that should
+                                 * be added
+                                 */
    int threshold;         /* if less than threshold children reads gave the
                            * same result a quorum error occurs.
                            */
@@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs)
    return result;
 }

-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
-    BDRVQuorumState *s = bs->opaque;
-    Error *local_err = NULL;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_invalidate_cache(s->children[i]->bs, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    }
-}
-
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
    BDRVQuorumState *s = bs->opaque;
@@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto exit;
    }
-    if (s->num_children < 2) {
+    if (s->num_children < 1) {
        error_setg(&local_err,
-                   "Number of provided children must be greater than 1");
+                   "Number of provided children must be 1 or more");
        ret = -EINVAL;
        goto exit;
    }
@@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

        opened[i] = true;
    }
+    s->next_child_index = s->num_children;

    g_free(opened);
    goto exit;
@@ -981,9 +971,7 @@ close_exit:
 exit:
    qemu_opts_del(opts);
    /* propagate error */
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);
    return ret;
 }

@@ -999,25 +987,70 @@ static void quorum_close(BlockDriverState *bs)
    g_free(s->children);
 }

-static void quorum_detach_aio_context(BlockDriverState *bs)
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
-    int i;
+    BdrvChild *child;
+    char indexstr[32];
+    int ret;

-    for (i = 0; i < s->num_children; i++) {
-        bdrv_detach_aio_context(s->children[i]->bs);
+    assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+    if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+        s->next_child_index == UINT_MAX) {
+        error_setg(errp, "Too many children");
+        return;
    }
+
+    ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+    if (ret < 0 || ret >= 32) {
+        error_setg(errp, "cannot generate child name");
+        return;
+    }
+    s->next_child_index++;
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely add the child now */
+    bdrv_ref(child_bs);
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+    s->children[s->num_children++] = child;
+
+    bdrv_drained_end(bs);
 }

-static void quorum_attach_aio_context(BlockDriverState *bs,
-                                      AioContext *new_context)
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+                             Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_children; i++) {
-        bdrv_attach_aio_context(s->children[i]->bs, new_context);
+        if (s->children[i] == child) {
+            break;
+        }
    }
+
+    /* we have checked it in bdrv_del_child() */
+    assert(i < s->num_children);
+
+    if (s->num_children <= s->threshold) {
+        error_setg(errp,
+            "The number of children cannot be lower than the vote threshold %d",
+            s->threshold);
+        return;
+    }
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely remove this child now */
+    memmove(&s->children[i], &s->children[i + 1],
+            (s->num_children - i - 1) * sizeof(BdrvChild *));
+    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+    bdrv_unref_child(bs, child);
+
+    bdrv_drained_end(bs);
 }

 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -1070,10 +1103,9 @@ static BlockDriver bdrv_quorum = {

    .bdrv_aio_readv                     = quorum_aio_readv,
    .bdrv_aio_writev                    = quorum_aio_writev,
-    .bdrv_invalidate_cache              = quorum_invalidate_cache,

-    .bdrv_detach_aio_context            = quorum_detach_aio_context,
-    .bdrv_attach_aio_context            = quorum_attach_aio_context,
+    .bdrv_add_child                     = quorum_add_child,
+    .bdrv_del_child                     = quorum_del_child,

    .is_filter                          = true,
    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -15,6 +15,7 @@
 #ifndef QEMU_RAW_AIO_H
 #define QEMU_RAW_AIO_H

+#include "qemu/coroutine.h"
 #include "qemu/iov.h"

 /* AIO request types */
@@ -35,15 +36,18 @@

 /* linux-aio.c - Linux native implementation */
 #ifdef CONFIG_LINUX_AIO
-void *laio_init(void);
-void laio_cleanup(void *s);
-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+typedef struct LinuxAioState LinuxAioState;
+LinuxAioState *laio_init(void);
+void laio_cleanup(LinuxAioState *s);
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                uint64_t offset, QEMUIOVector *qiov, int type);
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type);
-void laio_detach_aio_context(void *s, AioContext *old_context);
-void laio_attach_aio_context(void *s, AioContext *new_context);
-void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s);
 #endif

 #ifdef _WIN32
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -139,7 +139,7 @@ typedef struct BDRVRawState {

 #ifdef CONFIG_LINUX_AIO
    int use_aio;
-    void *aio_ctx;
+    LinuxAioState *aio_ctx;
 #endif
 #ifdef CONFIG_XFS
    bool is_xfs:1;
@@ -398,7 +398,7 @@ static void raw_attach_aio_context(BlockDriverState *bs,
 }

 #ifdef CONFIG_LINUX_AIO
-static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
 {
    int ret = -1;
    assert(aio_ctx != NULL);
@@ -517,6 +517,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,

    s->has_discard = true;
    s->has_write_zeroes = true;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
@@ -581,15 +582,9 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;

    s->type = FTYPE_FILE;
-    ret = raw_open_common(bs, options, flags, 0, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return raw_open_common(bs, options, flags, 0, errp);
 }

 static int raw_reopen_prepare(BDRVReopenState *state,
@@ -728,9 +723,33 @@ static void raw_reopen_abort(BDRVReopenState *state)
    state->opaque = NULL;
 }

+static int hdev_get_max_transfer_length(int fd)
+{
+#ifdef BLKSECTGET
+    int max_sectors = 0;
+    if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+        return max_sectors;
+    } else {
+        return -errno;
+    }
+#else
+    return -ENOSYS;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    BDRVRawState *s = bs->opaque;
+    struct stat st;
+
+    if (!fstat(s->fd, &st)) {
+        if (S_ISBLK(st.st_mode)) {
+            int ret = hdev_get_max_transfer_length(s->fd);
+            if (ret >= 0) {
+                bs->bl.max_transfer_length = ret;
+            }
+        }
+    }

    raw_probe_alignment(bs, s->fd, errp);
    bs->bl.min_mem_alignment = s->buf_align;
@@ -1251,8 +1270,8 @@ static int aio_worker(void *arg)
 }

 static int paio_submit_co(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        int type)
+                          int64_t offset, QEMUIOVector *qiov,
+                          int count, int type)
 {
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
    ThreadPool *pool;
@@ -1261,16 +1280,16 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
    acb->aio_type = type;
    acb->aio_fildes = fd;

-    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;

    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
-        assert(qiov->size == acb->aio_nbytes);
+        assert(qiov->size == count);
    }

-    trace_paio_submit_co(sector_num, nb_sectors, type);
+    trace_paio_submit_co(offset, count, type);
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_co(pool, aio_worker, acb);
 }
@@ -1300,14 +1319,13 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }

-static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
+                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 {
    BDRVRawState *s = bs->opaque;

    if (fd_open(bs) < 0)
-        return NULL;
+        return -EIO;

    /*
     * Check if the underlying device requires requests to be aligned,
@@ -1320,14 +1338,28 @@ static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
        } else if (s->use_aio) {
-            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
-                               nb_sectors, cb, opaque, type);
+            assert(qiov->size == bytes);
+            return laio_co_submit(bs, s->aio_ctx, s->fd, offset, qiov, type);
 #endif
        }
    }

-    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
-                       cb, opaque, type);
+    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
+}
+
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *qiov,
+                                      int flags)
+{
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
+}
+
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *qiov,
+                                       int flags)
+{
+    assert(flags == 0);
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }

 static void raw_aio_plug(BlockDriverState *bs)
@@ -1345,37 +1377,11 @@ static void raw_aio_unplug(BlockDriverState *bs)
 #ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, true);
+        laio_io_unplug(bs, s->aio_ctx);
    }
 #endif
 }

-static void raw_aio_flush_io_queue(BlockDriverState *bs)
-{
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, false);
-    }
-#endif
-}
-
-static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_READ);
-}
-
-static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_WRITE);
-}
-
 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
        BlockCompletionFunc *cb, void *opaque)
 {
@@ -1877,17 +1883,17 @@ static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
                       cb, opaque, QEMU_AIO_DISCARD);
 }

-static int coroutine_fn raw_co_write_zeroes(
-    BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(
+    BlockDriverState *bs, int64_t offset,
+    int count, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;

    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                              QEMU_AIO_WRITE_ZEROES);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                              QEMU_AIO_DISCARD);
    }
    return -ENOTSUP;
@@ -1940,16 +1946,15 @@ BlockDriver bdrv_file = {
    .bdrv_create = raw_create,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
    .bdrv_co_get_block_status = raw_co_get_block_status,
-    .bdrv_co_write_zeroes = raw_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,

-    .bdrv_aio_readv = raw_aio_readv,
-    .bdrv_aio_writev = raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_aio_flush = raw_aio_flush,
    .bdrv_aio_discard = raw_aio_discard,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate = raw_truncate,
    .bdrv_getlength = raw_getlength,
@@ -2225,9 +2230,7 @@ hdev_open_Mac_error:

    ret = raw_open_common(bs, options, flags, 0, &local_err);
    if (ret < 0) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
+        error_propagate(errp, local_err);
 #if defined(__APPLE__) && defined(__MACH__)
        if (*bsd_path) {
            filename = bsd_path;
@@ -2303,8 +2306,8 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

-static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
    BDRVRawState *s = bs->opaque;
    int rc;
@@ -2314,10 +2317,10 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
        return rc;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
    } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
    }
    return -ENOTSUP;
@@ -2389,16 +2392,15 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create         = hdev_create,
    .create_opts         = &raw_create_opts,
-    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,

-    .bdrv_aio_readv	= raw_aio_readv,
-    .bdrv_aio_writev	= raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_aio_discard   = hdev_aio_discard,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength	= raw_getlength,
@@ -2433,17 +2435,11 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
    BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;

    s->type = FTYPE_CD;

    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
 }

 static int cdrom_probe_device(const char *filename)
@@ -2522,13 +2518,13 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_create         = hdev_create,
    .create_opts         = &raw_create_opts,

-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
@@ -2561,9 +2557,7 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,

    ret = raw_open_common(bs, options, flags, 0, &local_err);
    if (ret) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
+        error_propagate(errp, local_err);
        return ret;
    }

@@ -2658,13 +2652,12 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_create        = hdev_create,
    .create_opts        = &raw_create_opts,

-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
    .bdrv_aio_flush	= raw_aio_flush,
    .bdrv_refresh_limits = raw_refresh_limits,
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,

    .bdrv_truncate      = raw_truncate,
    .bdrv_getlength      = raw_getlength,
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -105,8 +105,8 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    }

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
-                             nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+    ret = bdrv_co_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
+                          nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);

 fail:
    if (qiov == &local_qiov) {
@@ -116,13 +116,6 @@ fail:
    return ret;
 }

-static int coroutine_fn
-raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-              QEMUIOVector *qiov)
-{
-    return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int64_t sector_num,
                                            int nb_sectors, int *pnum,
@@ -134,11 +127,11 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
           (sector_num << BDRV_SECTOR_BITS);
 }

-static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-                                            int64_t sector_num, int nb_sectors,
-                                            BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
+                                             int64_t offset, int count,
+                                             BdrvRequestFlags flags)
 {
-    return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    return bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
 }

 static int coroutine_fn raw_co_discard(BlockDriverState *bs,
@@ -197,20 +190,15 @@ static int raw_has_zero_init(BlockDriverState *bs)

 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    Error *local_err = NULL;
-    int ret;
-
-    ret = bdrv_create_file(filename, opts, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return bdrv_create_file(filename, opts, errp);
 }

 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
    bs->sg = bs->file->bs->sg;
+    bs->supported_write_flags = BDRV_REQ_FUA;
+    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP;

    if (bs->probed && !bdrv_is_read_only(bs)) {
        fprintf(stderr,
@@ -256,10 +244,8 @@ BlockDriver bdrv_raw = {
    .bdrv_close           = &raw_close,
    .bdrv_create          = &raw_create,
    .bdrv_co_readv        = &raw_co_readv,
-    .bdrv_co_writev       = &raw_co_writev,
    .bdrv_co_writev_flags = &raw_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
-    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_discard      = &raw_co_discard,
    .bdrv_co_get_block_status = &raw_co_get_block_status,
    .bdrv_truncate        = &raw_truncate,
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -290,7 +290,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
            if (only_read_conf_file) {
                ret = rados_conf_read_file(cluster, value);
                if (ret < 0) {
-                    error_setg(errp, "error reading conf file %s", value);
+                    error_setg_errno(errp, -ret, "error reading conf file %s",
+                                     value);
                    break;
                }
            }
@@ -299,7 +300,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
        } else if (!only_read_conf_file) {
            ret = rados_conf_set(cluster, name, value);
            if (ret < 0) {
-                error_setg(errp, "invalid conf option %s", name);
+                error_setg_errno(errp, -ret, "invalid conf option %s", name);
                ret = -EINVAL;
                break;
            }
@@ -354,9 +355,10 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    if (rados_create(&cluster, clientname) < 0) {
-        error_setg(errp, "error initializing");
-        return -EIO;
+    ret = rados_create(&cluster, clientname);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error initializing");
+        return ret;
    }

    if (strstr(conf, "conf=") == NULL) {
@@ -381,21 +383,27 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        return -EIO;
    }

-    if (rados_connect(cluster) < 0) {
-        error_setg(errp, "error connecting");
+    ret = rados_connect(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error connecting");
        rados_shutdown(cluster);
-        return -EIO;
+        return ret;
    }

-    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+    ret = rados_ioctx_create(cluster, pool, &io_ctx);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error opening pool %s", pool);
        rados_shutdown(cluster);
-        return -EIO;
+        return ret;
    }

    ret = rbd_create(io_ctx, name, bytes, &obj_order);
    rados_ioctx_destroy(io_ctx);
    rados_shutdown(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd create");
+        return ret;
+    }

    return ret;
 }
@@ -500,7 +508,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
    r = rados_create(&s->cluster, clientname);
    if (r < 0) {
-        error_setg(errp, "error initializing");
+        error_setg_errno(errp, -r, "error initializing");
        goto failed_opts;
    }

@@ -546,19 +554,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,

    r = rados_connect(s->cluster);
    if (r < 0) {
-        error_setg(errp, "error connecting");
+        error_setg_errno(errp, -r, "error connecting");
        goto failed_shutdown;
    }

    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
    if (r < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+        error_setg_errno(errp, -r, "error opening pool %s", pool);
        goto failed_shutdown;
    }

    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
    if (r < 0) {
-        error_setg(errp, "error reading header from %s", s->name);
+        error_setg_errno(errp, -r, "error reading header from %s", s->name);
        goto failed_open;
    }

@@ -875,10 +883,8 @@ static int qemu_rbd_snap_rollback(BlockDriverState *bs,
                                  const char *snapshot_name)
 {
    BDRVRBDState *s = bs->opaque;
-    int r;

-    r = rbd_snap_rollback(s->image, snapshot_name);
-    return r;
+    return rbd_snap_rollback(s->image, snapshot_name);
 }

 static int qemu_rbd_snap_list(BlockDriverState *bs,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)

 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)                                       \
-    do {                                                            \
-        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-    } while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)                                           \
+    do {                                                                \
+        if (DEBUG_SDOG_PRINT) {                                         \
+            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+        }                                                               \
+    } while (0)

 typedef struct SheepdogAIOCB SheepdogAIOCB;

@@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp)
        if (ret < 0) {
            goto out;
        }
-        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
        if (ret < 0) {
            goto out;
        }
@@ -2781,12 +2784,19 @@ static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
    return ret;
 }

-static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
-                           int64_t pos, int size)
+static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                           int64_t pos)
 {
    BDRVSheepdogState *s = bs->opaque;
+    void *buf;
+    int ret;

-    return do_load_save_vmstate(s, data, pos, size, 1);
+    buf = qemu_blockalign(bs, qiov->size);
+    ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
+    qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
+    qemu_vfree(buf);
+
+    return ret;
 }


--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -358,9 +358,7 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
        ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err);
    }

-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);

    return ret;
 }
@@ -373,9 +371,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
    bool ok = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (ok && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -383,8 +382,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
            ok = bdrv_can_snapshot(bs);
        }
        aio_context_release(ctx);
+        if (!ok) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ok;
 }
@@ -393,10 +396,11 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                             Error **err)
 {
    int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
    QEMUSnapshotInfo sn1, *snapshot = &sn1;

-    while (ret == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -405,8 +409,12 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
        }
        aio_context_release(ctx);
+        if (ret < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ret;
 }
@@ -415,9 +423,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -425,8 +434,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_goto(bs, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -435,9 +448,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    QEMUSnapshotInfo sn;
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -445,8 +459,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_find(bs, &sn, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -457,9 +475,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                             BlockDriverState **first_bad_bs)
 {
    int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -471,23 +490,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
            err = bdrv_snapshot_create(bs, sn);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }

 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    bool not_found = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;

-    while (not_found && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);
+        bool found;

        aio_context_acquire(ctx);
-        not_found = !bdrv_can_snapshot(bs);
+        found = bdrv_can_snapshot(bs);
        aio_context_release(ctx);
+
+        if (found) {
+            break;
+        }
    }
    return bs;
 }
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
    char *backing_file_str;
 } StreamBlockJob;

-static int coroutine_fn stream_populate(BlockDriverState *bs,
+static int coroutine_fn stream_populate(BlockBackend *blk,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
@@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Copy-on-read the unallocated clusters */
-    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
+                         BDRV_REQ_COPY_ON_READ);
 }

 typedef struct {
@@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
    StreamCompleteData *data = opaque;
+    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                base_fmt = base->drv->format_name;
            }
        }
-        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
-        bdrv_set_backing_hd(job->bs, base);
+        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        bdrv_set_backing_hd(bs, base);
    }

    g_free(s->backing_file_str);
@@ -88,7 +90,8 @@ static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
    StreamCompleteData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockBackend *blk = s->common.blk;
+    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *base = s->base;
    int64_t sector_num = 0;
    int64_t end = -1;
@@ -159,12 +162,11 @@ wait:
                    goto wait;
                }
            }
-            ret = stream_populate(bs, sector_num, n, buf);
+            ret = stream_populate(blk, sector_num, n, buf);
        }
        if (ret < 0) {
            BlockErrorAction action =
-                block_job_error_action(&s->common, s->common.bs, s->on_error,
-                                       true, -ret);
+                block_job_error_action(&s->common, s->on_error, true, -ret);
            if (action == BLOCK_ERROR_ACTION_STOP) {
                n = 0;
                continue;
@@ -224,13 +226,6 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
 {
    StreamBlockJob *s;

-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
-        return;
-    }
-
    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,13 +23,14 @@
 */

 #include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"

 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockDriverState and it's independent from
+ * among different BlockBackends and it's independent from
 * AioContext, so in order to use it from different threads it needs
 * its own locking.
 *
@@ -39,26 +40,26 @@
 * The whole ThrottleGroup structure is private and invisible to
 * outside users, that only use it through its ThrottleState.
 *
- * In addition to the ThrottleGroup structure, BlockDriverState has
+ * In addition to the ThrottleGroup structure, BlockBackendPublic has
 * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a BDS is
- * registered in a group those fields can be accessed by other threads
- * any time.
+ * therefore also need to be protected by this lock. Once a
+ * BlockBackend is registered in a group those fields can be accessed
+ * by other threads any time.
 *
 * Again, all this is handled internally and is mostly transparent to
 * the outside. The 'throttle_timers' field however has an additional
 * constraint because it may be temporarily invalid (see for example
 * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BDS's timers only after verifying that that BDS
- * has throttled requests in the queue.
+ * access some other BlockBackend's timers only after verifying that
+ * that BlockBackend has throttled requests in the queue.
 */
 typedef struct ThrottleGroup {
    char *name; /* This is constant during the lifetime of the group */

    QemuMutex lock; /* This lock protects the following four fields */
    ThrottleState ts;
-    QLIST_HEAD(, BlockDriverState) head;
-    BlockDriverState *tokens[2];
+    QLIST_HEAD(, BlockBackendPublic) head;
+    BlockBackend *tokens[2];
    bool any_timer_armed[2];

    /* These two are protected by the global throttle_groups_lock */
@@ -132,93 +133,98 @@ void throttle_group_unref(ThrottleState *ts)
    qemu_mutex_unlock(&throttle_groups_lock);
 }

-/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
- * the pointer) is guaranteed to remain constant during the lifetime
- * of the group.
+/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+ * is guaranteed to remain constant during the lifetime of the group.
 *
- * @bs:   a BlockDriverState that is member of a throttling group
+ * @blk:  a BlockBackend that is member of a throttling group
 * @ret:  the name of the group.
 */
-const char *throttle_group_get_name(BlockDriverState *bs)
+const char *throttle_group_get_name(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    return tg->name;
 }

-/* Return the next BlockDriverState in the round-robin sequence,
- * simulating a circular list.
+/* Return the next BlockBackend in the round-robin sequence, simulating a
+ * circular list.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:  the current BlockDriverState
- * @ret: the next BlockDriverState in the sequence
+ * @blk: the current BlockBackend
+ * @ret: the next BlockBackend in the sequence
 */
-static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);

    if (!next) {
-        return QLIST_FIRST(&tg->head);
+        next = QLIST_FIRST(&tg->head);
    }

-    return next;
+    return blk_by_public(next);
 }

-/* Return the next BlockDriverState in the round-robin sequence with
- * pending I/O requests.
+/* Return the next BlockBackend in the round-robin sequence with pending I/O
+ * requests.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockDriverState with pending requests, or bs
- *             if there is none.
+ * @ret:       the next BlockBackend with pending requests, or blk if there is
+ *             none.
 */
-static BlockDriverState *next_throttle_token(BlockDriverState *bs,
-                                             bool is_write)
+static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    BlockDriverState *token, *start;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    BlockBackend *token, *start;

    start = token = tg->tokens[is_write];

    /* get next bs round in round robin style */
-    token = throttle_group_next_bs(token);
-    while (token != start && !token->pending_reqs[is_write]) {
-        token = throttle_group_next_bs(token);
+    token = throttle_group_next_blk(token);
+    while (token != start && !blkp->pending_reqs[is_write]) {
+        token = throttle_group_next_blk(token);
    }

    /* If no IO are queued for scheduling on the next round robin token
     * then decide the token is the current bs because chances are
     * the current bs get the current request queued.
     */
-    if (token == start && !token->pending_reqs[is_write]) {
-        token = bs;
+    if (token == start && !blkp->pending_reqs[is_write]) {
+        token = blk;
    }

    return token;
 }

-/* Check if the next I/O request for a BlockDriverState needs to be
- * throttled or not. If there's no timer set in this group, set one
- * and update the token accordingly.
+/* Check if the next I/O request for a BlockBackend needs to be throttled or
+ * not. If there's no timer set in this group, set one and update the token
+ * accordingly.
 *
 * This assumes that tg->lock is held.
 *
- * @bs:         the current BlockDriverState
+ * @blk:        the current BlockBackend
 * @is_write:   the type of operation (read/write)
 * @ret:        whether the I/O request needs to be throttled or not
 */
-static bool throttle_group_schedule_timer(BlockDriverState *bs,
-                                          bool is_write)
+static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
-    ThrottleTimers *tt = &bs->throttle_timers;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &blkp->throttle_timers;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool must_wait;

+    if (blkp->io_limits_disabled) {
+        return false;
+    }
+
    /* Check if any of the timers in this group is already armed */
    if (tg->any_timer_armed[is_write]) {
        return true;
@@ -226,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,

    must_wait = throttle_schedule_timer(ts, tt, is_write);

-    /* If a timer just got armed, set bs as the current token */
+    /* If a timer just got armed, set blk as the current token */
    if (must_wait) {
-        tg->tokens[is_write] = bs;
+        tg->tokens[is_write] = blk;
        tg->any_timer_armed[is_write] = true;
    }

@@ -239,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
 *
 * This assumes that tg->lock is held.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @is_write:  the type of operation (read/write)
 */
-static void schedule_next_request(BlockDriverState *bs, bool is_write)
+static void schedule_next_request(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

    /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(bs, is_write);
-    if (!token->pending_reqs[is_write]) {
+    token = next_throttle_token(blk, is_write);
+    if (!blkp->pending_reqs[is_write]) {
        return;
    }

@@ -259,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)

    /* If it doesn't have to wait, queue it for immediate execution */
    if (!must_wait) {
-        /* Give preference to requests from the current bs */
+        /* Give preference to requests from the current blk */
        if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
-            token = bs;
+            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            token = blk;
        } else {
-            ThrottleTimers *tt = &token->throttle_timers;
+            ThrottleTimers *tt = &blkp->throttle_timers;
            int64_t now = qemu_clock_get_ns(tt->clock_type);
            timer_mod(tt->timers[is_write], now + 1);
            tg->any_timer_armed[is_write] = true;
@@ -277,53 +284,67 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
 * if necessary, and schedule the next request using a round robin
 * algorithm.
 *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
 * @bytes:     the number of bytes for this I/O
 * @is_write:  the type of operation (read/write)
 */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                        unsigned int bytes,
                                                        bool is_write)
 {
    bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;

-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);

    /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(bs, is_write);
+    token = next_throttle_token(blk, is_write);
    must_wait = throttle_group_schedule_timer(token, is_write);

    /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || bs->pending_reqs[is_write]) {
-        bs->pending_reqs[is_write]++;
+    if (must_wait || blkp->pending_reqs[is_write]) {
+        blkp->pending_reqs[is_write]++;
        qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
        qemu_mutex_lock(&tg->lock);
-        bs->pending_reqs[is_write]--;
+        blkp->pending_reqs[is_write]--;
    }

    /* The I/O will be executed, so do the accounting */
-    throttle_account(bs->throttle_state, is_write, bytes);
+    throttle_account(blkp->throttle_state, is_write, bytes);

    /* Schedule the next request */
-    schedule_next_request(bs, is_write);
+    schedule_next_request(blk, is_write);

    qemu_mutex_unlock(&tg->lock);
 }

+void throttle_group_restart_blk(BlockBackend *blk)
+{
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
+            ;
+        }
+    }
+}
+
 /* Update the throttle configuration for a particular group. Similar
 * to throttle_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration to set
 */
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleTimers *tt = &bs->throttle_timers;
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    /* throttle_config() cancels the timers */
@@ -335,18 +356,22 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
    }
    throttle_config(ts, tt, cfg);
    qemu_mutex_unlock(&tg->lock);
+
+    qemu_co_enter_next(&blkp->throttled_reqs[0]);
+    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }

 /* Get the throttle configuration from a particular group. Similar to
 * throttle_get_config(), but guarantees atomicity within the
 * throttling group.
 *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
 * @cfg: the configuration will be written here
 */
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    qemu_mutex_lock(&tg->lock);
    throttle_get_config(ts, cfg);
@@ -356,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
 * because it had been throttled.
 *
- * @bs:        the BlockDriverState whose request had been throttled
+ * @blk:       the BlockBackend whose request had been throttled
 * @is_write:  the type of operation (read/write)
 */
-static void timer_cb(BlockDriverState *bs, bool is_write)
+static void timer_cb(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    bool empty_queue;

@@ -371,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write)
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
    if (empty_queue) {
        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(bs, is_write);
+        schedule_next_request(blk, is_write);
        qemu_mutex_unlock(&tg->lock);
    }
 }
@@ -392,17 +418,17 @@ static void write_timer_cb(void *opaque)
    timer_cb(opaque, true);
 }

-/* Register a BlockDriverState in the throttling group, also
- * initializing its timers and updating its throttle_state pointer to
- * point to it. If a throttling group with that name does not exist
- * yet, it will be created.
+/* Register a BlockBackend in the throttling group, also initializing its
+ * timers and updating its throttle_state pointer to point to it. If a
+ * throttling group with that name does not exist yet, it will be created.
 *
- * @bs:        the BlockDriverState to insert
+ * @blk:       the BlockBackend to insert
 * @groupname: the name of the group
 */
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
 {
    int i;
+    BlockBackendPublic *blkp = blk_get_public(blk);
    ThrottleState *ts = throttle_group_incref(groupname);
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    int clock_type = QEMU_CLOCK_REALTIME;
@@ -412,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
        clock_type = QEMU_CLOCK_VIRTUAL;
    }

-    bs->throttle_state = ts;
+    blkp->throttle_state = ts;

    qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    /* If the ThrottleGroup is new set this BlockBackend as the token */
    for (i = 0; i < 2; i++) {
        if (!tg->tokens[i]) {
-            tg->tokens[i] = bs;
+            tg->tokens[i] = blk;
        }
    }

-    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);

-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
+    throttle_timers_init(&blkp->throttle_timers,
+                         blk_get_aio_context(blk),
                         clock_type,
                         read_timer_cb,
                         write_timer_cb,
-                         bs);
+                         blk);

    qemu_mutex_unlock(&tg->lock);
 }

-/* Unregister a BlockDriverState from its group, removing it from the
- * list, destroying the timers and setting the throttle_state pointer
- * to NULL.
+/* Unregister a BlockBackend from its group, removing it from the list,
+ * destroying the timers and setting the throttle_state pointer to NULL.
 *
- * The BlockDriverState must not have pending throttled requests, so
- * the caller has to drain them first.
+ * The BlockBackend must not have pending throttled requests, so the caller has
+ * to drain them first.
 *
 * The group will be destroyed if it's empty after this operation.
 *
- * @bs: the BlockDriverState to remove
+ * @blk: the BlockBackend to remove
 */
-void throttle_group_unregister_bs(BlockDriverState *bs)
+void throttle_group_unregister_blk(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
    int i;

-    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));

    qemu_mutex_lock(&tg->lock);
    for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == bs) {
-            BlockDriverState *token = throttle_group_next_bs(bs);
-            /* Take care of the case where this is the last bs in the group */
-            if (token == bs) {
+        if (tg->tokens[i] == blk) {
+            BlockBackend *token = throttle_group_next_blk(blk);
+            /* Take care of the case where this is the last blk in the group */
+            if (token == blk) {
                token = NULL;
            }
            tg->tokens[i] = token;
        }
    }

-    /* remove the current bs from the list */
-    QLIST_REMOVE(bs, round_robin);
-    throttle_timers_destroy(&bs->throttle_timers);
+    /* remove the current blk from the list */
+    QLIST_REMOVE(blkp, round_robin);
+    throttle_timers_destroy(&blkp->throttle_timers);
    qemu_mutex_unlock(&tg->lock);

    throttle_group_unref(&tg->ts);
-    bs->throttle_state = NULL;
+    blkp->throttle_state = NULL;
 }

 static void throttle_groups_init(void)
--- a/block/trace-events
+++ b/block/trace-events
@@ -0,0 +1,116 @@
+# See docs/trace-events.txt for syntax documentation.
+
+# block.c
+bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\""
+bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
+
+# block/block-backend.c
+blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+
+# block/io.c
+bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
+bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
+bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
+
+# block/stream.c
+stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
+
+# block/commit.c
+commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
+
+# block/mirror.c
+mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
+mirror_before_flush(void *s) "s %p"
+mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
+mirror_before_sleep(void *s, int64_t cnt, int synced, uint64_t delay_ns) "s %p dirty count %"PRId64" synced %d delay %"PRIu64"ns"
+mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
+mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
+mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
+mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
+mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
+
+# block/backup.c
+backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d"
+backup_do_cow_return(void *job, int64_t sector_num, int nb_sectors, int ret) "job %p sector_num %"PRId64" nb_sectors %d ret %d"
+backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64
+backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64
+backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
+backup_do_cow_write_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
+
+# blockdev.c
+qmp_block_job_cancel(void *job) "job %p"
+qmp_block_job_pause(void *job) "job %p"
+qmp_block_job_resume(void *job) "job %p"
+qmp_block_job_complete(void *job) "job %p"
+block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d"
+qmp_block_stream(void *bs, void *job) "bs %p job %p"
+
+# block/raw-win32.c
+# block/raw-posix.c
+paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d"
+paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d"
+
+# block/qcow2.c
+qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
+qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
+qcow2_writev_start_part(void *co) "co %p"
+qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
+qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
+qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+
+# block/qcow2-cluster.c
+qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
+qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
+qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
+qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
+qcow2_cluster_alloc_phys(void *co) "co %p"
+qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d"
+
+qcow2_l2_allocate(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_get_empty(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_write_l2(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_write_l1(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_done(void *bs, int l1_index, int ret) "bs %p l1_index %d ret %d"
+
+# block/qcow2-cache.c
+qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset %" PRIx64 " read_from_disk %d"
+qcow2_cache_get_replace_entry(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_get_read(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
+qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+
+# block/qed-l2-cache.c
+qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
+qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
+qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d"
+
+# block/qed-table.c
+qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p"
+qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d"
+qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u"
+qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d"
+
+# block/qed.c
+qed_need_check_timer_cb(void *s) "s %p"
+qed_start_need_check_timer(void *s) "s %p"
+qed_cancel_need_check_timer(void *s) "s %p"
+qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
+qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
+qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64
+qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
+qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
+qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -54,6 +54,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
 #include "qemu/cutils.h"
@@ -557,98 +558,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }

-static int vdi_co_read(BlockDriverState *bs,
-        int64_t sector_num, uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will read %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will read %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Block not allocated, return zeros, no need to wait. */
-            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            ret = 0;
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
-            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
-        }
-        logout("%u sectors read\n", n_sectors);
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, data_offset, n_bytes,
+                                 &local_qiov, 0);
+        }
+        logout("%u bytes read\n", n_bytes);
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }

+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

-static int vdi_co_write(BlockDriverState *bs,
-        int64_t sector_num, const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
    uint32_t bmap_entry;
    uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
    uint32_t bmap_first = VDI_UNALLOCATED;
    uint32_t bmap_last = VDI_UNALLOCATED;
    uint8_t *block = NULL;
+    uint64_t bytes_done = 0;
    int ret = 0;

    logout("\n");

-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        logout("will write %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
+
+        logout("will write %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);

        /* prepare next AIO request */
        bmap_entry = le32_to_cpu(s->bmap[block_index]);
        if (!VDI_IS_ALLOCATED(bmap_entry)) {
            /* Allocate new block and write to it. */
-            uint64_t offset;
+            uint64_t data_offset;
            bmap_entry = s->header.blocks_allocated;
            s->bmap[block_index] = cpu_to_le32(bmap_entry);
            s->header.blocks_allocated++;
-            offset = s->header.offset_data / SECTOR_SIZE +
-                     (uint64_t)bmap_entry * s->block_sectors;
+            data_offset = s->header.offset_data +
+                          (uint64_t)bmap_entry * s->block_size;
            if (block == NULL) {
                block = g_malloc(s->block_size);
                bmap_first = block_index;
            }
            bmap_last = block_index;
            /* Copy data to be written to new block and zero unused parts. */
-            memset(block, 0, sector_in_block * SECTOR_SIZE);
-            memcpy(block + sector_in_block * SECTOR_SIZE,
-                   buf, n_sectors * SECTOR_SIZE);
-            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            memset(block, 0, offset_in_block);
+            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
+                              n_bytes);
+            memset(block + offset_in_block + n_bytes, 0,
+                   s->block_size - n_bytes - offset_in_block);

            /* Note that this coroutine does not yield anywhere from reading the
             * bmap entry until here, so in regards to all the coroutines trying
@@ -658,12 +670,12 @@ static int vdi_co_write(BlockDriverState *bs,
             * acquire the lock and thus the padded cluster is written before
             * the other coroutines can write to the affected area. */
            qemu_co_mutex_lock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
+            ret = bdrv_pwrite(bs->file->bs, data_offset, block, s->block_size);
            qemu_co_mutex_unlock(&s->write_lock);
        } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;
            qemu_co_mutex_lock(&s->write_lock);
            /* This lock is only used to make sure the following write operation
             * is executed after the write issued by the coroutine allocating
@@ -674,16 +686,23 @@ static int vdi_co_write(BlockDriverState *bs,
             * that that write operation has returned (there may be other writes
             * in flight, but they do not concern this very operation). */
            qemu_co_mutex_unlock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
+
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_pwritev(bs->file->bs, data_offset, n_bytes,
+                                  &local_qiov, 0);
        }

-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

-        logout("%u sectors written\n", n_sectors);
+        logout("%u bytes written\n", n_bytes);
    }

+    qemu_iovec_destroy(&local_qiov);
+
    logout("finished data write\n");
    if (ret < 0) {
        return ret;
@@ -694,6 +713,7 @@ static int vdi_co_write(BlockDriverState *bs,
        VdiHeader *header = (VdiHeader *) block;
        uint8_t *base;
        uint64_t offset;
+        uint32_t n_sectors;

        logout("now writing modified header\n");
        assert(VDI_IS_ALLOCATED(bmap_first));
@@ -808,7 +828,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
    vdi_header_print(&header);
 #endif
    vdi_header_to_le(&header);
-    ret = blk_pwrite(blk, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, "Error writing header to %s", filename);
        goto exit;
@@ -829,7 +849,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                bmap[i] = VDI_UNALLOCATED;
            }
        }
-        ret = blk_pwrite(blk, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size, 0);
        if (ret < 0) {
            error_setg(errp, "Error writing bmap to %s", filename);
            goto exit;
@@ -903,9 +923,9 @@ static BlockDriver bdrv_vdi = {
    .bdrv_co_get_block_status = vdi_co_get_block_status,
    .bdrv_make_empty = vdi_make_empty,

-    .bdrv_read = vdi_co_read,
+    .bdrv_co_preadv     = vdi_co_preadv,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_write = vdi_co_write,
+    .bdrv_co_pwritev    = vdi_co_pwritev,
 #endif

    .bdrv_get_info = vdi_get_info,
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"

 #include <uuid/uuid.h>
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -23,6 +23,7 @@
 #include "block/block_int.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"


--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -22,11 +22,11 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 #include "migration/migration.h"

 #include <uuid/uuid.h>
-#include <glib.h>

 /* Options for VHDX creation */

@@ -1856,13 +1856,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                              &creator_items, NULL);
    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature),
+                     0);
    if (ret < 0) {
        goto delete_and_exit;
    }
    if (creator) {
        ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                         creator, creator_items * sizeof(gunichar2));
+                         creator, creator_items * sizeof(gunichar2), 0);
        if (ret < 0) {
            goto delete_and_exit;
        }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,10 +30,10 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/cutils.h"
 #include <zlib.h>
-#include <glib.h>

 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
@@ -997,9 +997,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
-            bs->bl.write_zeroes_alignment =
-                MAX(bs->bl.write_zeroes_alignment,
-                    s->extents[i].cluster_sectors);
+            bs->bl.pwrite_zeroes_alignment =
+                MAX(bs->bl.pwrite_zeroes_alignment,
+                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
        }
    }
 }
@@ -1016,27 +1016,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 */
 static int get_whole_cluster(BlockDriverState *bs,
                             VmdkExtent *extent,
-                             uint64_t cluster_sector_num,
-                             uint64_t sector_num,
-                             uint64_t skip_start_sector,
-                             uint64_t skip_end_sector)
+                             uint64_t cluster_offset,
+                             uint64_t offset,
+                             uint64_t skip_start_bytes,
+                             uint64_t skip_end_bytes)
 {
    int ret = VMDK_OK;
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
-    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
    whole_grain = qemu_blockalign(bs, cluster_bytes);

    if (!bs->backing) {
-        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
-        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
-               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+        memset(whole_grain, 0, skip_start_bytes);
+        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
    }

-    assert(skip_end_sector <= extent->cluster_sectors);
+    assert(skip_end_bytes <= cluster_bytes);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
@@ -1045,42 +1044,43 @@ static int get_whole_cluster(BlockDriverState *bs,
    }

    /* Read backing data before skip range */
-    if (skip_start_sector > 0) {
+    if (skip_start_bytes > 0) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num,
-                            whole_grain, skip_start_sector);
+            ret = bdrv_pread(bs->backing->bs, offset, whole_grain,
+                             skip_start_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain,
-                         skip_start_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset, whole_grain,
+                          skip_start_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
    /* Read backing data after skip range */
-    if (skip_end_sector < extent->cluster_sectors) {
+    if (skip_end_bytes < cluster_bytes) {
        if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector,
-                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                            extent->cluster_sectors - skip_end_sector);
+            ret = bdrv_pread(bs->backing->bs, offset + skip_end_bytes,
+                             whole_grain + skip_end_bytes,
+                             cluster_bytes - skip_end_bytes);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector,
-                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                         extent->cluster_sectors - skip_end_sector);
+        ret = bdrv_pwrite(extent->file->bs, cluster_offset + skip_end_bytes,
+                          whole_grain + skip_end_bytes,
+                          cluster_bytes - skip_end_bytes);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }

+    ret = VMDK_OK;
 exit:
    qemu_vfree(whole_grain);
    return ret;
@@ -1142,8 +1142,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
-                              uint64_t skip_start_sector,
-                              uint64_t skip_end_sector)
+                              uint64_t skip_start_bytes,
+                              uint64_t skip_end_bytes)
 {
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
@@ -1230,10 +1230,8 @@ static int get_cluster_offset(BlockDriverState *bs,
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
-        ret = get_whole_cluster(bs, extent,
-                                cluster_sector,
-                                offset >> BDRV_SECTOR_BITS,
-                                skip_start_sector, skip_end_sector);
+        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+                                offset, skip_start_bytes, skip_end_bytes);
        if (ret) {
            return ret;
        }
@@ -1259,15 +1257,24 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
    return NULL;
 }

+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+                                                   int64_t offset)
+{
+    uint64_t extent_begin_offset, extent_relative_offset;
+    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+    extent_begin_offset =
+        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+    extent_relative_offset = offset - extent_begin_offset;
+    return extent_relative_offset % cluster_size;
+}
+
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
 {
-    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
-
-    extent_begin_sector = extent->end_sector - extent->sectors;
-    extent_relative_sector_num = sector_num - extent_begin_sector;
-    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-    return index_in_cluster;
+    uint64_t offset;
+    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
+    return offset / BDRV_SECTOR_SIZE;
 }

 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
@@ -1319,38 +1326,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
 }

 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, const uint8_t *buf,
-                            int nb_sectors, int64_t sector_num)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            uint64_t qiov_offset, uint64_t n_bytes,
+                            uint64_t offset)
 {
    int ret;
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
-    const uint8_t *write_buf = buf;
-    int write_len = nb_sectors * 512;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
    int64_t write_offset;
    int64_t write_end_sector;

    if (extent->compressed) {
+        void *compressed_data;
+
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
-        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
-                buf_len == 0) {
+
+        compressed_data = g_malloc(n_bytes);
+        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
+        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
+        g_free(compressed_data);
+
+        if (ret != Z_OK || buf_len == 0) {
            ret = -EINVAL;
            goto out;
        }
-        data->lba = sector_num;
-        data->size = buf_len;
-        write_buf = (uint8_t *)data;
-        write_len = buf_len + sizeof(VmdkGrainMarker);
-    }
-    write_offset = cluster_offset + offset_in_cluster,
-    ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len);

-    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE);
+        data->lba = offset >> BDRV_SECTOR_BITS;
+        data->size = buf_len;
+
+        n_bytes = buf_len + sizeof(VmdkGrainMarker);
+        iov = (struct iovec) {
+            .iov_base   = data,
+            .iov_len    = n_bytes,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    } else {
+        qemu_iovec_init(&local_qiov, qiov->niov);
+        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
+    }
+
+    write_offset = cluster_offset + offset_in_cluster,
+    ret = bdrv_co_pwritev(extent->file->bs, write_offset, n_bytes,
+                          &local_qiov, 0);
+
+    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);

    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
@@ -1359,19 +1385,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
                                          write_end_sector);
    }

-    if (ret != write_len) {
-        ret = ret < 0 ? ret : -EIO;
+    if (ret < 0) {
        goto out;
    }
    ret = 0;
 out:
    g_free(data);
+    if (!extent->compressed) {
+        qemu_iovec_destroy(&local_qiov);
+    }
    return ret;
 }

 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, uint8_t *buf,
-                            int nb_sectors)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            int bytes)
 {
    int ret;
    int cluster_bytes, buf_bytes;
@@ -1383,14 +1411,13 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,


    if (!extent->compressed) {
-        ret = bdrv_pread(extent->file->bs,
-                          cluster_offset + offset_in_cluster,
-                          buf, nb_sectors * 512);
-        if (ret == nb_sectors * 512) {
-            return 0;
-        } else {
-            return -EIO;
+        ret = bdrv_co_preadv(extent->file->bs,
+                             cluster_offset + offset_in_cluster, bytes,
+                             qiov, 0);
+        if (ret < 0) {
+            return ret;
        }
+        return 0;
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
@@ -1422,11 +1449,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,

    }
    if (offset_in_cluster < 0 ||
-            offset_in_cluster + nb_sectors * 512 > buf_len) {
+            offset_in_cluster + bytes > buf_len) {
        ret = -EINVAL;
        goto out;
    }
-    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
+    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
    ret = 0;

 out:
@@ -1435,64 +1462,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
    return ret;
 }

-static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVmdkState *s = bs->opaque;
    int ret;
-    uint64_t n, index_in_cluster;
+    uint64_t n_bytes, offset_in_cluster;
    VmdkExtent *extent = NULL;
+    QEMUIOVector local_qiov;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
-            return -EIO;
+            ret = -EIO;
+            goto fail;
        }
        ret = get_cluster_offset(bs, extent, NULL,
-                                 sector_num << 9, false, &cluster_offset,
-                                 0, 0);
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
+                                 offset, false, &cluster_offset, 0, 0);
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
        if (ret != VMDK_OK) {
            /* if not allocated, try to read from parent image, if exist */
            if (bs->backing && ret != VMDK_ZEROED) {
                if (!vmdk_is_cid_valid(bs)) {
-                    return -EINVAL;
+                    ret = -EINVAL;
+                    goto fail;
                }
-                ret = bdrv_read(bs->backing->bs, sector_num, buf, n);
+
+                qemu_iovec_reset(&local_qiov);
+                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+                ret = bdrv_co_preadv(bs->backing->bs, offset, n_bytes,
+                                     &local_qiov, 0);
                if (ret < 0) {
-                    return ret;
+                    goto fail;
                }
            } else {
-                memset(buf, 0, 512 * n);
+                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
            }
        } else {
-            ret = vmdk_read_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n);
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
+                                   &local_qiov, n_bytes);
            if (ret) {
-                return ret;
+                goto fail;
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
-                                     uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVmdkState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
    return ret;
 }

@@ -1506,38 +1542,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
 *
 * Returns: error code with 0 for success.
 */
-static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                      const uint8_t *buf, int nb_sectors,
-                      bool zeroed, bool zero_dry_run)
+static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
+                       uint64_t bytes, QEMUIOVector *qiov,
+                       bool zeroed, bool zero_dry_run)
 {
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int ret;
-    int64_t index_in_cluster, n;
+    int64_t offset_in_cluster, n_bytes;
    uint64_t cluster_offset;
+    uint64_t bytes_done = 0;
    VmdkMetaData m_data;

-    if (sector_num > bs->total_sectors) {
-        error_report("Wrong offset: sector_num=0x%" PRIx64
+    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
+        error_report("Wrong offset: offset=0x%" PRIx64
                     " total_sectors=0x%" PRIx64,
-                     sector_num, bs->total_sectors);
+                     offset, bs->total_sectors);
        return -EIO;
    }

-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
        if (!extent) {
            return -EIO;
        }
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
-        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
+        ret = get_cluster_offset(bs, extent, &m_data, offset,
                                 !(extent->compressed || zeroed),
-                                 &cluster_offset,
-                                 index_in_cluster, index_in_cluster + n);
+                                 &cluster_offset, offset_in_cluster,
+                                 offset_in_cluster + n_bytes);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1546,7 +1582,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
-                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                ret = get_cluster_offset(bs, extent, &m_data, offset,
                                         true, &cluster_offset, 0, 0);
            }
        }
@@ -1556,9 +1592,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
-                    index_in_cluster == 0 &&
-                    n >= extent->cluster_sectors) {
-                n = extent->cluster_sectors;
+                    offset_in_cluster == 0 &&
+                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
+                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
                if (!zero_dry_run) {
                    /* update L2 tables */
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
@@ -1570,9 +1606,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -ENOTSUP;
            }
        } else {
-            ret = vmdk_write_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n, sector_num);
+            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
+                                    qiov, bytes_done, n_bytes, offset);
            if (ret) {
                return ret;
            }
@@ -1585,9 +1620,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                }
            }
        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;

        /* update CID on the first write every time the virtual disk is
         * opened */
@@ -1602,43 +1637,84 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
    return 0;
 }

-static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
-                                      const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }

+typedef struct VmdkWriteCompressedCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} VmdkWriteCompressedCo;
+
+static void vmdk_co_write_compressed(void *opaque)
+{
+    VmdkWriteCompressedCo *co = opaque;
+    QEMUIOVector local_qiov;
+    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = bytes,
+    };
+    qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+    co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false);
+}
+
 static int vmdk_write_compressed(BlockDriverState *bs,
                                 int64_t sector_num,
                                 const uint8_t *buf,
                                 int nb_sectors)
 {
    BDRVVmdkState *s = bs->opaque;
+
    if (s->num_extents == 1 && s->extents[0].compressed) {
-        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+        Coroutine *co;
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        VmdkWriteCompressedCo data = {
+            .bs         = bs,
+            .sector_num = sector_num,
+            .buf        = buf,
+            .nb_sectors = nb_sectors,
+            .ret        = -EINPROGRESS,
+        };
+        co = qemu_coroutine_create(vmdk_co_write_compressed);
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(aio_context, true);
+        }
+        return data.ret;
    } else {
        return -ENOTSUP;
    }
 }

-static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             int nb_sectors,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
 {
    int ret;
    BDRVVmdkState *s = bs->opaque;
+
    qemu_co_mutex_lock(&s->lock);
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
-    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
    if (!ret) {
-        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
    }
    qemu_co_mutex_unlock(&s->lock);
    return ret;
@@ -1728,12 +1804,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
    header.check_bytes[3] = 0xa;

    /* write all the data */
-    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
    }
-    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1753,7 +1829,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1765,7 +1841,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
        gd_buf[i] = cpu_to_le32(tmp);
    }
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
    if (ret < 0) {
        error_setg(errp, QERR_IO_ERROR);
        goto exit;
@@ -1829,8 +1905,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
    int64_t total_size = 0, filesize;
    char *adapter_type = NULL;
    char *backing_file = NULL;
+    char *hw_version = NULL;
    char *fmt = NULL;
-    int flags = 0;
    int ret = 0;
    bool flat, split, compress;
    GString *ext_desc_lines;
@@ -1861,7 +1937,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
-        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.virtualHWVersion = \"%s\"\n"
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
        "ddb.geometry.sectors = \"63\"\n"
@@ -1878,8 +1954,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        flags |= BLOCK_FLAG_COMPAT6;
+        if (strcmp(hw_version, "undefined")) {
+            error_setg(errp,
+                       "compat6 cannot be enabled with hwversion set");
+            ret = -EINVAL;
+            goto exit;
+        }
+        g_free(hw_version);
+        hw_version = g_strdup("6");
+    }
+    if (strcmp(hw_version, "undefined") == 0) {
+        g_free(hw_version);
+        hw_version = g_strdup("4");
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
@@ -2001,7 +2089,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
-                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           hw_version,
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                           number_heads,
@@ -2028,7 +2116,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)

    blk_set_allow_write_beyond_eof(new_blk, true);

-    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
@@ -2047,6 +2135,7 @@ exit:
    }
    g_free(adapter_type);
    g_free(backing_file);
+    g_free(hw_version);
    g_free(fmt);
    g_free(desc);
    g_free(path);
@@ -2250,27 +2339,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static void vmdk_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_detach_aio_context(s->extents[i].file->bs);
-    }
-}
-
-static void vmdk_attach_aio_context(BlockDriverState *bs,
-                                    AioContext *new_context)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
-    }
-}
-
 static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2297,6 +2365,12 @@ static QemuOptsList vmdk_create_opts = {
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
+        {
+            .name = BLOCK_OPT_HWVERSION,
+            .type = QEMU_OPT_STRING,
+            .help = "VMDK hardware version",
+            .def_value_str = "undefined"
+        },
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
@@ -2321,10 +2395,10 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_open                    = vmdk_open,
    .bdrv_check                   = vmdk_check,
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
-    .bdrv_read                    = vmdk_co_read,
-    .bdrv_write                   = vmdk_co_write,
+    .bdrv_co_preadv               = vmdk_co_preadv,
+    .bdrv_co_pwritev              = vmdk_co_pwritev,
    .bdrv_write_compressed        = vmdk_write_compressed,
-    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
    .bdrv_close                   = vmdk_close,
    .bdrv_create                  = vmdk_create,
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
@@ -2334,8 +2408,6 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_get_specific_info       = vmdk_get_specific_info,
    .bdrv_refresh_limits          = vmdk_refresh_limits,
    .bdrv_get_info                = vmdk_get_info,
-    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
-    .bdrv_attach_aio_context      = vmdk_attach_aio_context,

    .supports_backing             = true,
    .create_opts                  = &vmdk_create_opts,
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
+#include "qemu/bswap.h"
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
@@ -454,22 +455,21 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
 * The parameter write must be 1 if the offset will be used for a write
 * operation (the block bitmaps is updated then), 0 otherwise.
 */
-static inline int64_t get_sector_offset(BlockDriverState *bs,
-    int64_t sector_num, int write)
+static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
+                                       bool write)
 {
    BDRVVPCState *s = bs->opaque;
-    uint64_t offset = sector_num * 512;
    uint64_t bitmap_offset, block_offset;
-    uint32_t pagetable_index, pageentry_index;
+    uint32_t pagetable_index, offset_in_block;

    pagetable_index = offset / s->block_size;
-    pageentry_index = (offset % s->block_size) / 512;
+    offset_in_block = offset % s->block_size;

    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
        return -1; /* not allocated */

    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
-    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;

    /* We must ensure that we don't write to any sectors which are marked as
       unused in the bitmap. We get away with setting all bits in the block
@@ -487,6 +487,12 @@ static inline int64_t get_sector_offset(BlockDriverState *bs,
    return block_offset;
 }

+static inline int64_t get_sector_offset(BlockDriverState *bs,
+                                        int64_t sector_num, bool write)
+{
+    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
+}
+
 /*
 * Writes the footer to the end of the image file. This is needed when the
 * file grows as it overwrites the old footer
@@ -513,7 +519,7 @@ static int rewrite_footer(BlockDriverState* bs)
 *
 * Returns the sectors' offset in the image file on success and < 0 on error
 */
-static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 {
    BDRVVPCState *s = bs->opaque;
    int64_t bat_offset;
@@ -522,14 +528,13 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
    uint8_t bitmap[s->bitmap_size];

    /* Check if sector_num is valid */
-    if ((sector_num < 0) || (sector_num > bs->total_sectors))
-        return -1;
+    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
+        return -EINVAL;
+    }

    /* Write entry into in-memory BAT */
-    index = (sector_num * 512) / s->block_size;
-    if (s->pagetable[index] != 0xFFFFFFFF)
-        return -1;
-
+    index = offset / s->block_size;
+    assert(s->pagetable[index] == 0xFFFFFFFF);
    s->pagetable[index] = s->free_data_block_offset / 512;

    /* Initialize the block's bitmap */
@@ -553,11 +558,11 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
    if (ret < 0)
        goto fail;

-    return get_sector_offset(bs, sector_num, 0);
+    return get_image_offset(bs, offset, false);

 fail:
    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
-    return -1;
+    return ret;
 }

 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -573,104 +578,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    return 0;
 }

-static int vpc_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
    int ret;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    VHDFooter *footer = (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
+        return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0);
    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 0);

-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&local_qiov, qiov->niov);

-        if (offset == -1) {
-            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, false);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
        } else {
-            ret = bdrv_pread(bs->file->bs, offset, buf,
-                sectors * BDRV_SECTOR_SIZE);
-            if (ret != sectors * BDRV_SECTOR_SIZE) {
-                return -1;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes,
+                                 &local_qiov, 0);
+            if (ret < 0) {
+                goto fail;
            }
        }

-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
    }
-    return 0;
-}

-static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

-static int vpc_write(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
    BDRVVPCState *s = bs->opaque;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
    int ret;
    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
-    }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 1);
-
-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
-
-        if (offset == -1) {
-            offset = alloc_block(bs, sector_num);
-            if (offset < 0)
-                return -1;
-        }
-
-        ret = bdrv_pwrite(bs->file->bs, offset, buf,
-                          sectors * BDRV_SECTOR_SIZE);
-        if (ret != sectors * BDRV_SECTOR_SIZE) {
-            return -1;
-        }
-
-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0);
    }

-    return 0;
-}
-
-static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_write(bs, sector_num, buf, nb_sectors);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, true);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
+
+        if (image_offset == -1) {
+            image_offset = alloc_block(bs, offset);
+            if (image_offset < 0) {
+                ret = image_offset;
+                goto fail;
+            }
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+        ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes,
+                              &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
+    }
+
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
    qemu_co_mutex_unlock(&s->lock);
+
    return ret;
 }

@@ -783,13 +789,13 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    block_size = 0x200000;
    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);

-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }

    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -799,7 +805,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,

    memset(buf, 0xFF, 512);
    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = blk_pwrite(blk, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512, 0);
        if (ret < 0) {
            goto fail;
        }
@@ -826,7 +832,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
    /* Write the header */
    offset = 512;

-    ret = blk_pwrite(blk, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024, 0);
    if (ret < 0) {
        goto fail;
    }
@@ -848,7 +854,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
        return ret;
    }

-    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
    if (ret < 0) {
        return ret;
    }
@@ -1056,8 +1062,8 @@ static BlockDriver bdrv_vpc = {
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
    .bdrv_create            = vpc_create,

-    .bdrv_read                  = vpc_co_read,
-    .bdrv_write                 = vpc_co_write,
+    .bdrv_co_preadv             = vpc_co_preadv,
+    .bdrv_co_pwritev            = vpc_co_pwritev,
    .bdrv_co_get_block_status   = vpc_co_get_block_status,

    .bdrv_get_info          = vpc_get_info,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
@@ -113,15 +114,12 @@ static inline int array_ensure_allocated(array_t* array, int index)

 static inline void* array_get_next(array_t* array) {
    unsigned int next = array->next;
-    void* result;

    if (array_ensure_allocated(array, next) < 0)
 	return NULL;

    array->next = next + 1;
-    result = array_get(array, next);
-
-    return result;
+    return array_get(array, next);
 }

 static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
@@ -1179,6 +1177,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
        bs->read_only = 0;
    }

+    bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */
    bs->total_sectors = cyls * heads * secs;

    if (init_directories(s, dirname, heads, secs, errp)) {
@@ -1421,14 +1420,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
    return 0;
 }

-static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_read(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_from_buf(qiov, 0, buf, bytes);
+    g_free(buf);
+
    return ret;
 }

@@ -1941,8 +1957,7 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i))
 		/* check file size with FAT */
 		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
 		if (cluster_count !=
-			(le32_to_cpu(direntries[i].size) + s->cluster_size
-			 - 1) / s->cluster_size) {
+            DIV_ROUND_UP(le32_to_cpu(direntries[i].size), s->cluster_size)) {
 		    DLOG(fprintf(stderr, "Cluster count mismatch\n"));
 		    goto fail;
 		}
@@ -2880,14 +2895,31 @@ DLOG(checkpoint());
    return 0;
 }

-static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
-                                       const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                 QEMUIOVector *qiov, int flags)
 {
    int ret;
    BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+    qemu_iovec_to_buf(qiov, 0, buf, bytes);
+
    qemu_co_mutex_lock(&s->lock);
    ret = vvfat_write(bs, sector_num, buf, nb_sectors);
    qemu_co_mutex_unlock(&s->lock);
+
+    g_free(buf);
+
    return ret;
 }

@@ -2904,8 +2936,10 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
    return BDRV_BLOCK_DATA;
 }

-static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
-	const uint8_t* buffer, int nb_sectors) {
+static int coroutine_fn
+write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                    QEMUIOVector *qiov, int flags)
+{
    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
    return try_commit(s);
 }
@@ -2918,7 +2952,7 @@ static void write_target_close(BlockDriverState *bs) {

 static BlockDriver vvfat_write_target = {
    .format_name        = "vvfat_write_target",
-    .bdrv_write         = write_target_commit,
+    .bdrv_co_pwritev    = write_target_commit,
    .bdrv_close         = write_target_close,
 };

@@ -2960,12 +2994,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
        goto err;
    }

-    s->qcow = NULL;
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow"));
-    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
-    if (ret < 0) {
+    s->qcow = bdrv_open(s->qcow_filename, NULL, options,
+                        BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
+    if (!s->qcow) {
+        ret = -EINVAL;
        goto err;
    }

@@ -3014,8 +3048,8 @@ static BlockDriver bdrv_vvfat = {
    .bdrv_file_open         = vvfat_open,
    .bdrv_close             = vvfat_close,

-    .bdrv_read              = vvfat_co_read,
-    .bdrv_write             = vvfat_co_write,
+    .bdrv_co_preadv         = vvfat_co_preadv,
+    .bdrv_co_pwritev        = vvfat_co_pwritev,
    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };

--- a/blockdev.c
+++ b/blockdev.c
@@ -56,6 +56,8 @@
 static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
    QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);

+static int do_open_tray(const char *device, bool force, Error **errp);
+
 static const char *const if_name[IF_COUNT] = {
    [IF_NONE] = "none",
    [IF_IDE] = "ide",
@@ -73,7 +75,7 @@ static int if_max_devs[IF_COUNT] = {
     * Do not change these numbers!  They govern how drive option
     * index maps to unit and bus.  That mapping is ABI.
     *
-     * All controllers used to imlement if=T drives need to support
+     * All controllers used to implement if=T drives need to support
     * if_max_devs[T] units, for any T with if_max_devs[T] != 0.
     * Otherwise, some index values map to "impossible" bus, unit
     * values.
@@ -567,25 +569,12 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new(errp);
-        if (!blk) {
-            goto early_err;
-        }
-
+        blk = blk_new();
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags;
        blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
        blk_rs->detect_zeroes = detect_zeroes;

-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            blk_rs->throttle_group = g_strdup(throttling_group);
-            blk_rs->throttle_state = throttle_group_incref(throttling_group);
-            blk_rs->throttle_state->cfg = cfg;
-        }
-
        QDECREF(bs_opts);
    } else {
        if (file && !*file) {
@@ -611,15 +600,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,

        bs->detect_zeroes = detect_zeroes;

-        /* disk I/O throttling */
-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            bdrv_io_limits_enable(bs, throttling_group);
-            bdrv_set_io_limits(bs, &cfg);
-        }
-
        if (bdrv_key_required(bs)) {
            autostart = 0;
        }
@@ -633,6 +613,15 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
        }
    }

+    /* disk I/O throttling */
+    if (throttle_enabled(&cfg)) {
+        if (!throttling_group) {
+            throttling_group = blk_name(blk);
+        }
+        blk_io_limits_enable(blk, throttling_group);
+        blk_set_io_limits(blk, &cfg);
+    }
+
    blk_set_enable_write_cache(blk, !writethrough);
    blk_set_on_error(blk, on_read_error, on_write_error);

@@ -666,7 +655,6 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
    QemuOpts *opts;
    Error *local_error = NULL;
    BlockdevDetectZeroesOptions detect_zeroes;
-    int ret;
    int bdrv_flags = 0;

    opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp);
@@ -697,9 +685,8 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        bdrv_flags |= BDRV_O_INACTIVE;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp);
-    if (ret < 0) {
+    bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
+    if (!bs) {
        goto fail_no_bs_opts;
    }

@@ -1652,7 +1639,7 @@ typedef struct ExternalSnapshotState {
 static void external_snapshot_prepare(BlkActionState *common,
                                      Error **errp)
 {
-    int flags = 0, ret;
+    int flags = 0;
    QDict *options = NULL;
    Error *local_err = NULL;
    /* Device and node name of the image to generate the snapshot from */
@@ -1777,17 +1764,16 @@ static void external_snapshot_prepare(BlkActionState *common,
        flags |= BDRV_O_NO_BACKING;
    }

-    assert(state->new_bs == NULL);
-    ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options,
-                    flags, errp);
+    state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags,
+                              errp);
    /* We will manually add the backing_hd field to the bs later */
-    if (ret != 0) {
+    if (!state->new_bs) {
        return;
    }

-    if (state->new_bs->blk != NULL) {
+    if (bdrv_has_blk(state->new_bs)) {
        error_setg(errp, "The snapshot is already in use by %s",
-                   blk_name(state->new_bs->blk));
+                   bdrv_get_parent_name(state->new_bs));
        return;
    }

@@ -2293,12 +2279,18 @@ exit:
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
    Error *local_err = NULL;
+    int rc;

-    qmp_blockdev_open_tray(device, has_force, force, &local_err);
-    if (local_err) {
+    if (!has_force) {
+        force = false;
+    }
+
+    rc = do_open_tray(device, force, &local_err);
+    if (rc && rc != -ENOSYS) {
        error_propagate(errp, local_err);
        return;
    }
+    error_free(local_err);

    qmp_x_blockdev_remove_medium(device, errp);
 }
@@ -2327,35 +2319,41 @@ void qmp_block_passwd(bool has_device, const char *device,
    aio_context_release(aio_context);
 }

-void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
-                            Error **errp)
+/*
+ * Attempt to open the tray of @device.
+ * If @force, ignore its tray lock.
+ * Else, if the tray is locked, don't open it, but ask the guest to open it.
+ * On error, store an error through @errp and return -errno.
+ * If @device does not exist, return -ENODEV.
+ * If it has no removable media, return -ENOTSUP.
+ * If it has no tray, return -ENOSYS.
+ * If the guest was asked to open the tray, return -EINPROGRESS.
+ * Else, return 0.
+ */
+static int do_open_tray(const char *device, bool force, Error **errp)
 {
    BlockBackend *blk;
    bool locked;

-    if (!has_force) {
-        force = false;
-    }
-
    blk = blk_by_name(device);
    if (!blk) {
        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                  "Device '%s' not found", device);
-        return;
+        return -ENODEV;
    }

    if (!blk_dev_has_removable_media(blk)) {
        error_setg(errp, "Device '%s' is not removable", device);
-        return;
+        return -ENOTSUP;
    }

    if (!blk_dev_has_tray(blk)) {
-        /* Ignore this command on tray-less devices */
-        return;
+        error_setg(errp, "Device '%s' does not have a tray", device);
+        return -ENOSYS;
    }

    if (blk_dev_is_tray_open(blk)) {
-        return;
+        return 0;
    }

    locked = blk_dev_is_medium_locked(blk);
@@ -2366,6 +2364,31 @@ void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
    if (!locked || force) {
        blk_dev_change_media_cb(blk, false);
    }
+
+    if (locked && !force) {
+        error_setg(errp, "Device '%s' is locked and force was not specified, "
+                   "wait for tray to open and try again", device);
+        return -EINPROGRESS;
+    }
+
+    return 0;
+}
+
+void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
+                            Error **errp)
+{
+    Error *local_err = NULL;
+    int rc;
+
+    if (!has_force) {
+        force = false;
+    }
+    rc = do_open_tray(device, force, &local_err);
+    if (rc && rc != -ENOSYS && rc != -EINPROGRESS) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    error_free(local_err);
 }

 void qmp_blockdev_close_tray(const char *device, Error **errp)
@@ -2503,9 +2526,9 @@ void qmp_x_blockdev_insert_medium(const char *device, const char *node_name,
        return;
    }

-    if (bs->blk) {
+    if (bdrv_has_blk(bs)) {
        error_setg(errp, "Node '%s' is already in use by '%s'", node_name,
-                   blk_name(bs->blk));
+                   bdrv_get_parent_name(bs));
        return;
    }

@@ -2520,7 +2543,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
 {
    BlockBackend *blk;
    BlockDriverState *medium_bs = NULL;
-    int bdrv_flags, ret;
+    int bdrv_flags;
+    int rc;
    QDict *options = NULL;
    Error *err = NULL;

@@ -2564,25 +2588,24 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    assert(!medium_bs);
-    ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp);
-    if (ret < 0) {
+    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
+    if (!medium_bs) {
        goto fail;
    }

-    blk_apply_root_state(blk, medium_bs);
-
    bdrv_add_key(medium_bs, NULL, &err);
    if (err) {
        error_propagate(errp, err);
        goto fail;
    }

-    qmp_blockdev_open_tray(device, false, false, &err);
-    if (err) {
+    rc = do_open_tray(device, false, &err);
+    if (rc && rc != -ENOSYS) {
        error_propagate(errp, err);
        goto fail;
    }
+    error_free(err);
+    err = NULL;

    qmp_x_blockdev_remove_medium(device, &err);
    if (err) {
@@ -2596,6 +2619,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        goto fail;
    }

+    blk_apply_root_state(blk, medium_bs);
+
    qmp_blockdev_close_tray(device, errp);

 fail:
@@ -2661,13 +2686,6 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
        goto out;
    }

-    /* The BlockBackend must be the only parent */
-    assert(QLIST_FIRST(&bs->parents));
-    if (QLIST_NEXT(QLIST_FIRST(&bs->parents), next_parent)) {
-        error_setg(errp, "Cannot throttle device with multiple parents");
-        goto out;
-    }
-
    throttle_config_init(&cfg);
    cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
    cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
@@ -2726,16 +2744,16 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
    if (throttle_enabled(&cfg)) {
        /* Enable I/O limits if they're not enabled yet, otherwise
         * just update the throttling group. */
-        if (!bs->throttle_state) {
-            bdrv_io_limits_enable(bs, has_group ? group : device);
+        if (!blk_get_public(blk)->throttle_state) {
+            blk_io_limits_enable(blk, has_group ? group : device);
        } else if (has_group) {
-            bdrv_io_limits_update_group(bs, group);
+            blk_io_limits_update_group(blk, group);
        }
        /* Set the new throttling configuration */
-        bdrv_set_io_limits(bs, &cfg);
-    } else if (bs->throttle_state) {
+        blk_set_io_limits(blk, &cfg);
+    } else if (blk_get_public(blk)->throttle_state) {
        /* If all throttling settings are set to 0, disable I/O limits */
-        bdrv_io_limits_disable(bs);
+        blk_io_limits_disable(blk);
    }

 out:
@@ -3186,7 +3204,6 @@ static void do_drive_backup(const char *device, const char *target,
    Error *local_err = NULL;
    int flags;
    int64_t size;
-    int ret;

    if (!has_speed) {
        speed = 0;
@@ -3270,10 +3287,8 @@ static void do_drive_backup(const char *device, const char *target,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options, flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags, errp);
+    if (!target_bs) {
        goto out;
    }

@@ -3291,8 +3306,8 @@ static void do_drive_backup(const char *device, const char *target,
    backup_start(bs, target_bs, speed, sync, bmap,
                 on_source_error, on_target_error,
                 block_job_cb, bs, txn, &local_err);
+    bdrv_unref(target_bs);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
        goto out;
    }
@@ -3333,7 +3348,7 @@ void do_blockdev_backup(const char *device, const char *target,
                         BlockdevOnError on_target_error,
                         BlockJobTxn *txn, Error **errp)
 {
-    BlockBackend *blk, *target_blk;
+    BlockBackend *blk;
    BlockDriverState *bs;
    BlockDriverState *target_bs;
    Error *local_err = NULL;
@@ -3364,24 +3379,25 @@ void do_blockdev_backup(const char *device, const char *target,
    }
    bs = blk_bs(blk);

-    target_blk = blk_by_name(target);
-    if (!target_blk) {
-        error_setg(errp, "Device '%s' not found", target);
+    target_bs = bdrv_lookup_bs(target, target, errp);
+    if (!target_bs) {
        goto out;
    }

-    if (!blk_is_available(target_blk)) {
-        error_setg(errp, "Device '%s' has no medium", target);
-        goto out;
+    if (bdrv_get_aio_context(target_bs) != aio_context) {
+        if (!bdrv_has_blk(target_bs)) {
+            /* The target BDS is not attached, we can safely move it to another
+             * AioContext. */
+            bdrv_set_aio_context(target_bs, aio_context);
+        } else {
+            error_setg(errp, "Target is attached to a different thread from "
+                             "source.");
+            goto out;
+        }
    }
-    target_bs = blk_bs(target_blk);
-
-    bdrv_ref(target_bs);
-    bdrv_set_aio_context(target_bs, aio_context);
    backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
                 on_target_error, block_job_cb, bs, txn, &local_err);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
    }
 out:
@@ -3410,6 +3426,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
                                   BlockDriverState *target,
                                   bool has_replaces, const char *replaces,
                                   enum MirrorSyncMode sync,
+                                   BlockMirrorBackingMode backing_mode,
                                   bool has_speed, int64_t speed,
                                   bool has_granularity, uint32_t granularity,
                                   bool has_buf_size, int64_t buf_size,
@@ -3457,10 +3474,6 @@ static void blockdev_mirror_common(BlockDriverState *bs,
    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
        return;
    }
-    if (target->blk) {
-        error_setg(errp, "Cannot mirror to an attached block device");
-        return;
-    }

    if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) {
        sync = MIRROR_SYNC_MODE_FULL;
@@ -3471,7 +3484,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
     */
    mirror_start(bs, target,
                 has_replaces ? replaces : NULL,
-                 speed, granularity, buf_size, sync,
+                 speed, granularity, buf_size, sync, backing_mode,
                 on_source_error, on_target_error, unmap,
                 block_job_cb, bs, errp);
 }
@@ -3494,11 +3507,11 @@ void qmp_drive_mirror(const char *device, const char *target,
    BlockBackend *blk;
    BlockDriverState *source, *target_bs;
    AioContext *aio_context;
+    BlockMirrorBackingMode backing_mode;
    Error *local_err = NULL;
    QDict *options = NULL;
    int flags;
    int64_t size;
-    int ret;

    blk = blk_by_name(device);
    if (!blk) {
@@ -3568,6 +3581,12 @@ void qmp_drive_mirror(const char *device, const char *target,
        }
    }

+    if (mode == NEW_IMAGE_MODE_ABSOLUTE_PATHS) {
+        backing_mode = MIRROR_SOURCE_BACKING_CHAIN;
+    } else {
+        backing_mode = MIRROR_OPEN_BACKING_CHAIN;
+    }
+
    if ((sync == MIRROR_SYNC_MODE_FULL || !source)
        && mode != NEW_IMAGE_MODE_EXISTING)
    {
@@ -3607,18 +3626,16 @@ void qmp_drive_mirror(const char *device, const char *target,
    /* Mirroring takes care of copy-on-write using the source's backing
     * file.
     */
-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options,
-                    flags | BDRV_O_NO_BACKING, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags | BDRV_O_NO_BACKING,
+                          errp);
+    if (!target_bs) {
        goto out;
    }

    bdrv_set_aio_context(target_bs, aio_context);

    blockdev_mirror_common(bs, target_bs,
-                           has_replaces, replaces, sync,
+                           has_replaces, replaces, sync, backing_mode,
                           has_speed, speed,
                           has_granularity, granularity,
                           has_buf_size, buf_size,
@@ -3626,10 +3643,8 @@ void qmp_drive_mirror(const char *device, const char *target,
                           has_on_target_error, on_target_error,
                           has_unmap, unmap,
                           &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
-    }
+    bdrv_unref(target_bs);
+    error_propagate(errp, local_err);
 out:
    aio_context_release(aio_context);
 }
@@ -3650,6 +3665,7 @@ void qmp_blockdev_mirror(const char *device, const char *target,
    BlockBackend *blk;
    BlockDriverState *target_bs;
    AioContext *aio_context;
+    BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN;
    Error *local_err = NULL;

    blk = blk_by_name(device);
@@ -3672,11 +3688,10 @@ void qmp_blockdev_mirror(const char *device, const char *target,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

-    bdrv_ref(target_bs);
    bdrv_set_aio_context(target_bs, aio_context);

    blockdev_mirror_common(bs, target_bs,
-                           has_replaces, replaces, sync,
+                           has_replaces, replaces, sync, backing_mode,
                           has_speed, speed,
                           has_granularity, granularity,
                           has_buf_size, buf_size,
@@ -3684,10 +3699,7 @@ void qmp_blockdev_mirror(const char *device, const char *target,
                           has_on_target_error, on_target_error,
                           true, true,
                           &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
-    }
+    error_propagate(errp, local_err);

    aio_context_release(aio_context);
 }
@@ -3795,6 +3807,7 @@ void qmp_block_job_resume(const char *device, Error **errp)

    job->user_paused = false;
    trace_qmp_block_job_resume(job);
+    block_job_iostatus_reset(job);
    block_job_resume(job);
    aio_context_release(aio_context);
 }
@@ -3897,9 +3910,7 @@ void qmp_change_backing_file(const char *device,

    if (ro) {
        bdrv_reopen(image_bs, open_flags, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err); /* will preserve prior errp */
-        }
+        error_propagate(errp, local_err);
    }

 out:
@@ -4046,15 +4057,15 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
        bs = blk_bs(blk);
        aio_context = blk_get_aio_context(blk);
    } else {
+        blk = NULL;
        bs = bdrv_find_node(node_name);
        if (!bs) {
            error_setg(errp, "Cannot find node %s", node_name);
            return;
        }
-        blk = bs->blk;
-        if (blk) {
+        if (bdrv_has_blk(bs)) {
            error_setg(errp, "Node %s is in use by %s",
-                       node_name, blk_name(blk));
+                       node_name, bdrv_get_parent_name(bs));
            return;
        }
        aio_context = bdrv_get_aio_context(bs);
@@ -4092,24 +4103,76 @@ out:
    aio_context_release(aio_context);
 }

+static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
+                                  const char *child_name)
+{
+    BdrvChild *child;
+
+    QLIST_FOREACH(child, &parent_bs->children, next) {
+        if (strcmp(child->name, child_name) == 0) {
+            return child;
+        }
+    }
+
+    return NULL;
+}
+
+void qmp_x_blockdev_change(const char *parent, bool has_child,
+                           const char *child, bool has_node,
+                           const char *node, Error **errp)
+{
+    BlockDriverState *parent_bs, *new_bs = NULL;
+    BdrvChild *p_child;
+
+    parent_bs = bdrv_lookup_bs(parent, parent, errp);
+    if (!parent_bs) {
+        return;
+    }
+
+    if (has_child == has_node) {
+        if (has_child) {
+            error_setg(errp, "The parameters child and node are in conflict");
+        } else {
+            error_setg(errp, "Either child or node must be specified");
+        }
+        return;
+    }
+
+    if (has_child) {
+        p_child = bdrv_find_child(parent_bs, child);
+        if (!p_child) {
+            error_setg(errp, "Node '%s' does not have child '%s'",
+                       parent, child);
+            return;
+        }
+        bdrv_del_child(parent_bs, p_child, errp);
+    }
+
+    if (has_node) {
+        new_bs = bdrv_find_node(node);
+        if (!new_bs) {
+            error_setg(errp, "Node '%s' not found", node);
+            return;
+        }
+        bdrv_add_child(parent_bs, new_bs, errp);
+    }
+}
+
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
    BlockJobInfoList *head = NULL, **p_next = &head;
-    BlockDriverState *bs;
+    BlockJob *job;

-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+        BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
+        AioContext *aio_context = blk_get_aio_context(job->blk);

        aio_context_acquire(aio_context);
-
-        if (bs->job) {
-            BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
-            elem->value = block_job_query(bs->job);
-            *p_next = elem;
-            p_next = &elem->next;
-        }
-
+        elem->value = block_job_query(job);
        aio_context_release(aio_context);
+
+        *p_next = elem;
+        p_next = &elem->next;
    }

    return head;
--- a/blockjob.c
+++ b/blockjob.c
@@ -50,17 +50,75 @@ struct BlockJobTxn {
    int refcnt;
 };

+static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);
+
+BlockJob *block_job_next(BlockJob *job)
+{
+    if (!job) {
+        return QLIST_FIRST(&block_jobs);
+    }
+    return QLIST_NEXT(job, job_list);
+}
+
+/* Normally the job runs in its BlockBackend's AioContext.  The exception is
+ * block_job_defer_to_main_loop() where it runs in the QEMU main loop.  Code
+ * that supports both cases uses this helper function.
+ */
+static AioContext *block_job_get_aio_context(BlockJob *job)
+{
+    return job->deferred_to_main_loop ?
+           qemu_get_aio_context() :
+           blk_get_aio_context(job->blk);
+}
+
+static void block_job_attached_aio_context(AioContext *new_context,
+                                           void *opaque)
+{
+    BlockJob *job = opaque;
+
+    if (job->driver->attached_aio_context) {
+        job->driver->attached_aio_context(job, new_context);
+    }
+
+    block_job_resume(job);
+}
+
+static void block_job_detach_aio_context(void *opaque)
+{
+    BlockJob *job = opaque;
+
+    /* In case the job terminates during aio_poll()... */
+    block_job_ref(job);
+
+    block_job_pause(job);
+
+    if (!job->paused) {
+        /* If job is !job->busy this kicks it into the next pause point. */
+        block_job_enter(job);
+    }
+    while (!job->paused && !job->completed) {
+        aio_poll(block_job_get_aio_context(job), true);
+    }
+
+    block_job_unref(job);
+}
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
 {
+    BlockBackend *blk;
    BlockJob *job;

+    assert(cb);
    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
        return NULL;
    }
-    bdrv_ref(bs);
+
+    blk = blk_new();
+    blk_insert_bs(blk, bs);
+
    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
@@ -69,13 +127,18 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,

    job->driver        = driver;
    job->id            = g_strdup(bdrv_get_device_name(bs));
-    job->bs            = bs;
+    job->blk           = blk;
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
    job->refcnt        = 1;
    bs->job = job;

+    QLIST_INSERT_HEAD(&block_jobs, job, job_list);
+
+    blk_add_aio_context_notifier(blk, block_job_attached_aio_context,
+                                 block_job_detach_aio_context, job);
+
    /* Only set speed when necessary to avoid NotSupported error */
    if (speed != 0) {
        Error *local_err = NULL;
@@ -98,11 +161,16 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        job->bs->job = NULL;
-        bdrv_op_unblock_all(job->bs, job->blocker);
-        bdrv_unref(job->bs);
+        BlockDriverState *bs = blk_bs(job->blk);
+        bs->job = NULL;
+        bdrv_op_unblock_all(bs, job->blocker);
+        blk_remove_aio_context_notifier(job->blk,
+                                        block_job_attached_aio_context,
+                                        block_job_detach_aio_context, job);
+        blk_unref(job->blk);
        error_free(job->blocker);
        g_free(job->id);
+        QLIST_REMOVE(job, job_list);
        g_free(job);
    }
 }
@@ -140,7 +208,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
    txn->aborting = true;
    /* We are the first failed job. Cancel other jobs. */
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
    }
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
@@ -157,7 +225,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
        assert(other_job->completed);
    }
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        block_job_completed_single(other_job);
        aio_context_release(ctx);
    }
@@ -179,7 +247,7 @@ static void block_job_completed_txn_success(BlockJob *job)
    }
    /* We are the last completed job, commit the transaction. */
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
        assert(other_job->ret == 0);
        block_job_completed_single(other_job);
@@ -189,9 +257,7 @@ static void block_job_completed_txn_success(BlockJob *job)

 void block_job_completed(BlockJob *job, int ret)
 {
-    BlockDriverState *bs = job->bs;
-
-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);
    assert(!job->completed);
    job->completed = true;
    job->ret = ret;
@@ -236,11 +302,37 @@ void block_job_pause(BlockJob *job)
    job->pause_count++;
 }

-bool block_job_is_paused(BlockJob *job)
+static bool block_job_should_pause(BlockJob *job)
 {
    return job->pause_count > 0;
 }

+void coroutine_fn block_job_pause_point(BlockJob *job)
+{
+    if (!block_job_should_pause(job)) {
+        return;
+    }
+    if (block_job_is_cancelled(job)) {
+        return;
+    }
+
+    if (job->driver->pause) {
+        job->driver->pause(job);
+    }
+
+    if (block_job_should_pause(job) && !block_job_is_cancelled(job)) {
+        job->paused = true;
+        job->busy = false;
+        qemu_coroutine_yield(); /* wait for block_job_resume() */
+        job->busy = true;
+        job->paused = false;
+    }
+
+    if (job->driver->resume) {
+        job->driver->resume(job);
+    }
+}
+
 void block_job_resume(BlockJob *job)
 {
    assert(job->pause_count > 0);
@@ -253,7 +345,6 @@ void block_job_resume(BlockJob *job)

 void block_job_enter(BlockJob *job)
 {
-    block_job_iostatus_reset(job);
    if (job->co && !job->busy) {
        qemu_coroutine_enter(job->co, NULL);
    }
@@ -262,6 +353,7 @@ void block_job_enter(BlockJob *job)
 void block_job_cancel(BlockJob *job)
 {
    job->cancelled = true;
+    block_job_iostatus_reset(job);
    block_job_enter(job);
 }

@@ -282,11 +374,10 @@ static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
-    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
    int ret;

-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);

    block_job_ref(job);
    finish(job, &local_err);
@@ -296,9 +387,7 @@ static int block_job_finish_sync(BlockJob *job,
        return -EBUSY;
    }
    while (!job->completed) {
-        aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
-                                              bdrv_get_aio_context(bs),
-                 true);
+        aio_poll(block_job_get_aio_context(job), true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
    block_job_unref(job);
@@ -318,6 +407,19 @@ int block_job_cancel_sync(BlockJob *job)
    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
 }

+void block_job_cancel_sync_all(void)
+{
+    BlockJob *job;
+    AioContext *aio_context;
+
+    while ((job = QLIST_FIRST(&block_jobs))) {
+        aio_context = blk_get_aio_context(job->blk);
+        aio_context_acquire(aio_context);
+        block_job_cancel_sync(job);
+        aio_context_release(aio_context);
+    }
+}
+
 int block_job_complete_sync(BlockJob *job, Error **errp)
 {
    return block_job_finish_sync(job, &block_job_complete, errp);
@@ -333,12 +435,12 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    }

    job->busy = false;
-    if (block_job_is_paused(job)) {
-        qemu_coroutine_yield();
-    } else {
-        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
+    if (!block_job_should_pause(job)) {
+        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
    }
    job->busy = true;
+
+    block_job_pause_point(job);
 }

 void block_job_yield(BlockJob *job)
@@ -351,8 +453,12 @@ void block_job_yield(BlockJob *job)
    }

    job->busy = false;
-    qemu_coroutine_yield();
+    if (!block_job_should_pause(job)) {
+        qemu_coroutine_yield();
+    }
    job->busy = true;
+
+    block_job_pause_point(job);
 }

 BlockJobInfo *block_job_query(BlockJob *job)
@@ -411,8 +517,7 @@ void block_job_event_ready(BlockJob *job)
                                    job->speed, &error_abort);
 }

-BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
-                                        BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                        int is_read, int error)
 {
    BlockErrorAction action;
@@ -443,9 +548,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
        job->user_paused = true;
        block_job_pause(job);
        block_job_iostatus_set_err(job, error);
-        if (bs->blk && bs != job->bs) {
-            blk_iostatus_set_err(bs->blk, error);
-        }
    }
    return action;
 }
@@ -469,7 +571,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    aio_context_acquire(data->aio_context);

    /* Fetch BDS AioContext again, in case it has changed */
-    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context = blk_get_aio_context(data->job->blk);
    aio_context_acquire(aio_context);

    data->job->deferred_to_main_loop = false;
@@ -489,7 +591,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
    data->job = job;
    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
-    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->aio_context = blk_get_aio_context(job->blk);
    data->fn = fn;
    data->opaque = opaque;
    job->deferred_to_main_loop = true;
--- a/bootdevice.c
+++ b/bootdevice.c
@@ -28,6 +28,7 @@
 #include "qapi/visitor.h"
 #include "qemu/error-report.h"
 #include "hw/hw.h"
+#include "hw/qdev-core.h"

 typedef struct FWBootEntry FWBootEntry;

@@ -301,9 +302,7 @@ static void device_set_bootindex(Object *obj, Visitor *v, const char *name,
    add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix);

 out:
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);
 }

 static void property_release_bootindex(Object *obj, const char *name,
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -1,7 +1,6 @@
 /* This is the Linux kernel elf-loading code, ported into user space */

 #include "qemu/osdep.h"
-#include <sys/mman.h>

 #include "qemu.h"
 #include "disas/disas.h"
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -18,13 +18,14 @@
 */
 #include "qemu/osdep.h"
 #include <machine/trap.h>
-#include <sys/mman.h>

+#include "qapi/error.h"
 #include "qemu.h"
 #include "qemu/path.h"
 #include "qemu/help_option.h"
 /* For tb_lock */
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
@@ -752,9 +753,6 @@ int main(int argc, char **argv)
    }

    cpu_model = NULL;
-#if defined(cpudef_setup)
-    cpudef_setup(); /* parse cpu definitions in target config file (TBD) */
-#endif

    optind = 1;
    for(;;) {
@@ -849,7 +847,8 @@ int main(int argc, char **argv)
    }

    /* init debug */
-    qemu_set_log_filename(log_file);
+    qemu_log_needs_buffers();
+    qemu_set_log_filename(log_file, &error_fatal);
    if (log_mask) {
        int mask;

--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -17,7 +17,6 @@
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
-#include <sys/mman.h>

 #include "qemu.h"
 #include "qemu-common.h"
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -19,6 +19,7 @@


 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"

 #undef DEBUG_REMAP
--- a/bsd-user/syscall.c
+++ b/bsd-user/syscall.c
@@ -19,7 +19,6 @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "qemu/path.h"
-#include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/param.h>
 #include <sys/sysctl.h>
@@ -316,12 +315,14 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1,
                            abi_long arg5, abi_long arg6, abi_long arg7,
                            abi_long arg8)
 {
+    CPUState *cpu = ENV_GET_CPU(cpu_env);
    abi_long ret;
    void *p;

 #ifdef DEBUG
    gemu_log("freebsd syscall %d\n", num);
 #endif
+    trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8);
    if(do_strace)
        print_freebsd_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6);

@@ -401,6 +402,7 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
    if (do_strace)
        print_freebsd_syscall_ret(num, ret);
+    trace_guest_user_syscall_ret(cpu, num, ret);
    return ret;
 efault:
    ret = -TARGET_EFAULT;
@@ -411,12 +413,14 @@ abi_long do_netbsd_syscall(void *cpu_env, int num, abi_long arg1,
                           abi_long arg2, abi_long arg3, abi_long arg4,
                           abi_long arg5, abi_long arg6)
 {
+    CPUState *cpu = ENV_GET_CPU(cpu_env);
    abi_long ret;
    void *p;

 #ifdef DEBUG
    gemu_log("netbsd syscall %d\n", num);
 #endif
+    trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, 0, 0);
    if(do_strace)
        print_netbsd_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6);

@@ -473,6 +477,7 @@ abi_long do_netbsd_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
    if (do_strace)
        print_netbsd_syscall_ret(num, ret);
+    trace_guest_user_syscall_ret(cpu, num, ret);
    return ret;
 efault:
    ret = -TARGET_EFAULT;
@@ -483,12 +488,14 @@ abi_long do_openbsd_syscall(void *cpu_env, int num, abi_long arg1,
                            abi_long arg2, abi_long arg3, abi_long arg4,
                            abi_long arg5, abi_long arg6)
 {
+    CPUState *cpu = ENV_GET_CPU(cpu_env);
    abi_long ret;
    void *p;

 #ifdef DEBUG
    gemu_log("openbsd syscall %d\n", num);
 #endif
+    trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, 0, 0);
    if(do_strace)
        print_openbsd_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6);

@@ -545,6 +552,7 @@ abi_long do_openbsd_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
    if (do_strace)
        print_openbsd_syscall_ret(num, ret);
+    trace_guest_user_syscall_ret(cpu, num, ret);
    return ret;
 efault:
    ret = -TARGET_EFAULT;
--- a/366
+++ b/366
@@ -31,6 +31,7 @@ TMPCXX="${TMPDIR1}/${TMPB}.cxx"
 TMPL="${TMPDIR1}/${TMPB}.lo"
 TMPA="${TMPDIR1}/lib${TMPB}.la"
 TMPE="${TMPDIR1}/${TMPB}.exe"
+TMPMO="${TMPDIR1}/${TMPB}.mo"

 rm -f config.log

@@ -164,7 +165,7 @@ have_backend () {
 }

 # default parameters
-source_path=`dirname "$0"`
+source_path=$(dirname "$0")
 cpu=""
 iasl="iasl"
 interp_prefix="/usr/gnemul/qemu-%M"
@@ -207,7 +208,7 @@ fdt=""
 netmap="no"
 pixman=""
 sdl=""
-sdlabi="1.2"
+sdlabi=""
 virtfs=""
 vnc="yes"
 sparse="no"
@@ -269,7 +270,6 @@ aix="no"
 blobs="yes"
 pkgversion=""
 pie=""
-zero_malloc=""
 qom_cast_debug="yes"
 trace_backends="log"
 trace_file="trace"
@@ -323,7 +323,7 @@ jemalloc="no"

 # parse CC options first
 for opt do
-  optarg=`expr "x$opt" : 'x[^=]*=\(.*\)'`
+  optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)')
  case "$opt" in
  --cross-prefix=*) cross_prefix="$optarg"
  ;;
@@ -398,7 +398,7 @@ if test "$debug_info" = "yes"; then
 fi

 # make source path absolute
-source_path=`cd "$source_path"; pwd`
+source_path=$(cd "$source_path"; pwd)

 # running configure in the source tree?
 # we know that's the case if configure is there.
@@ -443,7 +443,7 @@ elif check_define __sun__ ; then
 elif check_define __HAIKU__ ; then
  targetos='Haiku'
 else
-  targetos=`uname -s`
+  targetos=$(uname -s)
 fi

 # Some host OSes need non-standard checks for which CPU to use.
@@ -461,7 +461,7 @@ Darwin)
  fi
  ;;
 SunOS)
-  # `uname -m` returns i86pc even on an x86_64 box, so default based on isainfo
+  # $(uname -m) returns i86pc even on an x86_64 box, so default based on isainfo
  if test -z "$cpu" && test "$(isainfo -k)" = "amd64"; then
    cpu="x86_64"
  fi
@@ -507,7 +507,7 @@ elif check_define __aarch64__ ; then
 elif check_define __hppa__ ; then
  cpu="hppa"
 else
-  cpu=`uname -m`
+  cpu=$(uname -m)
 fi

 ARCH=
@@ -627,7 +627,7 @@ SunOS)
  ld="gld"
  smbd="${SMBD-/usr/sfw/sbin/smbd}"
  needs_libsunmath="no"
-  solarisrev=`uname -r | cut -f2 -d.`
+  solarisrev=$(uname -r | cut -f2 -d.)
  if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
    if test "$solarisrev" -le 9 ; then
      if test -f /opt/SUNWspro/prod/lib/libsunmath.so.1; then
@@ -722,7 +722,7 @@ fi
 werror=""

 for opt do
-  optarg=`expr "x$opt" : 'x[^=]*=\(.*\)'`
+  optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)')
  case "$opt" in
  --help|-h) show_help=yes
  ;;
@@ -846,9 +846,9 @@ for opt do
  ;;
  --audio-drv-list=*) audio_drv_list="$optarg"
  ;;
-  --block-drv-rw-whitelist=*|--block-drv-whitelist=*) block_drv_rw_whitelist=`echo "$optarg" | sed -e 's/,/ /g'`
+  --block-drv-rw-whitelist=*|--block-drv-whitelist=*) block_drv_rw_whitelist=$(echo "$optarg" | sed -e 's/,/ /g')
  ;;
-  --block-drv-ro-whitelist=*) block_drv_ro_whitelist=`echo "$optarg" | sed -e 's/,/ /g'`
+  --block-drv-ro-whitelist=*) block_drv_ro_whitelist=$(echo "$optarg" | sed -e 's/,/ /g')
  ;;
  --enable-debug-tcg) debug_tcg="yes"
  ;;
@@ -943,7 +943,7 @@ for opt do
  ;;
  --enable-cocoa)
      cocoa="yes" ;
-      audio_drv_list="coreaudio `echo $audio_drv_list | sed s,coreaudio,,g`"
+      audio_drv_list="coreaudio $(echo $audio_drv_list | sed s,coreaudio,,g)"
  ;;
  --disable-system) softmmu="no"
  ;;
@@ -1216,6 +1216,13 @@ esac
 QEMU_CFLAGS="$CPU_CFLAGS $QEMU_CFLAGS"
 EXTRA_CFLAGS="$CPU_CFLAGS $EXTRA_CFLAGS"

+# For user-mode emulation the host arch has to be one we explicitly
+# support, even if we're using TCI.
+if [ "$ARCH" = "unknown" ]; then
+  bsd_user="no"
+  linux_user="no"
+fi
+
 default_target_list=""

 mak_wilds=""
@@ -1380,7 +1387,6 @@ fi
 if test "$ARCH" = "unknown"; then
    if test "$tcg_interpreter" = "yes" ; then
        echo "Unsupported CPU = $cpu, will use TCG with TCI (experimental)"
-        ARCH=tci
    else
        error_exit "Unsupported CPU = $cpu, try --enable-tcg-interpreter"
    fi
@@ -1388,11 +1394,9 @@ fi

 # Consult white-list to determine whether to enable werror
 # by default.  Only enable by default for git builds
-z_version=`cut -f3 -d. $source_path/VERSION`
-
 if test -z "$werror" ; then
    if test -d "$source_path/.git" -a \
-        "$linux" = "yes" ; then
+        \( "$linux" = "yes" -o "$mingw32" = "yes" \) ; then
        werror="yes"
    else
        werror="no"
@@ -1617,7 +1621,7 @@ if test "$solaris" = "yes" ; then
        "install fileutils from www.blastwave.org using pkg-get -i fileutils" \
        "to get ginstall which is used by default (which lives in /opt/csw/bin)"
  fi
-  if test "`path_of $install`" = "/usr/sbin/install" ; then
+  if test "$(path_of $install)" = "/usr/sbin/install" ; then
    error_exit "Solaris /usr/sbin/install is not an appropriate install program." \
        "try ginstall from the GNU fileutils available from www.blastwave.org" \
        "using pkg-get -i fileutils, or use --install=/usr/ucb/install"
@@ -1636,7 +1640,7 @@ fi
 if test -z "${target_list+xxx}" ; then
    target_list="$default_target_list"
 else
-    target_list=`echo "$target_list" | sed -e 's/,/ /g'`
+    target_list=$(echo "$target_list" | sed -e 's/,/ /g')
 fi

 # Check that we recognised the target name; this allows a more
@@ -1781,14 +1785,23 @@ fi
 # avx2 optimization requirement check

 cat > $TMPC << EOF
-static void bar(void) {}
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include <cpuid.h>
+#include <immintrin.h>
+
+static int bar(void *a) {
+    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(*(__m256i *)a, (__m256i){0}));
+}
 static void *bar_ifunc(void) {return (void*) bar;}
-static void foo(void) __attribute__((ifunc("bar_ifunc")));
-int main(void) { foo(); return 0; }
+int foo(void *a) __attribute__((ifunc("bar_ifunc")));
+int main(int argc, char *argv[]) { return foo(argv[0]);}
 EOF
-if compile_prog "-mavx2" "" ; then
-    if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
-        avx2_opt="yes"
+if compile_object "" ; then
+    if has readelf; then
+        if readelf --syms $TMPO 2>/dev/null |grep -q "IFUNC.*foo"; then
+            avx2_opt="yes"
+        fi
    fi
 fi

@@ -1879,6 +1892,9 @@ if test "$seccomp" != "no" ; then
    arm|aarch64)
        libseccomp_minver="2.2.3"
        ;;
+    ppc|ppc64)
+        libseccomp_minver="2.3.0"
+        ;;
    *)
        libseccomp_minver=""
        ;;
@@ -1886,8 +1902,8 @@ if test "$seccomp" != "no" ; then

    if test "$libseccomp_minver" != "" &&
       $pkg_config --atleast-version=$libseccomp_minver libseccomp ; then
-        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
-        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
+        libs_softmmu="$libs_softmmu $($pkg_config --libs libseccomp)"
+        QEMU_CFLAGS="$QEMU_CFLAGS $($pkg_config --cflags libseccomp)"
        seccomp="yes"
    else
        if test "$seccomp" = "yes" ; then
@@ -2127,8 +2143,8 @@ fi
 x11_cflags=
 x11_libs=-lX11
 if $pkg_config --exists "x11"; then
-    x11_cflags=`$pkg_config --cflags x11`
-    x11_libs=`$pkg_config --libs x11`
+    x11_cflags=$($pkg_config --cflags x11)
+    x11_libs=$($pkg_config --libs x11)
 fi

 ##########################################
@@ -2155,8 +2171,9 @@ if test "$gtk" != "no"; then
      gtkversion="2.18.0"
    fi
    if $pkg_config --exists "$gtkpackage >= $gtkversion"; then
-        gtk_cflags=`$pkg_config --cflags $gtkpackage`
-        gtk_libs=`$pkg_config --libs $gtkpackage`
+        gtk_cflags=$($pkg_config --cflags $gtkpackage)
+        gtk_libs=$($pkg_config --libs $gtkpackage)
+        gtk_version=$($pkg_config --modversion $gtkpackage)
        if $pkg_config --exists "$gtkx11package >= $gtkversion"; then
            gtk_cflags="$gtk_cflags $x11_cflags"
            gtk_libs="$gtk_libs $x11_libs"
@@ -2194,8 +2211,8 @@ gnutls_gcrypt=no
 gnutls_nettle=no
 if test "$gnutls" != "no"; then
    if gnutls_works; then
-        gnutls_cflags=`$pkg_config --cflags gnutls`
-        gnutls_libs=`$pkg_config --libs gnutls`
+        gnutls_cflags=$($pkg_config --cflags gnutls)
+        gnutls_libs=$($pkg_config --libs gnutls)
        libs_softmmu="$gnutls_libs $libs_softmmu"
        libs_tools="$gnutls_libs $libs_tools"
 	QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags"
@@ -2219,7 +2236,7 @@ if test "$gnutls" != "no"; then
 	    gnutls_gcrypt=no
 	    gnutls_nettle=yes
 	elif $pkg_config --exists 'gnutls >= 2.12'; then
-	    case `$pkg_config --libs --static gnutls` in
+	    case $($pkg_config --libs --static gnutls) in
 		*gcrypt*)
 		    gnutls_gcrypt=yes
 		    gnutls_nettle=no
@@ -2280,7 +2297,7 @@ has_libgcrypt_config() {

    if test -n "$cross_prefix"
    then
-	host=`libgcrypt-config --host`
+	host=$(libgcrypt-config --host)
 	if test "$host-" != $cross_prefix
 	then
 	    return 1
@@ -2292,8 +2309,8 @@ has_libgcrypt_config() {

 if test "$gcrypt" != "no"; then
    if has_libgcrypt_config; then
-        gcrypt_cflags=`libgcrypt-config --cflags`
-        gcrypt_libs=`libgcrypt-config --libs`
+        gcrypt_cflags=$(libgcrypt-config --cflags)
+        gcrypt_libs=$(libgcrypt-config --libs)
        # Debian has remove -lgpg-error from libgcrypt-config
        # as it "spreads unnecessary dependencies" which in
        # turn breaks static builds...
@@ -2333,15 +2350,16 @@ fi

 if test "$nettle" != "no"; then
    if $pkg_config --exists "nettle"; then
-        nettle_cflags=`$pkg_config --cflags nettle`
-        nettle_libs=`$pkg_config --libs nettle`
-        nettle_version=`$pkg_config --modversion nettle`
+        nettle_cflags=$($pkg_config --cflags nettle)
+        nettle_libs=$($pkg_config --libs nettle)
+        nettle_version=$($pkg_config --modversion nettle)
        libs_softmmu="$nettle_libs $libs_softmmu"
        libs_tools="$nettle_libs $libs_tools"
        QEMU_CFLAGS="$QEMU_CFLAGS $nettle_cflags"
        nettle="yes"

        cat > $TMPC << EOF
+#include <stddef.h>
 #include <nettle/pbkdf2.h>
 int main(void) {
     pbkdf2_hmac_sha256(8, NULL, 1000, 8, NULL, 8, NULL);
@@ -2372,8 +2390,8 @@ tasn1=yes
 tasn1_cflags=""
 tasn1_libs=""
 if $pkg_config --exists "libtasn1"; then
-    tasn1_cflags=`$pkg_config --cflags libtasn1`
-    tasn1_libs=`$pkg_config --libs libtasn1`
+    tasn1_cflags=$($pkg_config --cflags libtasn1)
+    tasn1_libs=$($pkg_config --libs libtasn1)
 else
    tasn1=no
 fi
@@ -2392,20 +2410,25 @@ fi

 if test "$vte" != "no"; then
    if test "$gtkabi" = "3.0"; then
-      vtepackage="vte-2.90"
-      vteversion="0.32.0"
+      vteminversion="0.32.0"
+      if $pkg_config --exists "vte-2.91"; then
+        vtepackage="vte-2.91"
+      else
+        vtepackage="vte-2.90"
+      fi
    else
      vtepackage="vte"
-      vteversion="0.24.0"
+      vteminversion="0.24.0"
    fi
-    if $pkg_config --exists "$vtepackage >= $vteversion"; then
-        vte_cflags=`$pkg_config --cflags $vtepackage`
-        vte_libs=`$pkg_config --libs $vtepackage`
+    if $pkg_config --exists "$vtepackage >= $vteminversion"; then
+        vte_cflags=$($pkg_config --cflags $vtepackage)
+        vte_libs=$($pkg_config --libs $vtepackage)
+        vteversion=$($pkg_config --modversion $vtepackage)
        libs_softmmu="$vte_libs $libs_softmmu"
        vte="yes"
    elif test "$vte" = "yes"; then
        if test "$gtkabi" = "3.0"; then
-            feature_not_found "vte" "Install libvte-2.90 devel"
+            feature_not_found "vte" "Install libvte-2.90/2.91 devel"
        else
            feature_not_found "vte" "Install libvte devel"
        fi
@@ -2420,25 +2443,37 @@ fi
 # Look for sdl configuration program (pkg-config or sdl-config).  Try
 # sdl-config even without cross prefix, and favour pkg-config over sdl-config.

+if test "$sdlabi" = ""; then
+    if $pkg_config --exists "sdl"; then
+        sdlabi=1.2
+    elif $pkg_config --exists "sdl2"; then
+        sdlabi=2.0
+    else
+        sdlabi=1.2
+    fi
+fi
+
 if test $sdlabi = "2.0"; then
    sdl_config=$sdl2_config
    sdlname=sdl2
    sdlconfigname=sdl2_config
-else
+elif test $sdlabi = "1.2"; then
    sdlname=sdl
    sdlconfigname=sdl_config
+else
+    error_exit "Unknown sdlabi $sdlabi, must be 1.2 or 2.0"
 fi

-if test "`basename $sdl_config`" != $sdlconfigname && ! has ${sdl_config}; then
+if test "$(basename $sdl_config)" != $sdlconfigname && ! has ${sdl_config}; then
  sdl_config=$sdlconfigname
 fi

 if $pkg_config $sdlname --exists; then
  sdlconfig="$pkg_config $sdlname"
-  _sdlversion=`$sdlconfig --modversion 2>/dev/null | sed 's/[^0-9]//g'`
+  sdlversion=$($sdlconfig --modversion 2>/dev/null)
 elif has ${sdl_config}; then
  sdlconfig="$sdl_config"
-  _sdlversion=`$sdlconfig --version | sed 's/[^0-9]//g'`
+  sdlversion=$($sdlconfig --version)
 else
  if test "$sdl" = "yes" ; then
    feature_not_found "sdl" "Install SDL devel"
@@ -2456,14 +2491,14 @@ if test "$sdl" != "no" ; then
 #undef main /* We don't want SDL to override our main() */
 int main( void ) { return SDL_Init (SDL_INIT_VIDEO); }
 EOF
-  sdl_cflags=`$sdlconfig --cflags 2> /dev/null`
+  sdl_cflags=$($sdlconfig --cflags 2>/dev/null)
  if test "$static" = "yes" ; then
-    sdl_libs=`$sdlconfig --static-libs 2>/dev/null`
+    sdl_libs=$($sdlconfig --static-libs 2>/dev/null)
  else
-    sdl_libs=`$sdlconfig --libs 2> /dev/null`
+    sdl_libs=$($sdlconfig --libs 2>/dev/null)
  fi
  if compile_prog "$sdl_cflags" "$sdl_libs" ; then
-    if test "$_sdlversion" -lt 121 ; then
+    if test $(echo $sdlversion | sed 's/[^0-9]//g') -lt 121 ; then
      sdl_too_old=yes
    else
      sdl=yes
@@ -2472,8 +2507,8 @@ EOF
    # static link with sdl ? (note: sdl.pc's --static --libs is broken)
    if test "$sdl" = "yes" -a "$static" = "yes" ; then
      if test $? = 0 && echo $sdl_libs | grep -- -laa > /dev/null; then
-         sdl_libs="$sdl_libs `aalib-config --static-libs 2>/dev/null`"
-         sdl_cflags="$sdl_cflags `aalib-config --cflags 2>/dev/null`"
+         sdl_libs="$sdl_libs $(aalib-config --static-libs 2>/dev/null)"
+         sdl_cflags="$sdl_cflags $(aalib-config --cflags 2>/dev/null)"
      fi
      if compile_prog "$sdl_cflags" "$sdl_libs" ; then
 	:
@@ -2590,8 +2625,8 @@ int main(void) {
 }
 EOF
  if $pkg_config libpng --exists; then
-    vnc_png_cflags=`$pkg_config libpng --cflags`
-    vnc_png_libs=`$pkg_config libpng --libs`
+    vnc_png_cflags=$($pkg_config libpng --cflags)
+    vnc_png_libs=$($pkg_config libpng --libs)
  else
    vnc_png_cflags=""
    vnc_png_libs="-lpng"
@@ -2785,7 +2820,7 @@ EOF
    fi
 }

-audio_drv_list=`echo "$audio_drv_list" | sed -e 's/,/ /g'`
+audio_drv_list=$(echo "$audio_drv_list" | sed -e 's/,/ /g')
 for drv in $audio_drv_list; do
    case $drv in
    alsa)
@@ -2795,8 +2830,8 @@ for drv in $audio_drv_list; do
    ;;

    pa)
-    audio_drv_probe $drv pulse/mainloop.h "-lpulse" \
-        "pa_mainloop *m = 0; pa_mainloop_free (m); return 0;"
+    audio_drv_probe $drv pulse/pulseaudio.h "-lpulse" \
+        "pa_context_set_source_output_volume(NULL, 0, NULL, NULL, NULL); return 0;"
    libs_softmmu="-lpulse $libs_softmmu"
    audio_pt_int="yes"
    ;;
@@ -2897,8 +2932,8 @@ if test "$curl" != "no" ; then
 #include <curl/curl.h>
 int main(void) { curl_easy_init(); curl_multi_setopt(0, 0, 0); return 0; }
 EOF
-  curl_cflags=`$curlconfig --cflags 2>/dev/null`
-  curl_libs=`$curlconfig --libs 2>/dev/null`
+  curl_cflags=$($curlconfig --cflags 2>/dev/null)
+  curl_libs=$($curlconfig --libs 2>/dev/null)
  if compile_prog "$curl_cflags" "$curl_libs" ; then
    curl=yes
  else
@@ -2916,8 +2951,8 @@ if test "$bluez" != "no" ; then
 #include <bluetooth/bluetooth.h>
 int main(void) { return bt_error(0); }
 EOF
-  bluez_cflags=`$pkg_config --cflags bluez 2> /dev/null`
-  bluez_libs=`$pkg_config --libs bluez 2> /dev/null`
+  bluez_cflags=$($pkg_config --cflags bluez 2>/dev/null)
+  bluez_libs=$($pkg_config --libs bluez 2>/dev/null)
  if compile_prog "$bluez_cflags" "$bluez_libs" ; then
    bluez=yes
    libs_softmmu="$bluez_libs $libs_softmmu"
@@ -2940,8 +2975,8 @@ fi

 for i in $glib_modules; do
    if $pkg_config --atleast-version=$glib_req_ver $i; then
-        glib_cflags=`$pkg_config --cflags $i`
-        glib_libs=`$pkg_config --libs $i`
+        glib_cflags=$($pkg_config --cflags $i)
+        glib_libs=$($pkg_config --libs $i)
        CFLAGS="$glib_cflags $CFLAGS"
        LIBS="$glib_libs $LIBS"
        libs_qga="$glib_libs $libs_qga"
@@ -2967,7 +3002,7 @@ int main(void) {
 }
 EOF

-if ! compile_prog "-Werror $CFLAGS" "$LIBS" ; then
+if ! compile_prog "$CFLAGS" "$LIBS" ; then
    error_exit "sizeof(size_t) doesn't match GLIB_SIZEOF_SIZE_T."\
               "You probably need to set PKG_CONFIG_LIBDIR"\
 	       "to point to the right pkg-config files for your"\
@@ -3030,8 +3065,8 @@ if test "$pixman" = "none"; then
  pixman_libs=
 elif test "$pixman" = "system"; then
  # pixman version has been checked above
-  pixman_cflags=`$pkg_config --cflags pixman-1`
-  pixman_libs=`$pkg_config --libs pixman-1`
+  pixman_cflags=$($pkg_config --cflags pixman-1)
+  pixman_libs=$($pkg_config --libs pixman-1)
 else
  if test ! -d ${source_path}/pixman/pixman; then
    error_exit "pixman >= 0.21.8 not present. Your options:" \
@@ -3147,8 +3182,8 @@ fi
 min_libssh2_version=1.2.8
 if test "$libssh2" != "no" ; then
  if $pkg_config --atleast-version=$min_libssh2_version libssh2; then
-    libssh2_cflags=`$pkg_config libssh2 --cflags`
-    libssh2_libs=`$pkg_config libssh2 --libs`
+    libssh2_cflags=$($pkg_config libssh2 --cflags)
+    libssh2_libs=$($pkg_config libssh2 --libs)
    libssh2=yes
  else
    if test "$libssh2" = "yes" ; then
@@ -3399,8 +3434,8 @@ fi
 if test "$glusterfs" != "no" ; then
  if $pkg_config --atleast-version=3 glusterfs-api; then
    glusterfs="yes"
-    glusterfs_cflags=`$pkg_config --cflags glusterfs-api`
-    glusterfs_libs=`$pkg_config --libs glusterfs-api`
+    glusterfs_cflags=$($pkg_config --cflags glusterfs-api)
+    glusterfs_libs=$($pkg_config --libs glusterfs-api)
    if $pkg_config --atleast-version=4 glusterfs-api; then
      glusterfs_xlator_opt="yes"
    fi
@@ -3780,8 +3815,8 @@ if compile_prog "" "" ; then
  epoll=yes
 fi

-# epoll_create1 and epoll_pwait are later additions
-# so we must check separately for their presence
+# epoll_create1 is a later addition
+# so we must check separately for its presence
 epoll_create1=no
 cat > $TMPC << EOF
 #include <sys/epoll.h>
@@ -3803,20 +3838,6 @@ if compile_prog "" "" ; then
  epoll_create1=yes
 fi

-epoll_pwait=no
-cat > $TMPC << EOF
-#include <sys/epoll.h>
-
-int main(void)
-{
-    epoll_pwait(0, 0, 0, 0, 0);
-    return 0;
-}
-EOF
-if compile_prog "" "" ; then
-  epoll_pwait=yes
-fi
-
 # check for sendfile support
 sendfile=no
 cat > $TMPC << EOF
@@ -4171,24 +4192,6 @@ if compile_prog "" "" ; then
    posix_madvise=yes
 fi

-##########################################
-# check if we have usable SIGEV_THREAD_ID
-
-sigev_thread_id=no
-cat > $TMPC << EOF
-#include <signal.h>
-int main(void) {
-  struct sigevent ev;
-  ev.sigev_notify = SIGEV_THREAD_ID;
-  ev._sigev_un._tid = 0;
-  asm volatile("" : : "g"(&ev));
-  return 0;
-}
-EOF
-if compile_prog "" "" ; then
-    sigev_thread_id=yes
-fi
-
 ##########################################
 # check if trace backend exists

@@ -4207,12 +4210,12 @@ int main(void) { return 0; }
 EOF
  if compile_prog "" "" ; then
    if $pkg_config lttng-ust --exists; then
-      lttng_ust_libs=`$pkg_config --libs lttng-ust`
+      lttng_ust_libs=$($pkg_config --libs lttng-ust)
    else
      lttng_ust_libs="-llttng-ust"
    fi
    if $pkg_config liburcu-bp --exists; then
-      urcu_bp_libs=`$pkg_config --libs liburcu-bp`
+      urcu_bp_libs=$($pkg_config --libs liburcu-bp)
    else
      urcu_bp_libs="-lurcu-bp"
    fi
@@ -4508,6 +4511,38 @@ if compile_prog "" "" ; then
    have_fsxattr=yes
 fi

+##########################################
+# check if rtnetlink.h exists and is useful
+have_rtnetlink=no
+cat > $TMPC << EOF
+#include <linux/rtnetlink.h>
+int main(void) {
+  return IFLA_PROTO_DOWN;
+}
+EOF
+if compile_prog "" "" ; then
+    have_rtnetlink=yes
+fi
+
+#################################################
+# Sparc implicitly links with --relax, which is
+# incompatible with -r, so --no-relax should be
+# given. It does no harm to give it on other
+# platforms too.
+
+# Note: the prototype is needed since QEMU_CFLAGS
+#       contains -Wmissing-prototypes
+cat > $TMPC << EOF
+extern int foo(void);
+int foo(void) { return 0; }
+EOF
+if ! compile_object ""; then
+  error_exit "Failed to compile object file for LD_REL_FLAGS test"
+fi
+if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
+  LD_REL_FLAGS="-Wl,--no-relax"
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -4536,16 +4571,6 @@ if test "$libnfs" != "no" ; then
  fi
 fi

-# Disable zero malloc errors for official releases unless explicitly told to
-# enable/disable
-if test -z "$zero_malloc" ; then
-    if test "$z_version" = "50" ; then
-	zero_malloc="no"
-    else
-	zero_malloc="yes"
-    fi
-fi
-
 # Now we've finished running tests it's OK to add -Werror to the compiler flags
 if test "$werror" = "yes"; then
    QEMU_CFLAGS="-Werror $QEMU_CFLAGS"
@@ -4593,7 +4618,7 @@ if test "$softmmu" = yes ; then
      tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)"
    else
      if test "$virtfs" = yes; then
-        error_exit "VirtFS is supported only on Linux and requires libcap-devel and libattr-devel"
+        error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel"
      fi
      virtfs=no
    fi
@@ -4652,10 +4677,10 @@ if test "$guest_agent_msi" = "yes"; then
  fi

  if test "$QEMU_GA_VERSION" = ""; then
-      QEMU_GA_VERSION=`cat $source_path/VERSION`
+      QEMU_GA_VERSION=$(cat $source_path/VERSION)
  fi

-  QEMU_GA_MSI_MINGW_DLL_PATH="-D Mingw_dlls=`$pkg_config --variable=prefix glib-2.0`/bin"
+  QEMU_GA_MSI_MINGW_DLL_PATH="-D Mingw_dlls=$($pkg_config --variable=prefix glib-2.0)/bin"

  case "$cpu" in
  x86_64)
@@ -4686,7 +4711,7 @@ if test "$cpu" = "s390x" ; then
 fi

 # Probe for the need for relocating the user-only binary.
-if test "$pie" = "no" ; then
+if ( [ "$linux_user" = yes ] || [ "$bsd_user" = yes ] ) && [ "$pie" = no ]; then
  textseg_addr=
  case "$cpu" in
    arm | i386 | ppc* | s390* | sparc* | x86_64 | x32)
@@ -4708,6 +4733,16 @@ EOF
      # In case ld does not support -Ttext-segment, edit the default linker
      # script via sed to set the .text start addr.  This is needed on FreeBSD
      # at least.
+      if ! $ld --verbose >/dev/null 2>&1; then
+        error_exit \
+            "We need to link the QEMU user mode binaries at a" \
+            "specific text address. Unfortunately your linker" \
+            "doesn't support either the -Ttext-segment option or" \
+            "printing the default linker script with --verbose." \
+            "If you don't want the user mode binaries, pass the" \
+            "--disable-user option to configure."
+      fi
+
      $ld --verbose | sed \
        -e '1,/==================================================/d' \
        -e '/==================================================/,$d' \
@@ -4718,21 +4753,27 @@ EOF
  fi
 fi

+echo_version() {
+    if test "$1" = "yes" ; then
+        echo "($2)"
+    fi
+}
+
 # prepend pixman and ftd flags after all config tests are done
 QEMU_CFLAGS="$pixman_cflags $fdt_cflags $QEMU_CFLAGS"
 libs_softmmu="$pixman_libs $libs_softmmu"

 echo "Install prefix    $prefix"
-echo "BIOS directory    `eval echo $qemu_datadir`"
-echo "binary directory  `eval echo $bindir`"
-echo "library directory `eval echo $libdir`"
-echo "module directory  `eval echo $qemu_moddir`"
-echo "libexec directory `eval echo $libexecdir`"
-echo "include directory `eval echo $includedir`"
-echo "config directory  `eval echo $sysconfdir`"
+echo "BIOS directory    $(eval echo $qemu_datadir)"
+echo "binary directory  $(eval echo $bindir)"
+echo "library directory $(eval echo $libdir)"
+echo "module directory  $(eval echo $qemu_moddir)"
+echo "libexec directory $(eval echo $libexecdir)"
+echo "include directory $(eval echo $includedir)"
+echo "config directory  $(eval echo $sysconfdir)"
 if test "$mingw32" = "no" ; then
-echo "local state directory   `eval echo $local_statedir`"
-echo "Manual directory  `eval echo $mandir`"
+echo "local state directory   $(eval echo $local_statedir)"
+echo "Manual directory  $(eval echo $mandir)"
 echo "ELF interp prefix $interp_prefix"
 else
 echo "local state directory   queried at runtime"
@@ -4767,22 +4808,18 @@ if test "$darwin" = "yes" ; then
    echo "Cocoa support     $cocoa"
 fi
 echo "pixman            $pixman"
-echo "SDL support       $sdl"
-echo "GTK support       $gtk"
+echo "SDL support       $sdl $(echo_version $sdl $sdlversion)"
+echo "GTK support       $gtk $(echo_version $gtk $gtk_version)"
 echo "GTK GL support    $gtk_gl"
+echo "VTE support       $vte $(echo_version $vte $vteversion)"
 echo "GNUTLS support    $gnutls"
 echo "GNUTLS hash       $gnutls_hash"
 echo "GNUTLS rnd        $gnutls_rnd"
 echo "libgcrypt         $gcrypt"
 echo "libgcrypt kdf     $gcrypt_kdf"
-if test "$nettle" = "yes"; then
-    echo "nettle            $nettle ($nettle_version)"
-else
-    echo "nettle            $nettle"
-fi
+echo "nettle            $nettle $(echo_version $nettle $nettle_version)"
 echo "nettle kdf        $nettle_kdf"
 echo "libtasn1          $tasn1"
-echo "VTE support       $vte"
 echo "curses support    $curses"
 echo "virgl support     $virglrenderer"
 echo "curl support      $curl"
@@ -4822,7 +4859,6 @@ echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "madvise           $madvise"
 echo "posix_madvise     $posix_madvise"
-echo "sigev_thread_id   $sigev_thread_id"
 echo "uuid support      $uuid"
 echo "libcap-ng support $cap_ng"
 echo "vhost-net support $vhost_net"
@@ -4831,11 +4867,7 @@ echo "Trace backends    $trace_backends"
 if have_backend "simple"; then
 echo "Trace output file $trace_file-<pid>"
 fi
-if test "$spice" = "yes"; then
-echo "spice support     $spice ($spice_protocol_version/$spice_server_version)"
-else
-echo "spice support     $spice"
-fi
+echo "spice support     $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
 echo "rbd support       $rbd"
 echo "xfsctl support    $xfs"
 echo "smartcard support $smartcard"
@@ -4914,7 +4946,7 @@ if test "$bigendian" = "yes" ; then
 fi
 if test "$mingw32" = "yes" ; then
  echo "CONFIG_WIN32=y" >> $config_host_mak
-  rc_version=`cat $source_path/VERSION`
+  rc_version=$(cat $source_path/VERSION)
  version_major=${rc_version%%.*}
  rc_version=${rc_version#*.}
  version_minor=${rc_version%%.*}
@@ -4990,7 +5022,7 @@ if test "$cap_ng" = "yes" ; then
 fi
 echo "CONFIG_AUDIO_DRIVERS=$audio_drv_list" >> $config_host_mak
 for drv in $audio_drv_list; do
-    def=CONFIG_`echo $drv | LC_ALL=C tr '[a-z]' '[A-Z]'`
+    def=CONFIG_$(echo $drv | LC_ALL=C tr '[a-z]' '[A-Z]')
    echo "$def=y" >> $config_host_mak
 done
 if test "$audio_pt_int" = "yes" ; then
@@ -5022,7 +5054,7 @@ fi
 if test "$xfs" = "yes" ; then
  echo "CONFIG_XFS=y" >> $config_host_mak
 fi
-qemu_version=`head $source_path/VERSION`
+qemu_version=$(head $source_path/VERSION)
 echo "VERSION=$qemu_version" >>$config_host_mak
 echo "PKGVERSION=$pkgversion" >>$config_host_mak
 echo "SRC_PATH=$source_path" >> $config_host_mak
@@ -5033,7 +5065,7 @@ fi
 if test "$modules" = "yes"; then
  # $shacmd can generate a hash started with digit, which the compiler doesn't
  # like as an symbol. So prefix it with an underscore
-  echo "CONFIG_STAMP=_`(echo $qemu_version; echo $pkgversion; cat $0) | $shacmd - | cut -f1 -d\ `" >> $config_host_mak
+  echo "CONFIG_STAMP=_$( (echo $qemu_version; echo $pkgversion; cat $0) | $shacmd - | cut -f1 -d\ )" >> $config_host_mak
  echo "CONFIG_MODULES=y" >> $config_host_mak
 fi
 if test "$sdl" = "yes" ; then
@@ -5098,9 +5130,6 @@ fi
 if test "$epoll_create1" = "yes" ; then
  echo "CONFIG_EPOLL_CREATE1=y" >> $config_host_mak
 fi
-if test "$epoll_pwait" = "yes" ; then
-  echo "CONFIG_EPOLL_PWAIT=y" >> $config_host_mak
-fi
 if test "$sendfile" = "yes" ; then
  echo "CONFIG_SENDFILE=y" >> $config_host_mak
 fi
@@ -5244,9 +5273,6 @@ fi
 if test "$posix_madvise" = "yes" ; then
  echo "CONFIG_POSIX_MADVISE=y" >> $config_host_mak
 fi
-if test "$sigev_thread_id" = "yes" ; then
-  echo "CONFIG_SIGEV_THREAD_ID=y" >> $config_host_mak
-fi

 if test "$spice" = "yes" ; then
  echo "CONFIG_SPICE=y" >> $config_host_mak
@@ -5309,9 +5335,6 @@ if [ "$bsd" = "yes" ] ; then
  echo "CONFIG_BSD=y" >> $config_host_mak
 fi

-if test "$zero_malloc" = "yes" ; then
-  echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak
-fi
 if test "$localtime_r" = "yes" ; then
  echo "CONFIG_LOCALTIME_R=y" >> $config_host_mak
 fi
@@ -5445,6 +5468,10 @@ if test "$rdma" = "yes" ; then
  echo "CONFIG_RDMA=y" >> $config_host_mak
 fi

+if test "$have_rtnetlink" = "yes" ; then
+  echo "CONFIG_RTNETLINK=y" >> $config_host_mak
+fi
+
 # Hold two types of flag:
 #   CONFIG_THREAD_SETNAME_BYTHREAD  - we've got a way of setting the name on
 #                                     a thread we have a handle to
@@ -5513,6 +5540,7 @@ else
 fi
 echo "LDFLAGS=$LDFLAGS" >> $config_host_mak
 echo "LDFLAGS_NOPIE=$LDFLAGS_NOPIE" >> $config_host_mak
+echo "LD_REL_FLAGS=$LD_REL_FLAGS" >> $config_host_mak
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "LIBS_TOOLS+=$libs_tools" >> $config_host_mak
 echo "EXESUF=$EXESUF" >> $config_host_mak
@@ -5561,7 +5589,7 @@ fi
 for target in $target_list; do
 target_dir="$target"
 config_target_mak=$target_dir/config-target.mak
-target_name=`echo $target | cut -d '-' -f 1`
+target_name=$(echo $target | cut -d '-' -f 1)
 target_bigendian="no"

 case "$target_name" in
@@ -5601,7 +5629,7 @@ mkdir -p $target_dir
 echo "# Automatically generated by configure - do not modify" > $config_target_mak

 bflt="no"
-interp_prefix1=`echo "$interp_prefix" | sed "s/%M/$target_name/g"`
+interp_prefix1=$(echo "$interp_prefix" | sed "s/%M/$target_name/g")
 gdb_xml_files=""

 TARGET_ARCH="$target_name"
@@ -5727,7 +5755,7 @@ upper() {
    echo "$@"| LC_ALL=C tr '[a-z]' '[A-Z]'
 }

-target_arch_name="`upper $TARGET_ARCH`"
+target_arch_name="$(upper $TARGET_ARCH)"
 echo "TARGET_$target_arch_name=y" >> $config_target_mak
 echo "TARGET_NAME=$target_name" >> $config_target_mak
 echo "TARGET_BASE_ARCH=$TARGET_BASE_ARCH" >> $config_target_mak
@@ -5943,11 +5971,11 @@ for bios_file in \
    $source_path/pc-bios/u-boot.* \
    $source_path/pc-bios/palcode-*
 do
-    FILES="$FILES pc-bios/`basename $bios_file`"
+    FILES="$FILES pc-bios/$(basename $bios_file)"
 done
-for test_file in `find $source_path/tests/acpi-test-data -type f`
+for test_file in $(find $source_path/tests/acpi-test-data -type f)
 do
-    FILES="$FILES tests/acpi-test-data`echo $test_file | sed -e 's/.*acpi-test-data//'`"
+    FILES="$FILES tests/acpi-test-data$(echo $test_file | sed -e 's/.*acpi-test-data//')"
 done
 mkdir -p $DIRS
 for f in $FILES ; do
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -7,9 +7,9 @@
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "qemu/sockets.h"

-#include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/un.h>

--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -15,7 +15,7 @@
 * unix socket. For each client, the server will create some eventfd
 * (see EVENTFD(2)), one per vector. These fd are transmitted to all
 * clients using the SCM_RIGHTS cmsg message. Therefore, each client is
- * able to send a notification to another client without beeing
+ * able to send a notification to another client without being
 * "profixied" by the server.
 *
 * We use this mechanism to send interruptions between guests.
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -20,16 +20,14 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "sysemu/cpus.h"
+#include "exec/exec-all.h"
 #include "exec/memory-internal.h"

 bool exit_request;
 CPUState *tcg_current_cpu;

-/* exit the current TB from a signal handler. The host registers are
-   restored in a state compatible with the CPU emulator
- */
-#if defined(CONFIG_SOFTMMU)
-void cpu_resume_from_signal(CPUState *cpu, void *puc)
+/* exit the current TB, but without causing any exception to be raised */
+void cpu_loop_exit_noexc(CPUState *cpu)
 {
    /* XXX: restore cpu registers saved in host registers */

@@ -37,6 +35,7 @@ void cpu_resume_from_signal(CPUState *cpu, void *puc)
    siglongjmp(cpu->jmp_env, 1);
 }

+#if defined(CONFIG_SOFTMMU)
 void cpu_reloading_memory_map(void)
 {
    if (qemu_in_vcpu_thread()) {
@@ -68,7 +67,6 @@ void cpu_reloading_memory_map(void)

 void cpu_loop_exit(CPUState *cpu)
 {
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }

@@ -77,6 +75,5 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
    if (pc) {
        cpu_restore_state(cpu, pc);
    }
-    cpu->current_tb = NULL;
    siglongjmp(cpu->jmp_env, 1);
 }
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -20,6 +20,7 @@
 #include "cpu.h"
 #include "trace.h"
 #include "disas/disas.h"
+#include "exec/exec-all.h"
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
@@ -136,7 +137,9 @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
 static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 {
    CPUArchState *env = cpu->env_ptr;
-    uintptr_t next_tb;
+    uintptr_t ret;
+    TranslationBlock *last_tb;
+    int tb_exit;
    uint8_t *tb_ptr = itb->tc_ptr;

    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
@@ -160,118 +163,125 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
 #endif /* DEBUG_DISAS */

    cpu->can_do_io = !use_icount;
-    next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    ret = tcg_qemu_tb_exec(env, tb_ptr);
    cpu->can_do_io = 1;
-    trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
-                       next_tb & TB_EXIT_MASK);
+    last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    tb_exit = ret & TB_EXIT_MASK;
+    trace_exec_tb_exit(last_tb, tb_exit);

-    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
+    if (tb_exit > TB_EXIT_IDX1) {
        /* We didn't start executing this TB (eg because the instruction
         * counter hit zero); we must restore the guest PC to the address
         * of the start of the TB.
         */
        CPUClass *cc = CPU_GET_CLASS(cpu);
-        TranslationBlock *tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-        qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
+        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
                               "Stopped execution of TB chain before %p ["
                               TARGET_FMT_lx "] %s\n",
-                               itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
+                               last_tb->tc_ptr, last_tb->pc,
+                               lookup_symbol(last_tb->pc));
        if (cc->synchronize_from_tb) {
-            cc->synchronize_from_tb(cpu, tb);
+            cc->synchronize_from_tb(cpu, last_tb);
        } else {
            assert(cc->set_pc);
-            cc->set_pc(cpu, tb->pc);
+            cc->set_pc(cpu, last_tb->pc);
        }
    }
-    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_REQUESTED) {
+    if (tb_exit == TB_EXIT_REQUESTED) {
        /* We were asked to stop executing TBs (probably a pending
         * interrupt. We've now stopped, so clear the flag.
         */
        cpu->tcg_exit_req = 0;
    }
-    return next_tb;
+    return ret;
 }

+#ifndef CONFIG_USER_ONLY
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;
+    bool old_tb_flushed;

    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;

+    old_tb_flushed = cpu->tb_flushed;
+    cpu->tb_flushed = false;
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
-    cpu->current_tb = tb;
+    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
+    cpu->tb_flushed |= old_tb_flushed;
    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);
-    cpu->current_tb = NULL;
    tb_phys_invalidate(tb, -1);
    tb_free(tb);
 }
+#endif
+
+struct tb_desc {
+    target_ulong pc;
+    target_ulong cs_base;
+    CPUArchState *env;
+    tb_page_addr_t phys_page1;
+    uint32_t flags;
+};
+
+static bool tb_cmp(const void *p, const void *d)
+{
+    const TranslationBlock *tb = p;
+    const struct tb_desc *desc = d;
+
+    if (tb->pc == desc->pc &&
+        tb->page_addr[0] == desc->phys_page1 &&
+        tb->cs_base == desc->cs_base &&
+        tb->flags == desc->flags) {
+        /* check next page if needed */
+        if (tb->page_addr[1] == -1) {
+            return true;
+        } else {
+            tb_page_addr_t phys_page2;
+            target_ulong virt_page2;
+
+            virt_page2 = (desc->pc & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+            phys_page2 = get_page_addr_code(desc->env, virt_page2);
+            if (tb->page_addr[1] == phys_page2) {
+                return true;
+            }
+        }
+    }
+    return false;
+}

 static TranslationBlock *tb_find_physical(CPUState *cpu,
                                          target_ulong pc,
                                          target_ulong cs_base,
-                                          uint64_t flags)
+                                          uint32_t flags)
 {
-    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
-    TranslationBlock *tb, **ptb1;
-    unsigned int h;
-    tb_page_addr_t phys_pc, phys_page1;
-    target_ulong virt_page2;
+    tb_page_addr_t phys_pc;
+    struct tb_desc desc;
+    uint32_t h;

-    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
-
-    /* find translated block using physical mappings */
-    phys_pc = get_page_addr_code(env, pc);
-    phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_phys_hash_func(phys_pc);
-    ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h];
-    for(;;) {
-        tb = *ptb1;
-        if (!tb) {
-            return NULL;
-        }
-        if (tb->pc == pc &&
-            tb->page_addr[0] == phys_page1 &&
-            tb->cs_base == cs_base &&
-            tb->flags == flags) {
-            /* check next page if needed */
-            if (tb->page_addr[1] != -1) {
-                tb_page_addr_t phys_page2;
-
-                virt_page2 = (pc & TARGET_PAGE_MASK) +
-                    TARGET_PAGE_SIZE;
-                phys_page2 = get_page_addr_code(env, virt_page2);
-                if (tb->page_addr[1] == phys_page2) {
-                    break;
-                }
-            } else {
-                break;
-            }
-        }
-        ptb1 = &tb->phys_hash_next;
-    }
-
-    /* Move the TB to the head of the list */
-    *ptb1 = tb->phys_hash_next;
-    tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h];
-    tcg_ctx.tb_ctx.tb_phys_hash[h] = tb;
-    return tb;
+    desc.env = (CPUArchState *)cpu->env_ptr;
+    desc.cs_base = cs_base;
+    desc.flags = flags;
+    desc.pc = pc;
+    phys_pc = get_page_addr_code(desc.env, pc);
+    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
+    h = tb_hash_func(phys_pc, pc, flags);
+    return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
 }

 static TranslationBlock *tb_find_slow(CPUState *cpu,
                                      target_ulong pc,
                                      target_ulong cs_base,
-                                      uint64_t flags)
+                                      uint32_t flags)
 {
    TranslationBlock *tb;

@@ -309,26 +319,72 @@ found:
    return tb;
 }

-static inline TranslationBlock *tb_find_fast(CPUState *cpu)
+static inline TranslationBlock *tb_find_fast(CPUState *cpu,
+                                             TranslationBlock **last_tb,
+                                             int tb_exit)
 {
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
-    int flags;
+    uint32_t flags;

    /* we record a subset of the CPU state. It will
       always be the same before a given translated block
       is executed. */
    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb_lock();
    tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                 tb->flags != flags)) {
        tb = tb_find_slow(cpu, pc, cs_base, flags);
    }
+    if (cpu->tb_flushed) {
+        /* Ensure that no TB jump will be modified as the
+         * translation buffer has been flushed.
+         */
+        *last_tb = NULL;
+        cpu->tb_flushed = false;
+    }
+#ifndef CONFIG_USER_ONLY
+    /* We don't take care of direct jumps when address mapping changes in
+     * system emulation. So it's not safe to make a direct jump to a TB
+     * spanning two pages because the mapping for the second page can change.
+     */
+    if (tb->page_addr[1] != -1) {
+        *last_tb = NULL;
+    }
+#endif
+    /* See if we can patch the calling TB. */
+    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_add_jump(*last_tb, tb_exit, tb);
+    }
+    tb_unlock();
    return tb;
 }

-static void cpu_handle_debug_exception(CPUState *cpu)
+static inline bool cpu_handle_halt(CPUState *cpu)
+{
+    if (cpu->halted) {
+#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            apic_poll_irq(x86_cpu->apic_state);
+            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
+        }
+#endif
+        if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
+            return true;
+        }
+
+        cpu->halted = 0;
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_debug_exception(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUWatchpoint *wp;
@@ -342,37 +398,197 @@ static void cpu_handle_debug_exception(CPUState *cpu)
    cc->debug_excp_handler(cpu);
 }

+static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
+{
+    if (cpu->exception_index >= 0) {
+        if (cpu->exception_index >= EXCP_INTERRUPT) {
+            /* exit request from the cpu execution loop */
+            *ret = cpu->exception_index;
+            if (*ret == EXCP_DEBUG) {
+                cpu_handle_debug_exception(cpu);
+            }
+            cpu->exception_index = -1;
+            return true;
+        } else {
+#if defined(CONFIG_USER_ONLY)
+            /* if user mode only, we simulate a fake exception
+               which will be handled outside the cpu execution
+               loop */
+#if defined(TARGET_I386)
+            CPUClass *cc = CPU_GET_CLASS(cpu);
+            cc->do_interrupt(cpu);
+#endif
+            *ret = cpu->exception_index;
+            cpu->exception_index = -1;
+            return true;
+#else
+            if (replay_exception()) {
+                CPUClass *cc = CPU_GET_CLASS(cpu);
+                cc->do_interrupt(cpu);
+                cpu->exception_index = -1;
+            } else if (!replay_has_interrupt()) {
+                /* give a chance to iothread in replay mode */
+                *ret = EXCP_INTERRUPT;
+                return true;
+            }
+#endif
+        }
+#ifndef CONFIG_USER_ONLY
+    } else if (replay_has_exception()
+               && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+        /* try to cause an exception pending in the log */
+        TranslationBlock *last_tb = NULL; /* Avoid chaining TBs */
+        cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, &last_tb, 0), true);
+        *ret = -1;
+        return true;
+#endif
+    }
+
+    return false;
+}
+
+static inline void cpu_handle_interrupt(CPUState *cpu,
+                                        TranslationBlock **last_tb)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    int interrupt_request = cpu->interrupt_request;
+
+    if (unlikely(interrupt_request)) {
+        if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
+            /* Mask out external interrupts for this step. */
+            interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
+        }
+        if (interrupt_request & CPU_INTERRUPT_DEBUG) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
+            cpu->exception_index = EXCP_DEBUG;
+            cpu_loop_exit(cpu);
+        }
+        if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
+            /* Do nothing */
+        } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+            replay_interrupt();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
+            cpu->halted = 1;
+            cpu->exception_index = EXCP_HLT;
+            cpu_loop_exit(cpu);
+        }
+#if defined(TARGET_I386)
+        else if (interrupt_request & CPU_INTERRUPT_INIT) {
+            X86CPU *x86_cpu = X86_CPU(cpu);
+            CPUArchState *env = &x86_cpu->env;
+            replay_interrupt();
+            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
+            do_cpu_init(x86_cpu);
+            cpu->exception_index = EXCP_HALTED;
+            cpu_loop_exit(cpu);
+        }
+#else
+        else if (interrupt_request & CPU_INTERRUPT_RESET) {
+            replay_interrupt();
+            cpu_reset(cpu);
+            cpu_loop_exit(cpu);
+        }
+#endif
+        /* The target hook has 3 exit conditions:
+           False when the interrupt isn't processed,
+           True when it is, and we should restart on a new TB,
+           and via longjmp via cpu_loop_exit.  */
+        else {
+            replay_interrupt();
+            if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                *last_tb = NULL;
+            }
+            /* The target hook may have updated the 'cpu->interrupt_request';
+             * reload the 'interrupt_request' value */
+            interrupt_request = cpu->interrupt_request;
+        }
+        if (interrupt_request & CPU_INTERRUPT_EXITTB) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
+            /* ensure that no TB jump will be modified as
+               the program flow was changed */
+            *last_tb = NULL;
+        }
+    }
+    if (unlikely(cpu->exit_request || replay_has_interrupt())) {
+        cpu->exit_request = 0;
+        cpu->exception_index = EXCP_INTERRUPT;
+        cpu_loop_exit(cpu);
+    }
+}
+
+static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
+                                    TranslationBlock **last_tb, int *tb_exit,
+                                    SyncClocks *sc)
+{
+    uintptr_t ret;
+
+    if (unlikely(cpu->exit_request)) {
+        return;
+    }
+
+    trace_exec_tb(tb, tb->pc);
+    ret = cpu_tb_exec(cpu, tb);
+    *last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
+    *tb_exit = ret & TB_EXIT_MASK;
+    switch (*tb_exit) {
+    case TB_EXIT_REQUESTED:
+        /* Something asked us to stop executing
+         * chained TBs; just continue round the main
+         * loop. Whatever requested the exit will also
+         * have set something else (eg exit_request or
+         * interrupt_request) which we will handle
+         * next time around the loop.  But we need to
+         * ensure the tcg_exit_req read in generated code
+         * comes before the next read of cpu->exit_request
+         * or cpu->interrupt_request.
+         */
+        smp_rmb();
+        *last_tb = NULL;
+        break;
+    case TB_EXIT_ICOUNT_EXPIRED:
+    {
+        /* Instruction counter expired.  */
+#ifdef CONFIG_USER_ONLY
+        abort();
+#else
+        int insns_left = cpu->icount_decr.u32;
+        if (cpu->icount_extra && insns_left >= 0) {
+            /* Refill decrementer and continue execution.  */
+            cpu->icount_extra += insns_left;
+            insns_left = MIN(0xffff, cpu->icount_extra);
+            cpu->icount_extra -= insns_left;
+            cpu->icount_decr.u16.low = insns_left;
+        } else {
+            if (insns_left > 0) {
+                /* Execute remaining instructions.  */
+                cpu_exec_nocache(cpu, insns_left, *last_tb, false);
+                align_clocks(sc, cpu);
+            }
+            cpu->exception_index = EXCP_INTERRUPT;
+            *last_tb = NULL;
+            cpu_loop_exit(cpu);
+        }
+        break;
+#endif
+    }
+    default:
+        break;
+    }
+}
+
 /* main execution loop */

 int cpu_exec(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-    X86CPU *x86_cpu = X86_CPU(cpu);
-    CPUArchState *env = &x86_cpu->env;
-#endif
-    int ret, interrupt_request;
-    TranslationBlock *tb;
-    uintptr_t next_tb;
+    int ret;
    SyncClocks sc;

    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;

-    if (cpu->halted) {
-#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
-            && replay_interrupt()) {
-            apic_poll_irq(x86_cpu->apic_state);
-            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
-        }
-#endif
-        if (!cpu_has_work(cpu)) {
-            current_cpu = NULL;
-            return EXCP_HALTED;
-        }
-
-        cpu->halted = 0;
+    if (cpu_handle_halt(cpu)) {
+        return EXCP_HALTED;
    }

    atomic_mb_set(&tcg_current_cpu, cpu);
@@ -391,185 +607,26 @@ int cpu_exec(CPUState *cpu)
     */
    init_delay_params(&sc, cpu);

-    /* prepare setjmp context for exception handling */
    for(;;) {
+        TranslationBlock *tb, *last_tb;
+        int tb_exit = 0;
+
+        /* prepare setjmp context for exception handling */
        if (sigsetjmp(cpu->jmp_env, 0) == 0) {
            /* if an exception is pending, we execute it here */
-            if (cpu->exception_index >= 0) {
-                if (cpu->exception_index >= EXCP_INTERRUPT) {
-                    /* exit request from the cpu execution loop */
-                    ret = cpu->exception_index;
-                    if (ret == EXCP_DEBUG) {
-                        cpu_handle_debug_exception(cpu);
-                    }
-                    cpu->exception_index = -1;
-                    break;
-                } else {
-#if defined(CONFIG_USER_ONLY)
-                    /* if user mode only, we simulate a fake exception
-                       which will be handled outside the cpu execution
-                       loop */
-#if defined(TARGET_I386)
-                    cc->do_interrupt(cpu);
-#endif
-                    ret = cpu->exception_index;
-                    cpu->exception_index = -1;
-                    break;
-#else
-                    if (replay_exception()) {
-                        cc->do_interrupt(cpu);
-                        cpu->exception_index = -1;
-                    } else if (!replay_has_interrupt()) {
-                        /* give a chance to iothread in replay mode */
-                        ret = EXCP_INTERRUPT;
-                        break;
-                    }
-#endif
-                }
-            } else if (replay_has_exception()
-                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
-                /* try to cause an exception pending in the log */
-                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
-                ret = -1;
+            if (cpu_handle_exception(cpu, &ret)) {
                break;
            }

-            next_tb = 0; /* force lookup of first TB */
+            last_tb = NULL; /* forget the last executed TB after exception */
+            cpu->tb_flushed = false; /* reset before first TB lookup */
            for(;;) {
-                interrupt_request = cpu->interrupt_request;
-                if (unlikely(interrupt_request)) {
-                    if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
-                        /* Mask out external interrupts for this step. */
-                        interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
-                    }
-                    if (interrupt_request & CPU_INTERRUPT_DEBUG) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
-                        cpu->exception_index = EXCP_DEBUG;
-                        cpu_loop_exit(cpu);
-                    }
-                    if (replay_mode == REPLAY_MODE_PLAY
-                        && !replay_has_interrupt()) {
-                        /* Do nothing */
-                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
-                        replay_interrupt();
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
-                        cpu->halted = 1;
-                        cpu->exception_index = EXCP_HLT;
-                        cpu_loop_exit(cpu);
-                    }
-#if defined(TARGET_I386)
-                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
-                        replay_interrupt();
-                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
-                        do_cpu_init(x86_cpu);
-                        cpu->exception_index = EXCP_HALTED;
-                        cpu_loop_exit(cpu);
-                    }
-#else
-                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
-                        replay_interrupt();
-                        cpu_reset(cpu);
-                        cpu_loop_exit(cpu);
-                    }
-#endif
-                    /* The target hook has 3 exit conditions:
-                       False when the interrupt isn't processed,
-                       True when it is, and we should restart on a new TB,
-                       and via longjmp via cpu_loop_exit.  */
-                    else {
-                        replay_interrupt();
-                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                            next_tb = 0;
-                        }
-                    }
-                    /* Don't use the cached interrupt_request value,
-                       do_interrupt may have updated the EXITTB flag. */
-                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
-                        cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
-                        /* ensure that no TB jump will be modified as
-                           the program flow was changed */
-                        next_tb = 0;
-                    }
-                }
-                if (unlikely(cpu->exit_request
-                             || replay_has_interrupt())) {
-                    cpu->exit_request = 0;
-                    cpu->exception_index = EXCP_INTERRUPT;
-                    cpu_loop_exit(cpu);
-                }
-                tb_lock();
-                tb = tb_find_fast(cpu);
-                /* Note: we do it here to avoid a gcc bug on Mac OS X when
-                   doing it in tb_find_slow */
-                if (tcg_ctx.tb_ctx.tb_invalidated_flag) {
-                    /* as some TB could have been invalidated because
-                       of memory exceptions while generating the code, we
-                       must recompute the hash index here */
-                    next_tb = 0;
-                    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
-                }
-                /* see if we can patch the calling TB. When the TB
-                   spans two pages, we cannot safely do a direct
-                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1
-                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
-                                next_tb & TB_EXIT_MASK, tb);
-                }
-                tb_unlock();
-                if (likely(!cpu->exit_request)) {
-                    trace_exec_tb(tb, tb->pc);
-                    /* execute the generated code */
-                    cpu->current_tb = tb;
-                    next_tb = cpu_tb_exec(cpu, tb);
-                    cpu->current_tb = NULL;
-                    switch (next_tb & TB_EXIT_MASK) {
-                    case TB_EXIT_REQUESTED:
-                        /* Something asked us to stop executing
-                         * chained TBs; just continue round the main
-                         * loop. Whatever requested the exit will also
-                         * have set something else (eg exit_request or
-                         * interrupt_request) which we will handle
-                         * next time around the loop.  But we need to
-                         * ensure the tcg_exit_req read in generated code
-                         * comes before the next read of cpu->exit_request
-                         * or cpu->interrupt_request.
-                         */
-                        smp_rmb();
-                        next_tb = 0;
-                        break;
-                    case TB_EXIT_ICOUNT_EXPIRED:
-                    {
-                        /* Instruction counter expired.  */
-                        int insns_left = cpu->icount_decr.u32;
-                        if (cpu->icount_extra && insns_left >= 0) {
-                            /* Refill decrementer and continue execution.  */
-                            cpu->icount_extra += insns_left;
-                            insns_left = MIN(0xffff, cpu->icount_extra);
-                            cpu->icount_extra -= insns_left;
-                            cpu->icount_decr.u16.low = insns_left;
-                        } else {
-                            if (insns_left > 0) {
-                                /* Execute remaining instructions.  */
-                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb, false);
-                                align_clocks(&sc, cpu);
-                            }
-                            cpu->exception_index = EXCP_INTERRUPT;
-                            next_tb = 0;
-                            cpu_loop_exit(cpu);
-                        }
-                        break;
-                    }
-                    default:
-                        break;
-                    }
-                }
+                cpu_handle_interrupt(cpu, &last_tb);
+                tb = tb_find_fast(cpu, &last_tb, tb_exit);
+                cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
                /* Try to align the host and virtual clocks
                   if the guest is in advance */
                align_clocks(&sc, cpu);
-                /* reset soft MMU for next block (it can currently
-                   only be set by a memory fault) */
            } /* for(;;) */
        } else {
 #if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
@@ -579,18 +636,10 @@ int cpu_exec(CPUState *cpu)
             * Newer versions of gcc would complain about this code (-Wclobbered). */
            cpu = current_cpu;
            cc = CPU_GET_CLASS(cpu);
-#ifdef TARGET_I386
-            x86_cpu = X86_CPU(cpu);
-            env = &x86_cpu->env;
-#endif
 #else /* buggy compiler */
            /* Assert that the compiler does not smash local variables. */
            g_assert(cpu == current_cpu);
            g_assert(cc == CPU_GET_CLASS(cpu));
-#ifdef TARGET_I386
-            g_assert(x86_cpu == X86_CPU(cpu));
-            g_assert(env == &x86_cpu->env);
-#endif
 #endif /* buggy compiler */
            cpu->can_do_io = 1;
            tb_lock_reset();
--- a/cpus.c
+++ b/cpus.c
@@ -24,7 +24,8 @@

 /* Needed early for CONFIG_BSD etc. */
 #include "qemu/osdep.h"
-
+#include "qemu-common.h"
+#include "cpu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
@@ -34,6 +35,7 @@
 #include "sysemu/dma.h"
 #include "sysemu/kvm.h"
 #include "qmp-commands.h"
+#include "exec/exec-all.h"

 #include "qemu/thread.h"
 #include "sysemu/cpus.h"
@@ -247,13 +249,13 @@ int64_t cpu_get_clock(void)
 void cpu_enable_ticks(void)
 {
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
-    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    seqlock_write_begin(&timers_state.vm_clock_seqlock);
    if (!timers_state.cpu_ticks_enabled) {
        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
        timers_state.cpu_clock_offset -= get_clock();
        timers_state.cpu_ticks_enabled = 1;
    }
-    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+    seqlock_write_end(&timers_state.vm_clock_seqlock);
 }

 /* disable cpu_get_ticks() : the clock is stopped. You must not call
@@ -263,13 +265,13 @@ void cpu_enable_ticks(void)
 void cpu_disable_ticks(void)
 {
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
-    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    seqlock_write_begin(&timers_state.vm_clock_seqlock);
    if (timers_state.cpu_ticks_enabled) {
        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
        timers_state.cpu_clock_offset = cpu_get_clock_locked();
        timers_state.cpu_ticks_enabled = 0;
    }
-    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+    seqlock_write_end(&timers_state.vm_clock_seqlock);
 }

 /* Correlation between real and virtual time is always going to be
@@ -292,7 +294,7 @@ static void icount_adjust(void)
        return;
    }

-    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    seqlock_write_begin(&timers_state.vm_clock_seqlock);
    cur_time = cpu_get_clock_locked();
    cur_icount = cpu_get_icount_locked();

@@ -313,7 +315,7 @@ static void icount_adjust(void)
    last_delta = delta;
    timers_state.qemu_icount_bias = cur_icount
                              - (timers_state.qemu_icount << icount_time_shift);
-    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+    seqlock_write_end(&timers_state.vm_clock_seqlock);
 }

 static void icount_adjust_rt(void *opaque)
@@ -353,7 +355,7 @@ static void icount_warp_rt(void)
        return;
    }

-    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    seqlock_write_begin(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
                                     cpu_get_clock_locked());
@@ -372,7 +374,7 @@ static void icount_warp_rt(void)
        timers_state.qemu_icount_bias += warp_delta;
    }
    vm_clock_warp_start = -1;
-    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+    seqlock_write_end(&timers_state.vm_clock_seqlock);

    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
@@ -397,9 +399,9 @@ void qtest_clock_warp(int64_t dest)
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);

-        seqlock_write_lock(&timers_state.vm_clock_seqlock);
+        seqlock_write_begin(&timers_state.vm_clock_seqlock);
        timers_state.qemu_icount_bias += warp;
-        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+        seqlock_write_end(&timers_state.vm_clock_seqlock);

        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
@@ -466,9 +468,9 @@ void qemu_start_warp_timer(void)
             * It is useful when we want a deterministic execution time,
             * isolated from host latencies.
             */
-            seqlock_write_lock(&timers_state.vm_clock_seqlock);
+            seqlock_write_begin(&timers_state.vm_clock_seqlock);
            timers_state.qemu_icount_bias += deadline;
-            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+            seqlock_write_end(&timers_state.vm_clock_seqlock);
            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
        } else {
            /*
@@ -479,11 +481,11 @@ void qemu_start_warp_timer(void)
             * you will not be sending network packets continuously instead of
             * every 100ms.
             */
-            seqlock_write_lock(&timers_state.vm_clock_seqlock);
+            seqlock_write_begin(&timers_state.vm_clock_seqlock);
            if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
                vm_clock_warp_start = clock;
            }
-            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+            seqlock_write_end(&timers_state.vm_clock_seqlock);
            timer_mod_anticipate(icount_warp_timer, clock + deadline);
        }
    } else if (deadline == 0) {
@@ -619,7 +621,7 @@ int cpu_throttle_get_percentage(void)

 void cpu_ticks_init(void)
 {
-    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
+    seqlock_init(&timers_state.vm_clock_seqlock);
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
                                           cpu_throttle_timer_tick, NULL);
@@ -778,7 +780,7 @@ static void sigbus_reraise(void)
        raise(SIGBUS);
        sigemptyset(&set);
        sigaddset(&set, SIGBUS);
-        sigprocmask(SIG_UNBLOCK, &set, NULL);
+        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
    }
    perror("Failed to re-raise SIGBUS!\n");
    abort();
@@ -970,6 +972,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
    qemu_cpu_kick(cpu);
 }

+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed");
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
    struct qemu_work_item *wi;
@@ -1059,7 +1073,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
    cpu->created = true;
    qemu_cond_signal(&qemu_cpu_cond);

-    while (1) {
+    do {
        if (cpu_can_run(cpu)) {
            r = kvm_cpu_exec(cpu);
            if (r == EXCP_DEBUG) {
@@ -1067,8 +1081,12 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
            }
        }
        qemu_kvm_wait_io_event(cpu);
-    }
+    } while (!cpu->unplug || cpu_can_run(cpu));

+    qemu_kvm_destroy_vcpu(cpu);
+    cpu->created = false;
+    qemu_cond_signal(&qemu_cpu_cond);
+    qemu_mutex_unlock_iothread();
    return NULL;
 }

@@ -1122,6 +1140,7 @@ static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;

    rcu_register_thread();

@@ -1159,6 +1178,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
            }
        }
        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        CPU_FOREACH(cpu) {
+            if (cpu->unplug && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            remove_cpu = NULL;
+        }
    }

    return NULL;
@@ -1315,6 +1346,21 @@ void resume_all_vcpus(void)
    }
 }

+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->unplug = true;
+    qemu_cpu_kick(cpu);
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+    cpu_remove(cpu);
+    while (cpu->created) {
+        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    }
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16

@@ -1531,6 +1577,9 @@ static void tcg_exec_all(void)
                break;
            }
        } else if (cpu->stop || cpu->stopped) {
+            if (cpu->unplug) {
+                next_cpu = CPU_NEXT(cpu);
+            }
            break;
        }
    }
@@ -1691,21 +1740,7 @@ exit:

 void qmp_inject_nmi(Error **errp)
 {
-#if defined(TARGET_I386)
-    CPUState *cs;
-
-    CPU_FOREACH(cs) {
-        X86CPU *cpu = X86_CPU(cs);
-
-        if (!cpu->apic_state) {
-            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
-        } else {
-            apic_deliver_nmi(cpu->apic_state);
-        }
-    }
-#else
    nmi_monitor_handle(monitor_get_cpu_index(), errp);
-#endif
 }

 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
--- a/cputlb.c
+++ b/cputlb.c
@@ -28,7 +28,10 @@

 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
+#include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "qemu/error-report.h"
+#include "exec/log.h"

 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -76,10 +79,6 @@ void tlb_flush(CPUState *cpu, int flush_global)

    tlb_debug("(%d)\n", flush_global);

-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;
-
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
@@ -95,9 +94,6 @@ static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
    CPUArchState *env = cpu->env_ptr;

    tlb_debug("start\n");
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    for (;;) {
        int mmu_idx = va_arg(argp, int);
@@ -152,9 +148,6 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
        tlb_flush(cpu, 1);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -193,9 +186,6 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
        va_end(argp);
        return;
    }
-    /* must reset current TB so that interrupts cannot modify the
-       links while we are modifying them */
-    cpu->current_tb = NULL;

    addr &= TARGET_PAGE_MASK;
    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
@@ -258,7 +248,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
 {
    ram_addr_t ram_addr;

-    if (qemu_ram_addr_from_host(ptr, &ram_addr) == NULL) {
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
        fprintf(stderr, "Bad ram pointer %p\n", ptr);
        abort();
    }
@@ -438,6 +429,39 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                            prot, mmu_idx, size);
 }

+static void report_bad_exec(CPUState *cpu, target_ulong addr)
+{
+    /* Accidentally executing outside RAM or ROM is quite common for
+     * several user-error situations, so report it in a way that
+     * makes it clear that this isn't a QEMU bug and provide suggestions
+     * about what a user could do to fix things.
+     */
+    error_report("Trying to execute code outside RAM or ROM at 0x"
+                 TARGET_FMT_lx, addr);
+    error_printf("This usually means one of the following happened:\n\n"
+                 "(1) You told QEMU to execute a kernel for the wrong machine "
+                 "type, and it crashed on startup (eg trying to run a "
+                 "raspberry pi kernel on a versatilepb QEMU machine)\n"
+                 "(2) You didn't give QEMU a kernel or BIOS filename at all, "
+                 "and QEMU executed a ROM full of no-op instructions until "
+                 "it fell off the end\n"
+                 "(3) Your guest kernel has a bug and crashed by jumping "
+                 "off into nowhere\n\n"
+                 "This is almost always one of the first two, so check your "
+                 "command line and that you are using the right type of kernel "
+                 "for this machine.\n"
+                 "If you think option (3) is likely then you can try debugging "
+                 "your guest with the -d debug options; in particular "
+                 "-d guest_errors will cause the log to include a dump of the "
+                 "guest register state at this point.\n\n"
+                 "Execution cannot continue; stopping here.\n\n");
+
+    /* Report also to the logs, with more detail including register dump */
+    qemu_log_mask(LOG_GUEST_ERROR, "qemu: fatal: Trying to execute code "
+                  "outside RAM or ROM at 0x" TARGET_FMT_lx "\n", addr);
+    log_cpu_state_mask(LOG_GUEST_ERROR, cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
+}
+
 /* NOTE: this function can trigger an exception */
 /* NOTE2: the returned address is not exactly the physical address: it
 * is actually a ram_addr_t (in system mode; the user mode emulation
@@ -466,8 +490,8 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
        if (cc->do_unassigned_access) {
            cc->do_unassigned_access(cpu, addr, false, true, 0, 4);
        } else {
-            cpu_abort(cpu, "Trying to execute code outside RAM or ROM at 0x"
-                      TARGET_FMT_lx "\n", addr);
+            report_bad_exec(cpu, addr);
+            exit(1);
        }
    }
    p = (void *)((uintptr_t)addr + env1->tlb_table[mmu_idx][page_index].addend);
--- a/crypto/afsplit.c
+++ b/crypto/afsplit.c
@@ -24,6 +24,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/bswap.h"
 #include "crypto/afsplit.h"
 #include "crypto/random.h"

--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -20,6 +20,7 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qemu/bswap.h"

 #include "crypto/block-luks.h"

@@ -1080,8 +1081,7 @@ qcrypto_block_luks_create(QCryptoBlock *block,
        luks->header.key_slots[i].key_offset =
            (QCRYPTO_BLOCK_LUKS_KEY_SLOT_OFFSET /
             QCRYPTO_BLOCK_LUKS_SECTOR_SIZE) +
-            (ROUND_UP(((splitkeylen + (QCRYPTO_BLOCK_LUKS_SECTOR_SIZE - 1)) /
-                       QCRYPTO_BLOCK_LUKS_SECTOR_SIZE),
+            (ROUND_UP(DIV_ROUND_UP(splitkeylen, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE),
                      (QCRYPTO_BLOCK_LUKS_KEY_SLOT_OFFSET /
                       QCRYPTO_BLOCK_LUKS_SECTOR_SIZE)) * i);
    }
@@ -1181,8 +1181,7 @@ qcrypto_block_luks_create(QCryptoBlock *block,
    luks->header.payload_offset =
        (QCRYPTO_BLOCK_LUKS_KEY_SLOT_OFFSET /
         QCRYPTO_BLOCK_LUKS_SECTOR_SIZE) +
-        (ROUND_UP(((splitkeylen + (QCRYPTO_BLOCK_LUKS_SECTOR_SIZE - 1)) /
-                   QCRYPTO_BLOCK_LUKS_SECTOR_SIZE),
+        (ROUND_UP(DIV_ROUND_UP(splitkeylen, QCRYPTO_BLOCK_LUKS_SECTOR_SIZE),
                  (QCRYPTO_BLOCK_LUKS_KEY_SLOT_OFFSET /
                   QCRYPTO_BLOCK_LUKS_SECTOR_SIZE)) *
         QCRYPTO_BLOCK_LUKS_NUM_KEY_SLOTS);
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -36,9 +36,7 @@ static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = {

 size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg)
 {
-    if (alg >= G_N_ELEMENTS(qcrypto_hash_alg_size)) {
-        return 0;
-    }
+    assert(alg < G_N_ELEMENTS(qcrypto_hash_alg_size));
    return qcrypto_hash_alg_size[alg];
 }

--- a/crypto/tlscredsx509.c
+++ b/crypto/tlscredsx509.c
@@ -392,11 +392,14 @@ qcrypto_tls_creds_load_cert(QCryptoTLSCredsX509 *creds,
    gsize buflen;
    GError *gerr;
    int ret = -1;
+    int err;

    trace_qcrypto_tls_creds_x509_load_cert(creds, isServer, certFile);

-    if (gnutls_x509_crt_init(&cert) < 0) {
-        error_setg(errp, "Unable to initialize certificate");
+    err = gnutls_x509_crt_init(&cert);
+    if (err < 0) {
+        error_setg(errp, "Unable to initialize certificate: %s",
+                   gnutls_strerror(err));
        goto cleanup;
    }

@@ -410,11 +413,13 @@ qcrypto_tls_creds_load_cert(QCryptoTLSCredsX509 *creds,
    data.data = (unsigned char *)buf;
    data.size = strlen(buf);

-    if (gnutls_x509_crt_import(cert, &data, GNUTLS_X509_FMT_PEM) < 0) {
+    err = gnutls_x509_crt_import(cert, &data, GNUTLS_X509_FMT_PEM);
+    if (err < 0) {
        error_setg(errp, isServer ?
-                   "Unable to import server certificate %s" :
-                   "Unable to import client certificate %s",
-                   certFile);
+                   "Unable to import server certificate %s: %s" :
+                   "Unable to import client certificate %s: %s",
+                   certFile,
+                   gnutls_strerror(err));
        goto cleanup;
    }

--- a/crypto/trace-events
+++ b/crypto/trace-events
@@ -0,0 +1,19 @@
+# See docs/trace-events.txt for syntax documentation.
+
+# crypto/tlscreds.c
+qcrypto_tls_creds_load_dh(void *creds, const char *filename) "TLS creds load DH creds=%p filename=%s"
+qcrypto_tls_creds_get_path(void *creds, const char *filename, const char *path) "TLS creds path creds=%p filename=%s path=%s"
+
+# crypto/tlscredsanon.c
+qcrypto_tls_creds_anon_load(void *creds, const char *dir) "TLS creds anon load creds=%p dir=%s"
+
+# crypto/tlscredsx509.c
+qcrypto_tls_creds_x509_load(void *creds, const char *dir) "TLS creds x509 load creds=%p dir=%s"
+qcrypto_tls_creds_x509_check_basic_constraints(void *creds, const char *file, int status) "TLS creds x509 check basic constraints creds=%p file=%s status=%d"
+qcrypto_tls_creds_x509_check_key_usage(void *creds, const char *file, int status, int usage, int critical) "TLS creds x509 check key usage creds=%p file=%s status=%d usage=%d critical=%d"
+qcrypto_tls_creds_x509_check_key_purpose(void *creds, const char *file, int status, const char *usage, int critical) "TLS creds x509 check key usage creds=%p file=%s status=%d usage=%s critical=%d"
+qcrypto_tls_creds_x509_load_cert(void *creds, int isServer, const char *file) "TLS creds x509 load cert creds=%p isServer=%d file=%s"
+qcrypto_tls_creds_x509_load_cert_list(void *creds, const char *file) "TLS creds x509 load cert list creds=%p file=%s"
+
+# crypto/tlssession.c
+qcrypto_tls_session_new(void *session, void *creds, const char *hostname, const char *aclname, int endpoint) "TLS session new session=%p creds=%p hostname=%s aclname=%s endpoint=%d"
--- a/default-configs/aarch64-softmmu.mak
+++ b/default-configs/aarch64-softmmu.mak
@@ -3,4 +3,7 @@
 # We support all the 32 bit boards so need all their config
 include arm-softmmu.mak

+CONFIG_AUX=y
+CONFIG_DDC=y
+CONFIG_DPCD=y
 CONFIG_XLNX_ZYNQMP=y
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -100,6 +100,7 @@ CONFIG_ALLWINNER_A10_PIT=y
 CONFIG_ALLWINNER_A10_PIC=y
 CONFIG_ALLWINNER_A10=y

+CONFIG_FSL_IMX6=y
 CONFIG_FSL_IMX31=y
 CONFIG_FSL_IMX25=y

--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -18,6 +18,7 @@ CONFIG_MEGASAS_SCSI_PCI=y
 CONFIG_MPTSAS_SCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
+CONFIG_E1000E_PCI=y
 CONFIG_VMXNET3_PCI=y
 CONFIG_IDE_CORE=y
 CONFIG_IDE_QDEV=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -49,6 +49,7 @@ CONFIG_ETSEC=y
 CONFIG_LIBDECNUMBER=y
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
+CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
 # For PReP
 CONFIG_MC146818RTC=y
--- a/device_tree.c
+++ b/device_tree.c
@@ -20,6 +20,7 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "qemu/error-report.h"
+#include "qemu/bswap.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -70,16 +70,17 @@ void qemu_sglist_destroy(QEMUSGList *qsg)

 typedef struct {
    BlockAIOCB common;
-    BlockBackend *blk;
+    AioContext *ctx;
    BlockAIOCB *acb;
    QEMUSGList *sg;
-    uint64_t sector_num;
+    uint64_t offset;
    DMADirection dir;
    int sg_cur_index;
    dma_addr_t sg_cur_byte;
    QEMUIOVector iov;
    QEMUBH *bh;
    DMAIOFunc *io_func;
+    void *io_func_opaque;
 } DMAAIOCB;

 static void dma_blk_cb(void *opaque, int ret);
@@ -130,7 +131,7 @@ static void dma_blk_cb(void *opaque, int ret)
    trace_dma_blk_cb(dbs, ret);

    dbs->acb = NULL;
-    dbs->sector_num += dbs->iov.size / 512;
+    dbs->offset += dbs->iov.size;

    if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
        dma_complete(dbs, ret);
@@ -154,8 +155,7 @@ static void dma_blk_cb(void *opaque, int ret)

    if (dbs->iov.size == 0) {
        trace_dma_map_wait(dbs);
-        dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk),
-                             reschedule_dma, dbs);
+        dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs);
        cpu_register_map_client(dbs->bh);
        return;
    }
@@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret)
        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
    }

-    dbs->acb = dbs->io_func(dbs->blk, dbs->sector_num, &dbs->iov,
-                            dbs->iov.size / 512, dma_blk_cb, dbs);
+    dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+                            dma_blk_cb, dbs, dbs->io_func_opaque);
    assert(dbs->acb);
 }

@@ -185,29 +185,38 @@ static void dma_aio_cancel(BlockAIOCB *acb)
    }
 }

+static AioContext *dma_get_aio_context(BlockAIOCB *acb)
+{
+    DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
+
+    return dbs->ctx;
+}

 static const AIOCBInfo dma_aiocb_info = {
    .aiocb_size         = sizeof(DMAAIOCB),
    .cancel_async       = dma_aio_cancel,
+    .get_aio_context    = dma_get_aio_context,
 };

-BlockAIOCB *dma_blk_io(
-    BlockBackend *blk, QEMUSGList *sg, uint64_t sector_num,
-    DMAIOFunc *io_func, BlockCompletionFunc *cb,
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+    QEMUSGList *sg, uint64_t offset,
+    DMAIOFunc *io_func, void *io_func_opaque,
+    BlockCompletionFunc *cb,
    void *opaque, DMADirection dir)
 {
-    DMAAIOCB *dbs = blk_aio_get(&dma_aiocb_info, blk, cb, opaque);
+    DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);

-    trace_dma_blk_io(dbs, blk, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
+    trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));

    dbs->acb = NULL;
-    dbs->blk = blk;
    dbs->sg = sg;
-    dbs->sector_num = sector_num;
+    dbs->ctx = ctx;
+    dbs->offset = offset;
    dbs->sg_cur_index = 0;
    dbs->sg_cur_byte = 0;
    dbs->dir = dir;
    dbs->io_func = io_func;
+    dbs->io_func_opaque = io_func_opaque;
    dbs->bh = NULL;
    qemu_iovec_init(&dbs->iov, sg->nsg);
    dma_blk_cb(dbs, 0);
@@ -215,19 +224,39 @@ BlockAIOCB *dma_blk_io(
 }


+static
+BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
+                                 BlockCompletionFunc *cb, void *cb_opaque,
+                                 void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t sector,
+                         QEMUSGList *sg, uint64_t offset,
                         void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_readv, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_read_io_func, blk, cb, opaque,
                      DMA_DIRECTION_FROM_DEVICE);
 }

+static
+BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
+                                  BlockCompletionFunc *cb, void *cb_opaque,
+                                  void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t sector,
+                          QEMUSGList *sg, uint64_t offset,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_writev, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_write_io_func, blk, cb, opaque,
                      DMA_DIRECTION_TO_DEVICE);
 }

--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -62,7 +62,7 @@ operations:
    typeof(*ptr) atomic_fetch_sub(ptr, val)
    typeof(*ptr) atomic_fetch_and(ptr, val)
    typeof(*ptr) atomic_fetch_or(ptr, val)
-    typeof(*ptr) atomic_xchg(ptr, val
+    typeof(*ptr) atomic_xchg(ptr, val)
    typeof(*ptr) atomic_cmpxchg(ptr, old, new)

 all of which return the old value of *ptr.  These operations are
@@ -326,21 +326,41 @@ and memory barriers, and the equivalents in QEMU:
  use a boxed atomic_t type; atomic operations in QEMU are polymorphic
  and use normal C types.

- atomic_read and atomic_set in Linux give no guarantee at all;
-  atomic_read and atomic_set in QEMU include a compiler barrier
-  (similar to the ACCESS_ONCE macro in Linux).
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+  at all. Linux 4.1 updated them to implement volatile
+  semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).

- most atomic read-modify-write operations in Linux return void;
-  in QEMU, all of them return the old value of the variable.
+  QEMU's atomic_read/set implement, if the compiler supports it, C11
+  atomic relaxed semantics, and volatile semantics otherwise.
+  Both semantics prevent the compiler from doing certain transformations;
+  the difference is that atomic accesses are guaranteed to be atomic,
+  while volatile accesses aren't. Thus, in the volatile case we just cross
+  our fingers hoping that the compiler will generate atomic accesses,
+  since we assume the variables passed are machine-word sized and
+  properly aligned.
+  No barriers are implied by atomic_read/set in either Linux or QEMU.
+
+- atomic read-modify-write operations in Linux are of three kinds:
+
+         atomic_OP          returns void
+         atomic_OP_return   returns new value of the variable
+         atomic_fetch_OP    returns the old value of the variable
+         atomic_cmpxchg     returns the old value of the variable
+
+  In QEMU, the second kind does not exist.  Currently Linux has
+  atomic_fetch_or only.  QEMU provides and, or, inc, dec, add, sub.

 - different atomic read-modify-write operations in Linux imply
  a different set of memory barriers; in QEMU, all of them enforce
  sequential consistency, which means they imply full memory barriers
  before and after the operation.

- Linux does not have an equivalent of atomic_mb_read() and
-  atomic_mb_set().  In particular, note that set_mb() is a little
-  weaker than atomic_mb_set().
+- Linux does not have an equivalent of atomic_mb_set().  In particular,
+  note that smp_store_mb() is a little weaker than atomic_mb_set().
+  atomic_mb_read() compiles to the same instructions as Linux's
+  smp_load_acquire(), but this should be treated as an implementation
+  detail.  If required, QEMU might later add atomic_load_acquire() and
+  atomic_store_release() macros.


 SOURCES
--- a/docs/build-system.txt
+++ b/docs/build-system.txt
@@ -438,6 +438,11 @@ top level Makefile, so anything defined in this file will influence the
 entire build system. Care needs to be taken when writing rules for tests
 to ensure they only apply to the unit test execution / build.

+- tests/docker/Makefile.include
+
+Rules for Docker tests. Like tests/Makefile, this file is included
+directly by the top level Makefile, anything defined in this file will
+influence the entire build system.

 - po/Makefile

--- a/docs/igd-assign.txt
+++ b/docs/igd-assign.txt
@@ -0,0 +1,133 @@
+Intel Graphics Device (IGD) assignment with vfio-pci
+====================================================
+
+IGD has two different modes for assignment using vfio-pci:
+
+1) Universal Pass-Through (UPT) mode:
+
+   In this mode the IGD device is added as a *secondary* (ie. non-primary)
+   graphics device in combination with an emulated primary graphics device.
+   This mode *requires* guest driver support to remove the external
+   dependencies generally associated with IGD (see below).  Those guest
+   drivers only support this mode for Broadwell and newer IGD, according to
+   Intel.  Additionally, this mode by default, and as officially supported
+   by Intel, does not support direct video output.  The intention is to use
+   this mode either to provide hardware acceleration to the emulated graphics
+   or to use this mode in combination with guest-based remote access software,
+   for example VNC (see below for optional output support).  This mode
+   theoretically has no device specific handling dependencies on vfio-pci or
+   the VM firmware.
+
+2) "Legacy" mode:
+
+   In this mode the IGD device is intended to be the primary and exclusive
+   graphics device in the VM[1], as such QEMU does not facilitate any sort
+   of remote graphics to the VM in this mode.  A connected physical monitor
+   is the intended output device for IGD.  This mode includes several
+   requirements and restrictions:
+
+    * IGD must be given address 02.0 on the PCI root bus in the VM
+    * The host kernel must support vfio extensions for IGD (v4.6)
+    * vfio VGA support very likely needs to be enabled in the host kernel
+    * The VM firmware must support specific fw_cfg enablers for IGD
+    * The VM machine type must support a PCI host bridge at 00.0 (standard)
+    * The VM machine type must provide or allow to be created a special
+      ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
+      PCI address 1f.0.
+    * The IGD device must have a VGA ROM, either provided via the romfile
+      option or loaded automatically through vfio (standard).  rombar=0
+      will disable legacy mode support.
+    * Hotplug of the IGD device is not supported.
+    * The IGD device must be a SandyBridge or newer model device.
+
+For either mode, depending on the host kernel, the i915 driver in the host
+may generate faults and errors upon re-binding to an IGD device after it
+has been assigned to a VM.  It's therefore generally recommended to prevent
+such driver binding unless the host driver is known to work well for this.
+There are numerous ways to do this, i915 can be blacklisted on the host,
+the driver_override option can be used to ensure that only vfio-pci can bind
+to the device on the host[2], virsh nodedev-detach can be used to bind the
+device to vfio drivers and then managed='no' set in the VM xml to prevent
+re-binding to i915, etc.  Also note that IGD is also typically the primary
+graphics in the host and special options may be required beyond simply
+blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a
+PCI class device.  Lower level drivers exist that may still claim the device.
+It may therefore be necessary to use kernel boot options video=vesafb:off or
+video=efifb:off (depending on host BIOS/UEFI) or these can be combined to
+a catch-all, video=vesafb:off,efifb:off.  Error messages such as:
+
+    Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow
+
+are a good indicator that such a problem exists.  The host files /proc/iomem
+and /proc/ioports are often useful for identifying drivers consuming ranges
+of the device to cause such conflicts.
+
+Additionally, IGD device are known to generate small numbers of DMAR faults
+when initially assigned.  It is believed that this is simply the IGD attempting
+to access the reserved GTT space after reset, which it no longer has access to
+when accessed from userspace.  So long as the DMAR faults are small in number
+and most importantly, not ongoing, these are not an indication of an error.
+
+Additionally++, analog VGA output (as opposed to digital outputs like HDMI,
+DVI, or DisplayPort) may be unsupported in some use cases.  In the author's
+experience, even DP to VGA adapters can be troublesome while adapters between
+digital formats work well.
+
+Usage
+=====
+The intention is for IGD assignment to be transparent for users and thus for
+management tools like libvirt.  To make use of legacy mode, simply remove all
+other graphics options and use "-nographic" and either "-vga none" or
+"-nodefaults", along with adding the device using vfio-pci:
+
+    -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+
+For UPT mode, retain the default emulated graphics and simply add the vfio-pci
+device making use of any other bus address other than 02.0.  libvirt will
+default to assigning the device a UPT compatible address while legacy mode
+users will need to manually edit the XML if using a tool like virt-manager
+where the VM device address is not expressly specified.
+
+An experimental vfio-pci option also exists to enable OpRegion, and thus
+external monitor support, for UPT mode.  This can be enabled by adding
+"x-igd-opregion=on" to the vfio-pci device options for the IGD device.  As
+with legacy mode, this requires the host to support features introduced in
+the v4.6 kernel.  If Intel chooses to embrace this support, the option may
+be made non-experimental in the future, opening it to libvirt support.
+
+Developer ABI
+=============
+Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+
+1) "etc/igd-opregion"
+
+   This fw_cfg file exposes the OpRegion for the IGD device.  A reserved
+   region should be created below 4GB (recommended 4KB alignment), sized
+   sufficient for the fw_cfg file size, and the content of this file copied
+   to it.  The dword based address of this reserved memory region must also
+   be written to the ASLS register at offset 0xFC on the IGD device.  It is
+   recommended that firmware should make use of this fw_cfg entry for any
+   PCI class VGA device with Intel vendor ID.  Multiple of such devices
+   within a VM is undefined.
+
+2) "etc/igd-bdsm-size"
+
+   This fw_cfg file contains an 8-byte, little endian integer indicating
+   the size of the reserved memory region required for IGD stolen memory.
+   Firmware must allocate a reserved memory below 4GB with required 1MB
+   alignment equal to this size.  Additionally the base address of this
+   reserved region must be written to the dword BDSM register in PCI config
+   space of the IGD device at offset 0x5C.  As this support is related to
+   running the IGD ROM, which has other dependencies on the device appearing
+   at guest address 00:02.0, it's expected that this fw_cfg file is only
+   relevant to a single PCI class VGA device with Intel vendor ID, appearing
+   at PCI bus address 00:02.0.
+
+Footnotes
+=========
+[1] Nothing precludes adding additional emulated or assigned graphics devices
+    as non-primary, other than the combination typically not working.  I only
+    intend to set user expectations, others are welcome to find working
+    combinations or fix whatever issues prevent this from working in the common
+    case.
+[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -403,8 +403,8 @@ listen thread:                     --- page -- page -- page -- page -- page --

 On receipt of CMD_PACKAGED (1)
   All the data associated with the package - the ( ... ) section in the
-diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
-recurses into qemu_loadvm_state_main to process the contents of the package (2)
+diagram - is read into memory, and the main thread recurses into
+qemu_loadvm_state_main to process the contents of the package (2)
 which contains commands (3,6) and devices (4...)

 On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
--- a/docs/multi-thread-compression.txt
+++ b/docs/multi-thread-compression.txt
@@ -110,7 +110,7 @@ Usage
 =====
 1. Verify both the source and destination QEMU are able
 to support the multiple thread compression migration:
-    {qemu} info_migrate_capabilities
+    {qemu} info migrate_capabilities
    {qemu} ... compress: off ...

 2. Activate compression on the source:
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .6.0
 .6.50