Update version for v2.8.0-rc1 release

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Merge remote-tracking branch 'sstabellini/tags/xen-20161122-tag' into staging
2016-11-22 22:29:08 +00:00 · 2016-11-22 19:30:39 +00:00 · 2016-11-22 19:30:03 +00:00 · 2016-11-22 19:29:30 +00:00 · 2016-11-22 10:29:41 -08:00 · 2016-11-22 10:29:39 -08:00
985 changed files with 45446 additions and 17591 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -39,9 +39,7 @@
 /qmp-introspect.[ch]
 /qmp-marshal.c
 /qemu-doc.html
-/qemu-tech.html
 /qemu-doc.info
-/qemu-tech.info
 /qemu-img
 /qemu-nbd
 /qemu-options.def
@@ -55,6 +53,7 @@
 /qemu-monitor-info.texi
 /qemu-version.h
 /qemu-version.h.tmp
+/module_block.h
 /vscclient
 /fsdev/virtfs-proxy-helper
 *.[1-9]
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,3 +31,6 @@
 [submodule "roms/u-boot"]
 	path = roms/u-boot
 	url = git://git.qemu-project.org/u-boot.git
+[submodule "roms/skiboot"]
+	path = roms/skiboot
+	url = git://git.qemu.org/skiboot.git
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,6 +9,7 @@ cache: ccache
 addons:
  apt:
    packages:
+      # Build dependencies
      - libaio-dev
      - libattr1-dev
      - libbrlapi-dev
@@ -89,6 +90,7 @@ matrix:
    - env: CONFIG=""
      os: osx
      compiler: clang
+    # Plain Trusty Build
    - env: CONFIG=""
      sudo: required
      addons:
@@ -99,3 +101,46 @@ matrix:
        - sudo apt-get build-dep -qq qemu
        - wget -O - http://people.linaro.org/~alex.bennee/qemu-submodule-git-seed.tar.xz | tar -xvJ
        - git submodule update --init --recursive
+    # Using newer GCC with sanitizers
+    - addons:
+        apt:
+          sources:
+            # PPAs for newer toolchains
+            - ubuntu-toolchain-r-test
+          packages:
+            # Extra toolchains
+            - gcc-5
+            - g++-5
+            # Build dependencies
+            - libaio-dev
+            - libattr1-dev
+            - libbrlapi-dev
+            - libcap-ng-dev
+            - libgnutls-dev
+            - libgtk-3-dev
+            - libiscsi-dev
+            - liblttng-ust-dev
+            - libnfs-dev
+            - libncurses5-dev
+            - libnss3-dev
+            - libpixman-1-dev
+            - libpng12-dev
+            - librados-dev
+            - libsdl1.2-dev
+            - libseccomp-dev
+            - libspice-protocol-dev
+            - libspice-server-dev
+            - libssh2-1-dev
+            - liburcu-dev
+            - libusb-1.0-0-dev
+            - libvte-2.90-dev
+            - sparse
+            - uuid-dev
+      language: generic
+      compiler: none
+      env:
+        - COMPILER_NAME=gcc CXX=g++-5 CC=gcc-5
+        - CONFIG="--cc=gcc-5 --cxx=g++-5 --disable-pie --disable-linux-user --with-coroutine=gthread"
+        - TEST_CMD=""
+      before_script:
+        - ./configure ${CONFIG} --extra-cflags="-g3 -O0 -fsanitize=thread -fuse-ld=gold" || cat config.log
--- a/2
+++ b/2
@@ -9,7 +9,7 @@ patches before submitting.
 Of course, the most important aspect in any coding style is whitespace.
 Crusty old coders who have trouble spotting the glasses on their noses
 can tell the difference between a tab and eight spaces from a distance
-of approximately fifteen parsecs.  Many a flamewar have been fought and
+of approximately fifteen parsecs.  Many a flamewar has been fought and
 lost on this issue.

 QEMU indents are four spaces.  Tabs are never used, except in Makefiles
--- a/181
+++ b/181
@@ -63,6 +63,17 @@ W: http://wiki.qemu.org/SecurityProcess
 M: Michael S. Tsirkin <mst@redhat.com>
 L: secalert@redhat.com

+Trivial patches
+---------------
+Trivial patches
+M: Michael Tokarev <mjt@tls.msk.ru>
+M: Laurent Vivier <laurent@vivier.eu>
+S: Maintained
+L: qemu-trivial@nongnu.org
+K: ^Subject:.*(?i)trivial
+T: git git://git.corpit.ru/qemu.git trivial-patches
+T: git git://github.com/vivier/qemu.git trivial-patches
+
 Guest CPU cores (TCG):
 ----------------------
 Overall
@@ -107,6 +118,7 @@ S: Maintained
 F: target-arm/
 F: hw/arm/
 F: hw/cpu/a*mpcore.c
+F: include/hw/cpu/a*mpcore.h
 F: disas/arm.c
 F: disas/arm-a64.cc
 F: disas/libvixl/
@@ -116,6 +128,7 @@ M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 S: Maintained
 F: target-cris/
 F: hw/cris/
+F: include/hw/cris/
 F: tests/tcg/cris/
 F: disas/cris.c

@@ -132,9 +145,10 @@ F: include/hw/lm32/
 F: tests/tcg/lm32/

 M68K
-S: Orphan
+M: Laurent Vivier <laurent@vivier.eu>
+S: Maintained
 F: target-m68k/
-F: hw/m68k/
+F: disas/m68k.c

 MicroBlaze
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
@@ -145,10 +159,17 @@ F: disas/microblaze.c

 MIPS
 M: Aurelien Jarno <aurelien@aurel32.net>
-M: Leon Alrae <leon.alrae@imgtec.com>
+M: Yongbok Kim <yongbok.kim@imgtec.com>
 S: Maintained
 F: target-mips/
 F: hw/mips/
+F: hw/misc/mips_*
+F: hw/intc/mips_gic.c
+F: hw/timer/mips_gictimer.c
+F: include/hw/mips/
+F: include/hw/misc/mips_*
+F: include/hw/intc/mips_gic.h
+F: include/hw/timer/mips_gictimer.h
 F: tests/tcg/mips/
 F: disas/mips.c

@@ -157,6 +178,8 @@ M: Anthony Green <green@moxielogic.com>
 S: Maintained
 F: target-moxie/
 F: disas/moxie.c
+F: hw/moxie/
+F: default-configs/moxie-softmmu.mak

 OpenRISC
 M: Jia Liu <proljc@gmail.com>
@@ -319,6 +342,9 @@ L: qemu-devel@nongnu.org
 M: Stefan Weil <sw@weilnetz.de>
 S: Maintained
 F: *win32*
+F: */*win32*
+F: include/*/*win32*
+X: qga/*win32*
 F: qemu.nsi

 ARM Machines
@@ -395,6 +421,7 @@ M: Peter Chubb <peter.chubb@nicta.com.au>
 L: qemu-arm@nongnu.org
 S: Odd fixes
 F: hw/*/imx*
+F: include/hw/*/imx*
 F: hw/arm/kzm.c
 F: include/hw/arm/fsl-imx31.h

@@ -403,6 +430,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/integratorcp.c
+F: hw/misc/arm_integrator_debug.c

 Musicpal
 M: Jan Kiszka <jan.kiszka@web.de>
@@ -427,6 +455,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/arm/realview*
+F: hw/cpu/realview_mpcore.c
 F: hw/intc/realview_gic.c
 F: include/hw/intc/realview_gic.h

@@ -439,6 +468,7 @@ F: hw/arm/spitz.c
 F: hw/arm/tosa.c
 F: hw/arm/z2.c
 F: hw/*/pxa2xx*
+F: hw/misc/mst_fpga.c
 F: include/hw/arm/pxa.h

 Stellaris
@@ -460,7 +490,8 @@ L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/xilinx_*
 F: hw/*/cadence_*
-F: hw/misc/zynq_slcr.c
+F: hw/misc/zynq*
+F: include/hw/misc/zynq*
 X: hw/ssi/xilinx_*

 Xilinx ZynqMP
@@ -479,6 +510,21 @@ S: Maintained
 F: hw/arm/virt-acpi-build.c
 F: include/hw/arm/virt-acpi-build.h

+STM32F205
+M: Alistair Francis <alistair@alistair23.me>
+S: Maintained
+F: hw/arm/stm32f205_soc.c
+F: hw/misc/stm32f2xx_syscfg.c
+F: hw/char/stm32f2xx_usart.c
+F: hw/timer/stm32f2xx_timer.c
+F: hw/adc/*
+F: hw/ssi/stm32f2xx_spi.c
+
+Netduino 2
+M: Alistair Francis <alistair@alistair23.me>
+S: Maintained
+F: hw/arm/netduino2.c
+
 CRIS Machines
 -------------
 Axis Dev88
@@ -504,6 +550,7 @@ M68K Machines
 an5206
 S: Orphan
 F: hw/m68k/an5206.c
+F: hw/m68k/mcf5206.c

 dummy_m68k
 S: Orphan
@@ -512,6 +559,9 @@ F: hw/m68k/dummy_m68k.c
 mcf5208
 S: Orphan
 F: hw/m68k/mcf5208.c
+F: hw/m68k/mcf_intc.c
+F: hw/char/mcf_uart.c
+F: hw/net/mcf_fec.c

 MicroBlaze Machines
 -------------------
@@ -605,6 +655,7 @@ S: Maintained
 F: hw/ppc/mac_oldworld.c
 F: hw/pci-host/grackle.c
 F: hw/misc/macio/
+F: hw/intc/heathrow_pic.c

 PReP
 L: qemu-devel@nongnu.org
@@ -613,6 +664,7 @@ S: Odd Fixes
 F: hw/ppc/prep.c
 F: hw/pci-host/prep.[hc]
 F: hw/isa/pc87312.[hc]
+F: pc-bios/ppc_rom.bin

 sPAPR
 M: David Gibson <david@gibson.dropbear.id.au>
@@ -626,6 +678,7 @@ F: include/hw/*/xics*
 F: pc-bios/spapr-rtas/*
 F: pc-bios/spapr-rtas.bin
 F: pc-bios/slof.bin
+F: pc-bios/skiboot.lid
 F: docs/specs/ppc-spapr-hcalls.txt
 F: docs/specs/ppc-spapr-hotplug.txt
 F: tests/spapr*
@@ -645,31 +698,40 @@ R2D
 M: Magnus Damm <magnus.damm@gmail.com>
 S: Maintained
 F: hw/sh4/r2d.c
+F: hw/intc/sh_intc.c
+F: hw/timer/sh_timer.c

 Shix
 M: Magnus Damm <magnus.damm@gmail.com>
-S: Orphan
+S: Odd Fixes
 F: hw/sh4/shix.c

 SPARC Machines
 --------------
 Sun4m
-M: Blue Swirl <blauwirbel@gmail.com>
 M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
 S: Maintained
 F: hw/sparc/sun4m.c
+F: hw/dma/sparc32_dma.c
+F: hw/dma/sun4m_iommu.c
+F: hw/misc/eccmemctl.c
+F: hw/misc/slavio_misc.c
+F: include/hw/sparc/sparc32_dma.h
+F: include/hw/sparc/sun4m.h
+F: pc-bios/openbios-sparc32

 Sun4u
-M: Blue Swirl <blauwirbel@gmail.com>
 M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
 S: Maintained
 F: hw/sparc64/sun4u.c
+F: pc-bios/openbios-sparc64

 Leon3
 M: Fabien Chouteau <chouteau@adacore.com>
 S: Maintained
 F: hw/sparc/leon3.c
 F: hw/*/grlib*
+F: include/hw/sparc/grlib.h

 S390 Machines
 -------------
@@ -772,6 +834,7 @@ M: John Snow <jsnow@redhat.com>
 L: qemu-block@nongnu.org
 S: Supported
 F: include/hw/ide.h
+F: include/hw/ide/
 F: hw/ide/
 F: hw/block/block.c
 F: hw/block/cdrom.c
@@ -908,6 +971,8 @@ virtio
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/virtio*
+F: hw/virtio/Makefile.objs
+F: hw/virtio/trace-events
 F: net/vhost-user.c
 F: include/hw/virtio/
 F: tests/virtio-balloon-test.c
@@ -963,6 +1028,13 @@ F: include/sysemu/rng*.h
 F: backends/rng*.c
 F: tests/virtio-rng-test.c

+virtio-crypto
+M: Gonglei <arei.gonglei@huawei.com>
+S: Supported
+F: hw/virtio/virtio-crypto.c
+F: hw/virtio/virtio-crypto-pci.c
+F: include/hw/virtio/virtio-crypto.h
+
 nvme
 M: Keith Busch <keith.busch@intel.com>
 L: qemu-block@nongnu.org
@@ -995,6 +1067,8 @@ Rocker
 M: Jiri Pirko <jiri@resnulli.us>
 S: Maintained
 F: hw/net/rocker/
+F: tests/rocker/
+F: docs/specs/rocker.txt

 NVDIMM
 M: Xiao Guangrong <guangrong.xiao@linux.intel.com>
@@ -1013,6 +1087,19 @@ M: Dmitry Fleytman <dmitry@daynix.com>
 S: Maintained
 F: hw/net/e1000e*

+Generic Loader
+M: Alistair Francis <alistair.francis@xilinx.com>
+S: Maintained
+F: hw/core/generic-loader.c
+F: include/hw/core/generic-loader.h
+
+CHRP NVRAM
+M: Thomas Huth <thuth@redhat.com>
+S: Maintained
+F: hw/nvram/chrp_nvram.c
+F: include/hw/nvram/chrp_nvram.h
+F: tests/prom-env-test.c
+
 Subsystems
 ----------
 Audio
@@ -1020,6 +1107,7 @@ M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: audio/
 F: hw/audio/
+F: include/hw/audio/
 F: tests/ac97-test.c
 F: tests/es1370-test.c
 F: tests/intel-hda-test.c
@@ -1070,6 +1158,20 @@ F: block/qapi.c
 F: qapi/block*.json
 T: git git://repo.or.cz/qemu/armbru.git block-next

+Dirty Bitmaps
+M: Fam Zheng <famz@redhat.com>
+M: John Snow <jsnow@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: util/hbitmap.c
+F: block/dirty-bitmap.c
+F: include/qemu/hbitmap.h
+F: include/block/dirty-bitmap.h
+F: tests/test-hbitmap.c
+F: docs/bitmaps.md
+T: git git://github.com/famz/qemu.git bitmaps
+T: git git://github.com/jnsnow/qemu.git bitmaps
+
 Character device backends
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
@@ -1154,12 +1256,12 @@ F: qemu-timer.c
 F: vl.c

 Human Monitor (HMP)
-M: Luiz Capitulino <lcapitulino@redhat.com>
+M: Dr. David Alan Gilbert <dgilbert@redhat.com>
 S: Maintained
 F: monitor.c
-F: hmp.c
-F: hmp-commands.hx
-T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp
+F: hmp.[ch]
+F: hmp-commands*.hx
+F: include/monitor/hmp-target.h

 Network device backends
 M: Jason Wang <jasowang@redhat.com>
@@ -1199,6 +1301,12 @@ S: Maintained
 F: backends/hostmem*.c
 F: include/sysemu/hostmem.h

+Cryptodev Backends
+M: Gonglei <arei.gonglei@huawei.com>
+S: Maintained
+F: include/sysemu/cryptodev*.h
+F: backends/cryptodev*.c
+
 QAPI
 M: Markus Armbruster <armbru@redhat.com>
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
@@ -1224,8 +1332,8 @@ F: qapi/*.json
 T: git git://repo.or.cz/qemu/armbru.git qapi-next

 QObject
-M: Luiz Capitulino <lcapitulino@redhat.com>
-S: Maintained
+M: Markus Armbruster <armbru@redhat.com>
+S: Supported
 F: qobject/
 F: include/qapi/qmp/
 X: include/qapi/qmp/dispatch.h
@@ -1235,7 +1343,7 @@ F: tests/check-qint.c
 F: tests/check-qjson.c
 F: tests/check-qlist.c
 F: tests/check-qstring.c
-T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp
+T: git git://repo.or.cz/qemu/armbru.git qapi-next

 QEMU Guest Agent
 M: Michael Roth <mdroth@linux.vnet.ibm.com>
@@ -1364,6 +1472,23 @@ F: util/uuid.c
 F: include/qemu/uuid.h
 F: tests/test-uuid.c

+COLO Framework
+M: zhanghailiang <zhang.zhanghailiang@huawei.com>
+S: Maintained
+F: migration/colo*
+F: include/migration/colo.h
+F: include/migration/failover.h
+F: docs/COLO-FT.txt
+
+COLO Proxy
+M: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+M: Li Zhijian <lizhijian@cn.fujitsu.com>
+S: Supported
+F: docs/colo-proxy.txt
+F: net/colo*
+F: net/filter-rewriter.c
+F: net/filter-mirror.c
+
 Usermode Emulation
 ------------------
 Overall
@@ -1375,11 +1500,13 @@ F: user-exec.c
 BSD user
 S: Orphan
 F: bsd-user/
+F: default-configs/*-bsd-user.mak

 Linux user
 M: Riku Voipio <riku.voipio@iki.fi>
 S: Maintained
 F: linux-user/
+F: default-configs/*-linux-user.mak

 Tiny Code Generator (TCG)
 -------------------------
@@ -1423,8 +1550,8 @@ F: tcg/mips/
 F: disas/mips.c

 PPC
-M: Vassili Karpov (malc) <av1474@comtv.ru>
-S: Maintained
+M: Richard Henderson <rth@twiddle.net>
+S: Odd Fixes
 F: tcg/ppc/
 F: disas/ppc.c

@@ -1447,28 +1574,6 @@ F: tcg/tci/
 F: tci.c
 F: disas/tci.c

-Stable branches
---------------
-Stable 1.0
-L: qemu-stable@nongnu.org
-T: git git://git.qemu-project.org/qemu-stable-1.0.git
-S: Orphan
-
-Stable 0.15
-L: qemu-stable@nongnu.org
-T: git git://git.qemu-project.org/qemu-stable-0.15.git
-S: Orphan
-
-Stable 0.14
-L: qemu-stable@nongnu.org
-T: git git://git.qemu-project.org/qemu-stable-0.14.git
-S: Orphan
-
-Stable 0.10
-L: qemu-stable@nongnu.org
-T: git git://git.qemu-project.org/qemu-stable-0.10.git
-S: Orphan
-
 Block drivers
 -------------
 VMDK
--- a/86
+++ b/86
@@ -56,9 +56,6 @@ GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
 GENERATED_HEADERS += qmp-introspect.h
 GENERATED_SOURCES += qmp-introspect.c

-GENERATED_HEADERS += trace/generated-events.h
-GENERATED_SOURCES += trace/generated-events.c
-
 GENERATED_HEADERS += trace/generated-tracers.h
 ifeq ($(findstring dtrace,$(TRACE_BACKENDS)),dtrace)
 GENERATED_HEADERS += trace/generated-tracers-dtrace.h
@@ -93,7 +90,7 @@ LIBS+=-lz $(LIBS_TOOLS)
 HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)

 ifdef BUILD_DOCS
-DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
+DOCS=qemu-doc.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -107,20 +104,20 @@ SUBDIR_DEVICES_MAK_DEP=$(patsubst %, %-config-devices.mak.d, $(TARGET_DIRS))

 ifeq ($(SUBDIR_DEVICES_MAK),)
 config-all-devices.mak:
-	$(call quiet-command,echo '# no devices' > $@,"  GEN   $@")
+	$(call quiet-command,echo '# no devices' > $@,"GEN","$@")
 else
 config-all-devices.mak: $(SUBDIR_DEVICES_MAK)
 	$(call quiet-command, sed -n \
             's|^\([^=]*\)=\(.*\)$$|\1:=$$(findstring y,$$(\1)\2)|p' \
             $(SUBDIR_DEVICES_MAK) | sort -u > $@, \
-             "  GEN   $@")
+             "GEN","$@")
 endif

 -include $(SUBDIR_DEVICES_MAK_DEP)

 %/config-devices.mak: default-configs/%.mak $(SRC_PATH)/scripts/make_device_config.sh
 	$(call quiet-command, \
-            $(SHELL) $(SRC_PATH)/scripts/make_device_config.sh $< $*-config-devices.mak.d $@ > $@.tmp, "  GEN   $@.tmp")
+            $(SHELL) $(SRC_PATH)/scripts/make_device_config.sh $< $*-config-devices.mak.d $@ > $@.tmp,"GEN","$@.tmp")
 	$(call quiet-command, if test -f $@; then \
 	  if cmp -s $@.old $@; then \
 	    mv $@.tmp $@; \
@@ -137,7 +134,7 @@ endif
 	 else \
 	  mv $@.tmp $@; \
 	  cp -p $@ $@.old; \
-	 fi, "  GEN   $@");
+	 fi,"GEN","$@");

 defconfig:
 	rm -f config-all-devices.mak $(SUBDIR_DEVICES_MAK)
@@ -191,7 +188,7 @@ qemu-version.h: FORCE
 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
 qemu-options.def: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@")

 SUBDIR_RULES=$(patsubst %,subdir-%, $(TARGET_DIRS))
 SOFTMMU_SUBDIR_RULES=$(filter %-softmmu,$(SUBDIR_RULES))
@@ -235,9 +232,9 @@ ALL_SUBDIRS=$(TARGET_DIRS) $(patsubst %,pc-bios/%, $(ROMS))
 recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES)

 $(BUILD_DIR)/version.o: $(SRC_PATH)/version.rc config-host.h | $(BUILD_DIR)/version.lo
-	$(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<,"  RC    version.o")
+	$(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<,"RC","version.o")
 $(BUILD_DIR)/version.lo: $(SRC_PATH)/version.rc config-host.h
-	$(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<,"  RC    version.lo")
+	$(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<,"RC","version.lo")

 Makefile: $(version-obj-y) $(version-lobj-y)

@@ -261,7 +258,7 @@ fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap

 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@")

 qemu-ga$(EXESUF): LIBS = $(LIBS_QGA)
 qemu-ga$(EXESUF): QEMU_CFLAGS += -I qga/qapi-generated
@@ -274,17 +271,17 @@ qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h :\
 $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \
 		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qga/qapi-generated/qga-qapi-visit.c qga/qapi-generated/qga-qapi-visit.h :\
 $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \
 		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qga/qapi-generated/qga-qmp-commands.h qga/qapi-generated/qga-qmp-marshal.c :\
 $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \
 		$(gen-out-type) -o qga/qapi-generated -p "qga-" $<, \
-		"  GEN   $@")
+		"GEN","$@")

 qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \
               $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \
@@ -296,27 +293,27 @@ qapi-types.c qapi-types.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \
 		$(gen-out-type) -o "." -b $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qapi-visit.c qapi-visit.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \
 		$(gen-out-type) -o "." -b $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qapi-event.c qapi-event.h :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-event.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-event.py \
 		$(gen-out-type) -o "." $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qmp-commands.h qmp-marshal.c :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \
 		$(gen-out-type) -o "." $<, \
-		"  GEN   $@")
+		"GEN","$@")
 qmp-introspect.h qmp-introspect.c :\
 $(qapi-modules) $(SRC_PATH)/scripts/qapi-introspect.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-introspect.py \
 		$(gen-out-type) -o "." $<, \
-		"  GEN   $@")
+		"GEN","$@")

 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
 $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
@@ -335,7 +332,7 @@ $(QEMU_GA_MSI): config-host.mak

 $(QEMU_GA_MSI):  $(SRC_PATH)/qga/installer/qemu-ga.wxs
 	$(call quiet-command,QEMU_GA_VERSION="$(QEMU_GA_VERSION)" QEMU_GA_MANUFACTURER="$(QEMU_GA_MANUFACTURER)" QEMU_GA_DISTRO="$(QEMU_GA_DISTRO)" BUILD_DIR="$(BUILD_DIR)" \
-	wixl -o $@ $(QEMU_GA_MSI_ARCH) $(QEMU_GA_MSI_WITH_VSS) $(QEMU_GA_MSI_MINGW_DLL_PATH) $<, "  WIXL  $@")
+	wixl -o $@ $(QEMU_GA_MSI_ARCH) $(QEMU_GA_MSI_WITH_VSS) $(QEMU_GA_MSI_MINGW_DLL_PATH) $<,"WIXL","$@")
 else
 msi:
 	@echo "MSI build not configured or dependency resolution failed (reconfigure with --enable-guest-agent-msi option)"
@@ -354,7 +351,7 @@ ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
 	$(call quiet-command,$(PYTHON) $< $@ \
 	$(addprefix $(SRC_PATH)/,$(patsubst %.mo,%.c,$(block-obj-m))), \
-	"  GEN   $@")
+	"GEN","$@")

 clean:
 # avoid old build problems by removing potentially incorrect old files
@@ -398,7 +395,6 @@ distclean: clean
 	rm -f qemu-doc.vr
 	rm -f config.log
 	rm -f linux-headers/asm
-	rm -f qemu-tech.info qemu-tech.aux qemu-tech.cp qemu-tech.dvi qemu-tech.fn qemu-tech.info qemu-tech.ky qemu-tech.log qemu-tech.pdf qemu-tech.pg qemu-tech.toc qemu-tech.tp qemu-tech.vr
 	for d in $(TARGET_DIRS); do \
 	rm -rf $$d || exit 1 ; \
        done
@@ -425,7 +421,7 @@ qemu-icon.bmp qemu_logo_no_text.svg \
 bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \
 multiboot.bin linuxboot.bin linuxboot_dma.bin kvmvapic.bin \
 s390-ccw.img \
-spapr-rtas.bin slof.bin \
+spapr-rtas.bin slof.bin skiboot.lid \
 palcode-clipper \
 u-boot.e500
 else
@@ -434,7 +430,7 @@ endif

 install-doc: $(DOCS)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) qemu-doc.html  qemu-tech.html "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) qemu-doc.html "$(DESTDIR)$(qemu_docdir)"
 	$(INSTALL_DATA) $(SRC_PATH)/docs/qmp-commands.txt "$(DESTDIR)$(qemu_docdir)"
 ifdef CONFIG_POSIX
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
@@ -521,13 +517,13 @@ ui/shader/%-vert.h: $(SRC_PATH)/ui/shader/%.vert $(SRC_PATH)/scripts/shaderinclu
 	@mkdir -p $(dir $@)
 	$(call quiet-command,\
 		perl $(SRC_PATH)/scripts/shaderinclude.pl $< > $@,\
-		"  VERT  $@")
+		"VERT","$@")

 ui/shader/%-frag.h: $(SRC_PATH)/ui/shader/%.frag $(SRC_PATH)/scripts/shaderinclude.pl
 	@mkdir -p $(dir $@)
 	$(call quiet-command,\
 		perl $(SRC_PATH)/scripts/shaderinclude.pl $< > $@,\
-		"  FRAG  $@")
+		"FRAG","$@")

 ui/console-gl.o: $(SRC_PATH)/ui/console-gl.c \
 	ui/shader/texture-blit-vert.h ui/shader/texture-blit-frag.h
@@ -537,65 +533,65 @@ MAKEINFO=makeinfo
 MAKEINFOFLAGS=--no-headers --no-split --number-sections
 TEXIFLAG=$(if $(V),,--quiet)
 %.dvi: %.texi
-	$(call quiet-command,texi2dvi $(TEXIFLAG) -I . $<,"  GEN   $@")
+	$(call quiet-command,texi2dvi $(TEXIFLAG) -I . $<,"GEN","$@")

 %.html: %.texi
 	$(call quiet-command,LC_ALL=C $(MAKEINFO) $(MAKEINFOFLAGS) --html $< -o $@, \
-	"  GEN   $@")
+	"GEN","$@")

 %.info: %.texi
-	$(call quiet-command,$(MAKEINFO) $< -o $@,"  GEN   $@")
+	$(call quiet-command,$(MAKEINFO) $< -o $@,"GEN","$@")

 %.pdf: %.texi
-	$(call quiet-command,texi2pdf $(TEXIFLAG) -I . $<,"  GEN   $@")
+	$(call quiet-command,texi2pdf $(TEXIFLAG) -I . $<,"GEN","$@")

 qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

 qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

 qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

 qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"  GEN   $@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")

 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu.pod > $@, \
-	  "  GEN   $@")
+	  "GEN","$@")
 qemu.1: qemu-option-trace.texi

 qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-img.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " qemu-img.pod > $@, \
-	  "  GEN   $@")
+	  "GEN","$@")

 fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< fsdev/virtfs-proxy-helper.pod && \
 	  $(POD2MAN) --section=1 --center=" " --release=" " fsdev/virtfs-proxy-helper.pod > $@, \
-	  "  GEN   $@")
+	  "GEN","$@")

 qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-nbd.pod && \
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-nbd.pod > $@, \
-	  "  GEN   $@")
+	  "GEN","$@")

 qemu-ga.8: qemu-ga.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-ga.pod && \
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
-	  "  GEN   $@")
+	  "GEN","$@")

-dvi: qemu-doc.dvi qemu-tech.dvi
-html: qemu-doc.html qemu-tech.html
-info: qemu-doc.info qemu-tech.info
-pdf: qemu-doc.pdf qemu-tech.pdf
+dvi: qemu-doc.dvi
+html: qemu-doc.html
+info: qemu-doc.info
+pdf: qemu-doc.pdf

 qemu-doc.dvi qemu-doc.html qemu-doc.info qemu-doc.pdf: \
 	qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \
@@ -699,7 +695,7 @@ help:
 	@echo  ''
 ifdef CONFIG_WIN32
 	@echo  'Windows targets:'
-	@echo  '  installer       - Build NSIS-based installer for qemu-ga'
+	@echo  '  installer       - Build NSIS-based installer for QEMU'
 ifdef QEMU_GA_MSI_ENABLED
 	@echo  '  msi             - Build MSI-based installer for qemu-ga'
 endif
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -89,7 +89,7 @@ endif

 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o
+common-obj-y += cpus-common.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
@@ -142,6 +142,7 @@ trace-events-y += hw/dma/trace-events
 trace-events-y += hw/sparc/trace-events
 trace-events-y += hw/sd/trace-events
 trace-events-y += hw/isa/trace-events
+trace-events-y += hw/mem/trace-events
 trace-events-y += hw/i386/trace-events
 trace-events-y += hw/9pfs/trace-events
 trace-events-y += hw/ppc/trace-events
@@ -154,9 +155,11 @@ trace-events-y += hw/alpha/trace-events
 trace-events-y += ui/trace-events
 trace-events-y += audio/trace-events
 trace-events-y += net/trace-events
+trace-events-y += target-arm/trace-events
 trace-events-y += target-i386/trace-events
 trace-events-y += target-sparc/trace-events
 trace-events-y += target-s390x/trace-events
 trace-events-y += target-ppc/trace-events
 trace-events-y += qom/trace-events
 trace-events-y += linux-user/trace-events
+trace-events-y += qapi/trace-events
--- a/Makefile.target
+++ b/Makefile.target
@@ -26,7 +26,7 @@ ifneq (,$(findstring -mwindows,$(libs_softmmu)))
 # Terminate program name with a 'w' because the linker builds a windows executable.
 QEMU_PROGW=qemu-system-$(TARGET_NAME)w$(EXESUF)
 $(QEMU_PROG): $(QEMU_PROGW)
-	$(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG),"  GEN   $(TARGET_DIR)$(QEMU_PROG)")
+	$(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG),"GEN","$(TARGET_DIR)$(QEMU_PROG)")
 QEMU_PROG_BUILD = $(QEMU_PROGW)
 else
 QEMU_PROG_BUILD = $(QEMU_PROG)
@@ -55,7 +55,7 @@ $(QEMU_PROG).stp-installed: $(BUILD_DIR)/trace-events-all
 		--binary=$(bindir)/$(QEMU_PROG) \
 		--target-name=$(TARGET_NAME) \
 		--target-type=$(TARGET_TYPE) \
-		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp-installed")
+		$< > $@,"GEN","$(TARGET_DIR)$(QEMU_PROG).stp-installed")

 $(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
 	$(call quiet-command,$(TRACETOOL) \
@@ -64,14 +64,14 @@ $(QEMU_PROG).stp: $(BUILD_DIR)/trace-events-all
 		--binary=$(realpath .)/$(QEMU_PROG) \
 		--target-name=$(TARGET_NAME) \
 		--target-type=$(TARGET_TYPE) \
-		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG).stp")
+		$< > $@,"GEN","$(TARGET_DIR)$(QEMU_PROG).stp")

 $(QEMU_PROG)-simpletrace.stp: $(BUILD_DIR)/trace-events-all
 	$(call quiet-command,$(TRACETOOL) \
 		--format=simpletrace-stap \
 		--backends=$(TRACE_BACKENDS) \
 		--probe-prefix=qemu.$(TARGET_TYPE).$(TARGET_NAME) \
-		< $< > $@,"  GEN   $(TARGET_DIR)$(QEMU_PROG)-simpletrace.stp")
+		$< > $@,"GEN","$(TARGET_DIR)$(QEMU_PROG)-simpletrace.stp")

 else
 stap:
@@ -94,6 +94,7 @@ obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target-$(TARGET_BASE_ARCH)/
 obj-y += disas.o
+obj-y += tcg-runtime.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o

@@ -196,18 +197,18 @@ $(QEMU_PROG_BUILD): config-devices.mak
 $(QEMU_PROG_BUILD): $(all-obj-y) ../libqemuutil.a ../libqemustub.a
 	$(call LINK, $(filter-out %.mak, $^))
 ifdef CONFIG_DARWIN
-	$(call quiet-command,Rez -append $(SRC_PATH)/pc-bios/qemu.rsrc -o $@,"  REZ   $(TARGET_DIR)$@")
-	$(call quiet-command,SetFile -a C $@,"  SETFILE $(TARGET_DIR)$@")
+	$(call quiet-command,Rez -append $(SRC_PATH)/pc-bios/qemu.rsrc -o $@,"REZ","$(TARGET_DIR)$@")
+	$(call quiet-command,SetFile -a C $@,"SETFILE","$(TARGET_DIR)$@")
 endif

 gdbstub-xml.c: $(TARGET_XML_FILES) $(SRC_PATH)/scripts/feature_to_c.sh
-	$(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES),"  GEN   $(TARGET_DIR)$@")
+	$(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES),"GEN","$(TARGET_DIR)$@")

 hmp-commands.h: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$(TARGET_DIR)$@")

 hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
-	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $(TARGET_DIR)$@")
+	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$(TARGET_DIR)$@")

 clean: clean-target
 	rm -f *.a *~ $(PROGS)
--- a/2
+++ b/2
@@ -42,8 +42,6 @@ of other UNIX targets. The simple steps to build QEMU are:
  ../configure
  make

-Complete details of the process for building and configuring QEMU for
-all supported host platforms can be found in the qemu-tech.html file.
 Additional information can also be found online via the QEMU website:

  http://qemu-project.org/Hosts/Linux
--- a/2
+++ b/2
@@ -1 +1 @@
-2.7.50
+2.7.91
--- a/accel.c
+++ b/accel.c
@@ -33,7 +33,6 @@
 #include "sysemu/qtest.h"
 #include "hw/xen/xen.h"
 #include "qom/object.h"
-#include "hw/boards.h"

 int tcg_tb_size;
 static bool tcg_allowed = true;
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -81,29 +81,22 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 {
    struct epoll_event event;
    int r;
+    int ctl;

    if (!ctx->epoll_enabled) {
        return;
    }
    if (!node->pfd.events) {
-        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
-        if (r) {
-            aio_epoll_disable(ctx);
-        }
+        ctl = EPOLL_CTL_DEL;
    } else {
        event.data.ptr = node;
        event.events = epoll_events_from_pfd(node->pfd.events);
-        if (is_new) {
-            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
-            if (r) {
-                aio_epoll_disable(ctx);
-            }
-        } else {
-            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
-            if (r) {
-                aio_epoll_disable(ctx);
-            }
-        }
+        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
+    }
+
+    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
+    if (r) {
+        aio_epoll_disable(ctx);
    }
 }

@@ -217,21 +210,23 @@ void aio_set_fd_handler(AioContext *ctx,

    /* Are we deleting the fd handler? */
    if (!io_read && !io_write) {
-        if (node) {
-            g_source_remove_poll(&ctx->source, &node->pfd);
+        if (node == NULL) {
+            return;
+        }

-            /* If the lock is held, just mark the node as deleted */
-            if (ctx->walking_handlers) {
-                node->deleted = 1;
-                node->pfd.revents = 0;
-            } else {
-                /* Otherwise, delete it for real.  We can't just mark it as
-                 * deleted because deleted nodes are only cleaned up after
-                 * releasing the walking_handlers lock.
-                 */
-                QLIST_REMOVE(node, node);
-                deleted = true;
-            }
+        g_source_remove_poll(&ctx->source, &node->pfd);
+
+        /* If the lock is held, just mark the node as deleted */
+        if (ctx->walking_handlers) {
+            node->deleted = 1;
+            node->pfd.revents = 0;
+        } else {
+            /* Otherwise, delete it for real.  We can't just mark it as
+             * deleted because deleted nodes are only cleaned up after
+             * releasing the walking_handlers lock.
+             */
+            QLIST_REMOVE(node, node);
+            deleted = true;
        }
    } else {
        if (node == NULL) {
@@ -431,11 +426,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
    assert(npfd == 0);

    /* fill pollfds */
-    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->pfd.events
-            && !aio_epoll_enabled(ctx)
-            && aio_node_check(ctx, node->is_external)) {
-            add_pollfd(node);
+
+    if (!aio_epoll_enabled(ctx)) {
+        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+            if (!node->deleted && node->pfd.events
+                && aio_node_check(ctx, node->is_external)) {
+                add_pollfd(node);
+            }
        }
    }

--- a/async.c
+++ b/async.c
@@ -44,6 +44,26 @@ struct QEMUBH {
    bool deleted;
 };

+void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
+{
+    QEMUBH *bh;
+    bh = g_new(QEMUBH, 1);
+    *bh = (QEMUBH){
+        .ctx = ctx,
+        .cb = cb,
+        .opaque = opaque,
+    };
+    qemu_mutex_lock(&ctx->bh_lock);
+    bh->next = ctx->first_bh;
+    bh->scheduled = 1;
+    bh->deleted = 1;
+    /* Make sure that the members are ready before putting bh into list */
+    smp_wmb();
+    ctx->first_bh = bh;
+    qemu_mutex_unlock(&ctx->bh_lock);
+    aio_notify(ctx);
+}
+
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 {
    QEMUBH *bh;
@@ -86,9 +106,9 @@ int aio_bh_poll(AioContext *ctx)
         * thread sees the zero before bh->cb has run, and thus will call
         * aio_notify again if necessary.
         */
-        if (!bh->deleted && atomic_xchg(&bh->scheduled, 0)) {
-            /* Idle BHs and the notify BH don't count as progress */
-            if (!bh->idle && bh != ctx->notify_dummy_bh) {
+        if (atomic_xchg(&bh->scheduled, 0)) {
+            /* Idle BHs don't count as progress */
+            if (!bh->idle) {
                ret = 1;
            }
            bh->idle = 0;
@@ -104,7 +124,7 @@ int aio_bh_poll(AioContext *ctx)
        bhp = &ctx->first_bh;
        while (*bhp) {
            bh = *bhp;
-            if (bh->deleted) {
+            if (bh->deleted && !bh->scheduled) {
                *bhp = bh->next;
                g_free(bh);
            } else {
@@ -168,7 +188,7 @@ aio_compute_timeout(AioContext *ctx)
    QEMUBH *bh;

    for (bh = ctx->first_bh; bh; bh = bh->next) {
-        if (!bh->deleted && bh->scheduled) {
+        if (bh->scheduled) {
            if (bh->idle) {
                /* idle bottom halves will be polled at least
                 * every 10ms */
@@ -216,7 +236,7 @@ aio_ctx_check(GSource *source)
    aio_notify_accept(ctx);

    for (bh = ctx->first_bh; bh; bh = bh->next) {
-        if (!bh->deleted && bh->scheduled) {
+        if (bh->scheduled) {
            return true;
        }
    }
@@ -240,7 +260,6 @@ aio_ctx_finalize(GSource     *source)
 {
    AioContext *ctx = (AioContext *) source;

-    qemu_bh_delete(ctx->notify_dummy_bh);
    thread_pool_free(ctx->thread_pool);

 #ifdef CONFIG_LINUX_AIO
@@ -265,7 +284,7 @@ aio_ctx_finalize(GSource     *source)

    aio_set_event_notifier(ctx, &ctx->notifier, false, NULL);
    event_notifier_cleanup(&ctx->notifier);
-    rfifolock_destroy(&ctx->lock);
+    qemu_rec_mutex_destroy(&ctx->lock);
    qemu_mutex_destroy(&ctx->bh_lock);
    timerlistgroup_deinit(&ctx->tlg);
 }
@@ -326,19 +345,6 @@ static void aio_timerlist_notify(void *opaque)
    aio_notify(opaque);
 }

-static void aio_rfifolock_cb(void *opaque)
-{
-    AioContext *ctx = opaque;
-
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    qemu_bh_schedule(ctx->notify_dummy_bh);
-}
-
-static void notify_dummy_bh(void *opaque)
-{
-    /* Do nothing, we were invoked just to force the event loop to iterate */
-}
-
 static void event_notifier_dummy_cb(EventNotifier *e)
 {
 }
@@ -366,11 +372,9 @@ AioContext *aio_context_new(Error **errp)
 #endif
    ctx->thread_pool = NULL;
    qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    qemu_rec_mutex_init(&ctx->lock);
    timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);

-    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
-
    return ctx;
 fail:
    g_source_destroy(&ctx->source);
@@ -389,10 +393,10 @@ void aio_context_unref(AioContext *ctx)

 void aio_context_acquire(AioContext *ctx)
 {
-    rfifolock_lock(&ctx->lock);
+    qemu_rec_mutex_lock(&ctx->lock);
 }

 void aio_context_release(AioContext *ctx)
 {
-    rfifolock_unlock(&ctx->lock);
+    qemu_rec_mutex_unlock(&ctx->lock);
 }
--- a/atomic_template.h
+++ b/atomic_template.h
@@ -0,0 +1,215 @@
+/*
+ * Atomic helper templates
+ * Included from tcg-runtime.c and cputlb.c.
+ *
+ * Copyright (c) 2016 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if DATA_SIZE == 16
+# define SUFFIX     o
+# define DATA_TYPE  Int128
+# define BSWAP      bswap128
+#elif DATA_SIZE == 8
+# define SUFFIX     q
+# define DATA_TYPE  uint64_t
+# define BSWAP      bswap64
+#elif DATA_SIZE == 4
+# define SUFFIX     l
+# define DATA_TYPE  uint32_t
+# define BSWAP      bswap32
+#elif DATA_SIZE == 2
+# define SUFFIX     w
+# define DATA_TYPE  uint16_t
+# define BSWAP      bswap16
+#elif DATA_SIZE == 1
+# define SUFFIX     b
+# define DATA_TYPE  uint8_t
+# define BSWAP
+#else
+# error unsupported data size
+#endif
+
+#if DATA_SIZE >= 4
+# define ABI_TYPE  DATA_TYPE
+#else
+# define ABI_TYPE  uint32_t
+#endif
+
+/* Define host-endian atomic operations.  Note that END is used within
+   the ATOMIC_NAME macro, and redefined below.  */
+#if DATA_SIZE == 1
+# define END
+#elif defined(HOST_WORDS_BIGENDIAN)
+# define END  _be
+#else
+# define END  _le
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+}
+
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return val;
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_xchg__nocheck(haddr, val);
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return atomic_##X(haddr, val);                                  \
+}                                                                   \
+
+GEN_ATOMIC_HELPER(fetch_add)
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(add_fetch)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+#endif /* DATA SIZE >= 16 */
+
+#undef END
+
+#if DATA_SIZE > 1
+
+/* Define reverse-host-endian atomic operations.  Note that END is used
+   within the ATOMIC_NAME macro.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define END  _le
+#else
+# define END  _be
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
+}
+
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return BSWAP(val);
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    val = BSWAP(val);
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_xchg__nocheck(haddr, BSWAP(val)));
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return BSWAP(atomic_##X(haddr, BSWAP(val)));                    \
+}
+
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+
+/* Note that for addition, we need to use a separate cmpxchg loop instead
+   of bswaps for the reverse-host-endian helpers.  */
+ABI_TYPE ATOMIC_NAME(fetch_add)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo);
+        sto = BSWAP(ret + val);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo) + val;
+        sto = BSWAP(ret);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+#endif /* DATA_SIZE >= 16 */
+
+#undef END
+#endif /* DATA_SIZE > 1 */
+
+#undef BSWAP
+#undef ABI_TYPE
+#undef DATA_TYPE
+#undef SUFFIX
+#undef DATA_SIZE
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -9,3 +9,6 @@ common-obj-$(CONFIG_TPM) += tpm.o

 common-obj-y += hostmem.o hostmem-ram.o
 common-obj-$(CONFIG_LINUX) += hostmem-file.o
+
+common-obj-y += cryptodev.o
+common-obj-y += cryptodev-builtin.o
--- a/backends/baum.c
+++ b/backends/baum.c
@@ -1,7 +1,7 @@
 /*
 * QEMU Baum Braille Device
 *
- * Copyright (c) 2008 Samuel Thibault
+ * Copyright (c) 2008, 2010-2011, 2016 Samuel Thibault
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -92,6 +92,7 @@ typedef struct {
    brlapi_handle_t *brlapi;
    int brlapi_fd;
    unsigned int x, y;
+    bool deferred_init;

    uint8_t in_buf[BUF_SIZE];
    uint8_t in_buf_used;
@@ -102,8 +103,11 @@ typedef struct {
 } BaumDriverState;

 /* Let's assume NABCC by default */
-static const uint8_t nabcc_translation[256] = {
-    [0] = ' ',
+enum way {
+    DOTS2ASCII,
+    ASCII2DOTS
+};
+static const uint8_t nabcc_translation[2][256] = {
 #ifndef BRLAPI_DOTS
 #define BRLAPI_DOTS(d1,d2,d3,d4,d5,d6,d7,d8) \
    ((d1?BRLAPI_DOT1:0)|\
@@ -115,107 +119,154 @@ static const uint8_t nabcc_translation[256] = {
     (d7?BRLAPI_DOT7:0)|\
     (d8?BRLAPI_DOT8:0))
 #endif
-    [BRLAPI_DOTS(1,0,0,0,0,0,0,0)] = 'a',
-    [BRLAPI_DOTS(1,1,0,0,0,0,0,0)] = 'b',
-    [BRLAPI_DOTS(1,0,0,1,0,0,0,0)] = 'c',
-    [BRLAPI_DOTS(1,0,0,1,1,0,0,0)] = 'd',
-    [BRLAPI_DOTS(1,0,0,0,1,0,0,0)] = 'e',
-    [BRLAPI_DOTS(1,1,0,1,0,0,0,0)] = 'f',
-    [BRLAPI_DOTS(1,1,0,1,1,0,0,0)] = 'g',
-    [BRLAPI_DOTS(1,1,0,0,1,0,0,0)] = 'h',
-    [BRLAPI_DOTS(0,1,0,1,0,0,0,0)] = 'i',
-    [BRLAPI_DOTS(0,1,0,1,1,0,0,0)] = 'j',
-    [BRLAPI_DOTS(1,0,1,0,0,0,0,0)] = 'k',
-    [BRLAPI_DOTS(1,1,1,0,0,0,0,0)] = 'l',
-    [BRLAPI_DOTS(1,0,1,1,0,0,0,0)] = 'm',
-    [BRLAPI_DOTS(1,0,1,1,1,0,0,0)] = 'n',
-    [BRLAPI_DOTS(1,0,1,0,1,0,0,0)] = 'o',
-    [BRLAPI_DOTS(1,1,1,1,0,0,0,0)] = 'p',
-    [BRLAPI_DOTS(1,1,1,1,1,0,0,0)] = 'q',
-    [BRLAPI_DOTS(1,1,1,0,1,0,0,0)] = 'r',
-    [BRLAPI_DOTS(0,1,1,1,0,0,0,0)] = 's',
-    [BRLAPI_DOTS(0,1,1,1,1,0,0,0)] = 't',
-    [BRLAPI_DOTS(1,0,1,0,0,1,0,0)] = 'u',
-    [BRLAPI_DOTS(1,1,1,0,0,1,0,0)] = 'v',
-    [BRLAPI_DOTS(0,1,0,1,1,1,0,0)] = 'w',
-    [BRLAPI_DOTS(1,0,1,1,0,1,0,0)] = 'x',
-    [BRLAPI_DOTS(1,0,1,1,1,1,0,0)] = 'y',
-    [BRLAPI_DOTS(1,0,1,0,1,1,0,0)] = 'z',
+#define DO(dots, ascii) \
+    [DOTS2ASCII][dots] = ascii, \
+    [ASCII2DOTS][ascii] = dots
+    DO(0, ' '),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 0, 0, 0), 'a'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 0, 0, 0), 'b'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 0, 0, 0), 'c'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 0, 0, 0), 'd'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 0, 0, 0), 'e'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 0, 0, 0), 'f'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 0, 0, 0), 'g'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 0, 0, 0), 'h'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 0, 0, 0), 'i'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 0, 0, 0), 'j'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 0, 0, 0), 'k'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 0, 0, 0), 'l'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 0, 0, 0), 'm'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 0, 0, 0), 'n'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 0, 0, 0), 'o'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 0, 0, 0), 'p'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 0, 0, 0), 'q'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 0, 0, 0), 'r'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 0, 0, 0), 's'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 0, 0, 0), 't'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 1, 0, 0), 'u'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 1, 0, 0), 'v'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 1, 0, 0), 'w'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 1, 0, 0), 'x'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 1, 0, 0), 'y'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 1, 0, 0), 'z'),

-    [BRLAPI_DOTS(1,0,0,0,0,0,1,0)] = 'A',
-    [BRLAPI_DOTS(1,1,0,0,0,0,1,0)] = 'B',
-    [BRLAPI_DOTS(1,0,0,1,0,0,1,0)] = 'C',
-    [BRLAPI_DOTS(1,0,0,1,1,0,1,0)] = 'D',
-    [BRLAPI_DOTS(1,0,0,0,1,0,1,0)] = 'E',
-    [BRLAPI_DOTS(1,1,0,1,0,0,1,0)] = 'F',
-    [BRLAPI_DOTS(1,1,0,1,1,0,1,0)] = 'G',
-    [BRLAPI_DOTS(1,1,0,0,1,0,1,0)] = 'H',
-    [BRLAPI_DOTS(0,1,0,1,0,0,1,0)] = 'I',
-    [BRLAPI_DOTS(0,1,0,1,1,0,1,0)] = 'J',
-    [BRLAPI_DOTS(1,0,1,0,0,0,1,0)] = 'K',
-    [BRLAPI_DOTS(1,1,1,0,0,0,1,0)] = 'L',
-    [BRLAPI_DOTS(1,0,1,1,0,0,1,0)] = 'M',
-    [BRLAPI_DOTS(1,0,1,1,1,0,1,0)] = 'N',
-    [BRLAPI_DOTS(1,0,1,0,1,0,1,0)] = 'O',
-    [BRLAPI_DOTS(1,1,1,1,0,0,1,0)] = 'P',
-    [BRLAPI_DOTS(1,1,1,1,1,0,1,0)] = 'Q',
-    [BRLAPI_DOTS(1,1,1,0,1,0,1,0)] = 'R',
-    [BRLAPI_DOTS(0,1,1,1,0,0,1,0)] = 'S',
-    [BRLAPI_DOTS(0,1,1,1,1,0,1,0)] = 'T',
-    [BRLAPI_DOTS(1,0,1,0,0,1,1,0)] = 'U',
-    [BRLAPI_DOTS(1,1,1,0,0,1,1,0)] = 'V',
-    [BRLAPI_DOTS(0,1,0,1,1,1,1,0)] = 'W',
-    [BRLAPI_DOTS(1,0,1,1,0,1,1,0)] = 'X',
-    [BRLAPI_DOTS(1,0,1,1,1,1,1,0)] = 'Y',
-    [BRLAPI_DOTS(1,0,1,0,1,1,1,0)] = 'Z',
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 0, 1, 0), 'A'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 0, 1, 0), 'B'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 0, 1, 0), 'C'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 0, 1, 0), 'D'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 0, 1, 0), 'E'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 0, 1, 0), 'F'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 0, 1, 0), 'G'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 0, 1, 0), 'H'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 0, 1, 0), 'I'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 0, 1, 0), 'J'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 0, 1, 0), 'K'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 0, 1, 0), 'L'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 0, 1, 0), 'M'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 0, 1, 0), 'N'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 0, 1, 0), 'O'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 0, 1, 0), 'P'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 0, 1, 0), 'Q'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 0, 1, 0), 'R'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 0, 1, 0), 'S'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 0, 1, 0), 'T'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 0, 1, 1, 0), 'U'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 0, 1, 1, 0), 'V'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 1, 1, 1, 0), 'W'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 0, 1, 1, 0), 'X'),
+    DO(BRLAPI_DOTS(1, 0, 1, 1, 1, 1, 1, 0), 'Y'),
+    DO(BRLAPI_DOTS(1, 0, 1, 0, 1, 1, 1, 0), 'Z'),

-    [BRLAPI_DOTS(0,0,1,0,1,1,0,0)] = '0',
-    [BRLAPI_DOTS(0,1,0,0,0,0,0,0)] = '1',
-    [BRLAPI_DOTS(0,1,1,0,0,0,0,0)] = '2',
-    [BRLAPI_DOTS(0,1,0,0,1,0,0,0)] = '3',
-    [BRLAPI_DOTS(0,1,0,0,1,1,0,0)] = '4',
-    [BRLAPI_DOTS(0,1,0,0,0,1,0,0)] = '5',
-    [BRLAPI_DOTS(0,1,1,0,1,0,0,0)] = '6',
-    [BRLAPI_DOTS(0,1,1,0,1,1,0,0)] = '7',
-    [BRLAPI_DOTS(0,1,1,0,0,1,0,0)] = '8',
-    [BRLAPI_DOTS(0,0,1,0,1,0,0,0)] = '9',
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 1, 1, 0, 0), '0'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 0, 0, 0, 0), '1'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 0, 0, 0, 0), '2'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 1, 0, 0, 0), '3'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 1, 1, 0, 0), '4'),
+    DO(BRLAPI_DOTS(0, 1, 0, 0, 0, 1, 0, 0), '5'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 1, 0, 0, 0), '6'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 1, 1, 0, 0), '7'),
+    DO(BRLAPI_DOTS(0, 1, 1, 0, 0, 1, 0, 0), '8'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 1, 0, 0, 0), '9'),

-    [BRLAPI_DOTS(0,0,0,1,0,1,0,0)] = '.',
-    [BRLAPI_DOTS(0,0,1,1,0,1,0,0)] = '+',
-    [BRLAPI_DOTS(0,0,1,0,0,1,0,0)] = '-',
-    [BRLAPI_DOTS(1,0,0,0,0,1,0,0)] = '*',
-    [BRLAPI_DOTS(0,0,1,1,0,0,0,0)] = '/',
-    [BRLAPI_DOTS(1,1,1,0,1,1,0,0)] = '(',
-    [BRLAPI_DOTS(0,1,1,1,1,1,0,0)] = ')',
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 1, 0, 0), '.'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 0, 1, 0, 0), '+'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 0, 1, 0, 0), '-'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 0, 1, 0, 0), '*'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 0, 0, 0, 0), '/'),
+    DO(BRLAPI_DOTS(1, 1, 1, 0, 1, 1, 0, 0), '('),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 1, 1, 0, 0), ')'),

-    [BRLAPI_DOTS(1,1,1,1,0,1,0,0)] = '&',
-    [BRLAPI_DOTS(0,0,1,1,1,1,0,0)] = '#',
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 0, 1, 0, 0), '&'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 1, 1, 0, 0), '#'),

-    [BRLAPI_DOTS(0,0,0,0,0,1,0,0)] = ',',
-    [BRLAPI_DOTS(0,0,0,0,1,1,0,0)] = ';',
-    [BRLAPI_DOTS(1,0,0,0,1,1,0,0)] = ':',
-    [BRLAPI_DOTS(0,1,1,1,0,1,0,0)] = '!',
-    [BRLAPI_DOTS(1,0,0,1,1,1,0,0)] = '?',
-    [BRLAPI_DOTS(0,0,0,0,1,0,0,0)] = '"',
-    [BRLAPI_DOTS(0,0,1,0,0,0,0,0)] ='\'',
-    [BRLAPI_DOTS(0,0,0,1,0,0,0,0)] = '`',
-    [BRLAPI_DOTS(0,0,0,1,1,0,1,0)] = '^',
-    [BRLAPI_DOTS(0,0,0,1,1,0,0,0)] = '~',
-    [BRLAPI_DOTS(0,1,0,1,0,1,1,0)] = '[',
-    [BRLAPI_DOTS(1,1,0,1,1,1,1,0)] = ']',
-    [BRLAPI_DOTS(0,1,0,1,0,1,0,0)] = '{',
-    [BRLAPI_DOTS(1,1,0,1,1,1,0,0)] = '}',
-    [BRLAPI_DOTS(1,1,1,1,1,1,0,0)] = '=',
-    [BRLAPI_DOTS(1,1,0,0,0,1,0,0)] = '<',
-    [BRLAPI_DOTS(0,0,1,1,1,0,0,0)] = '>',
-    [BRLAPI_DOTS(1,1,0,1,0,1,0,0)] = '$',
-    [BRLAPI_DOTS(1,0,0,1,0,1,0,0)] = '%',
-    [BRLAPI_DOTS(0,0,0,1,0,0,1,0)] = '@',
-    [BRLAPI_DOTS(1,1,0,0,1,1,0,0)] = '|',
-    [BRLAPI_DOTS(1,1,0,0,1,1,1,0)] ='\\',
-    [BRLAPI_DOTS(0,0,0,1,1,1,0,0)] = '_',
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 0, 1, 0, 0), ','),
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 1, 1, 0, 0), ';'),
+    DO(BRLAPI_DOTS(1, 0, 0, 0, 1, 1, 0, 0), ':'),
+    DO(BRLAPI_DOTS(0, 1, 1, 1, 0, 1, 0, 0), '!'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 1, 1, 0, 0), '?'),
+    DO(BRLAPI_DOTS(0, 0, 0, 0, 1, 0, 0, 0), '"'),
+    DO(BRLAPI_DOTS(0, 0, 1, 0, 0, 0, 0, 0), '\''),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 0, 0, 0), '`'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 0, 1, 0), '^'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 0, 0, 0), '~'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 1, 1, 0), '['),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 1, 1, 0), ']'),
+    DO(BRLAPI_DOTS(0, 1, 0, 1, 0, 1, 0, 0), '{'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 1, 1, 0, 0), '}'),
+    DO(BRLAPI_DOTS(1, 1, 1, 1, 1, 1, 0, 0), '='),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 0, 1, 0, 0), '<'),
+    DO(BRLAPI_DOTS(0, 0, 1, 1, 1, 0, 0, 0), '>'),
+    DO(BRLAPI_DOTS(1, 1, 0, 1, 0, 1, 0, 0), '$'),
+    DO(BRLAPI_DOTS(1, 0, 0, 1, 0, 1, 0, 0), '%'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 0, 0, 1, 0), '@'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 1, 0, 0), '|'),
+    DO(BRLAPI_DOTS(1, 1, 0, 0, 1, 1, 1, 0), '\\'),
+    DO(BRLAPI_DOTS(0, 0, 0, 1, 1, 1, 0, 0), '_'),
 };

+/* The guest OS has started discussing with us, finish initializing BrlAPI */
+static int baum_deferred_init(BaumDriverState *baum)
+{
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    SDL_SysWMinfo info;
+#endif
+#endif
+    int tty;
+
+    if (baum->deferred_init) {
+        return 1;
+    }
+
+    if (brlapi__getDisplaySize(baum->brlapi, &baum->x, &baum->y) == -1) {
+        brlapi_perror("baum: brlapi__getDisplaySize");
+        return 0;
+    }
+
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    memset(&info, 0, sizeof(info));
+    SDL_VERSION(&info.version);
+    if (SDL_GetWMInfo(&info)) {
+        tty = info.info.x11.wmwindow;
+    } else {
+#endif
+#endif
+        tty = BRLAPI_TTY_DEFAULT;
+#if defined(CONFIG_SDL)
+#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
+    }
+#endif
+#endif
+
+    if (brlapi__enterTtyMode(baum->brlapi, tty, NULL) == -1) {
+        brlapi_perror("baum: brlapi__enterTtyMode");
+        return 0;
+    }
+    baum->deferred_init = 1;
+    return 1;
+}
+
 /* The serial port can receive more of our data */
 static void baum_accept_input(struct CharDriverState *chr)
 {
@@ -346,8 +397,10 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len)
                cursor = i + 1;
                c &= ~(BRLAPI_DOT7|BRLAPI_DOT8);
            }
-            if (!(c = nabcc_translation[c]))
+            c = nabcc_translation[DOTS2ASCII][c];
+            if (!c) {
                c = '?';
+            }
            text[i] = c;
        }
        timer_del(baum->cellCount_timer);
@@ -440,6 +493,8 @@ static int baum_write(CharDriverState *chr, const uint8_t *buf, int len)
        return 0;
    if (!baum->brlapi)
        return len;
+    if (!baum_deferred_init(baum))
+        return len;

    while (len) {
        /* Complete our buffer as much as possible */
@@ -476,6 +531,13 @@ static void baum_send_key(BaumDriverState *baum, uint8_t type, uint8_t value) {
    baum_write_packet(baum, packet, sizeof(packet));
 }

+static void baum_send_key2(BaumDriverState *baum, uint8_t type, uint8_t value,
+                           uint8_t value2) {
+    uint8_t packet[] = { type, value, value2 };
+    DPRINTF("writing key %x %x\n", type, value);
+    baum_write_packet(baum, packet, sizeof(packet));
+}
+
 /* We got some data on the BrlAPI socket */
 static void baum_chr_read(void *opaque)
 {
@@ -484,6 +546,8 @@ static void baum_chr_read(void *opaque)
    int ret;
    if (!baum->brlapi)
        return;
+    if (!baum_deferred_init(baum))
+        return;
    while ((ret = brlapi__readKey(baum->brlapi, 0, &code)) == 1) {
        DPRINTF("got key %"BRLAPI_PRIxKEYCODE"\n", code);
        /* Emulate */
@@ -540,7 +604,17 @@ static void baum_chr_read(void *opaque)
            }
            break;
        case BRLAPI_KEY_TYPE_SYM:
-            break;
+            {
+                brlapi_keyCode_t keysym = code & BRLAPI_KEY_CODE_MASK;
+                if (keysym < 0x100) {
+                    uint8_t dots = nabcc_translation[ASCII2DOTS][keysym];
+                    if (dots) {
+                        baum_send_key2(baum, BAUM_RSP_EntryKeys, 0, dots);
+                        baum_send_key2(baum, BAUM_RSP_EntryKeys, 0, 0);
+                    }
+                }
+                break;
+            }
        }
    }
    if (ret == -1 && (brlapi_errno != BRLAPI_ERROR_LIBCERR || errno != EINTR)) {
@@ -551,7 +625,7 @@ static void baum_chr_read(void *opaque)
    }
 }

-static void baum_close(struct CharDriverState *chr)
+static void baum_free(struct CharDriverState *chr)
 {
    BaumDriverState *baum = chr->opaque;

@@ -566,18 +640,13 @@ static void baum_close(struct CharDriverState *chr)
 static CharDriverState *chr_baum_init(const char *id,
                                      ChardevBackend *backend,
                                      ChardevReturn *ret,
+                                      bool *be_opened,
                                      Error **errp)
 {
    ChardevCommon *common = backend->u.braille.data;
    BaumDriverState *baum;
    CharDriverState *chr;
    brlapi_handle_t *handle;
-#if defined(CONFIG_SDL)
-#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
-    SDL_SysWMinfo info;
-#endif
-#endif
-    int tty;

    chr = qemu_chr_alloc(common, errp);
    if (!chr) {
@@ -589,7 +658,7 @@ static CharDriverState *chr_baum_init(const char *id,
    chr->opaque = baum;
    chr->chr_write = baum_write;
    chr->chr_accept_input = baum_accept_input;
-    chr->chr_close = baum_close;
+    chr->chr_free = baum_free;

    handle = g_malloc0(brlapi_getHandleSize());
    baum->brlapi = handle;
@@ -600,39 +669,14 @@ static CharDriverState *chr_baum_init(const char *id,
                   brlapi_strerror(brlapi_error_location()));
        goto fail_handle;
    }
+    baum->deferred_init = 0;

    baum->cellCount_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, baum_cellCount_timer_cb, baum);

-    if (brlapi__getDisplaySize(handle, &baum->x, &baum->y) == -1) {
-        error_setg(errp, "brlapi__getDisplaySize: %s",
-                   brlapi_strerror(brlapi_error_location()));
-        goto fail;
-    }
-
-#if defined(CONFIG_SDL)
-#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0)
-    memset(&info, 0, sizeof(info));
-    SDL_VERSION(&info.version);
-    if (SDL_GetWMInfo(&info))
-        tty = info.info.x11.wmwindow;
-    else
-#endif
-#endif
-        tty = BRLAPI_TTY_DEFAULT;
-
-    if (brlapi__enterTtyMode(handle, tty, NULL) == -1) {
-        error_setg(errp, "brlapi__enterTtyMode: %s",
-                   brlapi_strerror(brlapi_error_location()));
-        goto fail;
-    }
-
    qemu_set_fd_handler(baum->brlapi_fd, baum_chr_read, NULL, baum);

    return chr;

-fail:
-    timer_free(baum->cellCount_timer);
-    brlapi__closeConnection(handle);
 fail_handle:
    g_free(handle);
    g_free(chr);
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -0,0 +1,361 @@
+/*
+ * QEMU Cryptodev backend for QEMU cipher APIs
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ *    Gonglei <arei.gonglei@huawei.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/cryptodev.h"
+#include "hw/boards.h"
+#include "qapi/error.h"
+#include "standard-headers/linux/virtio_crypto.h"
+#include "crypto/cipher.h"
+
+
+/**
+ * @TYPE_CRYPTODEV_BACKEND_BUILTIN:
+ * name of backend that uses QEMU cipher API
+ */
+#define TYPE_CRYPTODEV_BACKEND_BUILTIN "cryptodev-backend-builtin"
+
+#define CRYPTODEV_BACKEND_BUILTIN(obj) \
+    OBJECT_CHECK(CryptoDevBackendBuiltin, \
+                 (obj), TYPE_CRYPTODEV_BACKEND_BUILTIN)
+
+typedef struct CryptoDevBackendBuiltin
+                         CryptoDevBackendBuiltin;
+
+typedef struct CryptoDevBackendBuiltinSession {
+    QCryptoCipher *cipher;
+    uint8_t direction; /* encryption or decryption */
+    uint8_t type; /* cipher? hash? aead? */
+    QTAILQ_ENTRY(CryptoDevBackendBuiltinSession) next;
+} CryptoDevBackendBuiltinSession;
+
+/* Max number of symmetric sessions */
+#define MAX_NUM_SESSIONS 256
+
+#define CRYPTODEV_BUITLIN_MAX_AUTH_KEY_LEN    512
+#define CRYPTODEV_BUITLIN_MAX_CIPHER_KEY_LEN  64
+
+struct CryptoDevBackendBuiltin {
+    CryptoDevBackend parent_obj;
+
+    CryptoDevBackendBuiltinSession *sessions[MAX_NUM_SESSIONS];
+};
+
+static void cryptodev_builtin_init(
+             CryptoDevBackend *backend, Error **errp)
+{
+    /* Only support one queue */
+    int queues = backend->conf.peers.queues;
+    CryptoDevBackendClient *cc;
+
+    if (queues != 1) {
+        error_setg(errp,
+                  "Only support one queue in cryptdov-builtin backend");
+        return;
+    }
+
+    cc = cryptodev_backend_new_client(
+              "cryptodev-builtin", NULL);
+    cc->info_str = g_strdup_printf("cryptodev-builtin0");
+    cc->queue_index = 0;
+    backend->conf.peers.ccs[0] = cc;
+
+    backend->conf.crypto_services =
+                         1u << VIRTIO_CRYPTO_SERVICE_CIPHER |
+                         1u << VIRTIO_CRYPTO_SERVICE_HASH |
+                         1u << VIRTIO_CRYPTO_SERVICE_MAC;
+    backend->conf.cipher_algo_l = 1u << VIRTIO_CRYPTO_CIPHER_AES_CBC;
+    backend->conf.hash_algo = 1u << VIRTIO_CRYPTO_HASH_SHA1;
+    /*
+     * Set the Maximum length of crypto request.
+     * Why this value? Just avoid to overflow when
+     * memory allocation for each crypto request.
+     */
+    backend->conf.max_size = LONG_MAX - sizeof(CryptoDevBackendSymOpInfo);
+    backend->conf.max_cipher_key_len = CRYPTODEV_BUITLIN_MAX_CIPHER_KEY_LEN;
+    backend->conf.max_auth_key_len = CRYPTODEV_BUITLIN_MAX_AUTH_KEY_LEN;
+}
+
+static int
+cryptodev_builtin_get_unused_session_index(
+                 CryptoDevBackendBuiltin *builtin)
+{
+    size_t i;
+
+    for (i = 0; i < MAX_NUM_SESSIONS; i++) {
+        if (builtin->sessions[i] == NULL) {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+static int
+cryptodev_builtin_get_aes_algo(uint32_t key_len, Error **errp)
+{
+    int algo;
+
+    if (key_len == 128 / 8) {
+        algo = QCRYPTO_CIPHER_ALG_AES_128;
+    } else if (key_len == 192 / 8) {
+        algo = QCRYPTO_CIPHER_ALG_AES_192;
+    } else if (key_len == 256 / 8) {
+        algo = QCRYPTO_CIPHER_ALG_AES_256;
+    } else {
+        error_setg(errp, "Unsupported key length :%u", key_len);
+        return -1;
+    }
+
+    return algo;
+}
+
+static int cryptodev_builtin_create_cipher_session(
+                    CryptoDevBackendBuiltin *builtin,
+                    CryptoDevBackendSymSessionInfo *sess_info,
+                    Error **errp)
+{
+    int algo;
+    int mode;
+    QCryptoCipher *cipher;
+    int index;
+    CryptoDevBackendBuiltinSession *sess;
+
+    if (sess_info->op_type != VIRTIO_CRYPTO_SYM_OP_CIPHER) {
+        error_setg(errp, "Unsupported optype :%u", sess_info->op_type);
+        return -1;
+    }
+
+    index = cryptodev_builtin_get_unused_session_index(builtin);
+    if (index < 0) {
+        error_setg(errp, "Total number of sessions created exceeds %u",
+                  MAX_NUM_SESSIONS);
+        return -1;
+    }
+
+    switch (sess_info->cipher_alg) {
+    case VIRTIO_CRYPTO_CIPHER_AES_ECB:
+        algo = cryptodev_builtin_get_aes_algo(sess_info->key_len,
+                                                          errp);
+        if (algo < 0)  {
+            return -1;
+        }
+        mode = QCRYPTO_CIPHER_MODE_ECB;
+        break;
+    case VIRTIO_CRYPTO_CIPHER_AES_CBC:
+        algo = cryptodev_builtin_get_aes_algo(sess_info->key_len,
+                                                          errp);
+        if (algo < 0)  {
+            return -1;
+        }
+        mode = QCRYPTO_CIPHER_MODE_CBC;
+        break;
+    case VIRTIO_CRYPTO_CIPHER_AES_CTR:
+        algo = cryptodev_builtin_get_aes_algo(sess_info->key_len,
+                                                          errp);
+        if (algo < 0)  {
+            return -1;
+        }
+        mode = QCRYPTO_CIPHER_MODE_CTR;
+        break;
+    case VIRTIO_CRYPTO_CIPHER_DES_ECB:
+        algo = QCRYPTO_CIPHER_ALG_DES_RFB;
+        mode = QCRYPTO_CIPHER_MODE_ECB;
+        break;
+    default:
+        error_setg(errp, "Unsupported cipher alg :%u",
+                   sess_info->cipher_alg);
+        return -1;
+    }
+
+    cipher = qcrypto_cipher_new(algo, mode,
+                               sess_info->cipher_key,
+                               sess_info->key_len,
+                               errp);
+    if (!cipher) {
+        return -1;
+    }
+
+    sess = g_new0(CryptoDevBackendBuiltinSession, 1);
+    sess->cipher = cipher;
+    sess->direction = sess_info->direction;
+    sess->type = sess_info->op_type;
+
+    builtin->sessions[index] = sess;
+
+    return index;
+}
+
+static int64_t cryptodev_builtin_sym_create_session(
+           CryptoDevBackend *backend,
+           CryptoDevBackendSymSessionInfo *sess_info,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendBuiltin *builtin =
+                      CRYPTODEV_BACKEND_BUILTIN(backend);
+    int64_t session_id = -1;
+    int ret;
+
+    switch (sess_info->op_code) {
+    case VIRTIO_CRYPTO_CIPHER_CREATE_SESSION:
+        ret = cryptodev_builtin_create_cipher_session(
+                           builtin, sess_info, errp);
+        if (ret < 0) {
+            return ret;
+        } else {
+            session_id = ret;
+        }
+        break;
+    case VIRTIO_CRYPTO_HASH_CREATE_SESSION:
+    case VIRTIO_CRYPTO_MAC_CREATE_SESSION:
+    default:
+        error_setg(errp, "Unsupported opcode :%" PRIu32 "",
+                   sess_info->op_code);
+        return -1;
+    }
+
+    return session_id;
+}
+
+static int cryptodev_builtin_sym_close_session(
+           CryptoDevBackend *backend,
+           uint64_t session_id,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendBuiltin *builtin =
+                      CRYPTODEV_BACKEND_BUILTIN(backend);
+
+    if (session_id >= MAX_NUM_SESSIONS ||
+              builtin->sessions[session_id] == NULL) {
+        error_setg(errp, "Cannot find a valid session id: %" PRIu64 "",
+                      session_id);
+        return -1;
+    }
+
+    qcrypto_cipher_free(builtin->sessions[session_id]->cipher);
+    g_free(builtin->sessions[session_id]);
+    builtin->sessions[session_id] = NULL;
+    return 0;
+}
+
+static int cryptodev_builtin_sym_operation(
+                 CryptoDevBackend *backend,
+                 CryptoDevBackendSymOpInfo *op_info,
+                 uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendBuiltin *builtin =
+                      CRYPTODEV_BACKEND_BUILTIN(backend);
+    CryptoDevBackendBuiltinSession *sess;
+    int ret;
+
+    if (op_info->session_id >= MAX_NUM_SESSIONS ||
+              builtin->sessions[op_info->session_id] == NULL) {
+        error_setg(errp, "Cannot find a valid session id: %" PRIu64 "",
+                   op_info->session_id);
+        return -VIRTIO_CRYPTO_INVSESS;
+    }
+
+    if (op_info->op_type == VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) {
+        error_setg(errp,
+               "Algorithm chain is unsupported for cryptdoev-builtin");
+        return -VIRTIO_CRYPTO_NOTSUPP;
+    }
+
+    sess = builtin->sessions[op_info->session_id];
+
+    ret = qcrypto_cipher_setiv(sess->cipher, op_info->iv,
+                               op_info->iv_len, errp);
+    if (ret < 0) {
+        return -VIRTIO_CRYPTO_ERR;
+    }
+
+    if (sess->direction == VIRTIO_CRYPTO_OP_ENCRYPT) {
+        ret = qcrypto_cipher_encrypt(sess->cipher, op_info->src,
+                                     op_info->dst, op_info->src_len, errp);
+        if (ret < 0) {
+            return -VIRTIO_CRYPTO_ERR;
+        }
+    } else {
+        ret = qcrypto_cipher_decrypt(sess->cipher, op_info->src,
+                                     op_info->dst, op_info->src_len, errp);
+        if (ret < 0) {
+            return -VIRTIO_CRYPTO_ERR;
+        }
+    }
+    return VIRTIO_CRYPTO_OK;
+}
+
+static void cryptodev_builtin_cleanup(
+             CryptoDevBackend *backend,
+             Error **errp)
+{
+    CryptoDevBackendBuiltin *builtin =
+                      CRYPTODEV_BACKEND_BUILTIN(backend);
+    size_t i;
+    int queues = backend->conf.peers.queues;
+    CryptoDevBackendClient *cc;
+
+    for (i = 0; i < MAX_NUM_SESSIONS; i++) {
+        if (builtin->sessions[i] != NULL) {
+            cryptodev_builtin_sym_close_session(
+                    backend, i, 0, errp);
+        }
+    }
+
+    assert(queues == 1);
+
+    for (i = 0; i < queues; i++) {
+        cc = backend->conf.peers.ccs[i];
+        if (cc) {
+            cryptodev_backend_free_client(cc);
+            backend->conf.peers.ccs[i] = NULL;
+        }
+    }
+}
+
+static void
+cryptodev_builtin_class_init(ObjectClass *oc, void *data)
+{
+    CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_CLASS(oc);
+
+    bc->init = cryptodev_builtin_init;
+    bc->cleanup = cryptodev_builtin_cleanup;
+    bc->create_session = cryptodev_builtin_sym_create_session;
+    bc->close_session = cryptodev_builtin_sym_close_session;
+    bc->do_sym_op = cryptodev_builtin_sym_operation;
+}
+
+static const TypeInfo cryptodev_builtin_info = {
+    .name = TYPE_CRYPTODEV_BACKEND_BUILTIN,
+    .parent = TYPE_CRYPTODEV_BACKEND,
+    .class_init = cryptodev_builtin_class_init,
+    .instance_size = sizeof(CryptoDevBackendBuiltin),
+};
+
+static void
+cryptodev_builtin_register_types(void)
+{
+    type_register_static(&cryptodev_builtin_info);
+}
+
+type_init(cryptodev_builtin_register_types);
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -0,0 +1,245 @@
+/*
+ * QEMU Crypto Device Implementation
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ *    Gonglei <arei.gonglei@huawei.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/cryptodev.h"
+#include "hw/boards.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "qapi-types.h"
+#include "qapi-visit.h"
+#include "qemu/config-file.h"
+#include "qom/object_interfaces.h"
+#include "hw/virtio/virtio-crypto.h"
+
+
+static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
+
+
+CryptoDevBackendClient *
+cryptodev_backend_new_client(const char *model,
+                                    const char *name)
+{
+    CryptoDevBackendClient *cc;
+
+    cc = g_malloc0(sizeof(CryptoDevBackendClient));
+    cc->model = g_strdup(model);
+    if (name) {
+        cc->name = g_strdup(name);
+    }
+
+    QTAILQ_INSERT_TAIL(&crypto_clients, cc, next);
+
+    return cc;
+}
+
+void cryptodev_backend_free_client(
+                  CryptoDevBackendClient *cc)
+{
+    QTAILQ_REMOVE(&crypto_clients, cc, next);
+    g_free(cc->name);
+    g_free(cc->model);
+    g_free(cc->info_str);
+    g_free(cc);
+}
+
+void cryptodev_backend_cleanup(
+             CryptoDevBackend *backend,
+             Error **errp)
+{
+    CryptoDevBackendClass *bc =
+                  CRYPTODEV_BACKEND_GET_CLASS(backend);
+
+    if (bc->cleanup) {
+        bc->cleanup(backend, errp);
+    }
+
+    backend->ready = false;
+}
+
+int64_t cryptodev_backend_sym_create_session(
+           CryptoDevBackend *backend,
+           CryptoDevBackendSymSessionInfo *sess_info,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendClass *bc =
+                      CRYPTODEV_BACKEND_GET_CLASS(backend);
+
+    if (bc->create_session) {
+        return bc->create_session(backend, sess_info, queue_index, errp);
+    }
+
+    return -1;
+}
+
+int cryptodev_backend_sym_close_session(
+           CryptoDevBackend *backend,
+           uint64_t session_id,
+           uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendClass *bc =
+                      CRYPTODEV_BACKEND_GET_CLASS(backend);
+
+    if (bc->close_session) {
+        return bc->close_session(backend, session_id, queue_index, errp);
+    }
+
+    return -1;
+}
+
+static int cryptodev_backend_sym_operation(
+                 CryptoDevBackend *backend,
+                 CryptoDevBackendSymOpInfo *op_info,
+                 uint32_t queue_index, Error **errp)
+{
+    CryptoDevBackendClass *bc =
+                      CRYPTODEV_BACKEND_GET_CLASS(backend);
+
+    if (bc->do_sym_op) {
+        return bc->do_sym_op(backend, op_info, queue_index, errp);
+    }
+
+    return -VIRTIO_CRYPTO_ERR;
+}
+
+int cryptodev_backend_crypto_operation(
+                 CryptoDevBackend *backend,
+                 void *opaque,
+                 uint32_t queue_index, Error **errp)
+{
+    VirtIOCryptoReq *req = opaque;
+
+    if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+        CryptoDevBackendSymOpInfo *op_info;
+        op_info = req->u.sym_op_info;
+
+        return cryptodev_backend_sym_operation(backend,
+                         op_info, queue_index, errp);
+    } else {
+        error_setg(errp, "Unsupported cryptodev alg type: %" PRIu32 "",
+                   req->flags);
+       return -VIRTIO_CRYPTO_NOTSUPP;
+    }
+
+    return -VIRTIO_CRYPTO_ERR;
+}
+
+static void
+cryptodev_backend_get_queues(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp)
+{
+    CryptoDevBackend *backend = CRYPTODEV_BACKEND(obj);
+    uint32_t value = backend->conf.peers.queues;
+
+    visit_type_uint32(v, name, &value, errp);
+}
+
+static void
+cryptodev_backend_set_queues(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp)
+{
+    CryptoDevBackend *backend = CRYPTODEV_BACKEND(obj);
+    Error *local_err = NULL;
+    uint32_t value;
+
+    visit_type_uint32(v, name, &value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    if (!value) {
+        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
+                   PRIu32 "'", object_get_typename(obj), name, value);
+        goto out;
+    }
+    backend->conf.peers.queues = value;
+out:
+    error_propagate(errp, local_err);
+}
+
+static void
+cryptodev_backend_complete(UserCreatable *uc, Error **errp)
+{
+    CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc);
+    CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc);
+    Error *local_err = NULL;
+
+    if (bc->init) {
+        bc->init(backend, &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+    backend->ready = true;
+    return;
+
+out:
+    backend->ready = false;
+    error_propagate(errp, local_err);
+}
+
+static void cryptodev_backend_instance_init(Object *obj)
+{
+    object_property_add(obj, "queues", "int",
+                          cryptodev_backend_get_queues,
+                          cryptodev_backend_set_queues,
+                          NULL, NULL, NULL);
+    /* Initialize devices' queues property to 1 */
+    object_property_set_int(obj, 1, "queues", NULL);
+}
+
+static void cryptodev_backend_finalize(Object *obj)
+{
+
+}
+
+static void
+cryptodev_backend_class_init(ObjectClass *oc, void *data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+    ucc->complete = cryptodev_backend_complete;
+
+    QTAILQ_INIT(&crypto_clients);
+}
+
+static const TypeInfo cryptodev_backend_info = {
+    .name = TYPE_CRYPTODEV_BACKEND,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(CryptoDevBackend),
+    .instance_init = cryptodev_backend_instance_init,
+    .instance_finalize = cryptodev_backend_finalize,
+    .class_size = sizeof(CryptoDevBackendClass),
+    .class_init = cryptodev_backend_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+static void
+cryptodev_backend_register_types(void)
+{
+    type_register_static(&cryptodev_backend_info);
+}
+
+type_init(cryptodev_backend_register_types);
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -64,14 +64,6 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
 #endif
 }

-static void
-file_backend_class_init(ObjectClass *oc, void *data)
-{
-    HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
-
-    bc->alloc = file_backend_memory_alloc;
-}
-
 static char *get_mem_path(Object *o, Error **errp)
 {
    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
@@ -112,13 +104,18 @@ static void file_memory_backend_set_share(Object *o, bool value, Error **errp)
 }

 static void
-file_backend_instance_init(Object *o)
+file_backend_class_init(ObjectClass *oc, void *data)
 {
-    object_property_add_bool(o, "share",
-                        file_memory_backend_get_share,
-                        file_memory_backend_set_share, NULL);
-    object_property_add_str(o, "mem-path", get_mem_path,
-                            set_mem_path, NULL);
+    HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
+
+    bc->alloc = file_backend_memory_alloc;
+
+    object_class_property_add_bool(oc, "share",
+        file_memory_backend_get_share, file_memory_backend_set_share,
+        &error_abort);
+    object_class_property_add_str(oc, "mem-path",
+        get_mem_path, set_mem_path,
+        &error_abort);
 }

 static void file_backend_instance_finalize(Object *o)
@@ -132,7 +129,6 @@ static const TypeInfo file_backend_info = {
    .name = TYPE_MEMORY_BACKEND_FILE,
    .parent = TYPE_MEMORY_BACKEND,
    .class_init = file_backend_class_init,
-    .instance_init = file_backend_instance_init,
    .instance_finalize = file_backend_instance_finalize,
    .instance_size = sizeof(HostMemoryBackendFile),
 };
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -241,26 +241,6 @@ static void host_memory_backend_init(Object *obj)
    backend->merge = machine_mem_merge(machine);
    backend->dump = machine_dump_guest_core(machine);
    backend->prealloc = mem_prealloc;
-
-    object_property_add_bool(obj, "merge",
-                        host_memory_backend_get_merge,
-                        host_memory_backend_set_merge, NULL);
-    object_property_add_bool(obj, "dump",
-                        host_memory_backend_get_dump,
-                        host_memory_backend_set_dump, NULL);
-    object_property_add_bool(obj, "prealloc",
-                        host_memory_backend_get_prealloc,
-                        host_memory_backend_set_prealloc, NULL);
-    object_property_add(obj, "size", "int",
-                        host_memory_backend_get_size,
-                        host_memory_backend_set_size, NULL, NULL, NULL);
-    object_property_add(obj, "host-nodes", "int",
-                        host_memory_backend_get_host_nodes,
-                        host_memory_backend_set_host_nodes, NULL, NULL, NULL);
-    object_property_add_enum(obj, "policy", "HostMemPolicy",
-                             HostMemPolicy_lookup,
-                             host_memory_backend_get_policy,
-                             host_memory_backend_set_policy, NULL);
 }

 MemoryRegion *
@@ -375,6 +355,28 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)

    ucc->complete = host_memory_backend_memory_complete;
    ucc->can_be_deleted = host_memory_backend_can_be_deleted;
+
+    object_class_property_add_bool(oc, "merge",
+        host_memory_backend_get_merge,
+        host_memory_backend_set_merge, &error_abort);
+    object_class_property_add_bool(oc, "dump",
+        host_memory_backend_get_dump,
+        host_memory_backend_set_dump, &error_abort);
+    object_class_property_add_bool(oc, "prealloc",
+        host_memory_backend_get_prealloc,
+        host_memory_backend_set_prealloc, &error_abort);
+    object_class_property_add(oc, "size", "int",
+        host_memory_backend_get_size,
+        host_memory_backend_set_size,
+        NULL, NULL, &error_abort);
+    object_class_property_add(oc, "host-nodes", "int",
+        host_memory_backend_get_host_nodes,
+        host_memory_backend_set_host_nodes,
+        NULL, NULL, &error_abort);
+    object_class_property_add_enum(oc, "policy", "HostMemPolicy",
+        HostMemPolicy_lookup,
+        host_memory_backend_get_policy,
+        host_memory_backend_set_policy, &error_abort);
 }

 static const TypeInfo host_memory_backend_info = {
--- a/backends/msmouse.c
+++ b/backends/msmouse.c
@@ -133,7 +133,7 @@ static int msmouse_chr_write (struct CharDriverState *s, const uint8_t *buf, int
    return len;
 }

-static void msmouse_chr_close (struct CharDriverState *chr)
+static void msmouse_chr_free(struct CharDriverState *chr)
 {
    MouseState *mouse = chr->opaque;

@@ -151,6 +151,7 @@ static QemuInputHandler msmouse_handler = {
 static CharDriverState *qemu_chr_open_msmouse(const char *id,
                                              ChardevBackend *backend,
                                              ChardevReturn *ret,
+                                              bool *be_opened,
                                              Error **errp)
 {
    ChardevCommon *common = backend->u.msmouse.data;
@@ -162,9 +163,9 @@ static CharDriverState *qemu_chr_open_msmouse(const char *id,
        return NULL;
    }
    chr->chr_write = msmouse_chr_write;
-    chr->chr_close = msmouse_chr_close;
+    chr->chr_free = msmouse_chr_free;
    chr->chr_accept_input = msmouse_chr_accept_input;
-    chr->explicit_be_open = true;
+    *be_opened = false;

    mouse = g_new0(MouseState, 1);
    mouse->hs = qemu_input_handler_register((DeviceState *)mouse,
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -15,7 +15,6 @@
 #include "sysemu/char.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
-#include "hw/qdev.h" /* just for DEFINE_PROP_CHR */

 #define TYPE_RNG_EGD "rng-egd"
 #define RNG_EGD(obj) OBJECT_CHECK(RngEgd, (obj), TYPE_RNG_EGD)
@@ -24,7 +23,7 @@ typedef struct RngEgd
 {
    RngBackend parent;

-    CharDriverState *chr;
+    CharBackend chr;
    char *chr_name;
 } RngEgd;

@@ -43,7 +42,7 @@ static void rng_egd_request_entropy(RngBackend *b, RngRequest *req)

        /* XXX this blocks entire thread. Rewrite to use
         * qemu_chr_fe_write and background I/O callbacks */
-        qemu_chr_fe_write_all(s->chr, header, sizeof(header));
+        qemu_chr_fe_write_all(&s->chr, header, sizeof(header));

        size -= len;
    }
@@ -87,6 +86,7 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
 static void rng_egd_opened(RngBackend *b, Error **errp)
 {
    RngEgd *s = RNG_EGD(b);
+    CharDriverState *chr;

    if (s->chr_name == NULL) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
@@ -94,21 +94,19 @@ static void rng_egd_opened(RngBackend *b, Error **errp)
        return;
    }

-    s->chr = qemu_chr_find(s->chr_name);
-    if (s->chr == NULL) {
+    chr = qemu_chr_find(s->chr_name);
+    if (chr == NULL) {
        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                  "Device '%s' not found", s->chr_name);
        return;
    }
-
-    if (qemu_chr_fe_claim(s->chr) != 0) {
-        error_setg(errp, QERR_DEVICE_IN_USE, s->chr_name);
+    if (!qemu_chr_fe_init(&s->chr, chr, errp)) {
        return;
    }

    /* FIXME we should resubmit pending requests when the CDS reconnects. */
-    qemu_chr_add_handlers(s->chr, rng_egd_chr_can_read, rng_egd_chr_read,
-                          NULL, s);
+    qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read,
+                             rng_egd_chr_read, NULL, s, NULL, true);
 }

 static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
@@ -127,9 +125,10 @@ static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
 static char *rng_egd_get_chardev(Object *obj, Error **errp)
 {
    RngEgd *s = RNG_EGD(obj);
+    CharDriverState *chr = qemu_chr_fe_get_driver(&s->chr);

-    if (s->chr && s->chr->label) {
-        return g_strdup(s->chr->label);
+    if (chr && chr->label) {
+        return g_strdup(chr->label);
    }

    return NULL;
@@ -146,11 +145,7 @@ static void rng_egd_finalize(Object *obj)
 {
    RngEgd *s = RNG_EGD(obj);

-    if (s->chr) {
-        qemu_chr_add_handlers(s->chr, NULL, NULL, NULL, NULL);
-        qemu_chr_fe_release(s->chr);
-    }
-
+    qemu_chr_fe_deinit(&s->chr);
    g_free(s->chr_name);
 }

--- a/backends/testdev.c
+++ b/backends/testdev.c
@@ -102,7 +102,7 @@ static int testdev_write(CharDriverState *chr, const uint8_t *buf, int len)
    return orig_len;
 }

-static void testdev_close(struct CharDriverState *chr)
+static void testdev_free(struct CharDriverState *chr)
 {
    TestdevCharState *testdev = chr->opaque;

@@ -112,6 +112,7 @@ static void testdev_close(struct CharDriverState *chr)
 static CharDriverState *chr_testdev_init(const char *id,
                                         ChardevBackend *backend,
                                         ChardevReturn *ret,
+                                         bool *be_opened,
                                         Error **errp)
 {
    TestdevCharState *testdev;
@@ -122,7 +123,7 @@ static CharDriverState *chr_testdev_init(const char *id,

    chr->opaque = testdev;
    chr->chr_write = testdev_write;
-    chr->chr_close = testdev_close;
+    chr->chr_free = testdev_free;

    return chr;
 }
--- a/block.c
+++ b/block.c
@@ -42,6 +42,7 @@
 #include "qapi-event.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"
+#include "qapi/util.h"

 #ifdef CONFIG_BSD
 #include <sys/ioctl.h>
@@ -764,7 +765,7 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
    /* Our block drivers take care to send flushes and respect unmap policy,
     * so we can default to enable both on lower layers regardless of the
     * corresponding parent options. */
-    flags |= BDRV_O_UNMAP;
+    qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");

    /* Clear flags that only apply to the top layer */
    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ |
@@ -925,7 +926,7 @@ out:
    g_free(gen_node_name);
 }

-static QemuOptsList bdrv_runtime_opts = {
+QemuOptsList bdrv_runtime_opts = {
    .name = "bdrv_common",
    .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
    .desc = {
@@ -954,6 +955,16 @@ static QemuOptsList bdrv_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Node is opened in read-only mode",
        },
+        {
+            .name = "detect-zeroes",
+            .type = QEMU_OPT_STRING,
+            .help = "try to optimize zero writes (off, on, unmap)",
+        },
+        {
+            .name = "discard",
+            .type = QEMU_OPT_STRING,
+            .help = "discard operation (ignore/off, unmap/on)",
+        },
        { /* end of list */ }
    },
 };
@@ -970,6 +981,8 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
    const char *filename;
    const char *driver_name = NULL;
    const char *node_name = NULL;
+    const char *discard;
+    const char *detect_zeroes;
    QemuOpts *opts;
    BlockDriver *drv;
    Error *local_err = NULL;
@@ -1038,6 +1051,41 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
        }
    }

+    discard = qemu_opt_get(opts, "discard");
+    if (discard != NULL) {
+        if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
+            error_setg(errp, "Invalid discard option");
+            ret = -EINVAL;
+            goto fail_opts;
+        }
+    }
+
+    detect_zeroes = qemu_opt_get(opts, "detect-zeroes");
+    if (detect_zeroes) {
+        BlockdevDetectZeroesOptions value =
+            qapi_enum_parse(BlockdevDetectZeroesOptions_lookup,
+                            detect_zeroes,
+                            BLOCKDEV_DETECT_ZEROES_OPTIONS__MAX,
+                            BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF,
+                            &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            ret = -EINVAL;
+            goto fail_opts;
+        }
+
+        if (value == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
+            !(bs->open_flags & BDRV_O_UNMAP))
+        {
+            error_setg(errp, "setting detect-zeroes to unmap is not allowed "
+                             "without setting discard operation to unmap");
+            ret = -EINVAL;
+            goto fail_opts;
+        }
+
+        bs->detect_zeroes = value;
+    }
+
    if (filename != NULL) {
        pstrcpy(bs->filename, sizeof(bs->filename), filename);
    } else {
@@ -1380,9 +1428,11 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
            backing_hd->drv ? backing_hd->drv->format_name : "");

    bdrv_op_block_all(backing_hd, bs->backing_blocker);
-    /* Otherwise we won't be able to commit due to check in bdrv_commit */
+    /* Otherwise we won't be able to commit or stream */
    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
                    bs->backing_blocker);
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
+                    bs->backing_blocker);
    /*
     * We do backup in 3 ways:
     * 1. drive backup
@@ -2034,7 +2084,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
 * to all devices.
 *
 */
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
    int ret = -1;
    BlockReopenQueueEntry *bs_entry, *next;
@@ -2042,7 +2092,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)

    assert(bs_queue != NULL);

-    bdrv_drain_all();
+    aio_context_release(ctx);
+    bdrv_drain_all_begin();
+    aio_context_acquire(ctx);

    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
@@ -2072,6 +2124,9 @@ cleanup:
        g_free(bs_entry);
    }
    g_free(bs_queue);
+
+    bdrv_drain_all_end();
+
    return ret;
 }

@@ -2083,7 +2138,7 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
    Error *local_err = NULL;
    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);

-    ret = bdrv_reopen_multiple(queue, &local_err);
+    ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
    }
@@ -2741,7 +2796,7 @@ const char *bdrv_get_format_name(BlockDriverState *bs)

 static int qsort_strcmp(const void *a, const void *b)
 {
-    return strcmp(a, b);
+    return strcmp(*(char *const *)a, *(char *const *)b);
 }

 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
@@ -2767,6 +2822,24 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
        }
    }

+    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
+        const char *format_name = block_driver_modules[i].format_name;
+
+        if (format_name) {
+            bool found = false;
+            int j = count;
+
+            while (formats && j && !found) {
+                found = !strcmp(formats[--j], format_name);
+            }
+
+            if (!found) {
+                formats = g_renew(const char *, formats, count + 1);
+                formats[count++] = format_name;
+            }
+        }
+    }
+
    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);

    for (i = 0; i < count; i++) {
@@ -3312,17 +3385,10 @@ int bdrv_media_changed(BlockDriverState *bs)
 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
 {
    BlockDriver *drv = bs->drv;
-    const char *device_name;

    if (drv && drv->bdrv_eject) {
        drv->bdrv_eject(bs, eject_flag);
    }
-
-    device_name = bdrv_get_device_name(bs);
-    if (device_name[0] != '\0') {
-        qapi_event_send_device_tray_moved(device_name,
-                                          eject_flag, &error_abort);
-    }
 }

 /**
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -41,6 +41,7 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
-dmg.o-libs         := $(BZIP2_LIBS)
+block-obj-$(if $(CONFIG_BZIP2),m,n) += dmg-bz2.o
+dmg-bz2.o-libs     := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -87,7 +87,6 @@ typedef enum {

 typedef struct ArchipelagoAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    struct BDRVArchipelagoState *s;
    QEMUIOVector *qiov;
    ARCHIPCmd cmd;
@@ -154,11 +153,10 @@ static void archipelago_finish_aiocb(AIORequestData *reqdata)
    } else if (reqdata->aio_cb->ret == reqdata->segreq->total) {
        reqdata->aio_cb->ret = 0;
    }
-    reqdata->aio_cb->bh = aio_bh_new(
+    aio_bh_schedule_oneshot(
                        bdrv_get_aio_context(reqdata->aio_cb->common.bs),
                        qemu_archipelago_complete_aio, reqdata
                        );
-    qemu_bh_schedule(reqdata->aio_cb->bh);
 }

 static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port,
@@ -313,7 +311,6 @@ static void qemu_archipelago_complete_aio(void *opaque)
    AIORequestData *reqdata = (AIORequestData *) opaque;
    ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;

-    qemu_bh_delete(aio_cb->bh);
    aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
    aio_cb->status = 0;

--- a/block/backup.c
+++ b/block/backup.c
@@ -16,7 +16,7 @@
 #include "trace.h"
 #include "block/block.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_backup.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
@@ -242,6 +242,14 @@ static void backup_abort(BlockJob *job)
    }
 }

+static void backup_clean(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    assert(s->target);
+    blk_unref(s->target);
+    s->target = NULL;
+}
+
 static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context)
 {
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
@@ -300,14 +308,20 @@ void backup_cow_request_end(CowRequest *req)
    cow_request_end(req);
 }

-static const BlockJobDriver backup_job_driver = {
-    .instance_size          = sizeof(BackupBlockJob),
-    .job_type               = BLOCK_JOB_TYPE_BACKUP,
-    .set_speed              = backup_set_speed,
-    .commit                 = backup_commit,
-    .abort                  = backup_abort,
-    .attached_aio_context   = backup_attached_aio_context,
-};
+static void backup_drain(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of backup_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}

 static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
@@ -327,11 +341,8 @@ typedef struct {

 static void backup_complete(BlockJob *job, void *opaque)
 {
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
    BackupCompleteData *data = opaque;

-    blk_unref(s->target);
-
    block_job_completed(job, data->ret);
    g_free(data);
 }
@@ -372,14 +383,14 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
-    HBitmapIter hbi;
+    BdrvDirtyBitmapIter *dbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
-    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+    dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);

    /* Find the next dirty sector(s) */
-    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+    while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
        cluster = sector / sectors_per_cluster;

        /* Fake progress updates for any clusters we skipped */
@@ -391,7 +402,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
            do {
                if (yield_and_check(job)) {
-                    return ret;
+                    goto out;
                }
                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
@@ -399,7 +410,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if ((ret < 0) &&
                    backup_error_action(job, error_is_read, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
-                    return ret;
+                    goto out;
                }
            } while (ret < 0);
        }
@@ -407,7 +418,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
        if (granularity < job->cluster_size) {
-            bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster);
+            bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster);
        }

        last_cluster = cluster - 1;
@@ -419,6 +430,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
    }

+out:
+    bdrv_dirty_iter_free(dbi);
    return ret;
 }

@@ -427,7 +440,6 @@ static void coroutine_fn backup_run(void *opaque)
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
    BlockDriverState *bs = blk_bs(job->common.blk);
-    BlockBackend *target = job->target;
    int64_t start, end;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;
@@ -514,19 +526,30 @@ static void coroutine_fn backup_run(void *opaque)
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);
-
    data = g_malloc(sizeof(*data));
    data->ret = ret;
    block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }

-void backup_start(const char *job_id, BlockDriverState *bs,
+static const BlockJobDriver backup_job_driver = {
+    .instance_size          = sizeof(BackupBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_BACKUP,
+    .start                  = backup_run,
+    .set_speed              = backup_set_speed,
+    .commit                 = backup_commit,
+    .abort                  = backup_abort,
+    .clean                  = backup_clean,
+    .attached_aio_context   = backup_attached_aio_context,
+    .drain                  = backup_drain,
+};
+
+BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, int64_t speed,
                  MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
                  bool compress,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
+                  int creation_flags,
                  BlockCompletionFunc *cb, void *opaque,
                  BlockJobTxn *txn, Error **errp)
 {
@@ -540,52 +563,52 @@ void backup_start(const char *job_id, BlockDriverState *bs,

    if (bs == target) {
        error_setg(errp, "Source and target cannot be the same");
-        return;
+        return NULL;
    }

    if (!bdrv_is_inserted(bs)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(bs));
-        return;
+        return NULL;
    }

    if (!bdrv_is_inserted(target)) {
        error_setg(errp, "Device is not inserted: %s",
                   bdrv_get_device_name(target));
-        return;
+        return NULL;
    }

    if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
        error_setg(errp, "Compression is not supported for this drive %s",
                   bdrv_get_device_name(target));
-        return;
+        return NULL;
    }

    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
-        return;
+        return NULL;
    }

    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
-        return;
+        return NULL;
    }

    if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
        if (!sync_bitmap) {
            error_setg(errp, "must provide a valid bitmap name for "
                             "\"incremental\" sync mode");
-            return;
+            return NULL;
        }

        /* Create a new bitmap, and freeze/disable this one. */
        if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
-            return;
+            return NULL;
        }
    } else if (sync_bitmap) {
        error_setg(errp,
                   "a sync_bitmap was provided to backup_run, "
                   "but received an incompatible sync_mode (%s)",
                   MirrorSyncMode_lookup[sync_mode]);
-        return;
+        return NULL;
    }

    len = bdrv_getlength(bs);
@@ -596,7 +619,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
    }

    job = block_job_create(job_id, &backup_job_driver, bs, speed,
-                           cb, opaque, errp);
+                           creation_flags, cb, opaque, errp);
    if (!job) {
        goto error;
    }
@@ -629,19 +652,20 @@ void backup_start(const char *job_id, BlockDriverState *bs,
        job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
    }

-    bdrv_op_block_all(target, job->common.blocker);
+    block_job_add_bdrv(&job->common, target);
    job->common.len = len;
-    job->common.co = qemu_coroutine_create(backup_run, job);
    block_job_txn_add_job(txn, &job->common);
-    qemu_coroutine_enter(job->common.co);
-    return;
+
+    return &job->common;

 error:
    if (sync_bitmap) {
        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
    }
    if (job) {
-        blk_unref(job->target);
+        backup_clean(&job->common);
        block_job_unref(&job->common);
    }
+
+    return NULL;
 }
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -49,7 +49,6 @@ typedef struct BDRVBlkdebugState {

 typedef struct BlkdebugAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    int ret;
 } BlkdebugAIOCB;

@@ -410,7 +409,6 @@ out:
 static void error_callback_bh(void *opaque)
 {
    struct BlkdebugAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
    acb->common.cb(acb->common.opaque, acb->ret);
    qemu_aio_unref(acb);
 }
@@ -421,7 +419,6 @@ static BlockAIOCB *inject_error(BlockDriverState *bs,
    BDRVBlkdebugState *s = bs->opaque;
    int error = rule->options.inject.error;
    struct BlkdebugAIOCB *acb;
-    QEMUBH *bh;
    bool immediately = rule->options.inject.immediately;

    if (rule->options.inject.once) {
@@ -436,9 +433,7 @@ static BlockAIOCB *inject_error(BlockDriverState *bs,
    acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque);
    acb->ret = -error;

-    bh = aio_bh_new(bdrv_get_aio_context(bs), error_callback_bh, acb);
-    acb->bh = bh;
-    qemu_bh_schedule(bh);
+    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh, acb);

    return &acb->common;
 }
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -20,11 +20,6 @@ typedef struct Request {
    QEMUBH *bh;
 } Request;

-/* Next request id.
-   This counter is global, because requests from different
-   block devices should not get overlapping ids. */
-static uint64_t request_id;
-
 static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@@ -84,7 +79,7 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@@ -95,7 +90,7 @@ static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@@ -106,7 +101,7 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@@ -117,7 +112,7 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
                                              int64_t offset, int count)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@@ -127,7 +122,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,

 static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_flush(bs->file->bs);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -22,7 +22,6 @@ typedef struct {
 typedef struct BlkverifyAIOCB BlkverifyAIOCB;
 struct BlkverifyAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;

    /* Request metadata */
    bool is_write;
@@ -175,7 +174,6 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
 {
    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);

-    acb->bh = NULL;
    acb->is_write = is_write;
    acb->sector_num = sector_num;
    acb->nb_sectors = nb_sectors;
@@ -191,7 +189,6 @@ static void blkverify_aio_bh(void *opaque)
 {
    BlkverifyAIOCB *acb = opaque;

-    qemu_bh_delete(acb->bh);
    if (acb->buf) {
        qemu_iovec_destroy(&acb->raw_qiov);
        qemu_vfree(acb->buf);
@@ -218,9 +215,8 @@ static void blkverify_aio_cb(void *opaque, int ret)
            acb->verify(acb);
        }

-        acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
-                             blkverify_aio_bh, acb);
-        qemu_bh_schedule(acb->bh);
+        aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+                                blkverify_aio_bh, acb);
        break;
    }
 }
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -38,6 +38,7 @@ struct BlockBackend {
    BlockBackendPublic public;

    void *dev;                  /* attached device model, if any */
+    bool legacy_dev;            /* true if dev is not a DeviceState */
    /* TODO change to DeviceState when all users are qdevified */
    const BlockDevOps *dev_ops;
    void *dev_opaque;
@@ -65,7 +66,6 @@ struct BlockBackend {

 typedef struct BlockBackendAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    BlockBackend *blk;
    int ret;
 } BlockBackendAIOCB;
@@ -507,32 +507,38 @@ void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
    }
 }

-/*
- * Attach device model @dev to @blk.
- * Return 0 on success, -EBUSY when a device model is attached already.
- */
-int blk_attach_dev(BlockBackend *blk, void *dev)
-/* TODO change to DeviceState *dev when all users are qdevified */
+static int blk_do_attach_dev(BlockBackend *blk, void *dev)
 {
    if (blk->dev) {
        return -EBUSY;
    }
    blk_ref(blk);
    blk->dev = dev;
+    blk->legacy_dev = false;
    blk_iostatus_reset(blk);
    return 0;
 }

+/*
+ * Attach device model @dev to @blk.
+ * Return 0 on success, -EBUSY when a device model is attached already.
+ */
+int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
+{
+    return blk_do_attach_dev(blk, dev);
+}
+
 /*
 * Attach device model @dev to @blk.
 * @blk must not have a device model attached already.
 * TODO qdevified devices don't use this, remove when devices are qdevified
 */
-void blk_attach_dev_nofail(BlockBackend *blk, void *dev)
+void blk_attach_dev_legacy(BlockBackend *blk, void *dev)
 {
-    if (blk_attach_dev(blk, dev) < 0) {
+    if (blk_do_attach_dev(blk, dev) < 0) {
        abort();
    }
+    blk->legacy_dev = true;
 }

 /*
@@ -559,6 +565,23 @@ void *blk_get_attached_dev(BlockBackend *blk)
    return blk->dev;
 }

+/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
+ * device attached to the BlockBackend. */
+static char *blk_get_attached_dev_id(BlockBackend *blk)
+{
+    DeviceState *dev;
+
+    assert(!blk->legacy_dev);
+    dev = blk->dev;
+
+    if (!dev) {
+        return g_strdup("");
+    } else if (dev->id) {
+        return g_strdup(dev->id);
+    }
+    return object_get_canonical_path(OBJECT(dev));
+}
+
 /*
 * Return the BlockBackend which has the device model @dev attached if it
 * exists, else null.
@@ -586,6 +609,11 @@ BlockBackend *blk_by_dev(void *dev)
 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
                     void *opaque)
 {
+    /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep
+     * it that way, so we can assume blk->dev is a DeviceState if blk->dev_ops
+     * is set. */
+    assert(!blk->legacy_dev);
+
    blk->dev_ops = ops;
    blk->dev_opaque = opaque;
 }
@@ -601,13 +629,17 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
        bool tray_was_open, tray_is_open;

+        assert(!blk->legacy_dev);
+
        tray_was_open = blk_dev_is_tray_open(blk);
        blk->dev_ops->change_media_cb(blk->dev_opaque, load);
        tray_is_open = blk_dev_is_tray_open(blk);

        if (tray_was_open != tray_is_open) {
-            qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open,
+            char *id = blk_get_attached_dev_id(blk);
+            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open,
                                              &error_abort);
+            g_free(id);
        }
    }
 }
@@ -767,20 +799,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                               BdrvRequestFlags flags)
 {
    int ret;
+    BlockDriverState *bs = blk_bs(blk);

-    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_preadv(blk, bs, offset, bytes, flags);

    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

+    bdrv_inc_in_flight(bs);
+
    /* throttling disk I/O */
    if (blk->public.throttle_state) {
        throttle_group_co_io_limits_intercept(blk, bytes, false);
    }

-    return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }

 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
@@ -788,14 +825,17 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                BdrvRequestFlags flags)
 {
    int ret;
+    BlockDriverState *bs = blk_bs(blk);

-    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);

    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }

+    bdrv_inc_in_flight(bs);
+
    /* throttling disk I/O */
    if (blk->public.throttle_state) {
        throttle_group_co_io_limits_intercept(blk, bytes, true);
@@ -805,7 +845,9 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
        flags |= BDRV_REQ_FUA;
    }

-    return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }

 typedef struct BlkRwCo {
@@ -836,7 +878,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
                   int64_t bytes, CoroutineEntry co_entry,
                   BdrvRequestFlags flags)
 {
-    AioContext *aio_context;
    QEMUIOVector qiov;
    struct iovec iov;
    Coroutine *co;
@@ -858,11 +899,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,

    co = qemu_coroutine_create(co_entry, &rwco);
    qemu_coroutine_enter(co);
-
-    aio_context = blk_get_aio_context(blk);
-    while (rwco.ret == NOT_DONE) {
-        aio_poll(aio_context, true);
-    }
+    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);

    return rwco.ret;
 }
@@ -898,7 +935,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
    struct BlockBackendAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
+
+    bdrv_dec_in_flight(acb->common.bs);
    acb->common.cb(acb->common.opaque, acb->ret);
    qemu_aio_unref(acb);
 }
@@ -908,16 +946,13 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
                                  void *opaque, int ret)
 {
    struct BlockBackendAIOCB *acb;
-    QEMUBH *bh;

+    bdrv_inc_in_flight(blk_bs(blk));
    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
    acb->blk = blk;
    acb->ret = ret;

-    bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb);
-    acb->bh = bh;
-    qemu_bh_schedule(bh);
-
+    aio_bh_schedule_oneshot(blk_get_aio_context(blk), error_callback_bh, acb);
    return &acb->common;
 }

@@ -926,7 +961,6 @@ typedef struct BlkAioEmAIOCB {
    BlkRwCo rwco;
    int bytes;
    bool has_returned;
-    QEMUBH* bh;
 } BlkAioEmAIOCB;

 static const AIOCBInfo blk_aio_em_aiocb_info = {
@@ -935,11 +969,8 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {

 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
-    if (acb->bh) {
-        assert(acb->has_returned);
-        qemu_bh_delete(acb->bh);
-    }
    if (acb->has_returned) {
+        bdrv_dec_in_flight(acb->common.bs);
        acb->common.cb(acb->common.opaque, acb->rwco.ret);
        qemu_aio_unref(acb);
    }
@@ -947,7 +978,10 @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)

 static void blk_aio_complete_bh(void *opaque)
 {
-    blk_aio_complete(opaque);
+    BlkAioEmAIOCB *acb = opaque;
+
+    assert(acb->has_returned);
+    blk_aio_complete(acb);
 }

 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
@@ -958,6 +992,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
    BlkAioEmAIOCB *acb;
    Coroutine *co;

+    bdrv_inc_in_flight(blk_bs(blk));
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
    acb->rwco = (BlkRwCo) {
        .blk    = blk,
@@ -967,7 +1002,6 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
        .ret    = NOT_DONE,
    };
    acb->bytes = bytes;
-    acb->bh = NULL;
    acb->has_returned = false;

    co = qemu_coroutine_create(co_entry, acb);
@@ -975,8 +1009,8 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
-        acb->bh = aio_bh_new(blk_get_aio_context(blk), blk_aio_complete_bh, acb);
-        qemu_bh_schedule(acb->bh);
+        aio_bh_schedule_oneshot(blk_get_aio_context(blk),
+                                blk_aio_complete_bh, acb);
    }

    return &acb->common;
@@ -1075,26 +1109,36 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                        blk_aio_write_entry, flags, cb, opaque);
 }

+static void blk_aio_flush_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_flush(rwco->blk);
+    blk_aio_complete(acb);
+}
+
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                          BlockCompletionFunc *cb, void *opaque)
 {
-    if (!blk_is_available(blk)) {
-        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
-    }
+    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
+}

-    return bdrv_aio_flush(blk_bs(blk), cb, opaque);
+static void blk_aio_pdiscard_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
+    blk_aio_complete(acb);
 }

 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
                             int64_t offset, int count,
                             BlockCompletionFunc *cb, void *opaque)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
-    if (ret < 0) {
-        return blk_abort_aio_request(blk, cb, opaque, ret);
-    }
-
-    return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque);
+    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_pdiscard_entry, 0,
+                        cb, opaque);
 }

 void blk_aio_cancel(BlockAIOCB *acb)
@@ -1107,23 +1151,50 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
    bdrv_aio_cancel_async(acb);
 }

-int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
+int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
    if (!blk_is_available(blk)) {
        return -ENOMEDIUM;
    }

-    return bdrv_ioctl(blk_bs(blk), req, buf);
+    return bdrv_co_ioctl(blk_bs(blk), req, buf);
+}
+
+static void blk_ioctl_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
+                             rwco->qiov->iov[0].iov_base);
+}
+
+int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
+{
+    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
+}
+
+static void blk_aio_ioctl_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
+                             rwco->qiov->iov[0].iov_base);
+    blk_aio_complete(acb);
 }

 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                          BlockCompletionFunc *cb, void *opaque)
 {
-    if (!blk_is_available(blk)) {
-        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
-    }
+    QEMUIOVector qiov;
+    struct iovec iov;

-    return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque);
+    iov = (struct iovec) {
+        .iov_base = buf,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
 }

 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
@@ -1145,13 +1216,15 @@ int blk_co_flush(BlockBackend *blk)
    return bdrv_co_flush(blk_bs(blk));
 }

+static void blk_flush_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    rwco->ret = blk_co_flush(rwco->blk);
+}
+
 int blk_flush(BlockBackend *blk)
 {
-    if (!blk_is_available(blk)) {
-        return -ENOMEDIUM;
-    }
-
-    return bdrv_flush(blk_bs(blk));
+    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
 }

 void blk_drain(BlockBackend *blk)
@@ -1206,8 +1279,9 @@ static void send_qmp_error_event(BlockBackend *blk,
    IoOperationType optype;

    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
-    qapi_event_send_block_io_error(blk_name(blk), optype, action,
-                                   blk_iostatus_is_enabled(blk),
+    qapi_event_send_block_io_error(blk_name(blk),
+                                   bdrv_get_node_name(blk_bs(blk)), optype,
+                                   action, blk_iostatus_is_enabled(blk),
                                   error == ENOSPC, strerror(error),
                                   &error_abort);
 }
@@ -1312,10 +1386,21 @@ void blk_lock_medium(BlockBackend *blk, bool locked)
 void blk_eject(BlockBackend *blk, bool eject_flag)
 {
    BlockDriverState *bs = blk_bs(blk);
+    char *id;
+
+    /* blk_eject is only called by qdevified devices */
+    assert(!blk->legacy_dev);

    if (bs) {
        bdrv_eject(bs, eject_flag);
    }
+
+    /* Whether or not we ejected on the backend,
+     * the frontend experienced a tray event. */
+    id = blk_get_attached_dev_id(blk);
+    qapi_event_send_device_tray_moved(blk_name(blk), id,
+                                      eject_flag, &error_abort);
+    g_free(id);
 }

 int blk_get_flags(BlockBackend *blk)
@@ -1520,14 +1605,15 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
    return bdrv_truncate(blk_bs(blk), offset);
 }

+static void blk_pdiscard_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
+}
+
 int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bdrv_pdiscard(blk_bs(blk), offset, count);
+    return blk_prw(blk, offset, NULL, count, blk_pdiscard_entry, 0);
 }

 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
@@ -1592,13 +1678,12 @@ void blk_update_root_state(BlockBackend *blk)
 }

 /*
- * Applies the information in the root state to the given BlockDriverState. This
- * does not include the flags which have to be specified for bdrv_open(), use
- * blk_get_open_flags_from_root_state() to inquire them.
+ * Returns the detect-zeroes setting to be used for bdrv_open() of a
+ * BlockDriverState which is supposed to inherit the root state.
 */
-void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
+bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
 {
-    bs->detect_zeroes = blk->root_state.detect_zeroes;
+    return blk->root_state.detect_zeroes;
 }

 /*
@@ -1640,28 +1725,6 @@ int blk_commit_all(void)
    return 0;
 }

-int blk_flush_all(void)
-{
-    BlockBackend *blk = NULL;
-    int result = 0;
-
-    while ((blk = blk_all_next(blk)) != NULL) {
-        AioContext *aio_context = blk_get_aio_context(blk);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        if (blk_is_inserted(blk)) {
-            ret = blk_flush(blk);
-            if (ret < 0 && !result) {
-                result = ret;
-            }
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
-}
-

 /* throttling disk I/O limits */
 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
--- a/block/commit.c
+++ b/block/commit.c
@@ -15,7 +15,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -205,17 +205,19 @@ static const BlockJobDriver commit_job_driver = {
    .instance_size = sizeof(CommitBlockJob),
    .job_type      = BLOCK_JOB_TYPE_COMMIT,
    .set_speed     = commit_set_speed,
+    .start         = commit_run,
 };

 void commit_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockCompletionFunc *cb,
-                  void *opaque, const char *backing_file_str, Error **errp)
+                  BlockdevOnError on_error, const char *backing_file_str,
+                  Error **errp)
 {
    CommitBlockJob *s;
    BlockReopenQueue *reopen_queue = NULL;
    int orig_overlay_flags;
    int orig_base_flags;
+    BlockDriverState *iter;
    BlockDriverState *overlay_bs;
    Error *local_err = NULL;

@@ -233,7 +235,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    }

    s = block_job_create(job_id, &commit_job_driver, bs, speed,
-                         cb, opaque, errp);
+                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
    if (!s) {
        return;
    }
@@ -251,7 +253,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
                                         orig_overlay_flags | BDRV_O_RDWR);
    }
    if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
        if (local_err != NULL) {
            error_propagate(errp, local_err);
            block_job_unref(&s->common);
@@ -260,6 +262,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    }


+    /* Block all nodes between top and base, because they will
+     * disappear from the chain after this operation. */
+    assert(bdrv_chain_contains(top, base));
+    for (iter = top; iter != backing_bs(base); iter = backing_bs(iter)) {
+        block_job_add_bdrv(&s->common, iter);
+    }
+    /* overlay_bs must be blocked because it needs to be modified to
+     * update the backing image string, but if it's the root node then
+     * don't block it again */
+    if (bs != overlay_bs) {
+        block_job_add_bdrv(&s->common, overlay_bs);
+    }
+
    s->base = blk_new();
    blk_insert_bs(s->base, base);

@@ -274,10 +289,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,
    s->backing_file_str = g_strdup(backing_file_str);

    s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(commit_run, s);

-    trace_commit_start(bs, base, top, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
+    trace_commit_start(bs, base, top, s);
+    block_job_start(&s->common);
 }


--- a/block/curl.c
+++ b/block/curl.c
@@ -68,12 +68,10 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #endif

 #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
-                   CURLPROTO_FTP | CURLPROTO_FTPS | \
-                   CURLPROTO_TFTP)
+                   CURLPROTO_FTP | CURLPROTO_FTPS)

 #define CURL_NUM_STATES 8
 #define CURL_NUM_ACB    8
-#define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
 #define CURL_TIMEOUT_DEFAULT 5
 #define CURL_TIMEOUT_MAX 10000
@@ -96,7 +94,6 @@ struct BDRVCURLState;

 typedef struct CURLAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    QEMUIOVector *qiov;

    int64_t sector_num;
@@ -106,12 +103,17 @@ typedef struct CURLAIOCB {
    size_t end;
 } CURLAIOCB;

+typedef struct CURLSocket {
+    int fd;
+    QLIST_ENTRY(CURLSocket) next;
+} CURLSocket;
+
 typedef struct CURLState
 {
    struct BDRVCURLState *s;
    CURLAIOCB *acb[CURL_NUM_ACB];
    CURL *curl;
-    curl_socket_t sock_fd;
+    QLIST_HEAD(, CURLSocket) sockets;
    char *orig_buf;
    size_t buf_start;
    size_t buf_off;
@@ -165,10 +167,27 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
 {
    BDRVCURLState *s;
    CURLState *state = NULL;
+    CURLSocket *socket;
+
    curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state);
-    state->sock_fd = fd;
    s = state->s;

+    QLIST_FOREACH(socket, &state->sockets, next) {
+        if (socket->fd == fd) {
+            if (action == CURL_POLL_REMOVE) {
+                QLIST_REMOVE(socket, next);
+                g_free(socket);
+            }
+            break;
+        }
+    }
+    if (!socket) {
+        socket = g_new0(CURLSocket, 1);
+        socket->fd = fd;
+        QLIST_INSERT_HEAD(&state->sockets, socket, next);
+    }
+    socket = NULL;
+
    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, (int)fd);
    switch (action) {
        case CURL_POLL_IN:
@@ -214,12 +233,13 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)

    DPRINTF("CURL: Just reading %zd bytes\n", realsize);

-    if (!s || !s->orig_buf)
-        return 0;
+    if (!s || !s->orig_buf) {
+        goto read_end;
+    }

    if (s->buf_off >= s->buf_len) {
        /* buffer full, read nothing */
-        return 0;
+        goto read_end;
    }
    realsize = MIN(realsize, s->buf_len - s->buf_off);
    memcpy(s->orig_buf + s->buf_off, ptr, realsize);
@@ -232,15 +252,26 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
            continue;

        if ((s->buf_off >= acb->end)) {
+            size_t request_length = acb->nb_sectors * BDRV_SECTOR_SIZE;
+
            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
                                acb->end - acb->start);
+
+            if (acb->end - acb->start < request_length) {
+                size_t offset = acb->end - acb->start;
+                qemu_iovec_memset(acb->qiov, offset, 0,
+                                  request_length - offset);
+            }
+
            acb->common.cb(acb->common.opaque, 0);
            qemu_aio_unref(acb);
            s->acb[i] = NULL;
        }
    }

-    return realsize;
+read_end:
+    /* curl will error out if we do not return this value */
+    return size * nmemb;
 }

 static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
@@ -248,6 +279,8 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
 {
    int i;
    size_t end = start + len;
+    size_t clamped_end = MIN(end, s->len);
+    size_t clamped_len = clamped_end - start;

    for (i=0; i<CURL_NUM_STATES; i++) {
        CURLState *state = &s->states[i];
@@ -262,12 +295,15 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
        // Does the existing buffer cover our section?
        if ((start >= state->buf_start) &&
            (start <= buf_end) &&
-            (end >= state->buf_start) &&
-            (end <= buf_end))
+            (clamped_end >= state->buf_start) &&
+            (clamped_end <= buf_end))
        {
            char *buf = state->orig_buf + (start - state->buf_start);

-            qemu_iovec_from_buf(acb->qiov, 0, buf, len);
+            qemu_iovec_from_buf(acb->qiov, 0, buf, clamped_len);
+            if (clamped_len < len) {
+                qemu_iovec_memset(acb->qiov, clamped_len, 0, len - clamped_len);
+            }
            acb->common.cb(acb->common.opaque, 0);

            return FIND_RET_OK;
@@ -277,13 +313,13 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
        if (state->in_use &&
            (start >= state->buf_start) &&
            (start <= buf_fend) &&
-            (end >= state->buf_start) &&
-            (end <= buf_fend))
+            (clamped_end >= state->buf_start) &&
+            (clamped_end <= buf_fend))
        {
            int j;

            acb->start = start - state->buf_start;
-            acb->end = acb->start + len;
+            acb->end = acb->start + clamped_len;

            for (j=0; j<CURL_NUM_ACB; j++) {
                if (!state->acb[j]) {
@@ -353,6 +389,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
 static void curl_multi_do(void *arg)
 {
    CURLState *s = (CURLState *)arg;
+    CURLSocket *socket, *next_socket;
    int running;
    int r;

@@ -360,10 +397,13 @@ static void curl_multi_do(void *arg)
        return;
    }

-    do {
-        r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
+    /* Need to use _SAFE because curl_multi_socket_action() may trigger
+     * curl_sock_cb() which might modify this list */
+    QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) {
+        do {
+            r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running);
+        } while (r == CURLM_CALL_MULTI_PERFORM);
+    }
 }

 static void curl_multi_read(void *arg)
@@ -467,6 +507,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
 #endif
    }

+    QLIST_INIT(&state->sockets);
    state->s = s;

    return state;
@@ -476,6 +517,14 @@ static void curl_clean_state(CURLState *s)
 {
    if (s->s->multi)
        curl_multi_remove_handle(s->s->multi, s->curl);
+
+    while (!QLIST_EMPTY(&s->sockets)) {
+        CURLSocket *socket = QLIST_FIRST(&s->sockets);
+
+        QLIST_REMOVE(socket, next);
+        g_free(socket);
+    }
+
    s->in_use = 0;
 }

@@ -739,15 +788,12 @@ static void curl_readv_bh_cb(void *p)
    CURLAIOCB *acb = p;
    BDRVCURLState *s = acb->common.bs->opaque;

-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    size_t start = acb->sector_num * SECTOR_SIZE;
+    size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
    size_t end;

    // In case we have the requested data already (e.g. read-ahead),
    // we can just call the callback and be done.
-    switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
+    switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
        case FIND_RET_OK:
            qemu_aio_unref(acb);
            // fall through
@@ -766,13 +812,13 @@ static void curl_readv_bh_cb(void *p)
    }

    acb->start = 0;
-    acb->end = (acb->nb_sectors * SECTOR_SIZE);
+    acb->end = MIN(acb->nb_sectors * BDRV_SECTOR_SIZE, s->len - start);

    state->buf_off = 0;
    g_free(state->orig_buf);
    state->buf_start = start;
-    state->buf_len = acb->end + s->readahead_size;
-    end = MIN(start + state->buf_len, s->len) - 1;
+    state->buf_len = MIN(acb->end + s->readahead_size, s->len - start);
+    end = start + state->buf_len - 1;
    state->orig_buf = g_try_malloc(state->buf_len);
    if (state->buf_len && state->orig_buf == NULL) {
        curl_clean_state(state);
@@ -783,8 +829,8 @@ static void curl_readv_bh_cb(void *p)
    state->acb[0] = acb;

    snprintf(state->range, 127, "%zd-%zd", start, end);
-    DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n",
-            (acb->nb_sectors * SECTOR_SIZE), start, state->range);
+    DPRINTF("CURL (AIO): Reading %llu at %zd (%s)\n",
+            (acb->nb_sectors * BDRV_SECTOR_SIZE), start, state->range);
    curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);

    curl_multi_add_handle(s->multi, state->curl);
@@ -805,8 +851,7 @@ static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
    acb->sector_num = sector_num;
    acb->nb_sectors = nb_sectors;

-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
+    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb);
    return &acb->common;
 }

@@ -891,29 +936,12 @@ static BlockDriver bdrv_ftps = {
    .bdrv_attach_aio_context    = curl_attach_aio_context,
 };

-static BlockDriver bdrv_tftp = {
-    .format_name                = "tftp",
-    .protocol_name              = "tftp",
-
-    .instance_size              = sizeof(BDRVCURLState),
-    .bdrv_parse_filename        = curl_parse_filename,
-    .bdrv_file_open             = curl_open,
-    .bdrv_close                 = curl_close,
-    .bdrv_getlength             = curl_getlength,
-
-    .bdrv_aio_readv             = curl_aio_readv,
-
-    .bdrv_detach_aio_context    = curl_detach_aio_context,
-    .bdrv_attach_aio_context    = curl_attach_aio_context,
-};
-
 static void curl_block_init(void)
 {
    bdrv_register(&bdrv_http);
    bdrv_register(&bdrv_https);
    bdrv_register(&bdrv_ftp);
    bdrv_register(&bdrv_ftps);
-    bdrv_register(&bdrv_tftp);
 }

 block_init(curl_block_init);
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -38,13 +38,20 @@
 */
 struct BdrvDirtyBitmap {
    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    HBitmap *meta;              /* Meta dirty bitmap */
    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
    char *name;                 /* Optional non-empty unique ID */
    int64_t size;               /* Size of the bitmap (Number of sectors) */
    bool disabled;              /* Bitmap is read-only */
+    int active_iterators;       /* How many iterators are active */
    QLIST_ENTRY(BdrvDirtyBitmap) list;
 };

+struct BdrvDirtyBitmapIter {
+    HBitmapIter hbi;
+    BdrvDirtyBitmap *bitmap;
+};
+
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
    BdrvDirtyBitmap *bm;
@@ -97,6 +104,66 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
    return bitmap;
 }

+/* bdrv_create_meta_dirty_bitmap
+ *
+ * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e.
+ * when a dirty status bit in @bitmap is changed (either from reset to set or
+ * the other way around), its respective meta dirty bitmap bit will be marked
+ * dirty as well.
+ *
+ * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap.
+ * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap
+ * track.
+ */
+void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                                   int chunk_size)
+{
+    assert(!bitmap->meta);
+    bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
+                                       chunk_size * BITS_PER_BYTE);
+}
+
+void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(bitmap->meta);
+    hbitmap_free_meta(bitmap->bitmap);
+    bitmap->meta = NULL;
+}
+
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+                               BdrvDirtyBitmap *bitmap, int64_t sector,
+                               int nb_sectors)
+{
+    uint64_t i;
+    int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta);
+
+    /* To optimize: we can make hbitmap to internally check the range in a
+     * coarse level, or at least do it word by word. */
+    for (i = sector; i < sector + nb_sectors; i += sectors_per_bit) {
+        if (hbitmap_get(bitmap->meta, i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
+                                  BdrvDirtyBitmap *bitmap, int64_t sector,
+                                  int nb_sectors)
+{
+    hbitmap_reset(bitmap->meta, sector, nb_sectors);
+}
+
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->size;
+}
+
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->name;
+}
+
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
    return bitmap->successor;
@@ -212,6 +279,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)

    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
        assert(!bdrv_dirty_bitmap_frozen(bitmap));
+        assert(!bitmap->active_iterators);
        hbitmap_truncate(bitmap->bitmap, size);
        bitmap->size = size;
    }
@@ -224,7 +292,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
    BdrvDirtyBitmap *bm, *next;
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+            assert(!bm->active_iterators);
            assert(!bdrv_dirty_bitmap_frozen(bm));
+            assert(!bm->meta);
            QLIST_REMOVE(bm, list);
            hbitmap_free(bm->bitmap);
            g_free(bm->name);
@@ -235,6 +305,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
            }
        }
    }
+    if (bitmap) {
+        abort();
+    }
 }

 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
@@ -320,9 +393,43 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }

-void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap)
 {
-    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
+    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta);
+}
+
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
+                                         uint64_t first_sector)
+{
+    BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
+    hbitmap_iter_init(&iter->hbi, bitmap->bitmap, first_sector);
+    iter->bitmap = bitmap;
+    bitmap->active_iterators++;
+    return iter;
+}
+
+BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap)
+{
+    BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
+    hbitmap_iter_init(&iter->hbi, bitmap->meta, 0);
+    iter->bitmap = bitmap;
+    bitmap->active_iterators++;
+    return iter;
+}
+
+void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
+{
+    if (!iter) {
+        return;
+    }
+    assert(iter->bitmap->active_iterators > 0);
+    iter->bitmap->active_iterators--;
+    g_free(iter);
+}
+
+int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
+{
+    return hbitmap_iter_next(&iter->hbi);
 }

 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
@@ -360,6 +467,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
    hbitmap_free(tmp);
 }

+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+                                              uint64_t start, uint64_t count)
+{
+    return hbitmap_serialization_size(bitmap->bitmap, start, count);
+}
+
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap)
+{
+    return hbitmap_serialization_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+                                      uint8_t *buf, uint64_t start,
+                                      uint64_t count)
+{
+    hbitmap_serialize_part(bitmap->bitmap, buf, start, count);
+}
+
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+                                        uint8_t *buf, uint64_t start,
+                                        uint64_t count, bool finish)
+{
+    hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+                                          uint64_t start, uint64_t count,
+                                          bool finish)
+{
+    hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
+{
+    hbitmap_deserialize_finish(bitmap->bitmap);
+}
+
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                    int64_t nr_sectors)
 {
@@ -373,15 +517,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
 }

 /**
- * Advance an HBitmapIter to an arbitrary offset.
+ * Advance a BdrvDirtyBitmapIter to an arbitrary offset.
 */
-void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *iter, int64_t sector_num)
 {
-    assert(hbi->hb);
-    hbitmap_iter_init(hbi, hbi->hb, offset);
+    hbitmap_iter_init(&iter->hbi, iter->hbi.hb, sector_num);
 }

 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
 {
    return hbitmap_count(bitmap->bitmap);
 }
+
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap)
+{
+    return hbitmap_count(bitmap->meta);
+}
--- a/block/dmg-bz2.c
+++ b/block/dmg-bz2.c
@@ -0,0 +1,61 @@
+/*
+ * DMG bzip2 uncompression
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "dmg.h"
+#include <bzlib.h>
+
+static int dmg_uncompress_bz2_do(char *next_in, unsigned int avail_in,
+                                 char *next_out, unsigned int avail_out)
+{
+    int ret;
+    uint64_t total_out;
+    bz_stream bzstream = {};
+
+    ret = BZ2_bzDecompressInit(&bzstream, 0, 0);
+    if (ret != BZ_OK) {
+        return -1;
+    }
+    bzstream.next_in = next_in;
+    bzstream.avail_in = avail_in;
+    bzstream.next_out = next_out;
+    bzstream.avail_out = avail_out;
+    ret = BZ2_bzDecompress(&bzstream);
+    total_out = ((uint64_t)bzstream.total_out_hi32 << 32) +
+                bzstream.total_out_lo32;
+    BZ2_bzDecompressEnd(&bzstream);
+    if (ret != BZ_STREAM_END ||
+        total_out != avail_out) {
+        return -1;
+    }
+    return 0;
+}
+
+__attribute__((constructor))
+static void dmg_bz2_init(void)
+{
+    assert(!dmg_uncompress_bz2);
+    dmg_uncompress_bz2 = dmg_uncompress_bz2_do;
+}
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -28,10 +28,10 @@
 #include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
-#include <zlib.h>
-#ifdef CONFIG_BZIP2
-#include <bzlib.h>
-#endif
+#include "dmg.h"
+
+int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in,
+                          char *next_out, unsigned int avail_out);

 enum {
    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -41,31 +41,6 @@ enum {
    DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
 };

-typedef struct BDRVDMGState {
-    CoMutex lock;
-    /* each chunk contains a certain number of sectors,
-     * offsets[i] is the offset in the .dmg file,
-     * lengths[i] is the length of the compressed chunk,
-     * sectors[i] is the sector beginning at offsets[i],
-     * sectorcounts[i] is the number of sectors in that chunk,
-     * the sectors array is ordered
-     * 0<=i<n_chunks */
-
-    uint32_t n_chunks;
-    uint32_t* types;
-    uint64_t* offsets;
-    uint64_t* lengths;
-    uint64_t* sectors;
-    uint64_t* sectorcounts;
-    uint32_t current_chunk;
-    uint8_t *compressed_chunk;
-    uint8_t *uncompressed_chunk;
-    z_stream zstream;
-#ifdef CONFIG_BZIP2
-    bz_stream bzstream;
-#endif
-} BDRVDMGState;
-
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
    int len;
@@ -210,10 +185,9 @@ static bool dmg_is_known_block_type(uint32_t entry_type)
    case 0x00000001:    /* uncompressed */
    case 0x00000002:    /* zeroes */
    case 0x80000005:    /* zlib */
-#ifdef CONFIG_BZIP2
-    case 0x80000006:    /* bzip2 */
-#endif
        return true;
+    case 0x80000006:    /* bzip2 */
+        return !!dmg_uncompress_bz2;
    default:
        return false;
    }
@@ -439,6 +413,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
    int64_t offset;
    int ret;

+    block_module_load_one("dmg-bz2");
    bs->read_only = true;

    s->n_chunks = 0;
@@ -587,9 +562,6 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
    if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
        int ret;
        uint32_t chunk = search_chunk(s, sector_num);
-#ifdef CONFIG_BZIP2
-        uint64_t total_out;
-#endif

        if (chunk >= s->n_chunks) {
            return -1;
@@ -620,8 +592,10 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
                return -1;
            }
            break; }
-#ifdef CONFIG_BZIP2
        case 0x80000006: /* bzip2 compressed */
+            if (!dmg_uncompress_bz2) {
+                break;
+            }
            /* we need to buffer, because only the chunk as whole can be
             * inflated. */
            ret = bdrv_pread(bs->file, s->offsets[chunk],
@@ -630,24 +604,15 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
                return -1;
            }

-            ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0);
-            if (ret != BZ_OK) {
-                return -1;
-            }
-            s->bzstream.next_in = (char *)s->compressed_chunk;
-            s->bzstream.avail_in = (unsigned int) s->lengths[chunk];
-            s->bzstream.next_out = (char *)s->uncompressed_chunk;
-            s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk];
-            ret = BZ2_bzDecompress(&s->bzstream);
-            total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) +
-                        s->bzstream.total_out_lo32;
-            BZ2_bzDecompressEnd(&s->bzstream);
-            if (ret != BZ_STREAM_END ||
-                total_out != 512 * s->sectorcounts[chunk]) {
-                return -1;
+            ret = dmg_uncompress_bz2((char *)s->compressed_chunk,
+                                     (unsigned int) s->lengths[chunk],
+                                     (char *)s->uncompressed_chunk,
+                                     (unsigned int)
+                                         (512 * s->sectorcounts[chunk]));
+            if (ret < 0) {
+                return ret;
            }
            break;
-#endif /* CONFIG_BZIP2 */
        case 1: /* copy */
            ret = bdrv_pread(bs->file, s->offsets[chunk],
                             s->uncompressed_chunk, s->lengths[chunk]);
--- a/block/dmg.h
+++ b/block/dmg.h
@@ -0,0 +1,59 @@
+/*
+ * Header for DMG driver
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ * Copyright (c) 2016 Red hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_DMG_H
+#define BLOCK_DMG_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include <zlib.h>
+
+typedef struct BDRVDMGState {
+    CoMutex lock;
+    /* each chunk contains a certain number of sectors,
+     * offsets[i] is the offset in the .dmg file,
+     * lengths[i] is the length of the compressed chunk,
+     * sectors[i] is the sector beginning at offsets[i],
+     * sectorcounts[i] is the number of sectors in that chunk,
+     * the sectors array is ordered
+     * 0<=i<n_chunks */
+
+    uint32_t n_chunks;
+    uint32_t *types;
+    uint64_t *offsets;
+    uint64_t *lengths;
+    uint64_t *sectors;
+    uint64_t *sectorcounts;
+    uint32_t current_chunk;
+    uint8_t *compressed_chunk;
+    uint8_t *uncompressed_chunk;
+    z_stream zstream;
+} BDRVDMGState;
+
+extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in,
+                                 char *next_out, unsigned int avail_out);
+
+#endif
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -14,6 +14,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
+#include "qemu/cutils.h"

 #define GLUSTER_OPT_FILENAME        "filename"
 #define GLUSTER_OPT_VOLUME          "volume"
@@ -38,7 +39,6 @@
 typedef struct GlusterAIOCB {
    int64_t size;
    int ret;
-    QEMUBH *bh;
    Coroutine *coroutine;
    AioContext *aio_context;
 } GlusterAIOCB;
@@ -57,6 +57,19 @@ typedef struct BDRVGlusterReopenState {
 } BDRVGlusterReopenState;


+typedef struct GlfsPreopened {
+    char *volume;
+    glfs_t *fs;
+    int ref;
+} GlfsPreopened;
+
+typedef struct ListElement {
+    QLIST_ENTRY(ListElement) list;
+    GlfsPreopened saved;
+} ListElement;
+
+static QLIST_HEAD(glfs_list, ListElement) glfs_list;
+
 static QemuOptsList qemu_gluster_create_opts = {
    .name = "qemu-gluster-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
@@ -173,7 +186,7 @@ static QemuOptsList runtime_tcp_opts = {
        },
        {
            .name = GLUSTER_OPT_PORT,
-            .type = QEMU_OPT_NUMBER,
+            .type = QEMU_OPT_STRING,
            .help = "port number on which glusterd is listening (default 24007)",
        },
        {
@@ -195,6 +208,58 @@ static QemuOptsList runtime_tcp_opts = {
    },
 };

+static void glfs_set_preopened(const char *volume, glfs_t *fs)
+{
+    ListElement *entry = NULL;
+
+    entry = g_new(ListElement, 1);
+
+    entry->saved.volume = g_strdup(volume);
+
+    entry->saved.fs = fs;
+    entry->saved.ref = 1;
+
+    QLIST_INSERT_HEAD(&glfs_list, entry, list);
+}
+
+static glfs_t *glfs_find_preopened(const char *volume)
+{
+    ListElement *entry = NULL;
+
+     QLIST_FOREACH(entry, &glfs_list, list) {
+        if (strcmp(entry->saved.volume, volume) == 0) {
+            entry->saved.ref++;
+            return entry->saved.fs;
+        }
+     }
+
+    return NULL;
+}
+
+static void glfs_clear_preopened(glfs_t *fs)
+{
+    ListElement *entry = NULL;
+    ListElement *next;
+
+    if (fs == NULL) {
+        return;
+    }
+
+    QLIST_FOREACH_SAFE(entry, &glfs_list, list, next) {
+        if (entry->saved.fs == fs) {
+            if (--entry->saved.ref) {
+                return;
+            }
+
+            QLIST_REMOVE(entry, list);
+
+            glfs_fini(entry->saved.fs);
+            g_free(entry->saved.volume);
+            g_free(entry);
+        }
+    }
+}
+
 static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
 {
    char *p, *q;
@@ -331,22 +396,37 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
    int ret;
    int old_errno;
    GlusterServerList *server;
+    unsigned long long port;
+
+    glfs = glfs_find_preopened(gconf->volume);
+    if (glfs) {
+        return glfs;
+    }

    glfs = glfs_new(gconf->volume);
    if (!glfs) {
        goto out;
    }

+    glfs_set_preopened(gconf->volume, glfs);
+
    for (server = gconf->server; server; server = server->next) {
        if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
            ret = glfs_set_volfile_server(glfs,
                                   GlusterTransport_lookup[server->value->type],
                                   server->value->u.q_unix.path, 0);
        } else {
+            if (parse_uint_full(server->value->u.tcp.port, &port, 10) < 0 ||
+                port > 65535) {
+                error_setg(errp, "'%s' is not a valid port number",
+                           server->value->u.tcp.port);
+                errno = EINVAL;
+                goto out;
+            }
            ret = glfs_set_volfile_server(glfs,
                                   GlusterTransport_lookup[server->value->type],
                                   server->value->u.tcp.host,
-                                   atoi(server->value->u.tcp.port));
+                                   (int)port);
        }

        if (ret < 0) {
@@ -388,7 +468,7 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
 out:
    if (glfs) {
        old_errno = errno;
-        glfs_fini(glfs);
+        glfs_clear_preopened(glfs);
        errno = old_errno;
    }
    return NULL;
@@ -622,8 +702,6 @@ static void qemu_gluster_complete_aio(void *opaque)
 {
    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;

-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
    qemu_coroutine_enter(acb->coroutine);
 }

@@ -642,8 +720,7 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
        acb->ret = -EIO; /* Partial read/write - fail it */
    }

-    acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
-    qemu_bh_schedule(acb->bh);
+    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
 }

 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
@@ -672,7 +749,10 @@ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
 */
 static bool qemu_gluster_test_seek(struct glfs_fd *fd)
 {
-    off_t ret, eof;
+    off_t ret = 0;
+
+#if defined SEEK_HOLE && defined SEEK_DATA
+    off_t eof;

    eof = glfs_lseek(fd, 0, SEEK_END);
    if (eof < 0) {
@@ -682,6 +762,8 @@ static bool qemu_gluster_test_seek(struct glfs_fd *fd)

    /* this should always fail with ENXIO if SEEK_DATA is supported */
    ret = glfs_lseek(fd, eof, SEEK_DATA);
+#endif
+
    return (ret < 0) && (errno == ENXIO);
 }

@@ -766,9 +848,9 @@ out:
    if (s->fd) {
        glfs_close(s->fd);
    }
-    if (s->glfs) {
-        glfs_fini(s->glfs);
-    }
+
+    glfs_clear_preopened(s->glfs);
+
    return ret;
 }

@@ -835,9 +917,8 @@ static void qemu_gluster_reopen_commit(BDRVReopenState *state)
    if (s->fd) {
        glfs_close(s->fd);
    }
-    if (s->glfs) {
-        glfs_fini(s->glfs);
-    }
+
+    glfs_clear_preopened(s->glfs);

    /* use the newly opened image / connection */
    s->fd         = reop_s->fd;
@@ -862,9 +943,7 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
        glfs_close(reop_s->fd);
    }

-    if (reop_s->glfs) {
-        glfs_fini(reop_s->glfs);
-    }
+    glfs_clear_preopened(reop_s->glfs);

    g_free(state->opaque);
    state->opaque = NULL;
@@ -988,9 +1067,7 @@ static int qemu_gluster_create(const char *filename,
 out:
    g_free(tmp);
    qapi_free_BlockdevOptionsGluster(gconf);
-    if (glfs) {
-        glfs_fini(glfs);
-    }
+    glfs_clear_preopened(glfs);
    return ret;
 }

@@ -1063,7 +1140,7 @@ static void qemu_gluster_close(BlockDriverState *bs)
        glfs_close(s->fd);
        s->fd = NULL;
    }
-    glfs_fini(s->glfs);
+    glfs_clear_preopened(s->glfs);
 }

 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
@@ -1182,12 +1259,14 @@ static int find_allocation(BlockDriverState *bs, off_t start,
                           off_t *data, off_t *hole)
 {
    BDRVGlusterState *s = bs->opaque;
-    off_t offs;

    if (!s->supports_seek_data) {
-        return -ENOTSUP;
+        goto exit;
    }

+#if defined SEEK_HOLE && defined SEEK_DATA
+    off_t offs;
+
    /*
     * SEEK_DATA cases:
     * D1. offs == start: start is in data
@@ -1251,6 +1330,10 @@ static int find_allocation(BlockDriverState *bs, off_t start,

    /* D1 and H1 */
    return -EBUSY;
+#endif
+
+exit:
+    return -ENOTSUP;
 }

 /*
--- a/block/io.c
+++ b/block/io.c
@@ -143,7 +143,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 {
    BdrvChild *child;

-    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+    if (atomic_read(&bs->in_flight)) {
        return true;
    }

@@ -156,43 +156,38 @@ bool bdrv_requests_pending(BlockDriverState *bs)
    return false;
 }

-static void bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
    BdrvChild *child;
+    bool waited;
+
+    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);

    if (bs->drv && bs->drv->bdrv_drain) {
        bs->drv->bdrv_drain(bs);
    }
+
    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_drain_recurse(child->bs);
+        waited |= bdrv_drain_recurse(child->bs);
    }
+
+    return waited;
 }

 typedef struct {
    Coroutine *co;
    BlockDriverState *bs;
-    QEMUBH *bh;
    bool done;
 } BdrvCoDrainData;

-static void bdrv_drain_poll(BlockDriverState *bs)
-{
-    bool busy = true;
-
-    while (busy) {
-        /* Keep iterating */
-        busy = bdrv_requests_pending(bs);
-        busy |= aio_poll(bdrv_get_aio_context(bs), busy);
-    }
-}
-
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
    BdrvCoDrainData *data = opaque;
    Coroutine *co = data->co;
+    BlockDriverState *bs = data->bs;

-    qemu_bh_delete(data->bh);
-    bdrv_drain_poll(data->bs);
+    bdrv_dec_in_flight(bs);
+    bdrv_drained_begin(bs);
    data->done = true;
    qemu_coroutine_enter(co);
 }
@@ -210,9 +205,10 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
        .co = qemu_coroutine_self(),
        .bs = bs,
        .done = false,
-        .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
    };
-    qemu_bh_schedule(data.bh);
+    bdrv_inc_in_flight(bs);
+    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
+                            bdrv_co_drain_bh_cb, &data);

    qemu_coroutine_yield();
    /* If we are resumed from some other event (such as an aio completion or a
@@ -222,6 +218,11 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)

 void bdrv_drained_begin(BlockDriverState *bs)
 {
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+        return;
+    }
+
    if (!bs->quiesce_counter++) {
        aio_disable_external(bdrv_get_aio_context(bs));
        bdrv_parent_drained_begin(bs);
@@ -229,11 +230,6 @@ void bdrv_drained_begin(BlockDriverState *bs)

    bdrv_io_unplugged_begin(bs);
    bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
-    } else {
-        bdrv_drain_poll(bs);
-    }
    bdrv_io_unplugged_end(bs);
 }

@@ -277,11 +273,17 @@ void bdrv_drain(BlockDriverState *bs)
 *
 * This function does not flush data to disk, use bdrv_flush_all() for that
 * after calling this function.
+ *
+ * This pauses all block jobs and disables external clients. It must
+ * be paired with bdrv_drain_all_end().
+ *
+ * NOTE: no new block jobs or BlockDriverStates can be created between
+ * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 */
-void bdrv_drain_all(void)
+void bdrv_drain_all_begin(void)
 {
    /* Always run first iteration so any pending completion BHs run */
-    bool busy = true;
+    bool waited = true;
    BlockDriverState *bs;
    BdrvNextIterator it;
    BlockJob *job = NULL;
@@ -301,7 +303,7 @@ void bdrv_drain_all(void)
        aio_context_acquire(aio_context);
        bdrv_parent_drained_begin(bs);
        bdrv_io_unplugged_begin(bs);
-        bdrv_drain_recurse(bs);
+        aio_disable_external(aio_context);
        aio_context_release(aio_context);

        if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -315,8 +317,8 @@ void bdrv_drain_all(void)
     * request completion.  Therefore we must keep looping until there was no
     * more activity rather than simply draining each device independently.
     */
-    while (busy) {
-        busy = false;
+    while (waited) {
+        waited = false;

        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
            AioContext *aio_context = ctx->data;
@@ -324,28 +326,32 @@ void bdrv_drain_all(void)
            aio_context_acquire(aio_context);
            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                if (aio_context == bdrv_get_aio_context(bs)) {
-                    if (bdrv_requests_pending(bs)) {
-                        busy = true;
-                        aio_poll(aio_context, busy);
-                    }
+                    waited |= bdrv_drain_recurse(bs);
                }
            }
-            busy |= aio_poll(aio_context, false);
            aio_context_release(aio_context);
        }
    }

+    g_slist_free(aio_ctxs);
+}
+
+void bdrv_drain_all_end(void)
+{
+    BlockDriverState *bs;
+    BdrvNextIterator it;
+    BlockJob *job = NULL;
+
    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
+        aio_enable_external(aio_context);
        bdrv_io_unplugged_end(bs);
        bdrv_parent_drained_end(bs);
        aio_context_release(aio_context);
    }
-    g_slist_free(aio_ctxs);

-    job = NULL;
    while ((job = block_job_next(job))) {
        AioContext *aio_context = blk_get_aio_context(job->blk);

@@ -355,6 +361,12 @@ void bdrv_drain_all(void)
    }
 }

+void bdrv_drain_all(void)
+{
+    bdrv_drain_all_begin();
+    bdrv_drain_all_end();
+}
+
 /**
 * Remove an active request from the tracked requests list
 *
@@ -478,6 +490,28 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
    return true;
 }

+void bdrv_inc_in_flight(BlockDriverState *bs)
+{
+    atomic_inc(&bs->in_flight);
+}
+
+static void dummy_bh_cb(void *opaque)
+{
+}
+
+void bdrv_wakeup(BlockDriverState *bs)
+{
+    if (bs->wakeup) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
+}
+
+void bdrv_dec_in_flight(BlockDriverState *bs)
+{
+    atomic_dec(&bs->in_flight);
+    bdrv_wakeup(bs);
+}
+
 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
    BlockDriverState *bs = self->bs;
@@ -585,13 +619,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
        /* Fast-path if already in coroutine context */
        bdrv_rw_co_entry(&rwco);
    } else {
-        AioContext *aio_context = bdrv_get_aio_context(child->bs);
-
        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
        qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
    }
    return rwco.ret;
 }
@@ -1099,6 +1129,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
        return ret;
    }

+    bdrv_inc_in_flight(bs);
+
    /* Don't do copy-on-read if we read data before write operation */
    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
        flags |= BDRV_REQ_COPY_ON_READ;
@@ -1134,6 +1166,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
                              use_local_qiov ? &local_qiov : qiov,
                              flags);
    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);

    if (use_local_qiov) {
        qemu_iovec_destroy(&local_qiov);
@@ -1181,6 +1214,8 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
                        bs->bl.request_alignment);
+    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+                                    MAX_WRITE_ZEROES_BOUNCE_BUFFER);

    assert(alignment % bs->bl.request_alignment == 0);
    head = offset % alignment;
@@ -1196,9 +1231,12 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
         * boundaries.
         */
        if (head) {
-            /* Make a small request up to the first aligned sector.  */
-            num = MIN(count, alignment - head);
-            head = 0;
+            /* Make a small request up to the first aligned sector. For
+             * convenience, limit this request to max_transfer even if
+             * we don't need to fall back to writes.  */
+            num = MIN(MIN(count, max_transfer), alignment - head);
+            head = (head + num) % alignment;
+            assert(num < max_write_zeroes);
        } else if (tail && num > alignment) {
            /* Shorten the request to the last aligned sector.  */
            num -= tail;
@@ -1224,8 +1262,6 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,

        if (ret == -ENOTSUP) {
            /* Fall back to bounce buffer if write zeroes is unsupported */
-            int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
-                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;

            if ((flags & BDRV_REQ_FUA) &&
@@ -1482,6 +1518,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
        return ret;
    }

+    bdrv_inc_in_flight(bs);
    /*
     * Align write if necessary by performing a read-modify-write cycle.
     * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1583,6 +1620,7 @@ fail:
    qemu_vfree(tail_buf);
 out:
    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
    return ret;
 }

@@ -1619,6 +1657,31 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
                           BDRV_REQ_ZERO_WRITE | flags);
 }

+/*
+ * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
+ */
+int bdrv_flush_all(void)
+{
+    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
+    int result = 0;
+
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        int ret;
+
+        aio_context_acquire(aio_context);
+        ret = bdrv_flush(bs);
+        if (ret < 0 && !result) {
+            result = ret;
+        }
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
+
 typedef struct BdrvCoGetBlockStatusData {
    BlockDriverState *bs;
    BlockDriverState *base;
@@ -1682,17 +1745,19 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
    }

    *file = NULL;
+    bdrv_inc_in_flight(bs);
    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
                                            file);
    if (ret < 0) {
        *pnum = 0;
-        return ret;
+        goto out;
    }

    if (ret & BDRV_BLOCK_RAW) {
        assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum, file);
+        ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
+                                    *pnum, pnum, file);
+        goto out;
    }

    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
@@ -1734,6 +1799,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
        }
    }

+out:
+    bdrv_dec_in_flight(bs);
    return ret;
 }

@@ -1799,14 +1866,10 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
        /* Fast-path if already in coroutine context */
        bdrv_get_block_status_above_co_entry(&data);
    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
                                   &data);
        qemu_coroutine_enter(co);
-        while (!data.done) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, !data.done);
    }
    return data.ret;
 }
@@ -2070,7 +2133,6 @@ typedef struct BlockAIOCBCoroutine {
    bool is_write;
    bool need_bh;
    bool *done;
-    QEMUBH* bh;
 } BlockAIOCBCoroutine;

 static const AIOCBInfo bdrv_em_co_aiocb_info = {
@@ -2080,6 +2142,7 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 {
    if (!acb->need_bh) {
+        bdrv_dec_in_flight(acb->common.bs);
        acb->common.cb(acb->common.opaque, acb->req.error);
        qemu_aio_unref(acb);
    }
@@ -2090,7 +2153,6 @@ static void bdrv_co_em_bh(void *opaque)
    BlockAIOCBCoroutine *acb = opaque;

    assert(!acb->need_bh);
-    qemu_bh_delete(acb->bh);
    bdrv_co_complete(acb);
 }

@@ -2100,8 +2162,7 @@ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
    if (acb->req.error != -EINPROGRESS) {
        BlockDriverState *bs = acb->common.bs;

-        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-        qemu_bh_schedule(acb->bh);
+        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
    }
 }

@@ -2132,6 +2193,9 @@ static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
    Coroutine *co;
    BlockAIOCBCoroutine *acb;

+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(child->bs);
+
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
    acb->child = child;
    acb->need_bh = true;
@@ -2165,6 +2229,9 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
    Coroutine *co;
    BlockAIOCBCoroutine *acb;

+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(bs);
+
    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
    acb->need_bh = true;
    acb->req.error = -EINPROGRESS;
@@ -2176,35 +2243,6 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
    return &acb->common;
 }

-static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
-    bdrv_co_complete(acb);
-}
-
-BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
-                              BlockCompletionFunc *cb, void *opaque)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.offset = offset;
-    acb->req.bytes = count;
-    co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
-    qemu_coroutine_enter(co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
                   BlockCompletionFunc *cb, void *opaque)
 {
@@ -2253,23 +2291,22 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
    int ret;
-    BdrvTrackedRequest req;

    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
        bdrv_is_sg(bs)) {
        return 0;
    }

-    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
+    bdrv_inc_in_flight(bs);

    int current_gen = bs->write_gen;

    /* Wait until any previous flushes are completed */
-    while (bs->active_flush_req != NULL) {
+    while (bs->active_flush_req) {
        qemu_co_queue_wait(&bs->flush_queue);
    }

-    bs->active_flush_req = &req;
+    bs->active_flush_req = true;

    /* Write back all layers by calling one driver function */
    if (bs->drv->bdrv_co_flush) {
@@ -2338,12 +2375,14 @@ flush_parent:
    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
 out:
    /* Notify any pending flushes that we have completed */
-    bs->flushed_gen = current_gen;
-    bs->active_flush_req = NULL;
+    if (ret == 0) {
+        bs->flushed_gen = current_gen;
+    }
+    bs->active_flush_req = false;
    /* Return value is ignored - it's ok if wait queue is empty */
    qemu_co_queue_next(&bs->flush_queue);

-    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
    return ret;
 }

@@ -2359,13 +2398,9 @@ int bdrv_flush(BlockDriverState *bs)
        /* Fast-path if already in coroutine context */
        bdrv_flush_co_entry(&flush_co);
    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
        qemu_coroutine_enter(co);
-        while (flush_co.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
    }

    return flush_co.ret;
@@ -2389,7 +2424,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
 {
    BdrvTrackedRequest req;
    int max_pdiscard, ret;
-    int head, align;
+    int head, tail, align;

    if (!bs->drv) {
        return -ENOMEDIUM;
@@ -2412,20 +2447,17 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
        return 0;
    }

-    /* Discard is advisory, so ignore any unaligned head or tail */
+    /* Discard is advisory, but some devices track and coalesce
+     * unaligned requests, so we must pass everything down rather than
+     * round here.  Still, most devices will just silently ignore
+     * unaligned requests (by returning -ENOTSUP), so we must fragment
+     * the request accordingly.  */
    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
    assert(align % bs->bl.request_alignment == 0);
    head = offset % align;
-    if (head) {
-        head = MIN(count, align - head);
-        count -= head;
-        offset += head;
-    }
-    count = QEMU_ALIGN_DOWN(count, align);
-    if (!count) {
-        return 0;
-    }
+    tail = (offset + count) % align;

+    bdrv_inc_in_flight(bs);
    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);

    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
@@ -2435,11 +2467,34 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,

    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
                                   align);
-    assert(max_pdiscard);
+    assert(max_pdiscard >= bs->bl.request_alignment);

    while (count > 0) {
        int ret;
-        int num = MIN(count, max_pdiscard);
+        int num = count;
+
+        if (head) {
+            /* Make small requests to get to alignment boundaries. */
+            num = MIN(count, align - head);
+            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
+                num %= bs->bl.request_alignment;
+            }
+            head = (head + num) % align;
+            assert(num < max_pdiscard);
+        } else if (tail) {
+            if (num > align) {
+                /* Shorten the request to the last aligned cluster.  */
+                num -= tail;
+            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
+                       tail > bs->bl.request_alignment) {
+                tail %= bs->bl.request_alignment;
+                num -= tail;
+            }
+        }
+        /* limit request size */
+        if (num > max_pdiscard) {
+            num = max_pdiscard;
+        }

        if (bs->drv->bdrv_co_pdiscard) {
            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
@@ -2472,6 +2527,7 @@ out:
    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                   req.bytes >> BDRV_SECTOR_BITS);
    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
    return ret;
 }

@@ -2489,106 +2545,41 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
        /* Fast-path if already in coroutine context */
        bdrv_pdiscard_co_entry(&rwco);
    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
        qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
    }

    return rwco.ret;
 }

-static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
+int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
 {
    BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest tracked_req;
    CoroutineIOCompletion co = {
        .coroutine = qemu_coroutine_self(),
    };
    BlockAIOCB *acb;

-    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
-    if (!drv || !drv->bdrv_aio_ioctl) {
+    bdrv_inc_in_flight(bs);
+    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
        co.ret = -ENOTSUP;
        goto out;
    }

-    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
-    if (!acb) {
-        co.ret = -ENOTSUP;
-        goto out;
-    }
-    qemu_coroutine_yield();
-out:
-    tracked_request_end(&tracked_req);
-    return co.ret;
-}
-
-typedef struct {
-    BlockDriverState *bs;
-    int req;
-    void *buf;
-    int ret;
-} BdrvIoctlCoData;
-
-static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
-{
-    BdrvIoctlCoData *data = opaque;
-    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
-}
-
-/* needed for generic scsi interface */
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    BdrvIoctlCoData data = {
-        .bs = bs,
-        .req = req,
-        .buf = buf,
-        .ret = -EINPROGRESS,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_co_ioctl_entry(&data);
+    if (drv->bdrv_co_ioctl) {
+        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
    } else {
-        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, &data);
-
-        qemu_coroutine_enter(co);
-        while (data.ret == -EINPROGRESS) {
-            aio_poll(bdrv_get_aio_context(bs), true);
+        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+        if (!acb) {
+            co.ret = -ENOTSUP;
+            goto out;
        }
+        qemu_coroutine_yield();
    }
-    return data.ret;
-}
-
-static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
-                                      acb->req.req, acb->req.buf);
-    bdrv_co_complete(acb);
-}
-
-BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
-        unsigned long int req, void *buf,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
-                                            bs, cb, opaque);
-    Coroutine *co;
-
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.req = req;
-    acb->req.buf = buf;
-    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry, acb);
-    qemu_coroutine_enter(co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
 }

 void *qemu_blockalign(BlockDriverState *bs, size_t size)
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -95,7 +95,6 @@ typedef struct IscsiTask {
    int do_retry;
    struct scsi_task *task;
    Coroutine *co;
-    QEMUBH *bh;
    IscsiLun *iscsilun;
    QEMUTimer retry_timer;
    int err_code;
@@ -167,7 +166,6 @@ static void iscsi_co_generic_bh_cb(void *opaque)
 {
    struct IscsiTask *iTask = opaque;
    iTask->complete = 1;
-    qemu_bh_delete(iTask->bh);
    qemu_coroutine_enter(iTask->co);
 }

@@ -204,6 +202,10 @@ static inline unsigned exp_random(double mean)
 #define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR        0x1a00
 #endif

+#ifndef LIBISCSI_API_VERSION
+#define LIBISCSI_API_VERSION 20130701
+#endif
+
 static int iscsi_translate_sense(struct scsi_sense *sense)
 {
    int ret;
@@ -299,9 +301,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,

 out:
    if (iTask->co) {
-        iTask->bh = aio_bh_new(iTask->iscsilun->aio_context,
-                               iscsi_co_generic_bh_cb, iTask);
-        qemu_bh_schedule(iTask->bh);
+        aio_bh_schedule_oneshot(iTask->iscsilun->aio_context,
+                                 iscsi_co_generic_bh_cb, iTask);
    } else {
        iTask->complete = 1;
    }
@@ -595,6 +596,20 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
    if (iscsilun->use_16_for_rw) {
+#if LIBISCSI_API_VERSION >= (20160603)
+        iTask.task = iscsi_write16_iov_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                            NULL, num_sectors * iscsilun->block_size,
+                                            iscsilun->block_size, 0, 0, fua, 0, 0,
+                                            iscsi_co_generic_cb, &iTask,
+                                            (struct scsi_iovec *)iov->iov, iov->niov);
+    } else {
+        iTask.task = iscsi_write10_iov_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                            NULL, num_sectors * iscsilun->block_size,
+                                            iscsilun->block_size, 0, 0, fua, 0, 0,
+                                            iscsi_co_generic_cb, &iTask,
+                                            (struct scsi_iovec *)iov->iov, iov->niov);
+    }
+#else
        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        NULL, num_sectors * iscsilun->block_size,
                                        iscsilun->block_size, 0, 0, fua, 0, 0,
@@ -605,11 +620,14 @@ retry:
                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                        iscsi_co_generic_cb, &iTask);
    }
+#endif
    if (iTask.task == NULL) {
        return -ENOMEM;
    }
+#if LIBISCSI_API_VERSION < (20160603)
    scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
                          iov->niov);
+#endif
    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
        qemu_coroutine_yield();
@@ -792,6 +810,21 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
    if (iscsilun->use_16_for_rw) {
+#if LIBISCSI_API_VERSION >= (20160603)
+        iTask.task = iscsi_read16_iov_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                           num_sectors * iscsilun->block_size,
+                                           iscsilun->block_size, 0, 0, 0, 0, 0,
+                                           iscsi_co_generic_cb, &iTask,
+                                           (struct scsi_iovec *)iov->iov, iov->niov);
+    } else {
+        iTask.task = iscsi_read10_iov_task(iscsilun->iscsi, iscsilun->lun, lba,
+                                           num_sectors * iscsilun->block_size,
+                                           iscsilun->block_size,
+                                           0, 0, 0, 0, 0,
+                                           iscsi_co_generic_cb, &iTask,
+                                           (struct scsi_iovec *)iov->iov, iov->niov);
+    }
+#else
        iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                       num_sectors * iscsilun->block_size,
                                       iscsilun->block_size, 0, 0, 0, 0, 0,
@@ -803,11 +836,13 @@ retry:
                                       0, 0, 0, 0, 0,
                                       iscsi_co_generic_cb, &iTask);
    }
+#endif
    if (iTask.task == NULL) {
        return -ENOMEM;
    }
+#if LIBISCSI_API_VERSION < (20160603)
    scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
-
+#endif
    while (!iTask.complete) {
        iscsi_set_events(iscsilun);
        qemu_coroutine_yield();
@@ -1048,7 +1083,9 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
    struct IscsiTask iTask;
    struct unmap_list list;

-    assert(is_byte_request_lun_aligned(offset, count, iscsilun));
+    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+        return -ENOTSUP;
+    }

    if (!iscsilun->lbp.lbpu) {
        /* UNMAP is not supported by the target */
@@ -1609,7 +1646,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -ENOMEM;
        goto out;
    }
-
+#if LIBISCSI_API_VERSION >= (20160603)
+    if (iscsi_init_transport(iscsi, iscsi_url->transport)) {
+        error_setg(errp, ("Error initializing transport."));
+        ret = -EINVAL;
+        goto out;
+    }
+#endif
    if (iscsi_set_targetname(iscsi, iscsi_url->target)) {
        error_setg(errp, "iSCSI: Failed to set target name.");
        ret = -EINVAL;
@@ -1652,7 +1695,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,

    /* timeout handling is broken in libiscsi before 1.15.0 */
    timeout = parse_timeout(iscsi_url->target);
-#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621
+#if LIBISCSI_API_VERSION >= 20150621
    iscsi_set_timeout(iscsi, timeout);
 #else
    if (timeout) {
@@ -2013,9 +2056,48 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_attach_aio_context = iscsi_attach_aio_context,
 };

+#if LIBISCSI_API_VERSION >= (20160603)
+static BlockDriver bdrv_iser = {
+    .format_name     = "iser",
+    .protocol_name   = "iser",
+
+    .instance_size   = sizeof(IscsiLun),
+    .bdrv_needs_filename = true,
+    .bdrv_file_open  = iscsi_open,
+    .bdrv_close      = iscsi_close,
+    .bdrv_create     = iscsi_create,
+    .create_opts     = &iscsi_create_opts,
+    .bdrv_reopen_prepare   = iscsi_reopen_prepare,
+    .bdrv_reopen_commit    = iscsi_reopen_commit,
+    .bdrv_invalidate_cache = iscsi_invalidate_cache,
+
+    .bdrv_getlength  = iscsi_getlength,
+    .bdrv_get_info   = iscsi_get_info,
+    .bdrv_truncate   = iscsi_truncate,
+    .bdrv_refresh_limits = iscsi_refresh_limits,
+
+    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
+    .bdrv_co_readv         = iscsi_co_readv,
+    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .bdrv_co_flush_to_disk = iscsi_co_flush,
+
+#ifdef __linux__
+    .bdrv_aio_ioctl   = iscsi_aio_ioctl,
+#endif
+
+    .bdrv_detach_aio_context = iscsi_detach_aio_context,
+    .bdrv_attach_aio_context = iscsi_attach_aio_context,
+};
+#endif
+
 static void iscsi_block_init(void)
 {
    bdrv_register(&bdrv_iscsi);
+#if LIBISCSI_API_VERSION >= (20160603)
+    bdrv_register(&bdrv_iser);
+#endif
 }

 block_init(iscsi_block_init);
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -94,9 +94,12 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)

    laiocb->ret = ret;
    if (laiocb->co) {
-        /* Jump and continue completion for foreign requests, don't do
-         * anything for current request, it will be completed shortly. */
-        if (laiocb->co != qemu_coroutine_self()) {
+        /* If the coroutine is already entered it must be in ioq_submit() and
+         * will notice laio->ret has been filled in when it eventually runs
+         * later.  Coroutines cannot be entered recursively so avoid doing
+         * that!
+         */
+        if (!qemu_coroutine_entered(laiocb->co)) {
            qemu_coroutine_enter(laiocb->co);
        }
    } else {
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -13,7 +13,7 @@

 #include "qemu/osdep.h"
 #include "trace.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
@@ -55,7 +55,7 @@ typedef struct MirrorBlockJob {
    int64_t bdev_length;
    unsigned long *cow_bitmap;
    BdrvDirtyBitmap *dirty_bitmap;
-    HBitmapIter hbi;
+    BdrvDirtyBitmapIter *dbi;
    uint8_t *buf;
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
@@ -330,10 +330,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);

-    sector_num = hbitmap_iter_next(&s->hbi);
+    sector_num = bdrv_dirty_iter_next(s->dbi);
    if (sector_num < 0) {
-        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
-        sector_num = hbitmap_iter_next(&s->hbi);
+        bdrv_set_dirty_iter(s->dbi, 0);
+        sector_num = bdrv_dirty_iter_next(s->dbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }
@@ -349,7 +349,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
-        int64_t hbitmap_next;
+        int64_t next_dirty;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
@@ -360,13 +360,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            break;
        }

-        hbitmap_next = hbitmap_iter_next(&s->hbi);
-        if (hbitmap_next > next_sector || hbitmap_next < 0) {
+        next_dirty = bdrv_dirty_iter_next(s->dbi);
+        if (next_dirty > next_sector || next_dirty < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
-            bdrv_set_dirty_iter(&s->hbi, next_sector);
-            hbitmap_next = hbitmap_iter_next(&s->hbi);
+            bdrv_set_dirty_iter(s->dbi, next_sector);
+            next_dirty = bdrv_dirty_iter_next(s->dbi);
        }
-        assert(hbitmap_next == next_sector);
+        assert(next_dirty == next_sector);
        nb_chunks++;
    }

@@ -469,7 +469,11 @@ static void mirror_free_init(MirrorBlockJob *s)
    }
 }

-static void mirror_drain(MirrorBlockJob *s)
+/* This is also used for the .pause callback. There is no matching
+ * mirror_resume() because mirror_run() will begin iterating again
+ * when the job is resumed.
+ */
+static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
    while (s->in_flight > 0) {
        mirror_wait_for_io(s);
@@ -526,8 +530,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_op_unblock_all(target_bs, s->common.blocker);
    blk_unref(s->target);
+    s->target = NULL;
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
@@ -582,7 +586,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
            sector_num += nb_sectors;
        }

-        mirror_drain(s);
+        mirror_wait_for_all_io(s);
    }

    /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -611,12 +615,27 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
    return 0;
 }

+/* Called when going out of the streaming phase to flush the bulk of the
+ * data to the medium, or just before completing.
+ */
+static int mirror_flush(MirrorBlockJob *s)
+{
+    int ret = blk_flush(s->target);
+    if (ret < 0) {
+        if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) {
+            s->ret = ret;
+        }
+    }
+    return ret;
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
    BlockDriverState *bs = blk_bs(s->common.blk);
    BlockDriverState *target_bs = blk_bs(s->target);
+    bool need_drain = true;
    int64_t length;
    BlockDriverInfo bdi;
    char backing_filename[2]; /* we only need 2 characters because we are only
@@ -679,7 +698,8 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

-    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
+    assert(!s->dbi);
+    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
    for (;;) {
        uint64_t delay_ns = 0;
        int64_t cnt, delta;
@@ -721,27 +741,23 @@ static void coroutine_fn mirror_run(void *opaque)
        should_complete = false;
        if (s->in_flight == 0 && cnt == 0) {
            trace_mirror_before_flush(s);
-            ret = blk_flush(s->target);
-            if (ret < 0) {
-                if (mirror_error_action(s, false, -ret) ==
-                    BLOCK_ERROR_ACTION_REPORT) {
-                    goto immediate_exit;
+            if (!s->synced) {
+                if (mirror_flush(s) < 0) {
+                    /* Go check s->ret.  */
+                    continue;
                }
-            } else {
                /* We're out of the streaming phase.  From now on, if the job
                 * is cancelled we will actually complete all pending I/O and
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
-                if (!s->synced) {
-                    block_job_event_ready(&s->common);
-                    s->synced = true;
-                }
-
-                should_complete = s->should_complete ||
-                    block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+                block_job_event_ready(&s->common);
+                s->synced = true;
            }
+
+            should_complete = s->should_complete ||
+                block_job_is_cancelled(&s->common);
+            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
        }

        if (cnt == 0 && should_complete) {
@@ -751,11 +767,26 @@ static void coroutine_fn mirror_run(void *opaque)
             * source has dirty data to copy!
             *
             * Note that I/O can be submitted by the guest while
-             * mirror_populate runs.
+             * mirror_populate runs, so pause it now.  Before deciding
+             * whether to switch to target check one last time if I/O has
+             * come in the meanwhile, and if not flush the data to disk.
             */
            trace_mirror_before_drain(s, cnt);
-            bdrv_co_drain(bs);
+
+            bdrv_drained_begin(bs);
            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+            if (cnt > 0 || mirror_flush(s) < 0) {
+                bdrv_drained_end(bs);
+                continue;
+            }
+
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            need_drain = false;
+            break;
        }

        ret = 0;
@@ -768,13 +799,6 @@ static void coroutine_fn mirror_run(void *opaque)
        } else if (!should_complete) {
            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-        } else if (cnt == 0) {
-            /* The two disks are in sync.  Exit and report successful
-             * completion.
-             */
-            assert(QLIST_EMPTY(&bs->tracked_requests));
-            s->common.cancelled = false;
-            break;
        }
        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    }
@@ -786,20 +810,23 @@ immediate_exit:
         * the target is a copy of the source.
         */
        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
-        mirror_drain(s);
+        assert(need_drain);
+        mirror_wait_for_all_io(s);
    }

    assert(s->in_flight == 0);
    qemu_vfree(s->buf);
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
+    bdrv_dirty_iter_free(s->dbi);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
-    /* Before we switch to target in mirror_exit, make sure data doesn't
-     * change. */
-    bdrv_drained_begin(bs);
+
+    if (need_drain) {
+        bdrv_drained_begin(bs);
+    }
    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

@@ -870,14 +897,11 @@ static void mirror_complete(BlockJob *job, Error **errp)
    block_job_enter(&s->common);
 }

-/* There is no matching mirror_resume() because mirror_run() will begin
- * iterating again when the job is resumed.
- */
-static void coroutine_fn mirror_pause(BlockJob *job)
+static void mirror_pause(BlockJob *job)
 {
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);

-    mirror_drain(s);
+    mirror_wait_for_all_io(s);
 }

 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
@@ -887,28 +911,47 @@ static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
    blk_set_aio_context(s->target, new_context);
 }

+static void mirror_drain(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of mirror_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}
+
 static const BlockJobDriver mirror_job_driver = {
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_MIRROR,
    .set_speed              = mirror_set_speed,
+    .start                  = mirror_run,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };

 static const BlockJobDriver commit_active_job_driver = {
    .instance_size          = sizeof(MirrorBlockJob),
    .job_type               = BLOCK_JOB_TYPE_COMMIT,
    .set_speed              = mirror_set_speed,
+    .start                  = mirror_run,
    .complete               = mirror_complete,
    .pause                  = mirror_pause,
    .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };

 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-                             BlockDriverState *target, const char *replaces,
-                             int64_t speed, uint32_t granularity,
-                             int64_t buf_size,
+                             int creation_flags, BlockDriverState *target,
+                             const char *replaces, int64_t speed,
+                             uint32_t granularity, int64_t buf_size,
                             BlockMirrorBackingMode backing_mode,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
@@ -936,7 +979,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
+    s = block_job_create(job_id, driver, bs, speed, creation_flags,
+                         cb, opaque, errp);
    if (!s) {
        return;
    }
@@ -965,11 +1009,18 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
        return;
    }

-    bdrv_op_block_all(target, s->common.blocker);
+    block_job_add_bdrv(&s->common, target);
+    /* In commit_active_start() all intermediate nodes disappear, so
+     * any jobs in them must be blocked */
+    if (bdrv_chain_contains(bs, target)) {
+        BlockDriverState *iter;
+        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
+            block_job_add_bdrv(&s->common, iter);
+        }
+    }

-    s->common.co = qemu_coroutine_create(mirror_run, s);
-    trace_mirror_start(bs, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
+    trace_mirror_start(bs, s, opaque);
+    block_job_start(&s->common);
 }

 void mirror_start(const char *job_id, BlockDriverState *bs,
@@ -978,9 +1029,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
-                  bool unmap,
-                  BlockCompletionFunc *cb,
-                  void *opaque, Error **errp)
+                  bool unmap, Error **errp)
 {
    bool is_none_mode;
    BlockDriverState *base;
@@ -991,17 +1040,16 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
    }
    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
    base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
-    mirror_start_job(job_id, bs, target, replaces,
+    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
                     speed, granularity, buf_size, backing_mode,
-                     on_source_error, on_target_error, unmap, cb, opaque, errp,
+                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
                     &mirror_job_driver, is_none_mode, base, false);
 }

 void commit_active_start(const char *job_id, BlockDriverState *bs,
-                         BlockDriverState *base, int64_t speed,
-                         BlockdevOnError on_error,
-                         BlockCompletionFunc *cb,
-                         void *opaque, Error **errp,
+                         BlockDriverState *base, int creation_flags,
+                         int64_t speed, BlockdevOnError on_error,
+                         BlockCompletionFunc *cb, void *opaque, Error **errp,
                         bool auto_complete)
 {
    int64_t length, base_length;
@@ -1040,9 +1088,9 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
        }
    }

-    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
+    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
                     MIRROR_LEAVE_BACKING_CHAIN,
-                     on_error, on_error, false, cb, opaque, &local_err,
+                     on_error, on_error, true, cb, opaque, &local_err,
                     &commit_active_job_driver, false, base, auto_complete);
    if (local_err) {
        error_propagate(errp, local_err);
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -1,6 +1,7 @@
 /*
 * QEMU Block driver for  NBD
 *
+ * Copyright (C) 2016 Red Hat, Inc.
 * Copyright (C) 2008 Bull S.A.S.
 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
 *
@@ -32,7 +33,7 @@
 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))

-static void nbd_recv_coroutines_enter_all(NbdClientSession *s)
+static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
 {
    int i;

@@ -45,7 +46,7 @@ static void nbd_recv_coroutines_enter_all(NbdClientSession *s)

 static void nbd_teardown_connection(BlockDriverState *bs)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
+    NBDClientSession *client = nbd_get_client_session(bs);

    if (!client->ioc) { /* Already closed */
        return;
@@ -67,7 +68,7 @@ static void nbd_teardown_connection(BlockDriverState *bs)
 static void nbd_reply_ready(void *opaque)
 {
    BlockDriverState *bs = opaque;
-    NbdClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = nbd_get_client_session(bs);
    uint64_t i;
    int ret;

@@ -115,10 +116,10 @@ static void nbd_restart_write(void *opaque)
 }

 static int nbd_co_send_request(BlockDriverState *bs,
-                               struct nbd_request *request,
+                               NBDRequest *request,
                               QEMUIOVector *qiov)
 {
-    NbdClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = nbd_get_client_session(bs);
    AioContext *aio_context;
    int rc, ret, i;

@@ -166,9 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs,
    return rc;
 }

-static void nbd_co_receive_reply(NbdClientSession *s,
-                                 struct nbd_request *request,
-                                 struct nbd_reply *reply,
+static void nbd_co_receive_reply(NBDClientSession *s,
+                                 NBDRequest *request,
+                                 NBDReply *reply,
                                 QEMUIOVector *qiov)
 {
    int ret;
@@ -194,13 +195,13 @@ static void nbd_co_receive_reply(NbdClientSession *s,
    }
 }

-static void nbd_coroutine_start(NbdClientSession *s,
-   struct nbd_request *request)
+static void nbd_coroutine_start(NBDClientSession *s,
+                                NBDRequest *request)
 {
    /* Poor man semaphore.  The free_sema is locked when no other request
     * can be accepted, and unlocked after receiving one reply.  */
-    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
-        qemu_co_mutex_lock(&s->free_sema);
+    if (s->in_flight == MAX_NBD_REQUESTS) {
+        qemu_co_queue_wait(&s->free_sema);
        assert(s->in_flight < MAX_NBD_REQUESTS);
    }
    s->in_flight++;
@@ -208,26 +209,26 @@ static void nbd_coroutine_start(NbdClientSession *s,
    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }

-static void nbd_coroutine_end(NbdClientSession *s,
-    struct nbd_request *request)
+static void nbd_coroutine_end(NBDClientSession *s,
+                              NBDRequest *request)
 {
    int i = HANDLE_TO_INDEX(s, request->handle);
    s->recv_coroutine[i] = NULL;
    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_mutex_unlock(&s->free_sema);
+        qemu_co_queue_next(&s->free_sema);
    }
 }

 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                         uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = {
        .type = NBD_CMD_READ,
        .from = offset,
        .len = bytes,
    };
-    struct nbd_reply reply;
+    NBDReply reply;
    ssize_t ret;

    assert(bytes <= NBD_MAX_BUFFER_SIZE);
@@ -247,18 +248,18 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                          uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = {
        .type = NBD_CMD_WRITE,
        .from = offset,
        .len = bytes,
    };
-    struct nbd_reply reply;
+    NBDReply reply;
    ssize_t ret;

    if (flags & BDRV_REQ_FUA) {
        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
-        request.type |= NBD_CMD_FLAG_FUA;
+        request.flags |= NBD_CMD_FLAG_FUA;
    }

    assert(bytes <= NBD_MAX_BUFFER_SIZE);
@@ -274,11 +275,46 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
    return -reply.error;
 }

+int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                int count, BdrvRequestFlags flags)
+{
+    ssize_t ret;
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = {
+        .type = NBD_CMD_WRITE_ZEROES,
+        .from = offset,
+        .len = count,
+    };
+    NBDReply reply;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
+        return -ENOTSUP;
+    }
+
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
+        request.flags |= NBD_CMD_FLAG_FUA;
+    }
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        request.flags |= NBD_CMD_FLAG_NO_HOLE;
+    }
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(bs, &request, NULL);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
 int nbd_client_co_flush(BlockDriverState *bs)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_FLUSH };
-    struct nbd_reply reply;
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = { .type = NBD_CMD_FLUSH };
+    NBDReply reply;
    ssize_t ret;

    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) {
@@ -301,13 +337,13 @@ int nbd_client_co_flush(BlockDriverState *bs)

 int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = {
        .type = NBD_CMD_TRIM,
        .from = offset,
        .len = count,
    };
-    struct nbd_reply reply;
+    NBDReply reply;
    ssize_t ret;

    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
@@ -342,12 +378,8 @@ void nbd_client_attach_aio_context(BlockDriverState *bs,

 void nbd_client_close(BlockDriverState *bs)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = {
-        .type = NBD_CMD_DISC,
-        .from = 0,
-        .len = 0
-    };
+    NBDClientSession *client = nbd_get_client_session(bs);
+    NBDRequest request = { .type = NBD_CMD_DISC };

    if (client->ioc == NULL) {
        return;
@@ -365,7 +397,7 @@ int nbd_client_init(BlockDriverState *bs,
                    const char *hostname,
                    Error **errp)
 {
-    NbdClientSession *client = nbd_get_client_session(bs);
+    NBDClientSession *client = nbd_get_client_session(bs);
    int ret;

    /* NBD handshake */
@@ -386,7 +418,7 @@ int nbd_client_init(BlockDriverState *bs,
    }

    qemu_co_mutex_init(&client->send_mutex);
-    qemu_co_mutex_init(&client->free_sema);
+    qemu_co_queue_init(&client->free_sema);
    client->sioc = sioc;
    object_ref(OBJECT(client->sioc));

--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -17,24 +17,24 @@

 #define MAX_NBD_REQUESTS    16

-typedef struct NbdClientSession {
+typedef struct NBDClientSession {
    QIOChannelSocket *sioc; /* The master data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
    uint16_t nbdflags;
    off_t size;

    CoMutex send_mutex;
-    CoMutex free_sema;
+    CoQueue free_sema;
    Coroutine *send_coroutine;
    int in_flight;

    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
-    struct nbd_reply reply;
+    NBDReply reply;

    bool is_unix;
-} NbdClientSession;
+} NBDClientSession;

-NbdClientSession *nbd_get_client_session(BlockDriverState *bs);
+NBDClientSession *nbd_get_client_session(BlockDriverState *bs);

 int nbd_client_init(BlockDriverState *bs,
                    QIOChannelSocket *sock,
@@ -48,6 +48,8 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                          uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                int count, BdrvRequestFlags flags);
 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                         uint64_t bytes, QEMUIOVector *qiov, int flags);

--- a/block/nbd.c
+++ b/block/nbd.c
@@ -32,6 +32,9 @@
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi-visit.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qint.h"
@@ -41,10 +44,11 @@
 #define EN_OPTSTR ":exportname="

 typedef struct BDRVNBDState {
-    NbdClientSession client;
+    NBDClientSession client;

    /* For nbd_refresh_filename() */
-    char *path, *host, *port, *export, *tlscredsid;
+    SocketAddress *saddr;
+    char *export, *tlscredsid;
 } BDRVNBDState;

 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -90,9 +94,13 @@ static int nbd_parse_uri(const char *filename, QDict *options)
            ret = -EINVAL;
            goto out;
        }
-        qdict_put(options, "path", qstring_from_str(qp->p[0].value));
+        qdict_put(options, "server.type", qstring_from_str("unix"));
+        qdict_put(options, "server.data.path",
+                  qstring_from_str(qp->p[0].value));
    } else {
        QString *host;
+        char *port_str;
+
        /* nbd[+tcp]://host[:port]/export */
        if (!uri->server) {
            ret = -EINVAL;
@@ -107,12 +115,12 @@ static int nbd_parse_uri(const char *filename, QDict *options)
            host = qstring_from_str(uri->server);
        }

-        qdict_put(options, "host", host);
-        if (uri->port) {
-            char* port_str = g_strdup_printf("%d", uri->port);
-            qdict_put(options, "port", qstring_from_str(port_str));
-            g_free(port_str);
-        }
+        qdict_put(options, "server.type", qstring_from_str("inet"));
+        qdict_put(options, "server.data.host", host);
+
+        port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT);
+        qdict_put(options, "server.data.port", qstring_from_str(port_str));
+        g_free(port_str);
    }

 out:
@@ -123,6 +131,26 @@ out:
    return ret;
 }

+static bool nbd_has_filename_options_conflict(QDict *options, Error **errp)
+{
+    const QDictEntry *e;
+
+    for (e = qdict_first(options); e; e = qdict_next(options, e)) {
+        if (!strcmp(e->key, "host") ||
+            !strcmp(e->key, "port") ||
+            !strcmp(e->key, "path") ||
+            !strcmp(e->key, "export") ||
+            strstart(e->key, "server.", NULL))
+        {
+            error_setg(errp, "Option '%s' cannot be used with a file name",
+                       e->key);
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static void nbd_parse_filename(const char *filename, QDict *options,
                               Error **errp)
 {
@@ -131,12 +159,7 @@ static void nbd_parse_filename(const char *filename, QDict *options,
    const char *host_spec;
    const char *unixpath;

-    if (qdict_haskey(options, "host")
-        || qdict_haskey(options, "port")
-        || qdict_haskey(options, "path"))
-    {
-        error_setg(errp, "host/port/path and a file name may not be specified "
-                         "at the same time");
+    if (nbd_has_filename_options_conflict(options, errp)) {
        return;
    }

@@ -173,7 +196,8 @@ static void nbd_parse_filename(const char *filename, QDict *options,

    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
-        qdict_put(options, "path", qstring_from_str(unixpath));
+        qdict_put(options, "server.type", qstring_from_str("unix"));
+        qdict_put(options, "server.data.path", qstring_from_str(unixpath));
    } else {
        InetSocketAddress *addr = NULL;

@@ -182,8 +206,9 @@ static void nbd_parse_filename(const char *filename, QDict *options,
            goto out;
        }

-        qdict_put(options, "host", qstring_from_str(addr->host));
-        qdict_put(options, "port", qstring_from_str(addr->port));
+        qdict_put(options, "server.type", qstring_from_str("inet"));
+        qdict_put(options, "server.data.host", qstring_from_str(addr->host));
+        qdict_put(options, "server.data.port", qstring_from_str(addr->port));
        qapi_free_InetSocketAddress(addr);
    }

@@ -191,51 +216,85 @@ out:
    g_free(file);
 }

-static SocketAddress *nbd_config(BDRVNBDState *s, QemuOpts *opts, Error **errp)
+static bool nbd_process_legacy_socket_options(QDict *output_options,
+                                              QemuOpts *legacy_opts,
+                                              Error **errp)
 {
-    SocketAddress *saddr;
+    const char *path = qemu_opt_get(legacy_opts, "path");
+    const char *host = qemu_opt_get(legacy_opts, "host");
+    const char *port = qemu_opt_get(legacy_opts, "port");
+    const QDictEntry *e;

-    s->path = g_strdup(qemu_opt_get(opts, "path"));
-    s->host = g_strdup(qemu_opt_get(opts, "host"));
-
-    if (!s->path == !s->host) {
-        if (s->path) {
-            error_setg(errp, "path and host may not be used at the same time.");
-        } else {
-            error_setg(errp, "one of path and host must be specified.");
-        }
-        return NULL;
+    if (!path && !host && !port) {
+        return true;
    }

-    saddr = g_new0(SocketAddress, 1);
-
-    if (s->path) {
-        UnixSocketAddress *q_unix;
-        saddr->type = SOCKET_ADDRESS_KIND_UNIX;
-        q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
-        q_unix->path = g_strdup(s->path);
-    } else {
-        InetSocketAddress *inet;
-
-        s->port = g_strdup(qemu_opt_get(opts, "port"));
-
-        saddr->type = SOCKET_ADDRESS_KIND_INET;
-        inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1);
-        inet->host = g_strdup(s->host);
-        inet->port = g_strdup(s->port);
-        if (!inet->port) {
-            inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
+    for (e = qdict_first(output_options); e; e = qdict_next(output_options, e))
+    {
+        if (strstart(e->key, "server.", NULL)) {
+            error_setg(errp, "Cannot use 'server' and path/host/port at the "
+                       "same time");
+            return false;
        }
    }

+    if (path && host) {
+        error_setg(errp, "path and host may not be used at the same time");
+        return false;
+    } else if (path) {
+        if (port) {
+            error_setg(errp, "port may not be used without host");
+            return false;
+        }
+
+        qdict_put(output_options, "server.type", qstring_from_str("unix"));
+        qdict_put(output_options, "server.data.path", qstring_from_str(path));
+    } else if (host) {
+        qdict_put(output_options, "server.type", qstring_from_str("inet"));
+        qdict_put(output_options, "server.data.host", qstring_from_str(host));
+        qdict_put(output_options, "server.data.port",
+                  qstring_from_str(port ?: stringify(NBD_DEFAULT_PORT)));
+    }
+
+    return true;
+}
+
+static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, Error **errp)
+{
+    SocketAddress *saddr = NULL;
+    QDict *addr = NULL;
+    QObject *crumpled_addr = NULL;
+    Visitor *iv = NULL;
+    Error *local_err = NULL;
+
+    qdict_extract_subqdict(options, &addr, "server.");
+    if (!qdict_size(addr)) {
+        error_setg(errp, "NBD server address missing");
+        goto done;
+    }
+
+    crumpled_addr = qdict_crumple(addr, errp);
+    if (!crumpled_addr) {
+        goto done;
+    }
+
+    iv = qobject_input_visitor_new(crumpled_addr, true);
+    visit_type_SocketAddress(iv, NULL, &saddr, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto done;
+    }
+
    s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;

-    s->export = g_strdup(qemu_opt_get(opts, "export"));
-
+done:
+    QDECREF(addr);
+    qobject_decref(crumpled_addr);
+    visit_free(iv);
    return saddr;
 }

-NbdClientSession *nbd_get_client_session(BlockDriverState *bs)
+NBDClientSession *nbd_get_client_session(BlockDriverState *bs)
 {
    BDRVNBDState *s = bs->opaque;
    return &s->client;
@@ -248,6 +307,7 @@ static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
    Error *local_err = NULL;

    sioc = qio_channel_socket_new();
+    qio_channel_set_name(QIO_CHANNEL(sioc), "nbd-client");

    qio_channel_socket_connect_sync(sioc,
                                    saddr,
@@ -332,7 +392,6 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
    QemuOpts *opts = NULL;
    Error *local_err = NULL;
    QIOChannelSocket *sioc = NULL;
-    SocketAddress *saddr = NULL;
    QCryptoTLSCreds *tlscreds = NULL;
    const char *hostname = NULL;
    int ret = -EINVAL;
@@ -344,12 +403,19 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
        goto error;
    }

-    /* Pop the config into our state object. Exit if invalid. */
-    saddr = nbd_config(s, opts, errp);
-    if (!saddr) {
+    /* Translate @host, @port, and @path to a SocketAddress */
+    if (!nbd_process_legacy_socket_options(options, opts, errp)) {
        goto error;
    }

+    /* Pop the config into our state object. Exit if invalid. */
+    s->saddr = nbd_config(s, options, errp);
+    if (!s->saddr) {
+        goto error;
+    }
+
+    s->export = g_strdup(qemu_opt_get(opts, "export"));
+
    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
    if (s->tlscredsid) {
        tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
@@ -357,17 +423,17 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
            goto error;
        }

-        if (saddr->type != SOCKET_ADDRESS_KIND_INET) {
+        if (s->saddr->type != SOCKET_ADDRESS_KIND_INET) {
            error_setg(errp, "TLS only supported over IP sockets");
            goto error;
        }
-        hostname = saddr->u.inet.data->host;
+        hostname = s->saddr->u.inet.data->host;
    }

    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
-    sioc = nbd_establish_connection(saddr, errp);
+    sioc = nbd_establish_connection(s->saddr, errp);
    if (!sioc) {
        ret = -ECONNREFUSED;
        goto error;
@@ -384,13 +450,10 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
        object_unref(OBJECT(tlscreds));
    }
    if (ret < 0) {
-        g_free(s->path);
-        g_free(s->host);
-        g_free(s->port);
+        qapi_free_SocketAddress(s->saddr);
        g_free(s->export);
        g_free(s->tlscredsid);
    }
-    qapi_free_SocketAddress(saddr);
    qemu_opts_del(opts);
    return ret;
 }
@@ -403,6 +466,7 @@ static int nbd_co_flush(BlockDriverState *bs)
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
+    bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE;
    bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
 }

@@ -412,9 +476,7 @@ static void nbd_close(BlockDriverState *bs)

    nbd_client_close(bs);

-    g_free(s->path);
-    g_free(s->host);
-    g_free(s->port);
+    qapi_free_SocketAddress(s->saddr);
    g_free(s->export);
    g_free(s->tlscredsid);
 }
@@ -441,45 +503,52 @@ static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
 {
    BDRVNBDState *s = bs->opaque;
    QDict *opts = qdict_new();
+    QObject *saddr_qdict;
+    Visitor *ov;
+    const char *host = NULL, *port = NULL, *path = NULL;

-    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));
-
-    if (s->path && s->export) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix:///%s?socket=%s", s->export, s->path);
-    } else if (s->path && !s->export) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix://?socket=%s", s->path);
-    } else if (!s->path && s->export && s->port) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s/%s", s->host, s->port, s->export);
-    } else if (!s->path && s->export && !s->port) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s/%s", s->host, s->export);
-    } else if (!s->path && !s->export && s->port) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s", s->host, s->port);
-    } else if (!s->path && !s->export && !s->port) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s", s->host);
+    if (s->saddr->type == SOCKET_ADDRESS_KIND_INET) {
+        const InetSocketAddress *inet = s->saddr->u.inet.data;
+        if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) {
+            host = inet->host;
+            port = inet->port;
+        }
+    } else if (s->saddr->type == SOCKET_ADDRESS_KIND_UNIX) {
+        path = s->saddr->u.q_unix.data->path;
    }

-    if (s->path) {
-        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(s->path)));
-    } else if (s->port) {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
-        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(s->port)));
-    } else {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
+    qdict_put(opts, "driver", qstring_from_str("nbd"));
+
+    if (path && s->export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix:///%s?socket=%s", s->export, path);
+    } else if (path && !s->export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd+unix://?socket=%s", path);
+    } else if (host && s->export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s/%s", host, port, s->export);
+    } else if (host && !s->export) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nbd://%s:%s", host, port);
    }
+
+    ov = qobject_output_visitor_new(&saddr_qdict);
+    visit_type_SocketAddress(ov, NULL, &s->saddr, &error_abort);
+    visit_complete(ov, &saddr_qdict);
+    visit_free(ov);
+    assert(qobject_type(saddr_qdict) == QTYPE_QDICT);
+
+    qdict_put_obj(opts, "server", saddr_qdict);
+
    if (s->export) {
-        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(s->export)));
+        qdict_put(opts, "export", qstring_from_str(s->export));
    }
    if (s->tlscredsid) {
-        qdict_put_obj(opts, "tls-creds",
-                      QOBJECT(qstring_from_str(s->tlscredsid)));
+        qdict_put(opts, "tls-creds", qstring_from_str(s->tlscredsid));
    }

+    qdict_flatten(opts);
    bs->full_open_options = opts;
 }

@@ -491,6 +560,7 @@ static BlockDriver bdrv_nbd = {
    .bdrv_file_open             = nbd_open,
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
@@ -509,6 +579,7 @@ static BlockDriver bdrv_nbd_tcp = {
    .bdrv_file_open             = nbd_open,
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
@@ -527,6 +598,7 @@ static BlockDriver bdrv_nbd_unix = {
    .bdrv_file_open             = nbd_open,
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -35,8 +35,15 @@
 #include "qemu/uri.h"
 #include "qemu/cutils.h"
 #include "sysemu/sysemu.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi-visit.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include <nfsc/libnfs.h>

+
 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
 #define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
 #define QEMU_NFS_MAX_DEBUG_LEVEL 2
@@ -49,18 +56,137 @@ typedef struct NFSClient {
    AioContext *aio_context;
    blkcnt_t st_blocks;
    bool cache_used;
+    NFSServer *server;
+    char *path;
+    int64_t uid, gid, tcp_syncnt, readahead, pagecache, debug;
 } NFSClient;

 typedef struct NFSRPC {
+    BlockDriverState *bs;
    int ret;
    int complete;
    QEMUIOVector *iov;
    struct stat *st;
    Coroutine *co;
-    QEMUBH *bh;
    NFSClient *client;
 } NFSRPC;

+static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
+{
+    URI *uri = NULL;
+    QueryParams *qp = NULL;
+    int ret = -EINVAL, i;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        error_setg(errp, "Invalid URI specified");
+        goto out;
+    }
+    if (strcmp(uri->scheme, "nfs") != 0) {
+        error_setg(errp, "URI scheme must be 'nfs'");
+        goto out;
+    }
+
+    if (!uri->server) {
+        error_setg(errp, "missing hostname in URI");
+        goto out;
+    }
+
+    if (!uri->path) {
+        error_setg(errp, "missing file path in URI");
+        goto out;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (!qp) {
+        error_setg(errp, "could not parse query parameters");
+        goto out;
+    }
+
+    qdict_put(options, "server.host", qstring_from_str(uri->server));
+    qdict_put(options, "server.type", qstring_from_str("inet"));
+    qdict_put(options, "path", qstring_from_str(uri->path));
+
+    for (i = 0; i < qp->n; i++) {
+        if (!qp->p[i].value) {
+            error_setg(errp, "Value for NFS parameter expected: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+        if (parse_uint_full(qp->p[i].value, NULL, 0)) {
+            error_setg(errp, "Illegal value for NFS parameter: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+        if (!strcmp(qp->p[i].name, "uid")) {
+            qdict_put(options, "user",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "gid")) {
+            qdict_put(options, "group",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) {
+            qdict_put(options, "tcp-syn-count",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "readahead")) {
+            qdict_put(options, "readahead-size",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "pagecache")) {
+            qdict_put(options, "page-cache-size",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "debug")) {
+            qdict_put(options, "debug-level",
+                      qstring_from_str(qp->p[i].value));
+        } else {
+            error_setg(errp, "Unknown NFS parameter name: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    if (uri) {
+        uri_free(uri);
+    }
+    return ret;
+}
+
+static bool nfs_has_filename_options_conflict(QDict *options, Error **errp)
+{
+    const QDictEntry *qe;
+
+    for (qe = qdict_first(options); qe; qe = qdict_next(options, qe)) {
+        if (!strcmp(qe->key, "host") ||
+            !strcmp(qe->key, "path") ||
+            !strcmp(qe->key, "user") ||
+            !strcmp(qe->key, "group") ||
+            !strcmp(qe->key, "tcp-syn-count") ||
+            !strcmp(qe->key, "readahead-size") ||
+            !strcmp(qe->key, "page-cache-size") ||
+            !strcmp(qe->key, "debug-level") ||
+            strstart(qe->key, "server.", NULL))
+        {
+            error_setg(errp, "Option %s cannot be used with a filename",
+                       qe->key);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void nfs_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    if (nfs_has_filename_options_conflict(options, errp)) {
+        return;
+    }
+
+    nfs_parse_uri(filename, options, errp);
+}
+
 static void nfs_process_read(void *arg);
 static void nfs_process_write(void *arg);

@@ -91,11 +217,12 @@ static void nfs_process_write(void *arg)
    nfs_set_events(client);
 }

-static void nfs_co_init_task(NFSClient *client, NFSRPC *task)
+static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 {
    *task = (NFSRPC) {
        .co             = qemu_coroutine_self(),
-        .client         = client,
+        .bs             = bs,
+        .client         = bs->opaque,
    };
 }

@@ -103,7 +230,6 @@ static void nfs_co_generic_bh_cb(void *opaque)
 {
    NFSRPC *task = opaque;
    task->complete = 1;
-    qemu_bh_delete(task->bh);
    qemu_coroutine_enter(task->co);
 }

@@ -113,6 +239,7 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
 {
    NFSRPC *task = private_data;
    task->ret = ret;
+    assert(!task->st);
    if (task->ret > 0 && task->iov) {
        if (task->ret <= task->iov->size) {
            qemu_iovec_from_buf(task->iov, 0, data, task->ret);
@@ -120,19 +247,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
            task->ret = -EIO;
        }
    }
-    if (task->ret == 0 && task->st) {
-        memcpy(task->st, data, sizeof(struct stat));
-    }
    if (task->ret < 0) {
        error_report("NFS Error: %s", nfs_get_error(nfs));
    }
-    if (task->co) {
-        task->bh = aio_bh_new(task->client->aio_context,
-                              nfs_co_generic_bh_cb, task);
-        qemu_bh_schedule(task->bh);
-    } else {
-        task->complete = 1;
-    }
+    aio_bh_schedule_oneshot(task->client->aio_context,
+                            nfs_co_generic_bh_cb, task);
 }

 static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
@@ -142,7 +261,7 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
    NFSClient *client = bs->opaque;
    NFSRPC task;

-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
    task.iov = iov;

    if (nfs_pread_async(client->context, client->fh,
@@ -152,8 +271,8 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
        return -ENOMEM;
    }

+    nfs_set_events(client);
    while (!task.complete) {
-        nfs_set_events(client);
        qemu_coroutine_yield();
    }

@@ -177,7 +296,7 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
    NFSRPC task;
    char *buf = NULL;

-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);

    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
    if (nb_sectors && buf == NULL) {
@@ -194,8 +313,8 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
        return -ENOMEM;
    }

+    nfs_set_events(client);
    while (!task.complete) {
-        nfs_set_events(client);
        qemu_coroutine_yield();
    }

@@ -213,30 +332,59 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
    NFSClient *client = bs->opaque;
    NFSRPC task;

-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);

    if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
                        &task) != 0) {
        return -ENOMEM;
    }

+    nfs_set_events(client);
    while (!task.complete) {
-        nfs_set_events(client);
        qemu_coroutine_yield();
    }

    return task.ret;
 }

-/* TODO Convert to fine grained options */
 static QemuOptsList runtime_opts = {
    .name = "nfs",
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    .desc = {
        {
-            .name = "filename",
+            .name = "path",
            .type = QEMU_OPT_STRING,
-            .help = "URL to the NFS file",
+            .help = "Path of the image on the host",
+        },
+        {
+            .name = "uid",
+            .type = QEMU_OPT_NUMBER,
+            .help = "UID value to use when talking to the server",
+        },
+        {
+            .name = "gid",
+            .type = QEMU_OPT_NUMBER,
+            .help = "GID value to use when talking to the server",
+        },
+        {
+            .name = "tcp-syncnt",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Number of SYNs to send during the session establish",
+        },
+        {
+            .name = "readahead",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the readahead size in bytes",
+        },
+        {
+            .name = "pagecache",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the pagecache size in bytes",
+        },
+        {
+            .name = "debug",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the NFS debug level (max 2)",
        },
        { /* end of list */ }
    },
@@ -279,25 +427,65 @@ static void nfs_file_close(BlockDriverState *bs)
    nfs_client_close(client);
 }

-static int64_t nfs_client_open(NFSClient *client, const char *filename,
+static NFSServer *nfs_config(QDict *options, Error **errp)
+{
+    NFSServer *server = NULL;
+    QDict *addr = NULL;
+    QObject *crumpled_addr = NULL;
+    Visitor *iv = NULL;
+    Error *local_error = NULL;
+
+    qdict_extract_subqdict(options, &addr, "server.");
+    if (!qdict_size(addr)) {
+        error_setg(errp, "NFS server address missing");
+        goto out;
+    }
+
+    crumpled_addr = qdict_crumple(addr, errp);
+    if (!crumpled_addr) {
+        goto out;
+    }
+
+    iv = qobject_input_visitor_new(crumpled_addr, true);
+    visit_type_NFSServer(iv, NULL, &server, &local_error);
+    if (local_error) {
+        error_propagate(errp, local_error);
+        goto out;
+    }
+
+out:
+    QDECREF(addr);
+    qobject_decref(crumpled_addr);
+    visit_free(iv);
+    return server;
+}
+
+
+static int64_t nfs_client_open(NFSClient *client, QDict *options,
                               int flags, Error **errp, int open_flags)
 {
-    int ret = -EINVAL, i;
+    int ret = -EINVAL;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
    struct stat st;
-    URI *uri;
-    QueryParams *qp = NULL;
    char *file = NULL, *strp = NULL;

-    uri = uri_parse(filename);
-    if (!uri) {
-        error_setg(errp, "Invalid URL specified");
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
        goto fail;
    }
-    if (!uri->server) {
-        error_setg(errp, "Invalid URL specified");
+
+    client->path = g_strdup(qemu_opt_get(opts, "path"));
+    if (!client->path) {
+        ret = -EINVAL;
+        error_setg(errp, "No path was specified");
        goto fail;
    }
-    strp = strrchr(uri->path, '/');
+
+    strp = strrchr(client->path, '/');
    if (strp == NULL) {
        error_setg(errp, "Invalid URL specified");
        goto fail;
@@ -305,85 +493,89 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
    file = g_strdup(strp);
    *strp = 0;

+    /* Pop the config into our state object, Exit if invalid */
+    client->server = nfs_config(options, errp);
+    if (!client->server) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
    client->context = nfs_init_context();
    if (client->context == NULL) {
        error_setg(errp, "Failed to init NFS context");
        goto fail;
    }

-    qp = query_params_parse(uri->query);
-    for (i = 0; i < qp->n; i++) {
-        unsigned long long val;
-        if (!qp->p[i].value) {
-            error_setg(errp, "Value for NFS parameter expected: %s",
-                       qp->p[i].name);
-            goto fail;
-        }
-        if (parse_uint_full(qp->p[i].value, &val, 0)) {
-            error_setg(errp, "Illegal value for NFS parameter: %s",
-                       qp->p[i].name);
-            goto fail;
-        }
-        if (!strcmp(qp->p[i].name, "uid")) {
-            nfs_set_uid(client->context, val);
-        } else if (!strcmp(qp->p[i].name, "gid")) {
-            nfs_set_gid(client->context, val);
-        } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) {
-            nfs_set_tcp_syncnt(client->context, val);
-#ifdef LIBNFS_FEATURE_READAHEAD
-        } else if (!strcmp(qp->p[i].name, "readahead")) {
-            if (open_flags & BDRV_O_NOCACHE) {
-                error_setg(errp, "Cannot enable NFS readahead "
-                                 "if cache.direct = on");
-                goto fail;
-            }
-            if (val > QEMU_NFS_MAX_READAHEAD_SIZE) {
-                error_report("NFS Warning: Truncating NFS readahead"
-                             " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
-                val = QEMU_NFS_MAX_READAHEAD_SIZE;
-            }
-            nfs_set_readahead(client->context, val);
-#ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
-#endif
-            client->cache_used = true;
-#endif
-#ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
-        } else if (!strcmp(qp->p[i].name, "pagecache")) {
-            if (open_flags & BDRV_O_NOCACHE) {
-                error_setg(errp, "Cannot enable NFS pagecache "
-                                 "if cache.direct = on");
-                goto fail;
-            }
-            if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) {
-                error_report("NFS Warning: Truncating NFS pagecache"
-                             " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
-                val = QEMU_NFS_MAX_PAGECACHE_SIZE;
-            }
-            nfs_set_pagecache(client->context, val);
-            nfs_set_pagecache_ttl(client->context, 0);
-            client->cache_used = true;
-#endif
-#ifdef LIBNFS_FEATURE_DEBUG
-        } else if (!strcmp(qp->p[i].name, "debug")) {
-            /* limit the maximum debug level to avoid potential flooding
-             * of our log files. */
-            if (val > QEMU_NFS_MAX_DEBUG_LEVEL) {
-                error_report("NFS Warning: Limiting NFS debug level"
-                             " to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
-                val = QEMU_NFS_MAX_DEBUG_LEVEL;
-            }
-            nfs_set_debug(client->context, val);
-#endif
-        } else {
-            error_setg(errp, "Unknown NFS parameter name: %s",
-                       qp->p[i].name);
-            goto fail;
-        }
+    if (qemu_opt_get(opts, "uid")) {
+        client->uid = qemu_opt_get_number(opts, "uid", 0);
+        nfs_set_uid(client->context, client->uid);
    }

-    ret = nfs_mount(client->context, uri->server, uri->path);
+    if (qemu_opt_get(opts, "gid")) {
+        client->gid = qemu_opt_get_number(opts, "gid", 0);
+        nfs_set_gid(client->context, client->gid);
+    }
+
+    if (qemu_opt_get(opts, "tcp-syncnt")) {
+        client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syncnt", 0);
+        nfs_set_tcp_syncnt(client->context, client->tcp_syncnt);
+    }
+
+#ifdef LIBNFS_FEATURE_READAHEAD
+    if (qemu_opt_get(opts, "readahead")) {
+        if (open_flags & BDRV_O_NOCACHE) {
+            error_setg(errp, "Cannot enable NFS readahead "
+                             "if cache.direct = on");
+            goto fail;
+        }
+        client->readahead = qemu_opt_get_number(opts, "readahead", 0);
+        if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
+            error_report("NFS Warning: Truncating NFS readahead "
+                         "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
+            client->readahead = QEMU_NFS_MAX_READAHEAD_SIZE;
+        }
+        nfs_set_readahead(client->context, client->readahead);
+#ifdef LIBNFS_FEATURE_PAGECACHE
+        nfs_set_pagecache_ttl(client->context, 0);
+#endif
+        client->cache_used = true;
+    }
+#endif
+
+#ifdef LIBNFS_FEATURE_PAGECACHE
+    if (qemu_opt_get(opts, "pagecache")) {
+        if (open_flags & BDRV_O_NOCACHE) {
+            error_setg(errp, "Cannot enable NFS pagecache "
+                             "if cache.direct = on");
+            goto fail;
+        }
+        client->pagecache = qemu_opt_get_number(opts, "pagecache", 0);
+        if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
+            error_report("NFS Warning: Truncating NFS pagecache "
+                         "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
+            client->pagecache = QEMU_NFS_MAX_PAGECACHE_SIZE;
+        }
+        nfs_set_pagecache(client->context, client->pagecache);
+        nfs_set_pagecache_ttl(client->context, 0);
+        client->cache_used = true;
+    }
+#endif
+
+#ifdef LIBNFS_FEATURE_DEBUG
+    if (qemu_opt_get(opts, "debug")) {
+        client->debug = qemu_opt_get_number(opts, "debug", 0);
+        /* limit the maximum debug level to avoid potential flooding
+         * of our log files. */
+        if (client->debug > QEMU_NFS_MAX_DEBUG_LEVEL) {
+            error_report("NFS Warning: Limiting NFS debug level "
+                         "to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
+            client->debug = QEMU_NFS_MAX_DEBUG_LEVEL;
+        }
+        nfs_set_debug(client->context, client->debug);
+    }
+#endif
+
+    ret = nfs_mount(client->context, client->server->host, client->path);
    if (ret < 0) {
        error_setg(errp, "Failed to mount nfs share: %s",
                   nfs_get_error(client->context));
@@ -416,14 +608,13 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
    ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
    client->st_blocks = st.st_blocks;
    client->has_zero_init = S_ISREG(st.st_mode);
+    *strp = '/';
    goto out;
+
 fail:
    nfs_client_close(client);
 out:
-    if (qp) {
-        query_params_free(qp);
-    }
-    uri_free(uri);
+    qemu_opts_del(opts);
    g_free(file);
    return ret;
 }
@@ -432,28 +623,17 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp) {
    NFSClient *client = bs->opaque;
    int64_t ret;
-    QemuOpts *opts;
-    Error *local_err = NULL;

    client->aio_context = bdrv_get_aio_context(bs);

-    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto out;
-    }
-    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
+    ret = nfs_client_open(client, options,
                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                          errp, bs->open_flags);
    if (ret < 0) {
-        goto out;
+        return ret;
    }
    bs->total_sectors = ret;
    ret = 0;
-out:
-    qemu_opts_del(opts);
    return ret;
 }

@@ -475,6 +655,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
    int ret = 0;
    int64_t total_size = 0;
    NFSClient *client = g_new0(NFSClient, 1);
+    QDict *options = NULL;

    client->aio_context = qemu_get_aio_context();

@@ -482,13 +663,20 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);

-    ret = nfs_client_open(client, url, O_CREAT, errp, 0);
+    options = qdict_new();
+    ret = nfs_parse_uri(url, options, errp);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = nfs_client_open(client, options, O_CREAT, errp, 0);
    if (ret < 0) {
        goto out;
    }
    ret = nfs_ftruncate(client->context, client->fh, total_size);
    nfs_client_close(client);
 out:
+    QDECREF(options);
    g_free(client);
    return ret;
 }
@@ -499,6 +687,22 @@ static int nfs_has_zero_init(BlockDriverState *bs)
    return client->has_zero_init;
 }

+static void
+nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
+                               void *private_data)
+{
+    NFSRPC *task = private_data;
+    task->ret = ret;
+    if (task->ret == 0) {
+        memcpy(task->st, data, sizeof(struct stat));
+    }
+    if (task->ret < 0) {
+        error_report("NFS Error: %s", nfs_get_error(nfs));
+    }
+    task->complete = 1;
+    bdrv_wakeup(task->bs);
+}
+
 static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 {
    NFSClient *client = bs->opaque;
@@ -510,16 +714,15 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
        return client->st_blocks * 512;
    }

+    task.bs = bs;
    task.st = &st;
-    if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
+    if (nfs_fstat_async(client->context, client->fh, nfs_get_allocated_file_size_cb,
                        &task) != 0) {
        return -ENOMEM;
    }

-    while (!task.complete) {
-        nfs_set_events(client);
-        aio_poll(client->aio_context, true);
-    }
+    nfs_set_events(client);
+    BDRV_POLL_WHILE(bs, !task.complete);

    return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
@@ -564,6 +767,67 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

+static void nfs_refresh_filename(BlockDriverState *bs, QDict *options)
+{
+    NFSClient *client = bs->opaque;
+    QDict *opts = qdict_new();
+    QObject *server_qdict;
+    Visitor *ov;
+
+    qdict_put(opts, "driver", qstring_from_str("nfs"));
+
+    if (client->uid && !client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?uid=%" PRId64, client->server->host, client->path,
+                 client->uid);
+    } else if (!client->uid && client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?gid=%" PRId64, client->server->host, client->path,
+                 client->gid);
+    } else if (client->uid && client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?uid=%" PRId64 "&gid=%" PRId64,
+                 client->server->host, client->path, client->uid, client->gid);
+    } else {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s", client->server->host, client->path);
+    }
+
+    ov = qobject_output_visitor_new(&server_qdict);
+    visit_type_NFSServer(ov, NULL, &client->server, &error_abort);
+    visit_complete(ov, &server_qdict);
+    assert(qobject_type(server_qdict) == QTYPE_QDICT);
+
+    qdict_put_obj(opts, "server", server_qdict);
+    qdict_put(opts, "path", qstring_from_str(client->path));
+
+    if (client->uid) {
+        qdict_put(opts, "uid", qint_from_int(client->uid));
+    }
+    if (client->gid) {
+        qdict_put(opts, "gid", qint_from_int(client->gid));
+    }
+    if (client->tcp_syncnt) {
+        qdict_put(opts, "tcp-syncnt",
+                      qint_from_int(client->tcp_syncnt));
+    }
+    if (client->readahead) {
+        qdict_put(opts, "readahead",
+                      qint_from_int(client->readahead));
+    }
+    if (client->pagecache) {
+        qdict_put(opts, "pagecache",
+                      qint_from_int(client->pagecache));
+    }
+    if (client->debug) {
+        qdict_put(opts, "debug", qint_from_int(client->debug));
+    }
+
+    visit_free(ov);
+    qdict_flatten(opts);
+    bs->full_open_options = opts;
+}
+
 #ifdef LIBNFS_FEATURE_PAGECACHE
 static void nfs_invalidate_cache(BlockDriverState *bs,
                                 Error **errp)
@@ -578,7 +842,7 @@ static BlockDriver bdrv_nfs = {
    .protocol_name                  = "nfs",

    .instance_size                  = sizeof(NFSClient),
-    .bdrv_needs_filename            = true,
+    .bdrv_parse_filename            = nfs_parse_filename,
    .create_opts                    = &nfs_create_opts,

    .bdrv_has_zero_init             = nfs_has_zero_init,
@@ -596,6 +860,7 @@ static BlockDriver bdrv_nfs = {

    .bdrv_detach_aio_context        = nfs_detach_aio_context,
    .bdrv_attach_aio_context        = nfs_attach_aio_context,
+    .bdrv_refresh_filename          = nfs_refresh_filename,

 #ifdef LIBNFS_FEATURE_PAGECACHE
    .bdrv_invalidate_cache          = nfs_invalidate_cache,
--- a/block/null.c
+++ b/block/null.c
@@ -124,7 +124,6 @@ static coroutine_fn int null_co_flush(BlockDriverState *bs)

 typedef struct {
    BlockAIOCB common;
-    QEMUBH *bh;
    QEMUTimer timer;
 } NullAIOCB;

@@ -136,7 +135,6 @@ static void null_bh_cb(void *opaque)
 {
    NullAIOCB *acb = opaque;
    acb->common.cb(acb->common.opaque, 0);
-    qemu_bh_delete(acb->bh);
    qemu_aio_unref(acb);
 }

@@ -164,8 +162,7 @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
        timer_mod_ns(&acb->timer,
                     qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
    } else {
-        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
-        qemu_bh_schedule(acb->bh);
+        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), null_bh_cb, acb);
    }
    return &acb->common;
 }
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -29,7 +29,7 @@
 #include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
-#include "qapi/qmp-output-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include "qapi/qmp/types.h"
 #include "sysemu/block-backend.h"
 #include "qemu/cutils.h"
@@ -691,13 +691,14 @@ void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,
                                   ImageInfoSpecific *info_spec)
 {
    QObject *obj, *data;
-    Visitor *v = qmp_output_visitor_new(&obj);
+    Visitor *v = qobject_output_visitor_new(&obj);

    visit_type_ImageInfoSpecific(v, NULL, &info_spec, &error_abort);
    visit_complete(v, &obj);
    assert(qobject_type(obj) == QTYPE_QDICT);
    data = qdict_get(qobject_to_qdict(obj), "data");
    dump_qobject(func_fprintf, f, 1, data);
+    qobject_decref(obj);
    visit_free(v);
 }

--- a/block/qcow.c
+++ b/block/qcow.c
@@ -153,7 +153,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto fail;
    }
-    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128,
+                                 QCRYPTO_CIPHER_MODE_CBC)) {
        error_setg(errp, "AES cipher not available");
        ret = -EINVAL;
        goto fail;
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1558,7 +1558,7 @@ fail:
 * clusters.
 */
 static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
-                          uint64_t nb_clusters)
+                          uint64_t nb_clusters, int flags)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t *l2_table;
@@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,

        /* Update L2 entries */
        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-        if (old_offset & QCOW_OFLAG_COMPRESSED) {
+        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
        } else {
@@ -1595,7 +1595,8 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
    return nb_clusters;
 }

-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
+                        int flags)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t nb_clusters;
@@ -1612,7 +1613,7 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
    s->cache_discards = true;

    while (nb_clusters > 0) {
-        ret = zero_single_l2(bs, offset, nb_clusters);
+        ret = zero_single_l2(bs, offset, nb_clusters, flags);
        if (ret < 0) {
            goto fail;
        }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -959,7 +959,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
        ret = -EINVAL;
        goto fail;
    }
-    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128,
+                                 QCRYPTO_CIPHER_MODE_CBC)) {
        error_setg(errp, "AES cipher not available");
        ret = -EINVAL;
        goto fail;
@@ -1154,6 +1155,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,

    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;

    /* Repair image if dirty */
    if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
@@ -1204,6 +1206,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
    }
    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
+    bs->bl.pdiscard_alignment = s->cluster_size;
 }

 static int qcow2_set_key(BlockDriverState *bs, const char *key)
@@ -2476,7 +2479,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);

    /* Whatever is left can use real zero clusters */
-    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
+    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
@@ -2488,6 +2491,11 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
    int ret;
    BDRVQcow2State *s = bs->opaque;

+    if (!QEMU_IS_ALIGNED(offset | count, s->cluster_size)) {
+        assert(count < s->cluster_size);
+        return -ENOTSUP;
+    }
+
    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
                                 QCOW2_DISCARD_REQUEST, false);
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -473,8 +473,6 @@ static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
    return r1 > r2 ? r1 - r2 : r2 - r1;
 }

-// FIXME Need qcow2_ prefix to global functions
-
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                  int64_t sector_num, int nb_sectors);
@@ -547,7 +545,8 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
    int nb_sectors, enum qcow2_discard_type type, bool full_discard);
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
+                        int flags);

 int qcow2_expand_zero_clusters(BlockDriverState *bs,
                               BlockDriverAmendStatusCB *status_cb,
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -174,9 +174,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)

    qed_read_table(s, s->header.l1_table_offset,
                   s->l1_table, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);

    return ret;
 }
@@ -195,9 +193,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
    int ret = -EINPROGRESS;

    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);

    return ret;
 }
@@ -268,9 +264,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
    int ret = -EINPROGRESS;

    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);

    return ret;
 }
@@ -290,9 +284,7 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
    int ret = -EINPROGRESS;

    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);

    return ret;
 }
--- a/block/qed.c
+++ b/block/qed.c
@@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque)
    qed_plug_allocating_write_reqs(s);

    /* Ensure writes are on disk before clearing flag */
-    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 }

 static void qed_start_need_check_timer(BDRVQEDState *s)
@@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
    }
 }

+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Fire the timer immediately in order to start doing I/O as soon as the
+     * header is flushed.
+     */
+    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+        qed_cancel_need_check_timer(s);
+        qed_need_check_timer_cb(s);
+    }
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
 {
@@ -909,7 +922,6 @@ static void qed_aio_complete_bh(void *opaque)
    void *user_opaque = acb->common.opaque;
    int ret = acb->bh_ret;

-    qemu_bh_delete(acb->bh);
    qemu_aio_unref(acb);

    /* Invoke callback */
@@ -934,9 +946,8 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)

    /* Arrange for a bh to invoke the completion function */
    acb->bh_ret = ret;
-    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
-                         qed_aio_complete_bh, acb);
-    qemu_bh_schedule(acb->bh);
+    aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+                            qed_aio_complete_bh, acb);

    /* Start next allocating write request waiting behind this one.  Note that
     * requests enqueue themselves when they first hit an unallocated cluster
@@ -1670,6 +1681,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_check               = bdrv_qed_check,
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };

 static void bdrv_qed_init(void)
--- a/block/qed.h
+++ b/block/qed.h
@@ -130,7 +130,6 @@ enum {

 typedef struct QEDAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    int bh_ret;                     /* final return status for completion bh */
    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
    int flags;                      /* QED_AIOCB_* bits ORed together */
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -130,7 +130,7 @@ struct QuorumAIOCB {

    bool is_read;
    int vote_ret;
-    int child_iter;             /* which child to read in fifo pattern */
+    int children_read;          /* how many children have been read from */
 };

 static bool quorum_vote(QuorumAIOCB *acb);
@@ -156,22 +156,7 @@ static AIOCBInfo quorum_aiocb_info = {

 static void quorum_aio_finalize(QuorumAIOCB *acb)
 {
-    int i, ret = 0;
-
-    if (acb->vote_ret) {
-        ret = acb->vote_ret;
-    }
-
-    acb->common.cb(acb->common.opaque, ret);
-
-    if (acb->is_read) {
-        /* on the quorum case acb->child_iter == s->num_children - 1 */
-        for (i = 0; i <= acb->child_iter; i++) {
-            qemu_vfree(acb->qcrs[i].buf);
-            qemu_iovec_destroy(&acb->qcrs[i].qiov);
-        }
-    }
-
+    acb->common.cb(acb->common.opaque, acb->vote_ret);
    g_free(acb->qcrs);
    qemu_aio_unref(acb);
 }
@@ -283,39 +268,52 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
    }
 }

+static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret)
+{
+    QuorumAIOCB *acb = sacb->parent;
+    QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
+    quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
+                      sacb->aiocb->bs->node_name, ret);
+}
+
+static void quorum_fifo_aio_cb(void *opaque, int ret)
+{
+    QuorumChildRequest *sacb = opaque;
+    QuorumAIOCB *acb = sacb->parent;
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO);
+
+    if (ret < 0) {
+        quorum_report_bad_acb(sacb, ret);
+
+        /* We try to read next child in FIFO order if we fail to read */
+        if (acb->children_read < s->num_children) {
+            read_fifo_child(acb);
+            return;
+        }
+    }
+
+    acb->vote_ret = ret;
+
+    /* FIXME: rewrite failed children if acb->children_read > 1? */
+    quorum_aio_finalize(acb);
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
    QuorumChildRequest *sacb = opaque;
    QuorumAIOCB *acb = sacb->parent;
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;
+    int i;

+    sacb->ret = ret;
    if (ret == 0) {
        acb->success_count++;
    } else {
-        QuorumOpType type;
-        type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
-        quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
-                          sacb->aiocb->bs->node_name, ret);
+        quorum_report_bad_acb(sacb, ret);
    }
-
-    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
-        /* We try to read next child in FIFO order if we fail to read */
-        if (ret < 0 && (acb->child_iter + 1) < s->num_children) {
-            acb->child_iter++;
-            read_fifo_child(acb);
-            return;
-        }
-
-        if (ret == 0) {
-            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
-        }
-        acb->vote_ret = ret;
-        quorum_aio_finalize(acb);
-        return;
-    }
-
-    sacb->ret = ret;
    acb->count++;
    assert(acb->count <= s->num_children);
    assert(acb->success_count <= s->num_children);
@@ -326,6 +324,10 @@ static void quorum_aio_cb(void *opaque, int ret)
    /* Do the vote on read */
    if (acb->is_read) {
        rewrite = quorum_vote(acb);
+        for (i = 0; i < s->num_children; i++) {
+            qemu_vfree(acb->qcrs[i].buf);
+            qemu_iovec_destroy(&acb->qcrs[i].qiov);
+        }
    } else {
        quorum_has_too_much_io_failed(acb);
    }
@@ -653,6 +655,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
    BDRVQuorumState *s = acb->common.bs->opaque;
    int i;

+    acb->children_read = s->num_children;
    for (i = 0; i < s->num_children; i++) {
        acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size);
        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
@@ -671,16 +674,11 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
 static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
 {
    BDRVQuorumState *s = acb->common.bs->opaque;
+    int n = acb->children_read++;

-    acb->qcrs[acb->child_iter].buf =
-        qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size);
-    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
-    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
-                     acb->qcrs[acb->child_iter].buf);
-    acb->qcrs[acb->child_iter].aiocb =
-        bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num,
-                       &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
-                       quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+    acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num,
+                                        acb->qiov, acb->nb_sectors,
+                                        quorum_fifo_aio_cb, &acb->qcrs[n]);

    return &acb->common;
 }
@@ -696,13 +694,12 @@ static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs,
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
                                      nb_sectors, cb, opaque);
    acb->is_read = true;
+    acb->children_read = 0;

    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
-        acb->child_iter = s->num_children - 1;
        return read_quorum_children(acb);
    }

-    acb->child_iter = 0;
    return read_fifo_child(acb);
 }

--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -143,6 +143,7 @@ typedef struct BDRVRawState {
    bool has_discard:1;
    bool has_write_zeroes:1;
    bool discard_zeroes:1;
+    bool use_linux_aio:1;
    bool has_fallocate;
    bool needs_alignment;
 } BDRVRawState;
@@ -367,18 +368,6 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags)
    }
 }

-#ifdef CONFIG_LINUX_AIO
-static bool raw_use_aio(int bdrv_flags)
-{
-    /*
-     * Currently Linux do AIO only for files opened with O_DIRECT
-     * specified so check NOCACHE flag too
-     */
-    return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                         (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO);
-}
-#endif
-
 static void raw_parse_filename(const char *filename, QDict *options,
                               Error **errp)
 {
@@ -399,6 +388,11 @@ static QemuOptsList raw_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "File name of the image",
        },
+        {
+            .name = "aio",
+            .type = QEMU_OPT_STRING,
+            .help = "host AIO implementation (threads, native)",
+        },
        { /* end of list */ }
    },
 };
@@ -410,6 +404,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename = NULL;
+    BlockdevAioOptions aio, aio_default;
    int fd, ret;
    struct stat st;

@@ -429,6 +424,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        goto fail;
    }

+    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
+                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
+                  : BLOCKDEV_AIO_OPTIONS_THREADS;
+    aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
+                          BLOCKDEV_AIO_OPTIONS__MAX, aio_default, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
+
    s->open_flags = open_flags;
    raw_parse_flags(bdrv_flags, &s->open_flags);

@@ -436,6 +443,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    fd = qemu_open(filename, s->open_flags, 0644);
    if (fd < 0) {
        ret = -errno;
+        error_setg_errno(errp, errno, "Could not open '%s'", filename);
        if (ret == -EROFS) {
            ret = -EACCES;
        }
@@ -444,14 +452,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    s->fd = fd;

 #ifdef CONFIG_LINUX_AIO
-    if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
+     /* Currently Linux does AIO only for files opened with O_DIRECT */
+    if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
        error_setg(errp, "aio=native was specified, but it requires "
                         "cache.direct=on, which was not specified.");
        ret = -EINVAL;
        goto fail;
    }
 #else
-    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
+    if (s->use_linux_aio) {
        error_setg(errp, "aio=native was specified, but is not supported "
                         "in this build.");
        ret = -EINVAL;
@@ -533,7 +542,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
                              BlockReopenQueue *queue, Error **errp)
 {
    BDRVRawState *s;
-    BDRVRawReopenState *raw_s;
+    BDRVRawReopenState *rs;
    int ret = 0;
    Error *local_err = NULL;

@@ -543,15 +552,15 @@ static int raw_reopen_prepare(BDRVReopenState *state,
    s = state->bs->opaque;

    state->opaque = g_new0(BDRVRawReopenState, 1);
-    raw_s = state->opaque;
+    rs = state->opaque;

    if (s->type == FTYPE_CD) {
-        raw_s->open_flags |= O_NONBLOCK;
+        rs->open_flags |= O_NONBLOCK;
    }

-    raw_parse_flags(state->flags, &raw_s->open_flags);
+    raw_parse_flags(state->flags, &rs->open_flags);

-    raw_s->fd = -1;
+    rs->fd = -1;

    int fcntl_flags = O_APPEND | O_NONBLOCK;
 #ifdef O_NOATIME
@@ -560,35 +569,35 @@ static int raw_reopen_prepare(BDRVReopenState *state,

 #ifdef O_ASYNC
    /* Not all operating systems have O_ASYNC, and those that don't
-     * will not let us track the state into raw_s->open_flags (typically
+     * will not let us track the state into rs->open_flags (typically
     * you achieve the same effect with an ioctl, for example I_SETSIG
     * on Solaris). But we do not use O_ASYNC, so that's fine.
     */
    assert((s->open_flags & O_ASYNC) == 0);
 #endif

-    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+    if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
        /* dup the original fd */
-        raw_s->fd = qemu_dup(s->fd);
-        if (raw_s->fd >= 0) {
-            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
+        rs->fd = qemu_dup(s->fd);
+        if (rs->fd >= 0) {
+            ret = fcntl_setfl(rs->fd, rs->open_flags);
            if (ret) {
-                qemu_close(raw_s->fd);
-                raw_s->fd = -1;
+                qemu_close(rs->fd);
+                rs->fd = -1;
            }
        }
    }

    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
-    if (raw_s->fd == -1) {
+    if (rs->fd == -1) {
        const char *normalized_filename = state->bs->filename;
        ret = raw_normalize_devicepath(&normalized_filename);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not normalize device path");
        } else {
-            assert(!(raw_s->open_flags & O_CREAT));
-            raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags);
-            if (raw_s->fd == -1) {
+            assert(!(rs->open_flags & O_CREAT));
+            rs->fd = qemu_open(normalized_filename, rs->open_flags);
+            if (rs->fd == -1) {
                error_setg_errno(errp, errno, "Could not reopen file");
                ret = -1;
            }
@@ -597,11 +606,11 @@ static int raw_reopen_prepare(BDRVReopenState *state,

    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
     * alignment with the new fd. */
-    if (raw_s->fd != -1) {
-        raw_probe_alignment(state->bs, raw_s->fd, &local_err);
+    if (rs->fd != -1) {
+        raw_probe_alignment(state->bs, rs->fd, &local_err);
        if (local_err) {
-            qemu_close(raw_s->fd);
-            raw_s->fd = -1;
+            qemu_close(rs->fd);
+            rs->fd = -1;
            error_propagate(errp, local_err);
            ret = -EINVAL;
        }
@@ -612,13 +621,13 @@ static int raw_reopen_prepare(BDRVReopenState *state,

 static void raw_reopen_commit(BDRVReopenState *state)
 {
-    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawReopenState *rs = state->opaque;
    BDRVRawState *s = state->bs->opaque;

-    s->open_flags = raw_s->open_flags;
+    s->open_flags = rs->open_flags;

    qemu_close(s->fd);
-    s->fd = raw_s->fd;
+    s->fd = rs->fd;

    g_free(state->opaque);
    state->opaque = NULL;
@@ -627,16 +636,16 @@ static void raw_reopen_commit(BDRVReopenState *state)

 static void raw_reopen_abort(BDRVReopenState *state)
 {
-    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawReopenState *rs = state->opaque;

     /* nothing to do if NULL, we didn't get far enough */
-    if (raw_s == NULL) {
+    if (rs == NULL) {
        return;
    }

-    if (raw_s->fd >= 0) {
-        qemu_close(raw_s->fd);
-        raw_s->fd = -1;
+    if (rs->fd >= 0) {
+        qemu_close(rs->fd);
+        rs->fd = -1;
    }
    g_free(state->opaque);
    state->opaque = NULL;
@@ -1256,7 +1265,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
-        } else if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+        } else if (s->use_linux_aio) {
            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
            assert(qiov->size == bytes);
            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
@@ -1285,7 +1294,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
 static void raw_aio_plug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+    BDRVRawState *s = bs->opaque;
+    if (s->use_linux_aio) {
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_plug(bs, aio);
    }
@@ -1295,7 +1305,8 @@ static void raw_aio_plug(BlockDriverState *bs)
 static void raw_aio_unplug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+    BDRVRawState *s = bs->opaque;
+    if (s->use_linux_aio) {
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_unplug(bs, aio);
    }
@@ -2058,13 +2069,23 @@ static bool hdev_is_sg(BlockDriverState *bs)

 #if defined(__linux__)

+    BDRVRawState *s = bs->opaque;
    struct stat st;
    struct sg_scsi_id scsiid;
    int sg_version;
+    int ret;

-    if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) &&
-        !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) &&
-        !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) {
+    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
+        return false;
+    }
+
+    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
+    if (ret < 0) {
+        return false;
+    }
+
+    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
+    if (ret >= 0) {
        DPRINTF("SG device found: type=%d, version=%d\n",
            scsiid.scsi_type, sg_version);
        return true;
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -32,6 +32,7 @@
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/util.h"
 #include <windows.h>
 #include <winioctl.h>

@@ -252,7 +253,8 @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
    }
 }

-static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
+static void raw_parse_flags(int flags, bool use_aio, int *access_flags,
+                            DWORD *overlapped)
 {
    assert(access_flags != NULL);
    assert(overlapped != NULL);
@@ -264,7 +266,7 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
    }

    *overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NATIVE_AIO) {
+    if (use_aio) {
        *overlapped |= FILE_FLAG_OVERLAPPED;
    }
    if (flags & BDRV_O_NOCACHE) {
@@ -292,10 +294,35 @@ static QemuOptsList raw_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "File name of the image",
        },
+        {
+            .name = "aio",
+            .type = QEMU_OPT_STRING,
+            .help = "host AIO implementation (threads, native)",
+        },
        { /* end of list */ }
    },
 };

+static bool get_aio_option(QemuOpts *opts, int flags, Error **errp)
+{
+    BlockdevAioOptions aio, aio_default;
+
+    aio_default = (flags & BDRV_O_NATIVE_AIO) ? BLOCKDEV_AIO_OPTIONS_NATIVE
+                                              : BLOCKDEV_AIO_OPTIONS_THREADS;
+    aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
+                          BLOCKDEV_AIO_OPTIONS__MAX, aio_default, errp);
+
+    switch (aio) {
+    case BLOCKDEV_AIO_OPTIONS_NATIVE:
+        return true;
+    case BLOCKDEV_AIO_OPTIONS_THREADS:
+        return false;
+    default:
+        error_setg(errp, "Invalid AIO option");
+    }
+    return false;
+}
+
 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
@@ -305,6 +332,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    QemuOpts *opts;
    Error *local_err = NULL;
    const char *filename;
+    bool use_aio;
    int ret;

    s->type = FTYPE_FILE;
@@ -319,7 +347,14 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,

    filename = qemu_opt_get(opts, "filename");

-    raw_parse_flags(flags, &access_flags, &overlapped);
+    use_aio = get_aio_option(opts, flags, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    raw_parse_flags(flags, use_aio, &access_flags, &overlapped);

    if (filename[0] && filename[1] == ':') {
        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
@@ -338,6 +373,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    if (s->hfile == INVALID_HANDLE_VALUE) {
        int err = GetLastError();

+        error_setg_win32(errp, err, "Could not open '%s'", filename);
        if (err == ERROR_ACCESS_DENIED) {
            ret = -EACCES;
        } else {
@@ -346,7 +382,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    if (flags & BDRV_O_NATIVE_AIO) {
+    if (use_aio) {
        s->aio = win32_aio_init();
        if (s->aio == NULL) {
            CloseHandle(s->hfile);
@@ -647,6 +683,7 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,

    Error *local_err = NULL;
    const char *filename;
+    bool use_aio;

    QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0,
                                      &error_abort);
@@ -659,6 +696,16 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,

    filename = qemu_opt_get(opts, "filename");

+    use_aio = get_aio_option(opts, flags, &local_err);
+    if (!local_err && use_aio) {
+        error_setg(&local_err, "AIO is not supported on Windows host devices");
+    }
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto done;
+    }
+
    if (strstart(filename, "/dev/cdrom", NULL)) {
        if (find_cdrom(device_name, sizeof(device_name)) < 0) {
            error_setg(errp, "Could not open CD-ROM drive");
@@ -677,7 +724,7 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
    }
    s->type = find_device_type(bs, filename);

-    raw_parse_flags(flags, &access_flags, &overlapped);
+    raw_parse_flags(flags, use_aio, &access_flags, &overlapped);

    create_flags = OPEN_EXISTING;

--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -31,6 +31,30 @@
 #include "qapi/error.h"
 #include "qemu/option.h"

+typedef struct BDRVRawState {
+    uint64_t offset;
+    uint64_t size;
+    bool has_size;
+} BDRVRawState;
+
+static QemuOptsList raw_runtime_opts = {
+    .name = "raw",
+    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+    .desc = {
+        {
+            .name = "offset",
+            .type = QEMU_OPT_SIZE,
+            .help = "offset in the disk where the image starts",
+        },
+        {
+            .name = "size",
+            .type = QEMU_OPT_SIZE,
+            .help = "virtual disk size",
+        },
+        { /* end of list */ }
+    },
+};
+
 static QemuOptsList raw_create_opts = {
    .name = "raw-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -44,16 +68,116 @@ static QemuOptsList raw_create_opts = {
    }
 };

+static int raw_read_options(QDict *options, BlockDriverState *bs,
+    BDRVRawState *s, Error **errp)
+{
+    Error *local_err = NULL;
+    QemuOpts *opts = NULL;
+    int64_t real_size = 0;
+    int ret;
+
+    real_size = bdrv_getlength(bs->file->bs);
+    if (real_size < 0) {
+        error_setg_errno(errp, -real_size, "Could not get image size");
+        return real_size;
+    }
+
+    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    s->offset = qemu_opt_get_size(opts, "offset", 0);
+    if (s->offset > real_size) {
+        error_setg(errp, "Offset (%" PRIu64 ") cannot be greater than "
+            "size of the containing file (%" PRId64 ")",
+            s->offset, real_size);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    if (qemu_opt_find(opts, "size") != NULL) {
+        s->size = qemu_opt_get_size(opts, "size", 0);
+        s->has_size = true;
+    } else {
+        s->has_size = false;
+        s->size = real_size - s->offset;
+    }
+
+    /* Check size and offset */
+    if ((real_size - s->offset) < s->size) {
+        error_setg(errp, "The sum of offset (%" PRIu64 ") and size "
+            "(%" PRIu64 ") has to be smaller or equal to the "
+            " actual size of the containing file (%" PRId64 ")",
+            s->offset, s->size, real_size);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    /* Make sure size is multiple of BDRV_SECTOR_SIZE to prevent rounding
+     * up and leaking out of the specified area. */
+    if (s->has_size && !QEMU_IS_ALIGNED(s->size, BDRV_SECTOR_SIZE)) {
+        error_setg(errp, "Specified size is not multiple of %llu",
+            BDRV_SECTOR_SIZE);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    ret = 0;
+
+end:
+
+    qemu_opts_del(opts);
+
+    return ret;
+}
+
 static int raw_reopen_prepare(BDRVReopenState *reopen_state,
                              BlockReopenQueue *queue, Error **errp)
 {
-    return 0;
+    assert(reopen_state != NULL);
+    assert(reopen_state->bs != NULL);
+
+    reopen_state->opaque = g_new0(BDRVRawState, 1);
+
+    return raw_read_options(
+        reopen_state->options,
+        reopen_state->bs,
+        reopen_state->opaque,
+        errp);
+}
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+    BDRVRawState *new_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+
+    memcpy(s, new_s, sizeof(BDRVRawState));
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+    g_free(state->opaque);
+    state->opaque = NULL;
 }

 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
                                      uint64_t bytes, QEMUIOVector *qiov,
                                      int flags)
 {
+    BDRVRawState *s = bs->opaque;
+
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
+
    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
@@ -62,11 +186,23 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                       uint64_t bytes, QEMUIOVector *qiov,
                                       int flags)
 {
+    BDRVRawState *s = bs->opaque;
    void *buf = NULL;
    BlockDriver *drv;
    QEMUIOVector local_qiov;
    int ret;

+    if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
+        /* There's not enough space for the data. Don't write anything and just
+         * fail to prevent leaking out of the size specified in options. */
+        return -ENOSPC;
+    }
+
+    if (offset > UINT64_MAX - s->offset) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
    if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
        /* Handling partial writes would be a pain - so we just
         * require that guests have 512-byte request alignment if
@@ -101,6 +237,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
        qiov = &local_qiov;
    }

+    offset += s->offset;
+
    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);

@@ -117,8 +255,10 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                            int nb_sectors, int *pnum,
                                            BlockDriverState **file)
 {
+    BDRVRawState *s = bs->opaque;
    *pnum = nb_sectors;
    *file = bs->file->bs;
+    sector_num += s->offset / BDRV_SECTOR_SIZE;
    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
           (sector_num << BDRV_SECTOR_BITS);
 }
@@ -127,18 +267,49 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
                                             int64_t offset, int count,
                                             BdrvRequestFlags flags)
 {
+    BDRVRawState *s = bs->opaque;
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
 }

 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
                                        int64_t offset, int count)
 {
+    BDRVRawState *s = bs->opaque;
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
    return bdrv_co_pdiscard(bs->file->bs, offset, count);
 }

 static int64_t raw_getlength(BlockDriverState *bs)
 {
-    return bdrv_getlength(bs->file->bs);
+    int64_t len;
+    BDRVRawState *s = bs->opaque;
+
+    /* Update size. It should not change unless the file was externally
+     * modified. */
+    len = bdrv_getlength(bs->file->bs);
+    if (len < 0) {
+        return len;
+    }
+
+    if (len < s->offset) {
+        s->size = 0;
+    } else {
+        if (s->has_size) {
+            /* Try to honour the size */
+            s->size = MIN(s->size, len - s->offset);
+        } else {
+            s->size = len - s->offset;
+        }
+    }
+
+    return s->size;
 }

 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -158,6 +329,18 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)

 static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
+    BDRVRawState *s = bs->opaque;
+
+    if (s->has_size) {
+        return -ENOTSUP;
+    }
+
+    if (INT64_MAX - offset < s->offset) {
+        return -EINVAL;
+    }
+
+    s->size = offset;
+    offset += s->offset;
    return bdrv_truncate(bs->file->bs, offset);
 }

@@ -176,12 +359,13 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
    bdrv_lock_medium(bs->file->bs, locked);
 }

-static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
-                                 unsigned long int req, void *buf,
-                                 BlockCompletionFunc *cb,
-                                 void *opaque)
+static int raw_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
-    return bdrv_aio_ioctl(bs->file->bs, req, buf, cb, opaque);
+    BDRVRawState *s = bs->opaque;
+    if (s->offset || s->has_size) {
+        return -ENOTSUP;
+    }
+    return bdrv_co_ioctl(bs->file->bs, req, buf);
 }

 static int raw_has_zero_init(BlockDriverState *bs)
@@ -197,6 +381,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
 {
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
    bs->sg = bs->file->bs->sg;
    bs->supported_write_flags = BDRV_REQ_FUA &
        bs->file->bs->supported_write_flags;
@@ -214,6 +401,16 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                bs->file->bs->filename);
    }

+    ret = raw_read_options(options, bs, s, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (bs->sg && (s->offset || s->has_size)) {
+        error_setg(errp, "Cannot use offset/size with SCSI generic devices");
+        return -EINVAL;
+    }
+
    return 0;
 }

@@ -231,18 +428,37 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)

 static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
-    return bdrv_probe_blocksizes(bs->file->bs, bsz);
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = bdrv_probe_blocksizes(bs->file->bs, bsz);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!QEMU_IS_ALIGNED(s->offset, MAX(bsz->log, bsz->phys))) {
+        return -ENOTSUP;
+    }
+
+    return 0;
 }

 static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 {
+    BDRVRawState *s = bs->opaque;
+    if (s->offset || s->has_size) {
+        return -ENOTSUP;
+    }
    return bdrv_probe_geometry(bs->file->bs, geo);
 }

 BlockDriver bdrv_raw = {
    .format_name          = "raw",
+    .instance_size        = sizeof(BDRVRawState),
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
+    .bdrv_reopen_commit   = &raw_reopen_commit,
+    .bdrv_reopen_abort    = &raw_reopen_abort,
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
    .bdrv_create          = &raw_create,
@@ -261,7 +477,7 @@ BlockDriver bdrv_raw = {
    .bdrv_media_changed   = &raw_media_changed,
    .bdrv_eject           = &raw_eject,
    .bdrv_lock_medium     = &raw_lock_medium,
-    .bdrv_aio_ioctl       = &raw_aio_ioctl,
+    .bdrv_co_ioctl        = &raw_co_ioctl,
    .create_opts          = &raw_create_opts,
    .bdrv_has_zero_init   = &raw_has_zero_init
 };
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -71,7 +71,6 @@ typedef enum {

 typedef struct RBDAIOCB {
    BlockAIOCB common;
-    QEMUBH *bh;
    int64_t ret;
    QEMUIOVector *qiov;
    char *bounce;
@@ -366,45 +365,44 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
        rados_conf_read_file(cluster, NULL);
    } else if (conf[0] != '\0' &&
               qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) {
-        rados_shutdown(cluster);
        error_propagate(errp, local_err);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
    }

    if (conf[0] != '\0' &&
        qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) {
-        rados_shutdown(cluster);
        error_propagate(errp, local_err);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
    }

    if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
-        rados_shutdown(cluster);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
    }

    ret = rados_connect(cluster);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "error connecting");
-        rados_shutdown(cluster);
-        return ret;
+        goto shutdown;
    }

    ret = rados_ioctx_create(cluster, pool, &io_ctx);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "error opening pool %s", pool);
-        rados_shutdown(cluster);
-        return ret;
+        goto shutdown;
    }

    ret = rbd_create(io_ctx, name, bytes, &obj_order);
-    rados_ioctx_destroy(io_ctx);
-    rados_shutdown(cluster);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "error rbd create");
-        return ret;
    }

+    rados_ioctx_destroy(io_ctx);
+
+shutdown:
+    rados_shutdown(cluster);
    return ret;
 }

@@ -602,7 +600,6 @@ static const AIOCBInfo rbd_aiocb_info = {
 static void rbd_finish_bh(void *opaque)
 {
    RADOSCB *rcb = opaque;
-    qemu_bh_delete(rcb->acb->bh);
    qemu_rbd_complete_aio(rcb);
 }

@@ -621,9 +618,8 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
    rcb->ret = rbd_aio_get_return_value(c);
    rbd_aio_release(c);

-    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
-                         rbd_finish_bh, rcb);
-    qemu_bh_schedule(acb->bh);
+    aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+                            rbd_finish_bh, rcb);
 }

 static int rbd_aio_discard_wrapper(rbd_image_t image,
@@ -679,7 +675,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
    acb->ret = 0;
    acb->error = 0;
    acb->s = s;
-    acb->bh = NULL;

    if (cmd == RBD_AIO_WRITE) {
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -737,7 +732,7 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
                                      void *opaque)
 {
    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                         RBD_AIO_READ);
 }

@@ -749,7 +744,7 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                                       void *opaque)
 {
    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                         RBD_AIO_WRITE);
 }

--- a/block/replication.c
+++ b/block/replication.c
@@ -101,6 +101,11 @@ static int replication_open(BlockDriverState *bs, QDict *options,

    if (!strcmp(mode, "primary")) {
        s->mode = REPLICATION_MODE_PRIMARY;
+        top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
+        if (top_id) {
+            error_setg(&local_err, "The primary side does not support option top-id");
+            goto fail;
+        }
    } else if (!strcmp(mode, "secondary")) {
        s->mode = REPLICATION_MODE_SECONDARY;
        top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
@@ -133,6 +138,9 @@ static void replication_close(BlockDriverState *bs)
    if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
        replication_stop(s->rs, false, NULL);
    }
+    if (s->replication_state == BLOCK_REPLICATION_FAILOVER) {
+        block_job_cancel_sync(s->active_disk->bs->job);
+    }

    if (s->mode == REPLICATION_MODE_SECONDARY) {
        g_free(s->top_id);
@@ -314,9 +322,10 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
    }
 }

-static void reopen_backing_file(BDRVReplicationState *s, bool writable,
+static void reopen_backing_file(BlockDriverState *bs, bool writable,
                                Error **errp)
 {
+    BDRVReplicationState *s = bs->opaque;
    BlockReopenQueue *reopen_queue = NULL;
    int orig_hidden_flags, orig_secondary_flags;
    int new_hidden_flags, new_secondary_flags;
@@ -351,13 +360,15 @@ static void reopen_backing_file(BDRVReplicationState *s, bool writable,
    }

    if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs),
+                             reopen_queue, &local_err);
        error_propagate(errp, local_err);
    }
 }

-static void backup_job_cleanup(BDRVReplicationState *s)
+static void backup_job_cleanup(BlockDriverState *bs)
 {
+    BDRVReplicationState *s = bs->opaque;
    BlockDriverState *top_bs;

    top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
@@ -366,19 +377,20 @@ static void backup_job_cleanup(BDRVReplicationState *s)
    }
    bdrv_op_unblock_all(top_bs, s->blocker);
    error_free(s->blocker);
-    reopen_backing_file(s, false, NULL);
+    reopen_backing_file(bs, false, NULL);
 }

 static void backup_job_completed(void *opaque, int ret)
 {
-    BDRVReplicationState *s = opaque;
+    BlockDriverState *bs = opaque;
+    BDRVReplicationState *s = bs->opaque;

    if (s->replication_state != BLOCK_REPLICATION_FAILOVER) {
        /* The backup job is cancelled unexpectedly */
        s->error = -EIO;
    }

-    backup_job_cleanup(s);
+    backup_job_cleanup(bs);
 }

 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -409,6 +421,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
    int64_t active_length, hidden_length, disk_length;
    AioContext *aio_context;
    Error *local_err = NULL;
+    BlockJob *job;

    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);
@@ -474,7 +487,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        }

        /* reopen the backing file in r/w mode */
-        reopen_backing_file(s, true, &local_err);
+        reopen_backing_file(bs, true, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            aio_context_release(aio_context);
@@ -489,23 +502,25 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
        if (!top_bs || !bdrv_is_root_node(top_bs) ||
            !check_top_bs(top_bs, bs)) {
            error_setg(errp, "No top_bs or it is invalid");
-            reopen_backing_file(s, false, NULL);
+            reopen_backing_file(bs, false, NULL);
            aio_context_release(aio_context);
            return;
        }
        bdrv_op_block_all(top_bs, s->blocker);
        bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);

-        backup_start("replication-backup", s->secondary_disk->bs,
-                     s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
-                     BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
-                     backup_job_completed, s, NULL, &local_err);
+        job = backup_job_create(NULL, s->secondary_disk->bs, s->hidden_disk->bs,
+                                0, MIRROR_SYNC_MODE_NONE, NULL, false,
+                                BLOCKDEV_ON_ERROR_REPORT,
+                                BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
+                                backup_job_completed, bs, NULL, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
-            backup_job_cleanup(s);
+            backup_job_cleanup(bs);
            aio_context_release(aio_context);
            return;
        }
+        block_job_start(job);
        break;
    default:
        aio_context_release(aio_context);
@@ -621,10 +636,9 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
        }

        s->replication_state = BLOCK_REPLICATION_FAILOVER;
-        commit_active_start("replication-commit", s->active_disk->bs,
-                            s->secondary_disk->bs, 0, BLOCKDEV_ON_ERROR_REPORT,
-                            replication_done,
-                            bs, errp, true);
+        commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
+                            BLOCK_JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
+                            replication_done, bs, errp, true);
        break;
    default:
        aio_context_release(aio_context);
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -641,6 +641,7 @@ static void restart_co_req(void *opaque)

 typedef struct SheepdogReqCo {
    int sockfd;
+    BlockDriverState *bs;
    AioContext *aio_context;
    SheepdogReq *hdr;
    void *data;
@@ -701,6 +702,9 @@ out:

    srco->ret = ret;
    srco->finished = true;
+    if (srco->bs) {
+        bdrv_wakeup(srco->bs);
+    }
 }

 /*
@@ -708,13 +712,14 @@ out:
 *
 * Return 0 on success, -errno in case of error.
 */
-static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
+static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
                  void *data, unsigned int *wlen, unsigned int *rlen)
 {
    Coroutine *co;
    SheepdogReqCo srco = {
        .sockfd = sockfd,
-        .aio_context = aio_context,
+        .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
+        .bs = bs,
        .hdr = hdr,
        .data = data,
        .wlen = wlen,
@@ -727,9 +732,14 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
        do_co_req(&srco);
    } else {
        co = qemu_coroutine_create(do_co_req, &srco);
-        qemu_coroutine_enter(co);
-        while (!srco.finished) {
-            aio_poll(aio_context, true);
+        if (bs) {
+            qemu_coroutine_enter(co);
+            BDRV_POLL_WHILE(bs, !srco.finished);
+        } else {
+            qemu_coroutine_enter(co);
+            while (!srco.finished) {
+                aio_poll(qemu_get_aio_context(), true);
+            }
        }
    }

@@ -1125,7 +1135,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
    hdr.snapid = snapid;
    hdr.flags = SD_FLAG_CMD_WRITE;

-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
    if (ret) {
        error_setg_errno(errp, -ret, "cannot get vdi info");
        goto out;
@@ -1240,7 +1250,7 @@ out:
    qemu_co_mutex_unlock(&s->lock);
 }

-static int read_write_object(int fd, AioContext *aio_context, char *buf,
+static int read_write_object(int fd, BlockDriverState *bs, char *buf,
                             uint64_t oid, uint8_t copies,
                             unsigned int datalen, uint64_t offset,
                             bool write, bool create, uint32_t cache_flags)
@@ -1274,7 +1284,7 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
    hdr.offset = offset;
    hdr.copies = copies;

-    ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
    if (ret) {
        error_report("failed to send a request to the sheep");
        return ret;
@@ -1289,22 +1299,22 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
    }
 }

-static int read_object(int fd, AioContext *aio_context, char *buf,
+static int read_object(int fd, BlockDriverState *bs, char *buf,
                       uint64_t oid, uint8_t copies,
                       unsigned int datalen, uint64_t offset,
                       uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                             datalen, offset, false,
                             false, cache_flags);
 }

-static int write_object(int fd, AioContext *aio_context, char *buf,
+static int write_object(int fd, BlockDriverState *bs, char *buf,
                        uint64_t oid, uint8_t copies,
                        unsigned int datalen, uint64_t offset, bool create,
                        uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                             datalen, offset, true,
                             create, cache_flags);
 }
@@ -1331,7 +1341,7 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
        goto out;
    }

-    ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
                      s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
                      s->cache_flags);
    if (ret < 0) {
@@ -1489,7 +1499,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
    }

    buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                      0, SD_INODE_SIZE, 0, s->cache_flags);

    closesocket(fd);
@@ -1618,7 +1628,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
    hdr.copies = s->inode.nr_copies;
    hdr.block_size_shift = s->inode.block_size_shift;

-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);

    closesocket(fd);

@@ -1886,7 +1896,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
        hdr.proto_ver = SD_PROTO_VER;

-        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+        ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
                     NULL, &wlen, &rlen);
        closesocket(fd);
        if (ret) {
@@ -1951,7 +1961,7 @@ static void sd_close(BlockDriverState *bs)
    hdr.data_length = wlen;
    hdr.flags = SD_FLAG_CMD_WRITE;

-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                 s->name, &wlen, &rlen);

    closesocket(fd);
@@ -2000,7 +2010,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
    /* we don't need to update entire object */
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
    s->inode.vdi_size = offset;
-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                       datalen, 0, false, s->cache_flags);
    close(fd);
@@ -2070,7 +2080,7 @@ static bool sd_delete(BDRVSheepdogState *s)
        return false;
    }

-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                 s->name, &wlen, &rlen);
    closesocket(fd);
    if (ret) {
@@ -2126,7 +2136,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
        goto out;
    }

-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                      s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);

    closesocket(fd);
@@ -2411,7 +2421,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
        goto cleanup;
    }

-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                       datalen, 0, false, s->cache_flags);
    if (ret < 0) {
@@ -2426,7 +2436,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
        goto cleanup;
    }

-    ret = read_object(fd, s->aio_context, (char *)inode,
+    ret = read_object(fd, s->bs, (char *)inode,
                      vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
                      s->cache_flags);

@@ -2528,7 +2538,7 @@ static bool remove_objects(BDRVSheepdogState *s)
            i++;
        }

-        ret = write_object(fd, s->aio_context,
+        ret = write_object(fd, s->bs,
                           (char *)&inode->data_vdi_id[start_idx],
                           vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
                           (i - start_idx) * sizeof(uint32_t),
@@ -2600,7 +2610,7 @@ static int sd_snapshot_delete(BlockDriverState *bs,
        return -1;
    }

-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                 buf, &wlen, &rlen);
    closesocket(fd);
    if (ret) {
@@ -2652,8 +2662,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
    req.opcode = SD_OP_READ_VDIS;
    req.data_length = max;

-    ret = do_req(fd, s->aio_context, &req,
-                 vdi_inuse, &wlen, &rlen);
+    ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);

    closesocket(fd);
    if (ret) {
@@ -2679,7 +2688,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
        }

        /* we don't need to read entire object */
-        ret = read_object(fd, s->aio_context, (char *)&inode,
+        ret = read_object(fd, s->bs, (char *)&inode,
                          vid_to_vdi_oid(vid),
                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
                          s->cache_flags);
@@ -2745,11 +2754,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,

        create = (offset == 0);
        if (load) {
-            ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
                              s->inode.nr_copies, data_len, offset,
                              s->cache_flags);
        } else {
-            ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
                               s->inode.nr_copies, data_len, offset, create,
                               s->cache_flags);
        }
@@ -2820,8 +2829,9 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    iov.iov_len = sizeof(zero);
    discard_iov.iov = &iov;
    discard_iov.niov = 1;
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
+    if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
+        return -ENOTSUP;
+    }
    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
                       count >> BDRV_SECTOR_BITS);
    acb->aiocb_type = AIOCB_DISCARD_OBJ;
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -30,10 +30,14 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/cutils.h"
 #include "qemu/sockets.h"
 #include "qemu/uri.h"
+#include "qapi-visit.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"

 /* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
 * this block driver code.
@@ -74,8 +78,9 @@ typedef struct BDRVSSHState {
     */
    LIBSSH2_SFTP_ATTRIBUTES attrs;

+    InetSocketAddress *inet;
+
    /* Used to warn if 'flush' is not supported. */
-    char *hostport;
    bool unsafe_flush_warning;
 } BDRVSSHState;

@@ -89,7 +94,6 @@ static void ssh_state_init(BDRVSSHState *s)

 static void ssh_state_free(BDRVSSHState *s)
 {
-    g_free(s->hostport);
    if (s->sftp_handle) {
        libssh2_sftp_close(s->sftp_handle);
    }
@@ -193,6 +197,7 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
 {
    URI *uri = NULL;
    QueryParams *qp;
+    char *port_str;
    int i;

    uri = uri_parse(filename);
@@ -225,11 +230,11 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
        qdict_put(options, "user", qstring_from_str(uri->user));
    }

-    qdict_put(options, "host", qstring_from_str(uri->server));
+    qdict_put(options, "server.host", qstring_from_str(uri->server));

-    if (uri->port) {
-        qdict_put(options, "port", qint_from_int(uri->port));
-    }
+    port_str = g_strdup_printf("%d", uri->port ?: 22);
+    qdict_put(options, "server.port", qstring_from_str(port_str));
+    g_free(port_str);

    qdict_put(options, "path", qstring_from_str(uri->path));

@@ -254,15 +259,31 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
    return -EINVAL;
 }

+static bool ssh_has_filename_options_conflict(QDict *options, Error **errp)
+{
+    const QDictEntry *qe;
+
+    for (qe = qdict_first(options); qe; qe = qdict_next(options, qe)) {
+        if (!strcmp(qe->key, "host") ||
+            !strcmp(qe->key, "port") ||
+            !strcmp(qe->key, "path") ||
+            !strcmp(qe->key, "user") ||
+            !strcmp(qe->key, "host_key_check") ||
+            strstart(qe->key, "server.", NULL))
+        {
+            error_setg(errp, "Option '%s' cannot be used with a file name",
+                       qe->key);
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static void ssh_parse_filename(const char *filename, QDict *options,
                               Error **errp)
 {
-    if (qdict_haskey(options, "user") ||
-        qdict_haskey(options, "host") ||
-        qdict_haskey(options, "port") ||
-        qdict_haskey(options, "path") ||
-        qdict_haskey(options, "host_key_check")) {
-        error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option");
+    if (ssh_has_filename_options_conflict(options, errp)) {
        return;
    }

@@ -540,14 +561,68 @@ static QemuOptsList ssh_runtime_opts = {
    },
 };

+static bool ssh_process_legacy_socket_options(QDict *output_opts,
+                                              QemuOpts *legacy_opts,
+                                              Error **errp)
+{
+    const char *host = qemu_opt_get(legacy_opts, "host");
+    const char *port = qemu_opt_get(legacy_opts, "port");
+
+    if (!host && port) {
+        error_setg(errp, "port may not be used without host");
+        return false;
+    }
+
+    if (host) {
+        qdict_put(output_opts, "server.host", qstring_from_str(host));
+        qdict_put(output_opts, "server.port",
+                  qstring_from_str(port ?: stringify(22)));
+    }
+
+    return true;
+}
+
+static InetSocketAddress *ssh_config(QDict *options, Error **errp)
+{
+    InetSocketAddress *inet = NULL;
+    QDict *addr = NULL;
+    QObject *crumpled_addr = NULL;
+    Visitor *iv = NULL;
+    Error *local_error = NULL;
+
+    qdict_extract_subqdict(options, &addr, "server.");
+    if (!qdict_size(addr)) {
+        error_setg(errp, "SSH server address missing");
+        goto out;
+    }
+
+    crumpled_addr = qdict_crumple(addr, errp);
+    if (!crumpled_addr) {
+        goto out;
+    }
+
+    iv = qobject_input_visitor_new(crumpled_addr, true);
+    visit_type_InetSocketAddress(iv, NULL, &inet, &local_error);
+    if (local_error) {
+        error_propagate(errp, local_error);
+        goto out;
+    }
+
+out:
+    QDECREF(addr);
+    qobject_decref(crumpled_addr);
+    visit_free(iv);
+    return inet;
+}
+
 static int connect_to_ssh(BDRVSSHState *s, QDict *options,
                          int ssh_flags, int creat_mode, Error **errp)
 {
    int r, ret;
    QemuOpts *opts = NULL;
    Error *local_err = NULL;
-    const char *host, *user, *path, *host_key_check;
-    int port;
+    const char *user, *path, *host_key_check;
+    long port = 0;

    opts = qemu_opts_create(&ssh_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -557,15 +632,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        goto err;
    }

-    host = qemu_opt_get(opts, "host");
-    if (!host) {
+    if (!ssh_process_legacy_socket_options(options, opts, errp)) {
        ret = -EINVAL;
-        error_setg(errp, "No hostname was specified");
        goto err;
    }

-    port = qemu_opt_get_number(opts, "port", 22);
-
    path = qemu_opt_get(opts, "path");
    if (!path) {
        ret = -EINVAL;
@@ -588,12 +659,21 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
        host_key_check = "yes";
    }

-    /* Construct the host:port name for inet_connect. */
-    g_free(s->hostport);
-    s->hostport = g_strdup_printf("%s:%d", host, port);
+    /* Pop the config into our state object, Exit if invalid */
+    s->inet = ssh_config(options, errp);
+    if (!s->inet) {
+        ret = -EINVAL;
+        goto err;
+    }
+
+    if (qemu_strtol(s->inet->port, NULL, 10, &port) < 0) {
+        error_setg(errp, "Use only numeric port value");
+        ret = -EINVAL;
+        goto err;
+    }

    /* Open the socket and connect. */
-    s->sock = inet_connect(s->hostport, errp);
+    s->sock = inet_connect_saddr(s->inet, errp, NULL, NULL);
    if (s->sock < 0) {
        ret = -EIO;
        goto err;
@@ -619,7 +699,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
    }

    /* Check the remote host's key against known_hosts. */
-    ret = check_host_key(s, host, port, host_key_check, errp);
+    ret = check_host_key(s, s->inet->host, port, host_key_check,
+                         errp);
    if (ret < 0) {
        goto err;
    }
@@ -1040,7 +1121,7 @@ static void unsafe_flush_warning(BDRVSSHState *s, const char *what)
 {
    if (!s->unsafe_flush_warning) {
        error_report("warning: ssh server %s does not support fsync",
-                     s->hostport);
+                     s->inet->host);
        if (what) {
            error_report("to support fsync, you need %s", what);
        }
--- a/block/stream.c
+++ b/block/stream.c
@@ -14,7 +14,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -37,6 +37,7 @@ typedef struct StreamBlockJob {
    BlockDriverState *base;
    BlockdevOnError on_error;
    char *backing_file_str;
+    int bs_flags;
 } StreamBlockJob;

 static int coroutine_fn stream_populate(BlockBackend *blk,
@@ -81,6 +82,11 @@ static void stream_complete(BlockJob *job, void *opaque)
        bdrv_set_backing_hd(bs, base);
    }

+    /* Reopen the image back in read-only mode if necessary */
+    if (s->bs_flags != bdrv_get_flags(bs)) {
+        bdrv_reopen(bs, s->bs_flags, NULL);
+    }
+
    g_free(s->backing_file_str);
    block_job_completed(&s->common, data->ret);
    g_free(data);
@@ -212,26 +218,43 @@ static const BlockJobDriver stream_job_driver = {
    .instance_size = sizeof(StreamBlockJob),
    .job_type      = BLOCK_JOB_TYPE_STREAM,
    .set_speed     = stream_set_speed,
+    .start         = stream_run,
 };

 void stream_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, const char *backing_file_str,
-                  int64_t speed, BlockdevOnError on_error,
-                  BlockCompletionFunc *cb, void *opaque, Error **errp)
+                  int64_t speed, BlockdevOnError on_error, Error **errp)
 {
    StreamBlockJob *s;
+    BlockDriverState *iter;
+    int orig_bs_flags;

    s = block_job_create(job_id, &stream_job_driver, bs, speed,
-                         cb, opaque, errp);
+                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
    if (!s) {
        return;
    }

+    /* Make sure that the image is opened in read-write mode */
+    orig_bs_flags = bdrv_get_flags(bs);
+    if (!(orig_bs_flags & BDRV_O_RDWR)) {
+        if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) {
+            block_job_unref(&s->common);
+            return;
+        }
+    }
+
+    /* Block all intermediate nodes between bs and base, because they
+     * will disappear from the chain after this operation */
+    for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) {
+        block_job_add_bdrv(&s->common, iter);
+    }
+
    s->base = base;
    s->backing_file_str = g_strdup(backing_file_str);
+    s->bs_flags = orig_bs_flags;

    s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(stream_run, s);
-    trace_stream_start(bs, base, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co);
+    trace_stream_start(bs, base, s);
+    block_job_start(&s->common);
 }
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -168,6 +168,22 @@ static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
    return blk_by_public(next);
 }

+/*
+ * Return whether a BlockBackend has pending requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @blk: the BlockBackend
+ * @is_write:  the type of operation (read/write)
+ * @ret:       whether the BlockBackend has pending requests.
+ */
+static inline bool blk_has_pending_reqs(BlockBackend *blk,
+                                        bool is_write)
+{
+    const BlockBackendPublic *blkp = blk_get_public(blk);
+    return blkp->pending_reqs[is_write];
+}
+
 /* Return the next BlockBackend in the round-robin sequence with pending I/O
 * requests.
 *
@@ -188,7 +204,7 @@ static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)

    /* get next bs round in round robin style */
    token = throttle_group_next_blk(token);
-    while (token != start && !blkp->pending_reqs[is_write]) {
+    while (token != start && !blk_has_pending_reqs(token, is_write)) {
        token = throttle_group_next_blk(token);
    }

@@ -196,10 +212,13 @@ static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
     * then decide the token is the current bs because chances are
     * the current bs get the current request queued.
     */
-    if (token == start && !blkp->pending_reqs[is_write]) {
+    if (token == start && !blk_has_pending_reqs(token, is_write)) {
        token = blk;
    }

+    /* Either we return the original BB, or one with pending requests */
+    assert(token == blk || blk_has_pending_reqs(token, is_write));
+
    return token;
 }

@@ -257,7 +276,7 @@ static void schedule_next_request(BlockBackend *blk, bool is_write)

    /* Check if there's any pending request to schedule next */
    token = next_throttle_token(blk, is_write);
-    if (!blkp->pending_reqs[is_write]) {
+    if (!blk_has_pending_reqs(token, is_write)) {
        return;
    }

@@ -271,7 +290,7 @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
            token = blk;
        } else {
-            ThrottleTimers *tt = &blkp->throttle_timers;
+            ThrottleTimers *tt = &blk_get_public(token)->throttle_timers;
            int64_t now = qemu_clock_get_ns(tt->clock_type);
            timer_mod(tt->timers[is_write], now + 1);
            tg->any_timer_armed[is_write] = true;
--- a/block/trace-events
+++ b/block/trace-events
@@ -9,7 +9,6 @@ blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags
 blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"

 # block/io.c
-bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p"
 bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
@@ -20,14 +19,14 @@ bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t c

 # block/stream.c
 stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
+stream_start(void *bs, void *base, void *s) "bs %p base %p s %p"

 # block/commit.c
 commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
+commit_start(void *bs, void *base, void *top, void *s) "bs %p base %p top %p s %p"

 # block/mirror.c
-mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+mirror_start(void *bs, void *s, void *opaque) "bs %p s %p opaque %p"
 mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_flush(void *s) "s %p"
 mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
@@ -52,7 +51,6 @@ qmp_block_job_cancel(void *job) "job %p"
 qmp_block_job_pause(void *job) "job %p"
 qmp_block_job_resume(void *job) "job %p"
 qmp_block_job_complete(void *job) "job %p"
-block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d"
 qmp_block_stream(void *bs, void *job) "bs %p job %p"

 # block/raw-win32.c
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -76,8 +76,7 @@ static int coroutine_fn before_write_notify(NotifierWithReturn *notifier,
 static void write_threshold_register_notifier(BlockDriverState *bs)
 {
    bs->write_threshold_notifier.notify = before_write_notify;
-    notifier_with_return_list_add(&bs->before_write_notifiers,
-                                  &bs->write_threshold_notifier);
+    bdrv_add_before_write_notifier(bs, &bs->write_threshold_notifier);
 }

 static void write_threshold_update(BlockDriverState *bs,
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -44,6 +44,7 @@ static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,
        return TRUE;
    }

+    qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
    nbd_client_new(NULL, cioc,
                   nbd_server->tlscreds, NULL,
                   nbd_client_put);
@@ -111,6 +112,8 @@ void qmp_nbd_server_start(SocketAddress *addr,
    nbd_server = g_new0(NBDServerData, 1);
    nbd_server->watch = -1;
    nbd_server->listen_ioc = qio_channel_socket_new();
+    qio_channel_set_name(QIO_CHANNEL(nbd_server->listen_ioc),
+                         "nbd-listener");
    if (qio_channel_socket_listen_sync(
            nbd_server->listen_ioc, addr, errp) < 0) {
        goto error;
--- a/blockdev.c
+++ b/blockdev.c
@@ -43,7 +43,7 @@
 #include "qapi/qmp/types.h"
 #include "qapi-visit.h"
 #include "qapi/qmp/qerror.h"
-#include "qapi/qmp-output-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include "qapi/util.h"
 #include "sysemu/sysemu.h"
 #include "block/block_int.h"
@@ -356,7 +356,6 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
    const char **throttling_group, ThrottleConfig *throttle_cfg,
    BlockdevDetectZeroesOptions *detect_zeroes, Error **errp)
 {
-    const char *discard;
    Error *local_error = NULL;
    const char *aio;

@@ -365,13 +364,6 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
            *bdrv_flags |= BDRV_O_COPY_ON_READ;
        }

-        if ((discard = qemu_opt_get(opts, "discard")) != NULL) {
-            if (bdrv_parse_discard_flags(discard, bdrv_flags) != 0) {
-                error_setg(errp, "Invalid discard option");
-                return;
-            }
-        }
-
        if ((aio = qemu_opt_get(opts, "aio")) != NULL) {
            if (!strcmp(aio, "native")) {
                *bdrv_flags |= BDRV_O_NATIVE_AIO;
@@ -449,15 +441,6 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
            error_propagate(errp, local_error);
            return;
        }
-
-        if (bdrv_flags &&
-            *detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
-            !(*bdrv_flags & BDRV_O_UNMAP))
-        {
-            error_setg(errp, "setting detect-zeroes to unmap is not allowed "
-                             "without setting discard operation to unmap");
-            return;
-        }
    }
 }

@@ -650,35 +633,11 @@ err_no_opts:
    return NULL;
 }

-static QemuOptsList qemu_root_bds_opts;
-
 /* Takes the ownership of bs_opts */
 static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
 {
-    BlockDriverState *bs;
-    QemuOpts *opts;
-    Error *local_error = NULL;
-    BlockdevDetectZeroesOptions detect_zeroes;
    int bdrv_flags = 0;

-    opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp);
-    if (!opts) {
-        goto fail;
-    }
-
-    qemu_opts_absorb_qdict(opts, bs_opts, &local_error);
-    if (local_error) {
-        error_propagate(errp, local_error);
-        goto fail;
-    }
-
-    extract_common_blockdev_options(opts, &bdrv_flags, NULL, NULL,
-                                    &detect_zeroes, &local_error);
-    if (local_error) {
-        error_propagate(errp, local_error);
-        goto fail;
-    }
-
    /* bdrv_open() defaults to the values in bdrv_flags (for compatibility
     * with other callers) rather than what we want as the real defaults.
     * Apply the defaults here instead. */
@@ -690,21 +649,7 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        bdrv_flags |= BDRV_O_INACTIVE;
    }

-    bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
-    if (!bs) {
-        goto fail_no_bs_opts;
-    }
-
-    bs->detect_zeroes = detect_zeroes;
-
-fail_no_bs_opts:
-    qemu_opts_del(opts);
-    return bs;
-
-fail:
-    qemu_opts_del(opts);
-    QDECREF(bs_opts);
-    return NULL;
+    return bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
 }

 void blockdev_close_all_bdrv_states(void)
@@ -1866,7 +1811,7 @@ typedef struct DriveBackupState {
    BlockJob *job;
 } DriveBackupState;

-static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn,
+static BlockJob *do_drive_backup(DriveBackup *backup, BlockJobTxn *txn,
                            Error **errp);

 static void drive_backup_prepare(BlkActionState *common, Error **errp)
@@ -1890,23 +1835,26 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp)
    bdrv_drained_begin(bs);
    state->bs = bs;

-    do_drive_backup(backup, common->block_job_txn, &local_err);
+    state->job = do_drive_backup(backup, common->block_job_txn, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
    }
+}

-    state->job = state->bs->job;
+static void drive_backup_commit(BlkActionState *common)
+{
+    DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
+    assert(state->job);
+    block_job_start(state->job);
 }

 static void drive_backup_abort(BlkActionState *common)
 {
    DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
-    BlockDriverState *bs = state->bs;

-    /* Only cancel if it's the job we started */
-    if (bs && bs->job && bs->job == state->job) {
-        block_job_cancel_sync(bs->job);
+    if (state->job) {
+        block_job_cancel_sync(state->job);
    }
 }

@@ -1927,8 +1875,8 @@ typedef struct BlockdevBackupState {
    AioContext *aio_context;
 } BlockdevBackupState;

-static void do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn,
-                               Error **errp);
+static BlockJob *do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn,
+                                    Error **errp);

 static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
 {
@@ -1961,23 +1909,26 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
    state->bs = bs;
    bdrv_drained_begin(state->bs);

-    do_blockdev_backup(backup, common->block_job_txn, &local_err);
+    state->job = do_blockdev_backup(backup, common->block_job_txn, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
    }
+}

-    state->job = state->bs->job;
+static void blockdev_backup_commit(BlkActionState *common)
+{
+    BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
+    assert(state->job);
+    block_job_start(state->job);
 }

 static void blockdev_backup_abort(BlkActionState *common)
 {
    BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
-    BlockDriverState *bs = state->bs;

-    /* Only cancel if it's the job we started */
-    if (bs && bs->job && bs->job == state->job) {
-        block_job_cancel_sync(bs->job);
+    if (state->job) {
+        block_job_cancel_sync(state->job);
    }
 }

@@ -2127,12 +2078,14 @@ static const BlkActionOps actions[] = {
    [TRANSACTION_ACTION_KIND_DRIVE_BACKUP] = {
        .instance_size = sizeof(DriveBackupState),
        .prepare = drive_backup_prepare,
+        .commit = drive_backup_commit,
        .abort = drive_backup_abort,
        .clean = drive_backup_clean,
    },
    [TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP] = {
        .instance_size = sizeof(BlockdevBackupState),
        .prepare = blockdev_backup_prepare,
+        .commit = blockdev_backup_commit,
        .abort = blockdev_backup_abort,
        .clean = blockdev_backup_clean,
    },
@@ -2549,6 +2502,7 @@ void qmp_blockdev_change_medium(bool has_device, const char *device,
    BlockBackend *blk;
    BlockDriverState *medium_bs = NULL;
    int bdrv_flags;
+    bool detect_zeroes;
    int rc;
    QDict *options = NULL;
    Error *err = NULL;
@@ -2588,8 +2542,12 @@ void qmp_blockdev_change_medium(bool has_device, const char *device,
        abort();
    }

+    options = qdict_new();
+    detect_zeroes = blk_get_detect_zeroes_from_root_state(blk);
+    qdict_put(options, "detect-zeroes",
+              qstring_from_str(detect_zeroes ? "on" : "off"));
+
    if (has_format) {
-        options = qdict_new();
        qdict_put(options, "driver", qstring_from_str(format));
    }

@@ -2614,7 +2572,7 @@ void qmp_blockdev_change_medium(bool has_device, const char *device,
    error_free(err);
    err = NULL;

-    qmp_x_blockdev_remove_medium(has_device, device, has_id, id, errp);
+    qmp_x_blockdev_remove_medium(has_device, device, has_id, id, &err);
    if (err) {
        error_propagate(errp, err);
        goto fail;
@@ -2626,8 +2584,6 @@ void qmp_blockdev_change_medium(bool has_device, const char *device,
        goto fail;
    }

-    blk_apply_root_state(blk, medium_bs);
-
    qmp_blockdev_close_tray(has_device, device, has_id, id, errp);

 fail:
@@ -2957,39 +2913,15 @@ out:
    aio_context_release(aio_context);
 }

-static void block_job_cb(void *opaque, int ret)
-{
-    /* Note that this function may be executed from another AioContext besides
-     * the QEMU main loop.  If you need to access anything that assumes the
-     * QEMU global mutex, use a BH or introduce a mutex.
-     */
-
-    BlockDriverState *bs = opaque;
-    const char *msg = NULL;
-
-    trace_block_job_cb(bs, bs->job, ret);
-
-    assert(bs->job);
-
-    if (ret < 0) {
-        msg = strerror(-ret);
-    }
-
-    if (block_job_is_cancelled(bs->job)) {
-        block_job_event_cancelled(bs->job);
-    } else {
-        block_job_event_completed(bs->job, msg);
-    }
-}
-
 void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
                      bool has_base, const char *base,
+                      bool has_base_node, const char *base_node,
                      bool has_backing_file, const char *backing_file,
                      bool has_speed, int64_t speed,
                      bool has_on_error, BlockdevOnError on_error,
                      Error **errp)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs, *iter;
    BlockDriverState *base_bs = NULL;
    AioContext *aio_context;
    Error *local_err = NULL;
@@ -2999,7 +2931,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
        on_error = BLOCKDEV_ON_ERROR_REPORT;
    }

-    bs = qmp_get_root_bs(device, errp);
+    bs = bdrv_lookup_bs(device, device, errp);
    if (!bs) {
        return;
    }
@@ -3007,7 +2939,9 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

-    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) {
+    if (has_base && has_base_node) {
+        error_setg(errp, "'base' and 'base-node' cannot be specified "
+                   "at the same time");
        goto out;
    }

@@ -3021,6 +2955,27 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
        base_name = base;
    }

+    if (has_base_node) {
+        base_bs = bdrv_lookup_bs(NULL, base_node, errp);
+        if (!base_bs) {
+            goto out;
+        }
+        if (bs == base_bs || !bdrv_chain_contains(bs, base_bs)) {
+            error_setg(errp, "Node '%s' is not a backing image of '%s'",
+                       base_node, device);
+            goto out;
+        }
+        assert(bdrv_get_aio_context(base_bs) == aio_context);
+        base_name = base_bs->filename;
+    }
+
+    /* Check for op blockers in the whole chain between bs and base */
+    for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) {
+        if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) {
+            goto out;
+        }
+    }
+
    /* if we are streaming the entire chain, the result will have no backing
     * file, and specifying one is therefore an error */
    if (base_bs == NULL && has_backing_file) {
@@ -3033,7 +2988,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
    base_name = has_backing_file ? backing_file : base_name;

    stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
-                 has_speed ? speed : 0, on_error, block_job_cb, bs, &local_err);
+                 has_speed ? speed : 0, on_error, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        goto out;
@@ -3053,6 +3008,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                      Error **errp)
 {
    BlockDriverState *bs;
+    BlockDriverState *iter;
    BlockDriverState *base_bs, *top_bs;
    AioContext *aio_context;
    Error *local_err = NULL;
@@ -3119,8 +3075,10 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,

    assert(bdrv_get_aio_context(base_bs) == aio_context);

-    if (bdrv_op_is_blocked(base_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
-        goto out;
+    for (iter = top_bs; iter != backing_bs(base_bs); iter = backing_bs(iter)) {
+        if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
+            goto out;
+        }
    }

    /* Do not allow attempts to commit an image into itself */
@@ -3135,12 +3093,17 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                             " but 'top' is the active layer");
            goto out;
        }
-        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs, speed,
-                            on_error, block_job_cb, bs, &local_err, false);
+        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs,
+                            BLOCK_JOB_DEFAULT, speed, on_error, NULL, NULL,
+                            &local_err, false);
    } else {
+        BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs);
+        if (bdrv_op_is_blocked(overlay_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
+            goto out;
+        }
        commit_start(has_job_id ? job_id : NULL, bs, base_bs, top_bs, speed,
-                     on_error, block_job_cb, bs,
-                     has_backing_file ? backing_file : NULL, &local_err);
+                     on_error, has_backing_file ? backing_file : NULL,
+                     &local_err);
    }
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -3151,11 +3114,13 @@ out:
    aio_context_release(aio_context);
 }

-static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn, Error **errp)
+static BlockJob *do_drive_backup(DriveBackup *backup, BlockJobTxn *txn,
+                                 Error **errp)
 {
    BlockDriverState *bs;
    BlockDriverState *target_bs;
    BlockDriverState *source = NULL;
+    BlockJob *job = NULL;
    BdrvDirtyBitmap *bmap = NULL;
    AioContext *aio_context;
    QDict *options = NULL;
@@ -3184,7 +3149,7 @@ static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn, Error **errp)

    bs = qmp_get_root_bs(backup->device, errp);
    if (!bs) {
-        return;
+        return NULL;
    }

    aio_context = bdrv_get_aio_context(bs);
@@ -3258,9 +3223,10 @@ static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn, Error **errp)
        }
    }

-    backup_start(backup->job_id, bs, target_bs, backup->speed, backup->sync,
-                 bmap, backup->compress, backup->on_source_error,
-                 backup->on_target_error, block_job_cb, bs, txn, &local_err);
+    job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
+                            backup->sync, bmap, backup->compress,
+                            backup->on_source_error, backup->on_target_error,
+                            BLOCK_JOB_DEFAULT, NULL, NULL, txn, &local_err);
    bdrv_unref(target_bs);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
@@ -3269,11 +3235,17 @@ static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn, Error **errp)

 out:
    aio_context_release(aio_context);
+    return job;
 }

 void qmp_drive_backup(DriveBackup *arg, Error **errp)
 {
-    return do_drive_backup(arg, NULL, errp);
+
+    BlockJob *job;
+    job = do_drive_backup(arg, NULL, errp);
+    if (job) {
+        block_job_start(job);
+    }
 }

 BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
@@ -3281,12 +3253,14 @@ BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
    return bdrv_named_nodes_list(errp);
 }

-void do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn, Error **errp)
+BlockJob *do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn,
+                             Error **errp)
 {
    BlockDriverState *bs;
    BlockDriverState *target_bs;
    Error *local_err = NULL;
    AioContext *aio_context;
+    BlockJob *job = NULL;

    if (!backup->has_speed) {
        backup->speed = 0;
@@ -3306,7 +3280,7 @@ void do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn, Error **errp)

    bs = qmp_get_root_bs(backup->device, errp);
    if (!bs) {
-        return;
+        return NULL;
    }

    aio_context = bdrv_get_aio_context(bs);
@@ -3328,19 +3302,25 @@ void do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn, Error **errp)
            goto out;
        }
    }
-    backup_start(backup->job_id, bs, target_bs, backup->speed, backup->sync,
-                 NULL, backup->compress, backup->on_source_error,
-                 backup->on_target_error, block_job_cb, bs, txn, &local_err);
+    job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
+                            backup->sync, NULL, backup->compress,
+                            backup->on_source_error, backup->on_target_error,
+                            BLOCK_JOB_DEFAULT, NULL, NULL, txn, &local_err);
    if (local_err != NULL) {
        error_propagate(errp, local_err);
    }
 out:
    aio_context_release(aio_context);
+    return job;
 }

 void qmp_blockdev_backup(BlockdevBackup *arg, Error **errp)
 {
-    do_blockdev_backup(arg, NULL, errp);
+    BlockJob *job;
+    job = do_blockdev_backup(arg, NULL, errp);
+    if (job) {
+        block_job_start(job);
+    }
 }

 /* Parameter check and block job starting for drive mirroring.
@@ -3409,8 +3389,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
    mirror_start(job_id, bs, target,
                 has_replaces ? replaces : NULL,
                 speed, granularity, buf_size, sync, backing_mode,
-                 on_source_error, on_target_error, unmap,
-                 block_job_cb, bs, errp);
+                 on_source_error, on_target_error, unmap, errp);
 }

 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -3654,7 +3633,7 @@ void qmp_block_job_cancel(const char *device,
        force = false;
    }

-    if (job->user_paused && !force) {
+    if (block_job_user_paused(job) && !force) {
        error_setg(errp, "The block job for device '%s' is currently paused",
                   device);
        goto out;
@@ -3671,13 +3650,12 @@ void qmp_block_job_pause(const char *device, Error **errp)
    AioContext *aio_context;
    BlockJob *job = find_block_job(device, &aio_context, errp);

-    if (!job || job->user_paused) {
+    if (!job || block_job_user_paused(job)) {
        return;
    }

-    job->user_paused = true;
    trace_qmp_block_job_pause(job);
-    block_job_pause(job);
+    block_job_user_pause(job);
    aio_context_release(aio_context);
 }

@@ -3686,14 +3664,13 @@ void qmp_block_job_resume(const char *device, Error **errp)
    AioContext *aio_context;
    BlockJob *job = find_block_job(device, &aio_context, errp);

-    if (!job || !job->user_paused) {
+    if (!job || !block_job_user_paused(job)) {
        return;
    }

-    job->user_paused = false;
    trace_qmp_block_job_resume(job);
    block_job_iostatus_reset(job);
-    block_job_resume(job);
+    block_job_user_resume(job);
    aio_context_release(aio_context);
 }

@@ -3828,25 +3805,10 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
 {
    BlockDriverState *bs;
    QObject *obj;
-    Visitor *v = qmp_output_visitor_new(&obj);
+    Visitor *v = qobject_output_visitor_new(&obj);
    QDict *qdict;
    Error *local_err = NULL;

-    /* TODO Sort it out in raw-posix and drive_new(): Reject aio=native with
-     * cache.direct=false instead of silently switching to aio=threads, except
-     * when called from drive_new().
-     *
-     * For now, simply forbidding the combination for all drivers will do. */
-    if (options->has_aio && options->aio == BLOCKDEV_AIO_OPTIONS_NATIVE) {
-        bool direct = options->has_cache &&
-                      options->cache->has_direct &&
-                      options->cache->direct;
-        if (!direct) {
-            error_setg(errp, "aio=native requires cache.direct=true");
-            goto fail;
-        }
-    }
-
    visit_type_BlockdevOptions(v, NULL, &options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
@@ -3982,13 +3944,22 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
    BlockJob *job;

    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
-        AioContext *aio_context = blk_get_aio_context(job->blk);
+        BlockJobInfoList *elem;
+        AioContext *aio_context;

+        if (block_job_is_internal(job)) {
+            continue;
+        }
+        elem = g_new0(BlockJobInfoList, 1);
+        aio_context = blk_get_aio_context(job->blk);
        aio_context_acquire(aio_context);
-        elem->value = block_job_query(job);
+        elem->value = block_job_query(job, errp);
        aio_context_release(aio_context);
-
+        if (!elem->value) {
+            g_free(elem);
+            qapi_free_BlockJobInfoList(head);
+            return NULL;
+        }
        *p_next = elem;
        p_next = &elem->next;
    }
@@ -4004,10 +3975,6 @@ QemuOptsList qemu_common_drive_opts = {
            .name = "snapshot",
            .type = QEMU_OPT_BOOL,
            .help = "enable/disable snapshot mode",
-        },{
-            .name = "discard",
-            .type = QEMU_OPT_STRING,
-            .help = "discard operation (ignore/off, unmap/on)",
        },{
            .name = "aio",
            .type = QEMU_OPT_STRING,
@@ -4135,31 +4102,6 @@ QemuOptsList qemu_common_drive_opts = {
    },
 };

-static QemuOptsList qemu_root_bds_opts = {
-    .name = "root-bds",
-    .head = QTAILQ_HEAD_INITIALIZER(qemu_root_bds_opts.head),
-    .desc = {
-        {
-            .name = "discard",
-            .type = QEMU_OPT_STRING,
-            .help = "discard operation (ignore/off, unmap/on)",
-        },{
-            .name = "aio",
-            .type = QEMU_OPT_STRING,
-            .help = "host AIO implementation (threads, native)",
-        },{
-            .name = "copy-on-read",
-            .type = QEMU_OPT_BOOL,
-            .help = "copy read data from backing file into image file",
-        },{
-            .name = "detect-zeroes",
-            .type = QEMU_OPT_STRING,
-            .help = "try to optimize zero writes (off, on, unmap)",
-        },
-        { /* end of list */ }
-    },
-};
-
 QemuOptsList qemu_drive_opts = {
    .name = "drive",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_drive_opts.head),
--- a/blockjob.c
+++ b/blockjob.c
@@ -27,7 +27,7 @@
 #include "qemu-common.h"
 #include "trace.h"
 #include "block/block.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
@@ -38,6 +38,9 @@
 #include "qemu/timer.h"
 #include "qapi-event.h"

+static void block_job_event_cancelled(BlockJob *job);
+static void block_job_event_completed(BlockJob *job, const char *msg);
+
 /* Transactional group of block jobs */
 struct BlockJobTxn {

@@ -66,7 +69,7 @@ BlockJob *block_job_get(const char *id)
    BlockJob *job;

    QLIST_FOREACH(job, &block_jobs, job_list) {
-        if (!strcmp(id, job->id)) {
+        if (job->id && !strcmp(id, job->id)) {
            return job;
        }
    }
@@ -74,17 +77,6 @@ BlockJob *block_job_get(const char *id)
    return NULL;
 }

-/* Normally the job runs in its BlockBackend's AioContext.  The exception is
- * block_job_defer_to_main_loop() where it runs in the QEMU main loop.  Code
- * that supports both cases uses this helper function.
- */
-static AioContext *block_job_get_aio_context(BlockJob *job)
-{
-    return job->deferred_to_main_loop ?
-           qemu_get_aio_context() :
-           blk_get_aio_context(job->blk);
-}
-
 static void block_job_attached_aio_context(AioContext *new_context,
                                           void *opaque)
 {
@@ -97,6 +89,17 @@ static void block_job_attached_aio_context(AioContext *new_context,
    block_job_resume(job);
 }

+static void block_job_drain(BlockJob *job)
+{
+    /* If job is !job->busy this kicks it into the next pause point. */
+    block_job_enter(job);
+
+    blk_drain(job->blk);
+    if (job->driver->drain) {
+        job->driver->drain(job);
+    }
+}
+
 static void block_job_detach_aio_context(void *opaque)
 {
    BlockJob *job = opaque;
@@ -106,31 +109,33 @@ static void block_job_detach_aio_context(void *opaque)

    block_job_pause(job);

-    if (!job->paused) {
-        /* If job is !job->busy this kicks it into the next pause point. */
-        block_job_enter(job);
-    }
    while (!job->paused && !job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        block_job_drain(job);
    }

    block_job_unref(job);
 }

+void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs)
+{
+    job->nodes = g_slist_prepend(job->nodes, bs);
+    bdrv_ref(bs);
+    bdrv_op_block_all(bs, job->blocker);
+}
+
 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed,
+                       BlockDriverState *bs, int64_t speed, int flags,
                       BlockCompletionFunc *cb, void *opaque, Error **errp)
 {
    BlockBackend *blk;
    BlockJob *job;

-    assert(cb);
    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
        return NULL;
    }

-    if (job_id == NULL) {
+    if (job_id == NULL && !(flags & BLOCK_JOB_INTERNAL)) {
        job_id = bdrv_get_device_name(bs);
        if (!*job_id) {
            error_setg(errp, "An explicit job ID is required for this node");
@@ -138,14 +143,21 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
        }
    }

-    if (!id_wellformed(job_id)) {
-        error_setg(errp, "Invalid job ID '%s'", job_id);
-        return NULL;
-    }
+    if (job_id) {
+        if (flags & BLOCK_JOB_INTERNAL) {
+            error_setg(errp, "Cannot specify job ID for internal block job");
+            return NULL;
+        }

-    if (block_job_get(job_id)) {
-        error_setg(errp, "Job ID '%s' already in use", job_id);
-        return NULL;
+        if (!id_wellformed(job_id)) {
+            error_setg(errp, "Invalid job ID '%s'", job_id);
+            return NULL;
+        }
+
+        if (block_job_get(job_id)) {
+            error_setg(errp, "Job ID '%s' already in use", job_id);
+            return NULL;
+        }
    }

    blk = blk_new();
@@ -154,7 +166,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
-    bdrv_op_block_all(bs, job->blocker);
+    block_job_add_bdrv(job, bs);
    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);

    job->driver        = driver;
@@ -162,7 +174,9 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    job->blk           = blk;
    job->cb            = cb;
    job->opaque        = opaque;
-    job->busy          = true;
+    job->busy          = false;
+    job->paused        = true;
+    job->pause_count   = 1;
    job->refcnt        = 1;
    bs->job = job;

@@ -185,6 +199,28 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
    return job;
 }

+bool block_job_is_internal(BlockJob *job)
+{
+    return (job->id == NULL);
+}
+
+static bool block_job_started(BlockJob *job)
+{
+    return job->co;
+}
+
+void block_job_start(BlockJob *job)
+{
+    assert(job && !block_job_started(job) && job->paused &&
+           !job->busy && job->driver->start);
+    job->co = qemu_coroutine_create(job->driver->start, job);
+    if (--job->pause_count == 0) {
+        job->paused = false;
+        job->busy = true;
+        qemu_coroutine_enter(job->co);
+    }
+}
+
 void block_job_ref(BlockJob *job)
 {
    ++job->refcnt;
@@ -193,9 +229,15 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
+        GSList *l;
        BlockDriverState *bs = blk_bs(job->blk);
        bs->job = NULL;
-        bdrv_op_unblock_all(bs, job->blocker);
+        for (l = job->nodes; l; l = l->next) {
+            bs = l->data;
+            bdrv_op_unblock_all(bs, job->blocker);
+            bdrv_unref(bs);
+        }
+        g_slist_free(job->nodes);
        blk_remove_aio_context_notifier(job->blk,
                                        block_job_attached_aio_context,
                                        block_job_detach_aio_context, job);
@@ -218,8 +260,29 @@ static void block_job_completed_single(BlockJob *job)
            job->driver->abort(job);
        }
    }
-    job->cb(job->opaque, job->ret);
+    if (job->driver->clean) {
+        job->driver->clean(job);
+    }
+
+    if (job->cb) {
+        job->cb(job->opaque, job->ret);
+    }
+
+    /* Emit events only if we actually started */
+    if (block_job_started(job)) {
+        if (block_job_is_cancelled(job)) {
+            block_job_event_cancelled(job);
+        } else {
+            const char *msg = NULL;
+            if (job->ret < 0) {
+                msg = strerror(-job->ret);
+            }
+            block_job_event_completed(job, msg);
+        }
+    }
+
    if (job->txn) {
+        QLIST_REMOVE(job, txn_list);
        block_job_txn_unref(job->txn);
    }
    block_job_unref(job);
@@ -321,7 +384,10 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)

 void block_job_complete(BlockJob *job, Error **errp)
 {
-    if (job->pause_count || job->cancelled || !job->driver->complete) {
+    /* Should not be reachable via external interface for internal jobs */
+    assert(job->id);
+    if (job->pause_count || job->cancelled ||
+        !block_job_started(job) || !job->driver->complete) {
        error_setg(errp, "The active block job '%s' cannot be completed",
                   job->id);
        return;
@@ -335,13 +401,26 @@ void block_job_pause(BlockJob *job)
    job->pause_count++;
 }

+void block_job_user_pause(BlockJob *job)
+{
+    job->user_paused = true;
+    block_job_pause(job);
+}
+
 static bool block_job_should_pause(BlockJob *job)
 {
    return job->pause_count > 0;
 }

+bool block_job_user_paused(BlockJob *job)
+{
+    return job ? job->user_paused : 0;
+}
+
 void coroutine_fn block_job_pause_point(BlockJob *job)
 {
+    assert(job && block_job_started(job));
+
    if (!block_job_should_pause(job)) {
        return;
    }
@@ -376,6 +455,14 @@ void block_job_resume(BlockJob *job)
    block_job_enter(job);
 }

+void block_job_user_resume(BlockJob *job)
+{
+    if (job && job->user_paused && job->pause_count > 0) {
+        job->user_paused = false;
+        block_job_resume(job);
+    }
+}
+
 void block_job_enter(BlockJob *job)
 {
    if (job->co && !job->busy) {
@@ -385,9 +472,13 @@ void block_job_enter(BlockJob *job)

 void block_job_cancel(BlockJob *job)
 {
-    job->cancelled = true;
-    block_job_iostatus_reset(job);
-    block_job_enter(job);
+    if (block_job_started(job)) {
+        job->cancelled = true;
+        block_job_iostatus_reset(job);
+        block_job_enter(job);
+    } else {
+        block_job_completed(job, -ECANCELED);
+    }
 }

 bool block_job_is_cancelled(BlockJob *job)
@@ -413,14 +504,21 @@ static int block_job_finish_sync(BlockJob *job,
    assert(blk_bs(job->blk)->job == job);

    block_job_ref(job);
+
    finish(job, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        block_job_unref(job);
        return -EBUSY;
    }
+    /* block_job_drain calls block_job_enter, and it should be enough to
+     * induce progress until the job completes or moves to the main thread.
+    */
+    while (!job->deferred_to_main_loop && !job->completed) {
+        block_job_drain(job);
+    }
    while (!job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        aio_poll(qemu_get_aio_context(), true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
    block_job_unref(job);
@@ -494,9 +592,15 @@ void block_job_yield(BlockJob *job)
    block_job_pause_point(job);
 }

-BlockJobInfo *block_job_query(BlockJob *job)
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 {
-    BlockJobInfo *info = g_new0(BlockJobInfo, 1);
+    BlockJobInfo *info;
+
+    if (block_job_is_internal(job)) {
+        error_setg(errp, "Cannot query QEMU internal jobs");
+        return NULL;
+    }
+    info = g_new0(BlockJobInfo, 1);
    info->type      = g_strdup(BlockJobType_lookup[job->driver->job_type]);
    info->device    = g_strdup(job->id);
    info->len       = job->len;
@@ -517,8 +621,12 @@ static void block_job_iostatus_set_err(BlockJob *job, int error)
    }
 }

-void block_job_event_cancelled(BlockJob *job)
+static void block_job_event_cancelled(BlockJob *job)
 {
+    if (block_job_is_internal(job)) {
+        return;
+    }
+
    qapi_event_send_block_job_cancelled(job->driver->job_type,
                                        job->id,
                                        job->len,
@@ -527,8 +635,12 @@ void block_job_event_cancelled(BlockJob *job)
                                        &error_abort);
 }

-void block_job_event_completed(BlockJob *job, const char *msg)
+static void block_job_event_completed(BlockJob *job, const char *msg)
 {
+    if (block_job_is_internal(job)) {
+        return;
+    }
+
    qapi_event_send_block_job_completed(job->driver->job_type,
                                        job->id,
                                        job->len,
@@ -543,6 +655,10 @@ void block_job_event_ready(BlockJob *job)
 {
    job->ready = true;

+    if (block_job_is_internal(job)) {
+        return;
+    }
+
    qapi_event_send_block_job_ready(job->driver->job_type,
                                    job->id,
                                    job->len,
@@ -573,14 +689,15 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
    default:
        abort();
    }
-    qapi_event_send_block_job_error(job->id,
-                                    is_read ? IO_OPERATION_TYPE_READ :
-                                    IO_OPERATION_TYPE_WRITE,
-                                    action, &error_abort);
+    if (!block_job_is_internal(job)) {
+        qapi_event_send_block_job_error(job->id,
+                                        is_read ? IO_OPERATION_TYPE_READ :
+                                        IO_OPERATION_TYPE_WRITE,
+                                        action, &error_abort);
+    }
    if (action == BLOCK_ERROR_ACTION_STOP) {
        /* make the pause user visible, which will be resumed from QMP. */
-        job->user_paused = true;
-        block_job_pause(job);
+        block_job_user_pause(job);
        block_job_iostatus_set_err(job, error);
    }
    return action;
@@ -588,7 +705,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,

 typedef struct {
    BlockJob *job;
-    QEMUBH *bh;
    AioContext *aio_context;
    BlockJobDeferToMainLoopFn *fn;
    void *opaque;
@@ -599,8 +715,6 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    BlockJobDeferToMainLoopData *data = opaque;
    AioContext *aio_context;

-    qemu_bh_delete(data->bh);
-
    /* Prevent race with block_job_defer_to_main_loop() */
    aio_context_acquire(data->aio_context);

@@ -624,13 +738,13 @@ void block_job_defer_to_main_loop(BlockJob *job,
 {
    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
    data->job = job;
-    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
    data->aio_context = blk_get_aio_context(job->blk);
    data->fn = fn;
    data->opaque = opaque;
    job->deferred_to_main_loop = true;

-    qemu_bh_schedule(data->bh);
+    aio_bh_schedule_oneshot(qemu_get_aio_context(),
+                            block_job_defer_to_main_loop_bh, data);
 }

 BlockJobTxn *block_job_txn_new(void)
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -67,23 +67,6 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 }
 #endif

-/* These are no-ops because we are not threadsafe.  */
-static inline void cpu_exec_start(CPUArchState *env)
-{
-}
-
-static inline void cpu_exec_end(CPUArchState *env)
-{
-}
-
-static inline void start_exclusive(void)
-{
-}
-
-static inline void end_exclusive(void)
-{
-}
-
 void fork_start(void)
 {
 }
@@ -95,14 +78,6 @@ void fork_end(int child)
    }
 }

-void cpu_list_lock(void)
-{
-}
-
-void cpu_list_unlock(void)
-{
-}
-
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@@ -172,7 +147,11 @@ void cpu_loop(CPUX86State *env)
    //target_siginfo_t info;

    for(;;) {
+        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
+        cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
        switch(trapnr) {
        case 0x80:
            /* syscall from int $0x80 */
@@ -513,7 +492,10 @@ void cpu_loop(CPUSPARCState *env)
    //target_siginfo_t info;

    while (1) {
+        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
+        cpu_exec_end(cs);
+        process_queued_cpu_work(cs);

        switch (trapnr) {
 #ifndef TARGET_SPARC64
@@ -669,7 +651,7 @@ void cpu_loop(CPUSPARCState *env)
 static void usage(void)
 {
    printf("qemu-" TARGET_NAME " version " QEMU_VERSION QEMU_PKGVERSION
-           ", " QEMU_COPYRIGHT "\n"
+           "\n" QEMU_COPYRIGHT "\n"
           "usage: qemu-" TARGET_NAME " [options] program [arguments...]\n"
           "BSD CPU emulator (compiled for %s emulation)\n"
           "\n"
@@ -713,6 +695,16 @@ static void usage(void)

 THREAD CPUState *thread_cpu;

+bool qemu_cpu_is_self(CPUState *cpu)
+{
+    return thread_cpu == cpu;
+}
+
+void qemu_cpu_kick(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
 /* Assumes contents are already zeroed.  */
 void init_task_state(TaskState *ts)
 {
@@ -748,6 +740,8 @@ int main(int argc, char **argv)
    if (argc <= 1)
        usage();

+    module_call_init(MODULE_INIT_TRACE);
+    qemu_init_cpu_list();
    module_call_init(MODULE_INIT_QOM);

    if ((envlist = envlist_create()) == NULL) {
@@ -1133,7 +1127,6 @@ int main(int argc, char **argv)
        gdbserver_start (gdbstub_port);
        gdb_handlesig(cpu, 0);
    }
-    trace_init_vcpu_events();
    cpu_loop(env);
    /* never exits */
    return 0;
--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -42,6 +42,11 @@ void mmap_unlock(void)
    }
 }

+bool have_mmap_lock(void)
+{
+    return mmap_lock_count > 0 ? true : false;
+}
+
 /* Grab lock to make sure things are in a consistent state after fork().  */
 void mmap_fork_start(void)
 {
--- a/266
+++ b/266
@@ -230,6 +230,7 @@ vhost_net="no"
 vhost_scsi="no"
 vhost_vsock="no"
 kvm="no"
+colo="yes"
 rdma=""
 gprof="no"
 debug_tcg="no"
@@ -296,6 +297,7 @@ libiscsi=""
 libnfs=""
 coroutine=""
 coroutine_pool=""
+debug_stack_usage="no"
 seccomp=""
 glusterfs=""
 glusterfs_xlator_opt="no"
@@ -580,6 +582,8 @@ FreeBSD)
  audio_possible_drivers="oss sdl pa"
  # needed for kinfo_getvmmap(3) in libutil.h
  LIBS="-lutil $LIBS"
+  # needed for kinfo_getproc
+  libs_qga="-lutil $libs_qga"
  netmap=""  # enable netmap autodetect
  HOST_VARIANT_DIR="freebsd"
 ;;
@@ -917,6 +921,10 @@ for opt do
  ;;
  --enable-kvm) kvm="yes"
  ;;
+  --disable-colo) colo="no"
+  ;;
+  --enable-colo) colo="yes"
+  ;;
  --disable-tcg-interpreter) tcg_interpreter="no"
  ;;
  --enable-tcg-interpreter) tcg_interpreter="yes"
@@ -1004,6 +1012,8 @@ for opt do
  ;;
  --enable-coroutine-pool) coroutine_pool="yes"
  ;;
+  --enable-debug-stack-usage) debug_stack_usage="yes"
+  ;;
  --disable-docs) docs="no"
  ;;
  --enable-docs) docs="yes"
@@ -1213,7 +1223,10 @@ case "$cpu" in
           cc_i386='$(CC) -m32'
           ;;
    x86_64)
-           CPU_CFLAGS="-m64"
+           # ??? Only extremely old AMD cpus do not have cmpxchg16b.
+           # If we truly care, we should simply detect this case at
+           # runtime and generate the fallback to serial emulation.
+           CPU_CFLAGS="-m64 -mcx16"
           LDFLAGS="-m64 $LDFLAGS"
           cc_i386='$(CC) -m32'
           ;;
@@ -1360,6 +1373,7 @@ disabled with --disable-FEATURE, default is enabled if available:
  fdt             fdt device tree
  bluez           bluez stack connectivity
  kvm             KVM acceleration support
+  colo            COarse-grain LOck-stepping VM for Non-stop Service
  rdma            RDMA-based migration support
  vde             support for vde network
  netmap          support for netmap network
@@ -1722,6 +1736,19 @@ if test "$cocoa" = "yes"; then
    sdl=no
 fi

+# Some versions of Mac OS X incorrectly define SIZE_MAX
+cat > $TMPC << EOF
+#include <stdint.h>
+#include <stdio.h>
+int main(int argc, char *argv[]) {
+    return printf("%zu", SIZE_MAX);
+}
+EOF
+have_broken_size_max=no
+if ! compile_object -Werror ; then
+    have_broken_size_max=yes
+fi
+
 ##########################################
 # L2TPV3 probe

@@ -1952,6 +1979,61 @@ EOF
  # Xen unstable
  elif
      cat > $TMPC <<EOF &&
+/*
+ * If we have stable libs the we don't want the libxc compat
+ * layers, regardless of what CFLAGS we may have been given.
+ *
+ * Also, check if xengnttab_grant_copy_segment_t is defined and
+ * grant copy operation is implemented.
+ */
+#undef XC_WANT_COMPAT_EVTCHN_API
+#undef XC_WANT_COMPAT_GNTTAB_API
+#undef XC_WANT_COMPAT_MAP_FOREIGN_API
+#include <xenctrl.h>
+#include <xenstore.h>
+#include <xenevtchn.h>
+#include <xengnttab.h>
+#include <xenforeignmemory.h>
+#include <stdint.h>
+#include <xen/hvm/hvm_info_table.h>
+#if !defined(HVM_MAX_VCPUS)
+# error HVM_MAX_VCPUS not defined
+#endif
+int main(void) {
+  xc_interface *xc = NULL;
+  xenforeignmemory_handle *xfmem;
+  xenevtchn_handle *xe;
+  xengnttab_handle *xg;
+  xen_domain_handle_t handle;
+  xengnttab_grant_copy_segment_t* seg = NULL;
+
+  xs_daemon_open();
+
+  xc = xc_interface_open(0, 0, 0);
+  xc_hvm_set_mem_type(0, 0, HVMMEM_ram_ro, 0, 0);
+  xc_domain_add_to_physmap(0, 0, XENMAPSPACE_gmfn, 0, 0);
+  xc_hvm_inject_msi(xc, 0, 0xf0000000, 0x00000000);
+  xc_hvm_create_ioreq_server(xc, 0, HVM_IOREQSRV_BUFIOREQ_ATOMIC, NULL);
+  xc_domain_create(xc, 0, handle, 0, NULL, NULL);
+
+  xfmem = xenforeignmemory_open(0, 0);
+  xenforeignmemory_map(xfmem, 0, 0, 0, 0, 0);
+
+  xe = xenevtchn_open(0, 0);
+  xenevtchn_fd(xe);
+
+  xg = xengnttab_open(0, 0);
+  xengnttab_grant_copy(xg, 0, seg);
+
+  return 0;
+}
+EOF
+      compile_prog "" "$xen_libs $xen_stable_libs"
+    then
+    xen_ctrl_version=480
+    xen=yes
+  elif
+      cat > $TMPC <<EOF &&
 /*
 * If we have stable libs the we don't want the libxc compat
 * layers, regardless of what CFLAGS we may have been given.
@@ -2843,25 +2925,41 @@ fi
 # curses probe
 if test "$curses" != "no" ; then
  if test "$mingw32" = "yes" ; then
-    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
+    curses_inc_list="$($pkg_config --cflags ncurses 2>/dev/null):"
+    curses_lib_list="$($pkg_config --libs ncurses 2>/dev/null):-lpdcurses"
  else
-    curses_list="$($pkg_config --libs ncurses 2>/dev/null):-lncurses:-lcurses"
+    curses_inc_list="$($pkg_config --cflags ncursesw 2>/dev/null):-I/usr/include/ncursesw:"
+    curses_lib_list="$($pkg_config --libs ncursesw 2>/dev/null):-lncursesw:-lcursesw"
  fi
  curses_found=no
  cat > $TMPC << EOF
+#include <locale.h>
 #include <curses.h>
+#include <wchar.h>
 int main(void) {
  const char *s = curses_version();
+  wchar_t wch = L'w';
+  setlocale(LC_ALL, "");
  resize_term(0, 0);
+  addwstr(L"wide chars\n");
+  addnwstr(&wch, 1);
+  add_wch(WACS_DEGREE);
  return s != 0;
 }
 EOF
  IFS=:
-  for curses_lib in $curses_list; do
-    unset IFS
-    if compile_prog "" "$curses_lib" ; then
-      curses_found=yes
-      libs_softmmu="$curses_lib $libs_softmmu"
+  for curses_inc in $curses_inc_list; do
+    IFS=:
+    for curses_lib in $curses_lib_list; do
+      unset IFS
+      if compile_prog "$curses_inc" "$curses_lib" ; then
+        curses_found=yes
+        QEMU_CFLAGS="$curses_inc $QEMU_CFLAGS"
+        libs_softmmu="$curses_lib $libs_softmmu"
+        break
+      fi
+    done
+    if test "$curses_found" = yes ; then
      break
    fi
  done
@@ -2933,7 +3031,7 @@ for i in $glib_modules; do
    if $pkg_config --atleast-version=$glib_req_ver $i; then
        glib_cflags=$($pkg_config --cflags $i)
        glib_libs=$($pkg_config --libs $i)
-        CFLAGS="$glib_cflags $CFLAGS"
+        QEMU_CFLAGS="$glib_cflags $QEMU_CFLAGS"
        LIBS="$glib_libs $LIBS"
        libs_qga="$glib_libs $libs_qga"
    else
@@ -3840,6 +3938,36 @@ if compile_prog "" "" ; then
  setns=yes
 fi

+# clock_adjtime probe
+clock_adjtime=no
+cat > $TMPC <<EOF
+#include <time.h>
+
+int main(void)
+{
+    return clock_adjtime(0, 0);
+}
+EOF
+clock_adjtime=no
+if compile_prog "" "" ; then
+  clock_adjtime=yes
+fi
+
+# syncfs probe
+syncfs=no
+cat > $TMPC <<EOF
+#include <unistd.h>
+
+int main(void)
+{
+    return syncfs(0);
+}
+EOF
+syncfs=no
+if compile_prog "" "" ; then
+  syncfs=yes
+fi
+
 # Check if tools are available to build documentation.
 if test "$docs" != "no" ; then
  if has makeinfo && has pod2man; then
@@ -4276,6 +4404,17 @@ if test "$coroutine" = "gthread" -a "$coroutine_pool" = "yes"; then
  error_exit "'gthread' coroutine backend does not support pool (use --disable-coroutine-pool)"
 fi

+if test "$debug_stack_usage" = "yes"; then
+  if test "$cpu" = "ia64" -o "$cpu" = "hppa"; then
+    error_exit "stack usage debugging is not supported for $cpu"
+  fi
+  if test "$coroutine_pool" = "yes"; then
+    echo "WARN: disabling coroutine pool for stack usage debugging"
+    coroutine_pool=no
+  fi
+fi
+
+
 ##########################################
 # check if we have open_by_handle_at

@@ -4409,6 +4548,55 @@ if compile_prog "" "" ; then
    int128=yes
 fi

+#########################################
+# See if 128-bit atomic operations are supported.
+
+atomic128=no
+if test "$int128" = "yes"; then
+  cat > $TMPC << EOF
+int main(void)
+{
+  unsigned __int128 x = 0, y = 0;
+  y = __atomic_load_16(&x, 0);
+  __atomic_store_16(&x, y, 0);
+  __atomic_compare_exchange_16(&x, &y, x, 0, 0, 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "" ; then
+    atomic128=yes
+  fi
+fi
+
+#########################################
+# See if 64-bit atomic operations are supported.
+# Note that without __atomic builtins, we can only
+# assume atomic loads/stores max at pointer size.
+
+cat > $TMPC << EOF
+#include <stdint.h>
+int main(void)
+{
+  uint64_t x = 0, y = 0;
+#ifdef __ATOMIC_RELAXED
+  y = __atomic_load_8(&x, 0);
+  __atomic_store_8(&x, y, 0);
+  __atomic_compare_exchange_8(&x, &y, x, 0, 0, 0);
+  __atomic_exchange_8(&x, y, 0);
+  __atomic_fetch_add_8(&x, y, 0);
+#else
+  typedef char is_host64[sizeof(void *) >= sizeof(uint64_t) ? 1 : -1];
+  __sync_lock_test_and_set(&x, y);
+  __sync_val_compare_and_swap(&x, y, 0);
+  __sync_fetch_and_add(&x, y);
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "" ; then
+  atomic64=yes
+fi
+
 ########################################
 # check if getauxval is available.

@@ -4493,6 +4681,33 @@ if compile_prog "" "" ; then
    have_rtnetlink=yes
 fi

+##########################################
+# check for usable AF_VSOCK environment
+have_af_vsock=no
+cat > $TMPC << EOF
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#if !defined(AF_VSOCK)
+# error missing AF_VSOCK flag
+#endif
+#include <linux/vm_sockets.h>
+int main(void) {
+    int sock, ret;
+    struct sockaddr_vm svm;
+    socklen_t len = sizeof(svm);
+    sock = socket(AF_VSOCK, SOCK_STREAM, 0);
+    ret = getpeername(sock, (struct sockaddr *)&svm, &len);
+    if ((ret == -1) && (errno == ENOTCONN)) {
+        return 0;
+    }
+    return -1;
+}
+EOF
+if compile_prog "" "" ; then
+    have_af_vsock=yes
+fi
+
 #################################################
 # Sparc implicitly links with --relax, which is
 # incompatible with -r, so --no-relax should be
@@ -4829,6 +5044,7 @@ echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
 echo "KVM support       $kvm"
+echo "COLO support      $colo"
 echo "RDMA support      $rdma"
 echo "TCG interpreter   $tcg_interpreter"
 echo "fdt support       $fdt"
@@ -4861,6 +5077,7 @@ echo "QGA MSI support   $guest_agent_msi"
 echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
+echo "debug stack usage $debug_stack_usage"
 echo "GlusterFS support $glusterfs"
 echo "Archipelago support $archipelago"
 echo "gcov              $gcov_tool"
@@ -5113,6 +5330,12 @@ fi
 if test "$setns" = "yes" ; then
  echo "CONFIG_SETNS=y" >> $config_host_mak
 fi
+if test "$clock_adjtime" = "yes" ; then
+  echo "CONFIG_CLOCK_ADJTIME=y" >> $config_host_mak
+fi
+if test "$syncfs" = "yes" ; then
+  echo "CONFIG_SYNCFS=y" >> $config_host_mak
+fi
 if test "$inotify" = "yes" ; then
  echo "CONFIG_INOTIFY=y" >> $config_host_mak
 fi
@@ -5140,7 +5363,6 @@ fi
 if test "$glib_subprocess" = "yes" ; then
  echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak
 fi
-echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 if test "$gtk" = "yes" ; then
  echo "CONFIG_GTK=y" >> $config_host_mak
  echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak
@@ -5176,6 +5398,9 @@ fi
 if test "$have_ifaddrs_h" = "yes" ; then
    echo "HAVE_IFADDRS_H=y" >> $config_host_mak
 fi
+if test "$have_broken_size_max" = "yes" ; then
+    echo "HAVE_BROKEN_SIZE_MAX=y" >> $config_host_mak
+fi

 # Work around a system header bug with some kernel/XFS header
 # versions where they both try to define 'struct fsxattr':
@@ -5330,6 +5555,10 @@ else
  echo "CONFIG_COROUTINE_POOL=0" >> $config_host_mak
 fi

+if test "$debug_stack_usage" = "yes" ; then
+  echo "CONFIG_DEBUG_STACK_USAGE=y" >> $config_host_mak
+fi
+
 if test "$open_by_handle_at" = "yes" ; then
  echo "CONFIG_OPEN_BY_HANDLE=y" >> $config_host_mak
 fi
@@ -5358,6 +5587,14 @@ if test "$int128" = "yes" ; then
  echo "CONFIG_INT128=y" >> $config_host_mak
 fi

+if test "$atomic128" = "yes" ; then
+  echo "CONFIG_ATOMIC128=y" >> $config_host_mak
+fi
+
+if test "$atomic64" = "yes" ; then
+  echo "CONFIG_ATOMIC64=y" >> $config_host_mak
+fi
+
 if test "$getauxval" = "yes" ; then
  echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
@@ -5443,6 +5680,10 @@ if have_backend "syslog"; then
 fi
 echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak

+if test "$colo" = "yes"; then
+  echo "CONFIG_COLO=y" >> $config_host_mak
+fi
+
 if test "$rdma" = "yes" ; then
  echo "CONFIG_RDMA=y" >> $config_host_mak
 fi
@@ -5455,6 +5696,10 @@ if test "$replication" = "yes" ; then
  echo "CONFIG_REPLICATION=y" >> $config_host_mak
 fi

+if test "$have_af_vsock" = "yes" ; then
+  echo "CONFIG_AF_VSOCK=y" >> $config_host_mak
+fi
+
 # Hold two types of flag:
 #   CONFIG_THREAD_SETNAME_BYTHREAD  - we've got a way of setting the name on
 #                                     a thread we have a handle to
@@ -5946,6 +6191,7 @@ FILES="$FILES roms/seabios/Makefile roms/vgabios/Makefile"
 FILES="$FILES pc-bios/qemu-icon.bmp"
 for bios_file in \
    $source_path/pc-bios/*.bin \
+    $source_path/pc-bios/*.lid \
    $source_path/pc-bios/*.aml \
    $source_path/pc-bios/*.rom \
    $source_path/pc-bios/*.dtb \
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -77,3 +77,9 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
    }
    siglongjmp(cpu->jmp_env, 1);
 }
+
+void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
+{
+    cpu->exception_index = EXCP_ATOMIC;
+    cpu_loop_exit_restore(cpu, pc);
+}
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -143,23 +143,20 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
    uint8_t *tb_ptr = itb->tc_ptr;

    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
-                           "Trace %p [" TARGET_FMT_lx "] %s\n",
-                           itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
+                           "Trace %p [%d: " TARGET_FMT_lx "] %s\n",
+                           itb->tc_ptr, cpu->cpu_index, itb->pc,
+                           lookup_symbol(itb->pc));

 #if defined(DEBUG_DISAS)
    if (qemu_loglevel_mask(CPU_LOG_TB_CPU)
        && qemu_log_in_addr_range(itb->pc)) {
+        qemu_log_lock();
 #if defined(TARGET_I386)
        log_cpu_state(cpu, CPU_DUMP_CCOP);
-#elif defined(TARGET_M68K)
-        /* ??? Should not modify env state for dumping.  */
-        cpu_m68k_flush_flags(env, env->cc_op);
-        env->cc_op = CC_OP_FLAGS;
-        env->sr = (env->sr & 0xffe0) | env->cc_dest | (env->cc_x << 4);
-        log_cpu_state(cpu, 0);
 #else
        log_cpu_state(cpu, 0);
 #endif
+        qemu_log_unlock();
    }
 #endif /* DEBUG_DISAS */

@@ -192,7 +189,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
        /* We were asked to stop executing TBs (probably a pending
         * interrupt. We've now stopped, so clear the flag.
         */
-        cpu->tcg_exit_req = 0;
+        atomic_set(&cpu->tcg_exit_req, 0);
    }
    return ret;
 }
@@ -204,27 +201,59 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;
-    bool old_tb_flushed;

    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;

-    old_tb_flushed = cpu->tb_flushed;
-    cpu->tb_flushed = false;
+    tb_lock();
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
-    cpu->tb_flushed |= old_tb_flushed;
+    tb->orig_tb = orig_tb;
+    tb_unlock();
+
    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);
+
+    tb_lock();
+    tb_phys_invalidate(tb, -1);
+    tb_free(tb);
+    tb_unlock();
+}
+#endif
+
+static void cpu_exec_step(CPUState *cpu)
+{
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+    TranslationBlock *tb;
+    target_ulong cs_base, pc;
+    uint32_t flags;
+
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb = tb_gen_code(cpu, pc, cs_base, flags,
+                     1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
+    tb->orig_tb = NULL;
+    /* execute the generated code */
+    trace_exec_tb_nocache(tb, pc);
+    cpu_tb_exec(cpu, tb);
    tb_phys_invalidate(tb, -1);
    tb_free(tb);
 }
-#endif
+
+void cpu_exec_step_atomic(CPUState *cpu)
+{
+    start_exclusive();
+
+    /* Since we got here, we know that parallel_cpus must be true.  */
+    parallel_cpus = false;
+    cpu_exec_step(cpu);
+    parallel_cpus = true;
+
+    end_exclusive();
+}

 struct tb_desc {
    target_ulong pc;
@@ -338,10 +367,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
            tb_lock();
            have_tb_lock = true;
        }
-        /* Check if translation buffer has been flushed */
-        if (cpu->tb_flushed) {
-            cpu->tb_flushed = false;
-        } else if (!tb->invalid) {
+        if (!tb->invalid) {
            tb_add_jump(last_tb, tb_exit, tb);
        }
    }
@@ -497,8 +523,8 @@ static inline void cpu_handle_interrupt(CPUState *cpu,
            *last_tb = NULL;
        }
    }
-    if (unlikely(cpu->exit_request || replay_has_interrupt())) {
-        cpu->exit_request = 0;
+    if (unlikely(atomic_read(&cpu->exit_request) || replay_has_interrupt())) {
+        atomic_set(&cpu->exit_request, 0);
        cpu->exception_index = EXCP_INTERRUPT;
        cpu_loop_exit(cpu);
    }
@@ -510,7 +536,7 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
 {
    uintptr_t ret;

-    if (unlikely(cpu->exit_request)) {
+    if (unlikely(atomic_read(&cpu->exit_request))) {
        return;
    }

@@ -606,7 +632,6 @@ int cpu_exec(CPUState *cpu)
                break;
            }

-            atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB lookup */
            for(;;) {
                cpu_handle_interrupt(cpu, &last_tb);
                tb = tb_find(cpu, last_tb, tb_exit);
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -0,0 +1,353 @@
+/*
+ * CPU thread main loop - common bits for user and system mode emulation
+ *
+ *  Copyright (c) 2003-2005 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "exec/cpu-common.h"
+#include "qom/cpu.h"
+#include "sysemu/cpus.h"
+
+static QemuMutex qemu_cpu_list_lock;
+static QemuCond exclusive_cond;
+static QemuCond exclusive_resume;
+static QemuCond qemu_work_cond;
+
+/* >= 1 if a thread is inside start_exclusive/end_exclusive.  Written
+ * under qemu_cpu_list_lock, read with atomic operations.
+ */
+static int pending_cpus;
+
+void qemu_init_cpu_list(void)
+{
+    /* This is needed because qemu_init_cpu_list is also called by the
+     * child process in a fork.  */
+    pending_cpus = 0;
+
+    qemu_mutex_init(&qemu_cpu_list_lock);
+    qemu_cond_init(&exclusive_cond);
+    qemu_cond_init(&exclusive_resume);
+    qemu_cond_init(&qemu_work_cond);
+}
+
+void cpu_list_lock(void)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+}
+
+void cpu_list_unlock(void)
+{
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+static bool cpu_index_auto_assigned;
+
+static int cpu_get_free_index(void)
+{
+    CPUState *some_cpu;
+    int cpu_index = 0;
+
+    cpu_index_auto_assigned = true;
+    CPU_FOREACH(some_cpu) {
+        cpu_index++;
+    }
+    return cpu_index;
+}
+
+static void finish_safe_work(CPUState *cpu)
+{
+    cpu_exec_start(cpu);
+    cpu_exec_end(cpu);
+}
+
+void cpu_list_add(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {
+        cpu->cpu_index = cpu_get_free_index();
+        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
+    } else {
+        assert(!cpu_index_auto_assigned);
+    }
+    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+
+    finish_safe_work(cpu);
+}
+
+void cpu_list_remove(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    if (!QTAILQ_IN_USE(cpu, node)) {
+        /* there is nothing to undo since cpu_exec_init() hasn't been called */
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
+        return;
+    }
+
+    assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
+
+    QTAILQ_REMOVE(&cpus, cpu, node);
+    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+struct qemu_work_item {
+    struct qemu_work_item *next;
+    run_on_cpu_func func;
+    run_on_cpu_data data;
+    bool free, exclusive, done;
+};
+
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+    qemu_mutex_lock(&cpu->work_mutex);
+    if (cpu->queued_work_first == NULL) {
+        cpu->queued_work_first = wi;
+    } else {
+        cpu->queued_work_last->next = wi;
+    }
+    cpu->queued_work_last = wi;
+    wi->next = NULL;
+    wi->done = false;
+    qemu_mutex_unlock(&cpu->work_mutex);
+
+    qemu_cpu_kick(cpu);
+}
+
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
+                   QemuMutex *mutex)
+{
+    struct qemu_work_item wi;
+
+    if (qemu_cpu_is_self(cpu)) {
+        func(cpu, data);
+        return;
+    }
+
+    wi.func = func;
+    wi.data = data;
+    wi.done = false;
+    wi.free = false;
+    wi.exclusive = false;
+
+    queue_work_on_cpu(cpu, &wi);
+    while (!atomic_mb_read(&wi.done)) {
+        CPUState *self_cpu = current_cpu;
+
+        qemu_cond_wait(&qemu_work_cond, mutex);
+        current_cpu = self_cpu;
+    }
+}
+
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
+{
+    struct qemu_work_item *wi;
+
+    wi = g_malloc0(sizeof(struct qemu_work_item));
+    wi->func = func;
+    wi->data = data;
+    wi->free = true;
+
+    queue_work_on_cpu(cpu, wi);
+}
+
+/* Wait for pending exclusive operations to complete.  The CPU list lock
+   must be held.  */
+static inline void exclusive_idle(void)
+{
+    while (pending_cpus) {
+        qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock);
+    }
+}
+
+/* Start an exclusive operation.
+   Must only be called from outside cpu_exec.  */
+void start_exclusive(void)
+{
+    CPUState *other_cpu;
+    int running_cpus;
+
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    exclusive_idle();
+
+    /* Make all other cpus stop executing.  */
+    atomic_set(&pending_cpus, 1);
+
+    /* Write pending_cpus before reading other_cpu->running.  */
+    smp_mb();
+    running_cpus = 0;
+    CPU_FOREACH(other_cpu) {
+        if (atomic_read(&other_cpu->running)) {
+            other_cpu->has_waiter = true;
+            running_cpus++;
+            qemu_cpu_kick(other_cpu);
+        }
+    }
+
+    atomic_set(&pending_cpus, running_cpus + 1);
+    while (pending_cpus > 1) {
+        qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
+    }
+
+    /* Can release mutex, no one will enter another exclusive
+     * section until end_exclusive resets pending_cpus to 0.
+     */
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+/* Finish an exclusive operation.  */
+void end_exclusive(void)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    atomic_set(&pending_cpus, 0);
+    qemu_cond_broadcast(&exclusive_resume);
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+/* Wait for exclusive ops to finish, and begin cpu execution.  */
+void cpu_exec_start(CPUState *cpu)
+{
+    atomic_set(&cpu->running, true);
+
+    /* Write cpu->running before reading pending_cpus.  */
+    smp_mb();
+
+    /* 1. start_exclusive saw cpu->running == true and pending_cpus >= 1.
+     * After taking the lock we'll see cpu->has_waiter == true and run---not
+     * for long because start_exclusive kicked us.  cpu_exec_end will
+     * decrement pending_cpus and signal the waiter.
+     *
+     * 2. start_exclusive saw cpu->running == false but pending_cpus >= 1.
+     * This includes the case when an exclusive item is running now.
+     * Then we'll see cpu->has_waiter == false and wait for the item to
+     * complete.
+     *
+     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
+     * see cpu->running == true, and it will kick the CPU.
+     */
+    if (unlikely(atomic_read(&pending_cpus))) {
+        qemu_mutex_lock(&qemu_cpu_list_lock);
+        if (!cpu->has_waiter) {
+            /* Not counted in pending_cpus, let the exclusive item
+             * run.  Since we have the lock, just set cpu->running to true
+             * while holding it; no need to check pending_cpus again.
+             */
+            atomic_set(&cpu->running, false);
+            exclusive_idle();
+            /* Now pending_cpus is zero.  */
+            atomic_set(&cpu->running, true);
+        } else {
+            /* Counted in pending_cpus, go ahead and release the
+             * waiter at cpu_exec_end.
+             */
+        }
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
+    }
+}
+
+/* Mark cpu as not executing, and release pending exclusive ops.  */
+void cpu_exec_end(CPUState *cpu)
+{
+    atomic_set(&cpu->running, false);
+
+    /* Write cpu->running before reading pending_cpus.  */
+    smp_mb();
+
+    /* 1. start_exclusive saw cpu->running == true.  Then it will increment
+     * pending_cpus and wait for exclusive_cond.  After taking the lock
+     * we'll see cpu->has_waiter == true.
+     *
+     * 2. start_exclusive saw cpu->running == false but here pending_cpus >= 1.
+     * This includes the case when an exclusive item started after setting
+     * cpu->running to false and before we read pending_cpus.  Then we'll see
+     * cpu->has_waiter == false and not touch pending_cpus.  The next call to
+     * cpu_exec_start will run exclusive_idle if still necessary, thus waiting
+     * for the item to complete.
+     *
+     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
+     * see cpu->running == false, and it can ignore this CPU until the
+     * next cpu_exec_start.
+     */
+    if (unlikely(atomic_read(&pending_cpus))) {
+        qemu_mutex_lock(&qemu_cpu_list_lock);
+        if (cpu->has_waiter) {
+            cpu->has_waiter = false;
+            atomic_set(&pending_cpus, pending_cpus - 1);
+            if (pending_cpus == 1) {
+                qemu_cond_signal(&exclusive_cond);
+            }
+        }
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
+    }
+}
+
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
+                           run_on_cpu_data data)
+{
+    struct qemu_work_item *wi;
+
+    wi = g_malloc0(sizeof(struct qemu_work_item));
+    wi->func = func;
+    wi->data = data;
+    wi->free = true;
+    wi->exclusive = true;
+
+    queue_work_on_cpu(cpu, wi);
+}
+
+void process_queued_cpu_work(CPUState *cpu)
+{
+    struct qemu_work_item *wi;
+
+    if (cpu->queued_work_first == NULL) {
+        return;
+    }
+
+    qemu_mutex_lock(&cpu->work_mutex);
+    while (cpu->queued_work_first != NULL) {
+        wi = cpu->queued_work_first;
+        cpu->queued_work_first = wi->next;
+        if (!cpu->queued_work_first) {
+            cpu->queued_work_last = NULL;
+        }
+        qemu_mutex_unlock(&cpu->work_mutex);
+        if (wi->exclusive) {
+            /* Running work items outside the BQL avoids the following deadlock:
+             * 1) start_exclusive() is called with the BQL taken while another
+             * CPU is running; 2) cpu_exec in the other CPU tries to takes the
+             * BQL, so it goes to sleep; start_exclusive() is sleeping too, so
+             * neither CPU can proceed.
+             */
+            qemu_mutex_unlock_iothread();
+            start_exclusive();
+            wi->func(cpu, wi->data);
+            end_exclusive();
+            qemu_mutex_lock_iothread();
+        } else {
+            wi->func(cpu, wi->data);
+        }
+        qemu_mutex_lock(&cpu->work_mutex);
+        if (wi->free) {
+            g_free(wi);
+        } else {
+            atomic_mb_set(&wi->done, true);
+        }
+    }
+    qemu_mutex_unlock(&cpu->work_mutex);
+    qemu_cond_broadcast(&qemu_work_cond);
+}
--- a/cpus.c
+++ b/cpus.c
@@ -69,7 +69,6 @@

 #endif /* CONFIG_LINUX */

-static CPUState *next_cpu;
 int64_t max_delay;
 int64_t max_advance;

@@ -557,9 +556,8 @@ static const VMStateDescription vmstate_timers = {
    }
 };

-static void cpu_throttle_thread(void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 {
-    CPUState *cpu = opaque;
    double pct;
    double throttle_ratio;
    long sleeptime_ns;
@@ -589,7 +587,8 @@ static void cpu_throttle_timer_tick(void *opaque)
    }
    CPU_FOREACH(cpu) {
        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
-            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
+            async_run_on_cpu(cpu, cpu_throttle_thread,
+                             RUN_ON_CPU_NULL);
        }
    }

@@ -751,7 +750,8 @@ static int do_vm_stop(RunState state)
    }

    bdrv_drain_all();
-    ret = blk_flush_all();
+    replay_disable_events();
+    ret = bdrv_flush_all();

    return ret;
 }
@@ -903,79 +903,21 @@ static QemuThread io_thread;
 static QemuCond qemu_cpu_cond;
 /* system init */
 static QemuCond qemu_pause_cond;
-static QemuCond qemu_work_cond;

 void qemu_init_cpu_loop(void)
 {
    qemu_init_sigbus();
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
-    qemu_cond_init(&qemu_work_cond);
    qemu_cond_init(&qemu_io_proceeded_cond);
    qemu_mutex_init(&qemu_global_mutex);

    qemu_thread_get_self(&io_thread);
 }

-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 {
-    struct qemu_work_item wi;
-
-    if (qemu_cpu_is_self(cpu)) {
-        func(data);
-        return;
-    }
-
-    wi.func = func;
-    wi.data = data;
-    wi.free = false;
-
-    qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = &wi;
-    } else {
-        cpu->queued_work_last->next = &wi;
-    }
-    cpu->queued_work_last = &wi;
-    wi.next = NULL;
-    wi.done = false;
-    qemu_mutex_unlock(&cpu->work_mutex);
-
-    qemu_cpu_kick(cpu);
-    while (!atomic_mb_read(&wi.done)) {
-        CPUState *self_cpu = current_cpu;
-
-        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
-        current_cpu = self_cpu;
-    }
-}
-
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
-{
-    struct qemu_work_item *wi;
-
-    if (qemu_cpu_is_self(cpu)) {
-        func(data);
-        return;
-    }
-
-    wi = g_malloc0(sizeof(struct qemu_work_item));
-    wi->func = func;
-    wi->data = data;
-    wi->free = true;
-
-    qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = wi;
-    } else {
-        cpu->queued_work_last->next = wi;
-    }
-    cpu->queued_work_last = wi;
-    wi->next = NULL;
-    wi->done = false;
-    qemu_mutex_unlock(&cpu->work_mutex);
-
-    qemu_cpu_kick(cpu);
+    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 }

 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
@@ -990,34 +932,6 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }

-static void flush_queued_work(CPUState *cpu)
-{
-    struct qemu_work_item *wi;
-
-    if (cpu->queued_work_first == NULL) {
-        return;
-    }
-
-    qemu_mutex_lock(&cpu->work_mutex);
-    while (cpu->queued_work_first != NULL) {
-        wi = cpu->queued_work_first;
-        cpu->queued_work_first = wi->next;
-        if (!cpu->queued_work_first) {
-            cpu->queued_work_last = NULL;
-        }
-        qemu_mutex_unlock(&cpu->work_mutex);
-        wi->func(wi->data);
-        qemu_mutex_lock(&cpu->work_mutex);
-        if (wi->free) {
-            g_free(wi);
-        } else {
-            atomic_mb_set(&wi->done, true);
-        }
-    }
-    qemu_mutex_unlock(&cpu->work_mutex);
-    qemu_cond_broadcast(&qemu_work_cond);
-}
-
 static void qemu_wait_io_event_common(CPUState *cpu)
 {
    if (cpu->stop) {
@@ -1025,7 +939,7 @@ static void qemu_wait_io_event_common(CPUState *cpu)
        cpu->stopped = true;
        qemu_cond_broadcast(&qemu_pause_cond);
    }
-    flush_queued_work(cpu);
+    process_queued_cpu_work(cpu);
    cpu->thread_kicked = false;
 }

@@ -1141,12 +1055,102 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
 #endif
 }

-static void tcg_exec_all(void);
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        /* Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return qemu_icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void handle_icount_deadline(void)
+{
+    if (use_icount) {
+        int64_t deadline =
+            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        if (deadline == 0) {
+            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+        }
+    }
+}
+
+static int tcg_cpu_exec(CPUState *cpu)
+{
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    if (use_icount) {
+        int64_t count;
+        int decr;
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                                    + cpu->icount_extra);
+        cpu->icount_decr.u16.low = 0;
+        cpu->icount_extra = 0;
+        count = tcg_get_icount_limit();
+        timers_state.qemu_icount += count;
+        decr = (count > 0xffff) ? 0xffff : count;
+        count -= decr;
+        cpu->icount_decr.u16.low = decr;
+        cpu->icount_extra = count;
+    }
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    tcg_time += profile_getclock() - ti;
+#endif
+    if (use_icount) {
+        /* Fold pending instructions back into the
+           instruction counter, and clear the interrupt flag.  */
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                        + cpu->icount_extra);
+        cpu->icount_decr.u32 = 0;
+        cpu->icount_extra = 0;
+        replay_account_executed_instructions();
+    }
+    return ret;
+}
+
+/* Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            break;
+        }
+    }
+}

 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;
-    CPUState *remove_cpu = NULL;

    rcu_register_thread();

@@ -1173,29 +1177,44 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
    /* process any pending work */
    atomic_mb_set(&exit_request, 1);

+    cpu = first_cpu;
+
    while (1) {
-        tcg_exec_all();
+        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+        qemu_account_warp_timer();

-        if (use_icount) {
-            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-            if (deadline == 0) {
-                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-            }
+        if (!cpu) {
+            cpu = first_cpu;
        }
-        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
-        CPU_FOREACH(cpu) {
-            if (cpu->unplug && !cpu_can_run(cpu)) {
-                remove_cpu = cpu;
+
+        for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+                r = tcg_cpu_exec(cpu);
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                }
+            } else if (cpu->stop || cpu->stopped) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
                break;
            }
-        }
-        if (remove_cpu) {
-            qemu_tcg_destroy_vcpu(remove_cpu);
-            cpu->created = false;
-            qemu_cond_signal(&qemu_cpu_cond);
-            remove_cpu = NULL;
-        }
+
+        } /* for cpu.. */
+
+        /* Pairs with smp_wmb in qemu_cpu_kick.  */
+        atomic_mb_set(&exit_request, 0);
+
+        handle_icount_deadline();
+
+        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        deal_with_unplugged_cpus();
    }

    return NULL;
@@ -1293,17 +1312,17 @@ void qemu_mutex_unlock_iothread(void)
    qemu_mutex_unlock(&qemu_global_mutex);
 }

-static int all_vcpus_paused(void)
+static bool all_vcpus_paused(void)
 {
    CPUState *cpu;

    CPU_FOREACH(cpu) {
        if (!cpu->stopped) {
-            return 0;
+            return false;
        }
    }

-    return 1;
+    return true;
 }

 void pause_all_vcpus(void)
@@ -1494,106 +1513,10 @@ int vm_stop_force_state(RunState state)
        bdrv_drain_all();
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
-        return blk_flush_all();
+        return bdrv_flush_all();
    }
 }

-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-        /* Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return qemu_icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    if (use_icount) {
-        int64_t count;
-        int decr;
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                                    + cpu->icount_extra);
-        cpu->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        count = tcg_get_icount_limit();
-        timers_state.qemu_icount += count;
-        decr = (count > 0xffff) ? 0xffff : count;
-        count -= decr;
-        cpu->icount_decr.u16.low = decr;
-        cpu->icount_extra = count;
-    }
-    ret = cpu_exec(cpu);
-#ifdef CONFIG_PROFILER
-    tcg_time += profile_getclock() - ti;
-#endif
-    if (use_icount) {
-        /* Fold pending instructions back into the
-           instruction counter, and clear the interrupt flag.  */
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                        + cpu->icount_extra);
-        cpu->icount_decr.u32 = 0;
-        cpu->icount_extra = 0;
-        replay_account_executed_instructions();
-    }
-    return ret;
-}
-
-static void tcg_exec_all(void)
-{
-    int r;
-
-    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-    qemu_account_warp_timer();
-
-    if (next_cpu == NULL) {
-        next_cpu = first_cpu;
-    }
-    for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
-        CPUState *cpu = next_cpu;
-
-        qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-        if (cpu_can_run(cpu)) {
-            r = tcg_cpu_exec(cpu);
-            if (r == EXCP_DEBUG) {
-                cpu_handle_guest_debug(cpu);
-                break;
-            }
-        } else if (cpu->stop || cpu->stopped) {
-            if (cpu->unplug) {
-                next_cpu = CPU_NEXT(cpu);
-            }
-            break;
-        }
-    }
-
-    /* Pairs with smp_wmb in qemu_cpu_kick.  */
-    atomic_mb_set(&exit_request, 0);
-}
-
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
 {
    /* XXX: implement xxx_cpu_list for targets that still miss it */
--- a/cputlb.c
+++ b/cputlb.c
@@ -23,15 +23,14 @@
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "exec/cpu_ldst.h"
-
 #include "exec/cputlb.h"
-
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
+#include "exec/helper-proto.h"
+#include "qemu/atomic.h"

 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -498,6 +497,43 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
    return qemu_ram_addr_from_host_nofail(p);
 }

+static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                         target_ulong addr, uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    uint64_t val;
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    cpu->mem_io_pc = retaddr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    return val;
+}
+
+static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                      uint64_t val, target_ulong addr,
+                      uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    cpu->mem_io_pc = retaddr;
+    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+}
+
 /* Return true if ADDR is present in the victim tlb, and has been copied
   back to the main tlb.  */
 static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
@@ -527,34 +563,178 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
  victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
                 (ADDR) & TARGET_PAGE_MASK)

+/* Probe for whether the specified guest write access is permitted.
+ * If it is not permitted then an exception will be taken in the same
+ * way as if this were a real write access (and we will not return).
+ * Otherwise the function will return, and there will be a valid
+ * entry in the TLB for this access.
+ */
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+    }
+}
+
+/* Probe for a read-modify-write atomic operation.  Do not allow unaligned
+ * operations, or io operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    size_t mmu_idx = get_mmuidx(oi);
+    size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
+    target_ulong tlb_addr = tlbe->addr_write;
+    TCGMemOp mop = get_memop(oi);
+    int a_bits = get_alignment_bits(mop);
+    int s_bits = mop & MO_SIZE;
+
+    /* Adjust the given return address.  */
+    retaddr -= GETPC_ADJ;
+
+    /* Enforce guest required alignment.  */
+    if (unlikely(a_bits > 0 && (addr & ((1 << a_bits) - 1)))) {
+        /* ??? Maybe indicate atomic op to cpu_unaligned_access */
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & ((1 << s_bits) - 1))) {
+        /* We get here if guest alignment was not requested,
+           or was not enforced by cpu_unaligned_access above.
+           We might widen the access and emulate, but for now
+           mark an exception and exit the cpu loop.  */
+        goto stop_the_world;
+    }
+
+    /* Check TLB entry and enforce page permissions.  */
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+        tlb_addr = tlbe->addr_write;
+    }
+
+    /* Notice an IO access, or a notdirty page.  */
+    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+        /* There's really nothing that can be done to
+           support this apart from stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    /* Let the guest notice RMW on a write-only page.  */
+    if (unlikely(tlbe->addr_read != tlb_addr)) {
+        tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD, mmu_idx, retaddr);
+        /* Since we don't support reads and writes to different addresses,
+           and we do have the proper page loaded for write, this shouldn't
+           ever return.  But just in case, handle via stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    return (void *)((uintptr_t)addr + tlbe->addend);
+
+ stop_the_world:
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+}
+
+#ifdef TARGET_WORDS_BIGENDIAN
+# define TGT_BE(X)  (X)
+# define TGT_LE(X)  BSWAP(X)
+#else
+# define TGT_BE(X)  BSWAP(X)
+# define TGT_LE(X)  (X)
+#endif
+
 #define MMUSUFFIX _mmu

-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"

-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"

-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"

-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
+
+/* First set of helpers allows passing in of OI and RETADDR.  This makes
+   them callable from other helpers.  */
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, retaddr)
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+#ifdef CONFIG_ATOMIC128
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif
+
+/* Second set of helpers are directly callable from TCG as helpers.  */
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+#define EXTRA_ARGS         , TCGMemOpIdx oi
+#define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#ifdef CONFIG_ATOMIC64
+#define DATA_SIZE 8
+#include "atomic_template.h"
+#endif
+
+/* Code access functions.  */
+
 #undef MMUSUFFIX
-
 #define MMUSUFFIX _cmmu
 #undef GETPC
 #define GETPC() ((uintptr_t)0)
 #define SOFTMMU_CODE_ACCESS

-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"

-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"

-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"

-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
--- a/crypto/cipher-builtin.c
+++ b/crypto/cipher-builtin.c
@@ -400,14 +400,26 @@ static int qcrypto_cipher_init_des_rfb(QCryptoCipher *cipher,
 }


-bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
+bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
+                             QCryptoCipherMode mode)
 {
    switch (alg) {
    case QCRYPTO_CIPHER_ALG_DES_RFB:
    case QCRYPTO_CIPHER_ALG_AES_128:
    case QCRYPTO_CIPHER_ALG_AES_192:
    case QCRYPTO_CIPHER_ALG_AES_256:
+        break;
+    default:
+        return false;
+    }
+
+    switch (mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
        return true;
+    case QCRYPTO_CIPHER_MODE_CTR:
+        return false;
    default:
        return false;
    }
@@ -421,6 +433,17 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
 {
    QCryptoCipher *cipher;

+    switch (mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
+        break;
+    default:
+        error_setg(errp, "Unsupported cipher mode %s",
+                   QCryptoCipherMode_lookup[mode]);
+        return NULL;
+    }
+
    cipher = g_new0(QCryptoCipher, 1);
    cipher->alg = alg;
    cipher->mode = mode;
--- a/crypto/cipher-gcrypt.c
+++ b/crypto/cipher-gcrypt.c
@@ -24,7 +24,8 @@
 #include <gcrypt.h>


-bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
+bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
+                             QCryptoCipherMode mode)
 {
    switch (alg) {
    case QCRYPTO_CIPHER_ALG_DES_RFB:
@@ -37,6 +38,16 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_SERPENT_256:
    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        break;
+    default:
+        return false;
+    }
+
+    switch (mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
+    case QCRYPTO_CIPHER_MODE_CTR:
        return true;
    default:
        return false;
@@ -48,6 +59,7 @@ struct QCryptoCipherGcrypt {
    gcry_cipher_hd_t handle;
    gcry_cipher_hd_t tweakhandle;
    size_t blocksize;
+    /* Initialization vector or Counter */
    uint8_t *iv;
 };

@@ -69,6 +81,9 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    case QCRYPTO_CIPHER_MODE_CBC:
        gcrymode = GCRY_CIPHER_MODE_CBC;
        break;
+    case QCRYPTO_CIPHER_MODE_CTR:
+        gcrymode = GCRY_CIPHER_MODE_CTR;
+        break;
    default:
        error_setg(errp, "Unsupported cipher mode %s",
                   QCryptoCipherMode_lookup[mode]);
@@ -339,12 +354,21 @@ int qcrypto_cipher_setiv(QCryptoCipher *cipher,
    if (ctx->iv) {
        memcpy(ctx->iv, iv, niv);
    } else {
-        gcry_cipher_reset(ctx->handle);
-        err = gcry_cipher_setiv(ctx->handle, iv, niv);
-        if (err != 0) {
-            error_setg(errp, "Cannot set IV: %s",
-                   gcry_strerror(err));
-            return -1;
+        if (cipher->mode == QCRYPTO_CIPHER_MODE_CTR) {
+            err = gcry_cipher_setctr(ctx->handle, iv, niv);
+            if (err != 0) {
+                error_setg(errp, "Cannot set Counter: %s",
+                       gcry_strerror(err));
+                return -1;
+            }
+        } else {
+            gcry_cipher_reset(ctx->handle);
+            err = gcry_cipher_setiv(ctx->handle, iv, niv);
+            if (err != 0) {
+                error_setg(errp, "Cannot set IV: %s",
+                       gcry_strerror(err));
+                return -1;
+            }
        }
    }

--- a/crypto/cipher-nettle.c
+++ b/crypto/cipher-nettle.c
@@ -28,6 +28,7 @@
 #include <nettle/cast128.h>
 #include <nettle/serpent.h>
 #include <nettle/twofish.h>
+#include <nettle/ctr.h>

 typedef void (*QCryptoCipherNettleFuncWrapper)(const void *ctx,
                                               size_t length,
@@ -186,12 +187,13 @@ struct QCryptoCipherNettle {
    QCryptoCipherNettleFuncNative alg_decrypt_native;
    QCryptoCipherNettleFuncWrapper alg_encrypt_wrapper;
    QCryptoCipherNettleFuncWrapper alg_decrypt_wrapper;
-
+    /* Initialization vector or Counter */
    uint8_t *iv;
    size_t blocksize;
 };

-bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
+bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
+                             QCryptoCipherMode mode)
 {
    switch (alg) {
    case QCRYPTO_CIPHER_ALG_DES_RFB:
@@ -205,6 +207,16 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
    case QCRYPTO_CIPHER_ALG_TWOFISH_192:
    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        break;
+    default:
+        return false;
+    }
+
+    switch (mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+    case QCRYPTO_CIPHER_MODE_CBC:
+    case QCRYPTO_CIPHER_MODE_XTS:
+    case QCRYPTO_CIPHER_MODE_CTR:
        return true;
    default:
        return false;
@@ -225,6 +237,7 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm alg,
    case QCRYPTO_CIPHER_MODE_ECB:
    case QCRYPTO_CIPHER_MODE_CBC:
    case QCRYPTO_CIPHER_MODE_XTS:
+    case QCRYPTO_CIPHER_MODE_CTR:
        break;
    default:
        error_setg(errp, "Unsupported cipher mode %s",
@@ -430,6 +443,12 @@ int qcrypto_cipher_encrypt(QCryptoCipher *cipher,
                    ctx->iv, len, out, in);
        break;

+    case QCRYPTO_CIPHER_MODE_CTR:
+        ctr_crypt(ctx->ctx, ctx->alg_encrypt_native,
+                    ctx->blocksize, ctx->iv,
+                    len, out, in);
+        break;
+
    default:
        error_setg(errp, "Unsupported cipher mode %s",
                   QCryptoCipherMode_lookup[cipher->mode]);
@@ -469,6 +488,11 @@ int qcrypto_cipher_decrypt(QCryptoCipher *cipher,
                    ctx->alg_encrypt_wrapper, ctx->alg_decrypt_wrapper,
                    ctx->iv, len, out, in);
        break;
+    case QCRYPTO_CIPHER_MODE_CTR:
+        ctr_crypt(ctx->ctx, ctx->alg_encrypt_native,
+                    ctx->blocksize, ctx->iv,
+                    len, out, in);
+        break;

    default:
        error_setg(errp, "Unsupported cipher mode %s",
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -55,6 +55,7 @@ static bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {
    [QCRYPTO_CIPHER_MODE_ECB] = false,
    [QCRYPTO_CIPHER_MODE_CBC] = true,
    [QCRYPTO_CIPHER_MODE_XTS] = true,
+    [QCRYPTO_CIPHER_MODE_CTR] = true,
 };


--- a/crypto/init.c
+++ b/crypto/init.c
@@ -119,6 +119,10 @@ static struct gcry_thread_cbs qcrypto_gcrypt_thread_impl = {

 int qcrypto_init(Error **errp)
 {
+#ifdef QCRYPTO_INIT_GCRYPT_THREADS
+    gcry_control(GCRYCTL_SET_THREAD_CBS, &qcrypto_gcrypt_thread_impl);
+#endif /* QCRYPTO_INIT_GCRYPT_THREADS */
+
 #ifdef CONFIG_GNUTLS
    int ret;
    ret = gnutls_global_init();
@@ -139,9 +143,6 @@ int qcrypto_init(Error **errp)
        error_setg(errp, "Unable to initialize gcrypt");
        return -1;
    }
-#ifdef QCRYPTO_INIT_GCRYPT_THREADS
-    gcry_control(GCRYCTL_SET_THREAD_CBS, &qcrypto_gcrypt_thread_impl);
-#endif /* QCRYPTO_INIT_GCRYPT_THREADS */
    gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
 #endif

--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -86,6 +86,8 @@ CONFIG_ZYNQ=y
 CONFIG_STM32F2XX_TIMER=y
 CONFIG_STM32F2XX_USART=y
 CONFIG_STM32F2XX_SYSCFG=y
+CONFIG_STM32F2XX_ADC=y
+CONFIG_STM32F2XX_SPI=y
 CONFIG_STM32F205_SOC=y

 CONFIG_VERSATILE_PCI=y
--- a/default-configs/mips-softmmu-common.mak
+++ b/default-configs/mips-softmmu-common.mak
@@ -17,6 +17,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
 CONFIG_I8257=y
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -39,6 +39,7 @@ CONFIG_I8259=y
 CONFIG_XILINX=y
 CONFIG_XILINX_ETHLITE=y
 CONFIG_PSERIES=y
+CONFIG_POWERNV=y
 CONFIG_PREP=y
 CONFIG_MAC=y
 CONFIG_E500=y
--- a/default-configs/unicore32-linux-user.mak
+++ b/default-configs/unicore32-linux-user.mak
@@ -1 +0,0 @@
-# Default configuration for unicore32-linux-user
--- a/disas/ppc.c
+++ b/disas/ppc.c
@@ -2286,6 +2286,10 @@ const struct powerpc_opcode powerpc_opcodes[] = {
 { "vrlh",      VX(4,   68), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vrlw",      VX(4,  132), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vrsqrtefp", VX(4,  330), VX_MASK,	PPCVEC,		{ VD, VB } },
+{ "vrldmi",    VX(4,  197), VX_MASK,    PPCVEC,         { VD, VA, VB } },
+{ "vrldnm",    VX(4,  453), VX_MASK,    PPCVEC,         { VD, VA, VB } },
+{ "vrlwmi",    VX(4,  133), VX_MASK,    PPCVEC,         { VD, VA, VB} },
+{ "vrlwnm",    VX(4,  389), VX_MASK,    PPCVEC,         { VD, VA, VB } },
 { "vsel",      VXA(4,  42), VXA_MASK,	PPCVEC,		{ VD, VA, VB, VC } },
 { "vsl",       VX(4,  452), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vslb",      VX(4,  260), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -73,6 +73,7 @@ typedef struct {
    AioContext *ctx;
    BlockAIOCB *acb;
    QEMUSGList *sg;
+    uint32_t align;
    uint64_t offset;
    DMADirection dir;
    int sg_cur_index;
@@ -160,8 +161,9 @@ static void dma_blk_cb(void *opaque, int ret)
        return;
    }

-    if (dbs->iov.size & ~BDRV_SECTOR_MASK) {
-        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
+    if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
+        qemu_iovec_discard_back(&dbs->iov,
+                                QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
    }

    dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
@@ -199,7 +201,7 @@ static const AIOCBInfo dma_aiocb_info = {
 };

 BlockAIOCB *dma_blk_io(AioContext *ctx,
-    QEMUSGList *sg, uint64_t offset,
+    QEMUSGList *sg, uint64_t offset, uint32_t align,
    DMAIOFunc *io_func, void *io_func_opaque,
    BlockCompletionFunc *cb,
    void *opaque, DMADirection dir)
@@ -212,6 +214,7 @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
    dbs->sg = sg;
    dbs->ctx = ctx;
    dbs->offset = offset;
+    dbs->align = align;
    dbs->sg_cur_index = 0;
    dbs->sg_cur_byte = 0;
    dbs->dir = dir;
@@ -234,11 +237,11 @@ BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
 }

 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t offset,
+                         QEMUSGList *sg, uint64_t offset, uint32_t align,
                         void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk_get_aio_context(blk),
-                      sg, offset, dma_blk_read_io_func, blk, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+                      dma_blk_read_io_func, blk, cb, opaque,
                      DMA_DIRECTION_FROM_DEVICE);
 }

@@ -252,11 +255,11 @@ BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
 }

 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t offset,
+                          QEMUSGList *sg, uint64_t offset, uint32_t align,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk_get_aio_context(blk),
-                      sg, offset, dma_blk_write_io_func, blk, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+                      dma_blk_write_io_func, blk, cb, opaque,
                      DMA_DIRECTION_TO_DEVICE);
 }

--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -0,0 +1,191 @@
+COarse-grained LOck-stepping Virtual Machines for Non-stop Service
+----------------------------------------
+Copyright (c) 2016 Intel Corporation
+Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+Copyright (c) 2016 Fujitsu, Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+This document gives an overview of COLO's design and how to use it.
+
+== Background ==
+Virtual machine (VM) replication is a well known technique for providing
+application-agnostic software-implemented hardware fault tolerance,
+also known as "non-stop service".
+
+COLO (COarse-grained LOck-stepping) is a high availability solution.
+Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the
+same request from client, and generate response in parallel too.
+If the response packets from PVM and SVM are identical, they are released
+immediately. Otherwise, a VM checkpoint (on demand) is conducted.
+
+== Architecture ==
+
+The architecture of COLO is shown in the diagram below.
+It consists of a pair of networked physical nodes:
+The primary node running the PVM, and the secondary node running the SVM
+to maintain a valid replica of the PVM.
+PVM and SVM execute in parallel and generate output of response packets for
+client requests according to the application semantics.
+
+The incoming packets from the client or external network are received by the
+primary node, and then forwarded to the secondary node, so that both the PVM
+and the SVM are stimulated with the same requests.
+
+COLO receives the outbound packets from both the PVM and SVM and compares them
+before allowing the output to be sent to clients.
+
+The SVM is qualified as a valid replica of the PVM, as long as it generates
+identical responses to all client requests. Once the differences in the outputs
+are detected between the PVM and SVM, COLO withholds transmission of the
+outbound packets until it has successfully synchronized the PVM state to the SVM.
+
+  Primary Node                                                            Secondary Node
+------------+  +-----------------------+       +------------------------+  +------------+
+|            |  |       HeartBeat       +<----->+       HeartBeat        |  |            |
+| Primary VM |  +-----------+-----------+       +-----------+------------+  |Secondary VM|
+|            |              |                               |               |            |
+|            |  +-----------|-----------+       +-----------|------------+  |            |
+|            |  |QEMU   +---v----+      |       |QEMU  +----v---+        |  |            |
+|            |  |       |Failover|      |       |      |Failover|        |  |            |
+|            |  |       +--------+      |       |      +--------+        |  |            |
+|            |  |   +---------------+   |       |   +---------------+    |  |            |
+|            |  |   | VM Checkpoint +-------------->+ VM Checkpoint |    |  |            |
+|            |  |   +---------------+   |       |   +---------------+    |  |            |
+|Requests<--------------------------\ /-----------------\ /--------------------->Requests|
+|            |  |                   ^ ^ |       |       | |              |  |            |
+|Responses+---------------------\ /-|-|------------\ /-------------------------+Responses|
+|            |  |               | | | | |       |  | |  | |              |  |            |
+|            |  | +-----------+ | | | | |       |  | |  | | +----------+ |  |            |
+|            |  | | COLO disk | | | | | |       |  | |  | | | COLO disk| |  |            |
+|            |  | |   Manager +---------------------------->| Manager  | |  |            |
+|            |  | ++----------+ v v | | |       |  | v  v | +---------++ |  |            |
+|            |  |  |+-----------+-+-+-++|       | ++-+--+-+---------+ |  |  |            |
+|            |  |  ||   COLO Proxy     ||       | |   COLO Proxy    | |  |  |            |
+|            |  |  || (compare packet  ||       | |(adjust sequence | |  |  |            |
+|            |  |  ||and mirror packet)||       | |    and ACK)     | |  |  |            |
+|            |  |  |+------------+---+-+|       | +-----------------+ |  |  |            |
+------------+  +-----------------------+       +------------------------+  +------------+
+------------+     |             |   |                                |     +------------+
+| VM Monitor |     |             |   |                                |     | VM Monitor |
+------------+     |             |   |                                |     +------------+
+---------------------------------------+       +----------------------------------------+
+|   Kernel         |             |   |  |       |   Kernel            |                  |
+---------------------------------------+       +----------------------------------------+
+                   |             |   |                                |
+    +--------------v+  +---------v---+--+       +------------------+ +v-------------+
+    |   Storage     |  |External Network|       | External Network | |   Storage    |
+    +---------------+  +----------------+       +------------------+ +--------------+
+
+
+== Components introduction ==
+
+You can see there are several components in COLO's diagram of architecture.
+Their functions are described below.
+
+HeartBeat:
+Runs on both the primary and secondary nodes, to periodically check platform
+availability. When the primary node suffers a hardware fail-stop failure,
+the heartbeat stops responding, the secondary node will trigger a failover
+as soon as it determines the absence.
+
+COLO disk Manager:
+When primary VM writes data into image, the colo disk manger captures this data
+and sends it to secondary VM's which makes sure the context of secondary VM's
+image is consistent with the context of primary VM 's image.
+For more details, please refer to docs/block-replication.txt.
+
+Checkpoint/Failover Controller:
+Modifications of save/restore flow to realize continuous migration,
+to make sure the state of VM in Secondary side is always consistent with VM in
+Primary side.
+
+COLO Proxy:
+Delivers packets to Primary and Seconday, and then compare the responses from
+both side. Then decide whether to start a checkpoint according to some rules.
+Please refer to docs/colo-proxy.txt for more informations.
+
+Note:
+HeartBeat has not been implemented yet, so you need to trigger failover process
+by using 'x-colo-lost-heartbeat' command.
+
+== Test procedure ==
+1. Startup qemu
+Primary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
+         children.0.file.filename=1.raw,\
+         children.0.driver=raw -S
+Secondary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
+  -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\
+         file.driver=qcow2,top-id=active-disk0,\
+         file.file.filename=/mnt/ramfs/active_disk.img,\
+         file.backing.driver=qcow2,\
+         file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\
+         file.backing.backing=secondary-disk0 \
+  -incoming tcp:0:8888
+
+2. On Secondary VM's QEMU monitor, issue command
+{'execute':'qmp_capabilities'}
+{ 'execute': 'nbd-server-start',
+  'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } }
+}
+{'execute': 'nbd-server-add', 'arguments': {'device': 'secondeary-disk0', 'writable': true } }
+
+Note:
+  a. The qmp command nbd-server-start and nbd-server-add must be run
+     before running the qmp command migrate on primary QEMU
+  b. Active disk, hidden disk and nbd target's length should be the
+     same.
+  c. It is better to put active disk and hidden disk in ramdisk.
+
+3. On Primary VM's QEMU monitor, issue command:
+{'execute':'qmp_capabilities'}
+{ 'execute': 'human-monitor-command',
+  'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}}
+{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } }
+{ 'execute': 'migrate-set-capabilities',
+      'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } }
+
+  Note:
+  a. There should be only one NBD Client for each primary disk.
+  b. xx.xx.xx.xx is the secondary physical machine's hostname or IP
+  c. The qmp command line must be run after running qmp command line in
+     secondary qemu.
+
+4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
+You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }'
+to change the checkpoint period time
+
+5. Failover test
+You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's
+monitor at the same time, then SVM will failover and client will not detect this
+change.
+
+Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to
+issue block related command to stop block replication.
+Primary:
+  Remove the nbd child from the quorum:
+  { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}}
+  { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}}
+  Note: there is no qmp command to remove the blockdev now
+
+Secondary:
+  The primary host is down, so we should do the following thing:
+  { 'execute': 'nbd-server-stop' }
+
+== TODO ==
+1. Support continuous VM replication.
+2. Support shared storage.
+3. Develop the heartbeat part.
+4. Reduce checkpoint VM’s downtime while doing checkpoint.
--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -15,7 +15,8 @@ Macros defined by qemu/atomic.h fall in three camps:
 - compiler barriers: barrier();

 - weak atomic access and manual memory barriers: atomic_read(),
-  atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_read_barrier_depends();
+  atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(),
+  smp_mb_release(), smp_read_barrier_depends();

 - sequentially consistent atomic access: everything else.

@@ -111,8 +112,8 @@ consistent primitives.

 When using this model, variables are accessed with atomic_read() and
 atomic_set(), and restrictions to the ordering of accesses is enforced
-using the smp_rmb(), smp_wmb(), smp_mb() and smp_read_barrier_depends()
-memory barriers.
+using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(),
+smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends().

 atomic_read() and atomic_set() prevents the compiler from using
 optimizations that might otherwise optimize accesses out of existence
@@ -124,7 +125,7 @@ other threads, and which are local to the current thread or protected
 by other, more mundane means.

 Memory barriers control the order of references to shared memory.
-They come in four kinds:
+They come in six kinds:

 - smp_rmb() guarantees that all the LOAD operations specified before
  the barrier will appear to happen before all the LOAD operations
@@ -142,6 +143,16 @@ They come in four kinds:
  In other words, smp_wmb() puts a partial ordering on stores, but is not
  required to have any effect on loads.

+- smp_mb_acquire() guarantees that all the LOAD operations specified before
+  the barrier will appear to happen before all the LOAD or STORE operations
+  specified after the barrier with respect to the other components of
+  the system.
+
+- smp_mb_release() guarantees that all the STORE operations specified *after*
+  the barrier will appear to happen after all the LOAD or STORE operations
+  specified *before* the barrier with respect to the other components of
+  the system.
+
 - smp_mb() guarantees that all the LOAD and STORE operations specified
  before the barrier will appear to happen before all the LOAD and
  STORE operations specified after the barrier with respect to the other
@@ -149,8 +160,9 @@ They come in four kinds:

  smp_mb() puts a partial ordering on both loads and stores.  It is
  stronger than both a read and a write memory barrier; it implies both
-  smp_rmb() and smp_wmb(), but it also prevents STOREs coming before the
-  barrier from overtaking LOADs coming after the barrier and vice versa.
+  smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs
+  coming before the barrier from overtaking LOADs coming after the
+  barrier and vice versa.

 - smp_read_barrier_depends() is a weaker kind of read barrier.  On
  most processors, whenever two loads are performed such that the
@@ -173,24 +185,21 @@ They come in four kinds:
 This is the set of barriers that is required *between* two atomic_read()
 and atomic_set() operations to achieve sequential consistency:

-                    |               2nd operation             |
-                    |-----------------------------------------|
-     1st operation  | (after last) | atomic_read | atomic_set |
-     ---------------+--------------+-------------+------------|
-     (before first) |              | none        | smp_wmb()  |
-     ---------------+--------------+-------------+------------|
-     atomic_read    | smp_rmb()    | smp_rmb()*  | **         |
-     ---------------+--------------+-------------+------------|
-     atomic_set     | none         | smp_mb()*** | smp_wmb()  |
-     ---------------+--------------+-------------+------------|
+                    |               2nd operation                   |
+                    |-----------------------------------------------|
+     1st operation  | (after last)   | atomic_read | atomic_set     |
+     ---------------+----------------+-------------+----------------|
+     (before first) |                | none        | smp_mb_release |
+     ---------------+----------------+-------------+----------------|
+     atomic_read    | smp_mb_acquire | smp_rmb     | **             |
+     ---------------+----------------+-------------+----------------|
+     atomic_set     | none           | smp_mb()*** | smp_wmb()      |
+     ---------------+----------------+-------------+----------------|

       * Or smp_read_barrier_depends().

-      ** This requires a load-store barrier.  How to achieve this varies
-         depending on the machine, but in practice smp_rmb()+smp_wmb()
-         should have the desired effect.  For example, on PowerPC the
-         lwsync instruction is a combined load-load, load-store and
-         store-store barrier.
+      ** This requires a load-store barrier.  This is achieved by
+         either smp_mb_acquire() or smp_mb_release().

     *** This requires a store-load barrier.  On most machines, the only
         way to achieve this is a full barrier.
@@ -199,11 +208,11 @@ and atomic_set() operations to achieve sequential consistency:
 You can see that the two possible definitions of atomic_mb_read()
 and atomic_mb_set() are the following:

-    1) atomic_mb_read(p)   = atomic_read(p); smp_rmb()
-       atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); smp_mb()
+    1) atomic_mb_read(p)   = atomic_read(p); smp_mb_acquire()
+       atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb()

-    2) atomic_mb_read(p)   = smp_mb() atomic_read(p); smp_rmb()
-       atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v);
+    2) atomic_mb_read(p)   = smp_mb() atomic_read(p); smp_mb_acquire()
+       atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v);

 Usually the former is used, because smp_mb() is expensive and a program
 normally has more reads than writes.  Therefore it makes more sense to
@@ -222,7 +231,7 @@ place barriers instead:
     thread 1                                thread 1
     -------------------------               ------------------------
     (other writes)
-                                             smp_wmb()
+                                             smp_mb_release()
     atomic_mb_set(&a, x)                    atomic_set(&a, x)
                                             smp_wmb()
     atomic_mb_set(&b, y)                    atomic_set(&b, y)
@@ -233,7 +242,13 @@ place barriers instead:
     y = atomic_mb_read(&b)                  y = atomic_read(&b)
                                             smp_rmb()
     x = atomic_mb_read(&a)                  x = atomic_read(&a)
-                                             smp_rmb()
+                                             smp_mb_acquire()
+
+  Note that the barrier between the stores in thread 1, and between
+  the loads in thread 2, has been optimized here to a write or a
+  read memory barrier respectively.  On some architectures, notably
+  ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as
+  smp_mb, but smp_rmb and/or smp_wmb are more efficient.

 - sometimes, a thread is accessing many variables that are otherwise
  unrelated to each other (for example because, apart from the current
@@ -246,12 +261,12 @@ place barriers instead:
     n = 0;                                  n = 0;
     for (i = 0; i < 10; i++)          =>    for (i = 0; i < 10; i++)
       n += atomic_mb_read(&a[i]);             n += atomic_read(&a[i]);
-                                             smp_rmb();
+                                             smp_mb_acquire();

  Similarly, atomic_mb_set() can be transformed as follows:
  smp_mb():

-                                             smp_wmb();
+                                             smp_mb_release();
     for (i = 0; i < 10; i++)          =>    for (i = 0; i < 10; i++)
       atomic_mb_set(&a[i], false);            atomic_set(&a[i], false);
                                             smp_mb();
@@ -261,7 +276,7 @@ The two tricks can be combined.  In this case, splitting a loop in
 two lets you hoist the barriers out of the loops _and_ eliminate the
 expensive smp_mb():

-                                             smp_wmb();
+                                             smp_mb_release();
     for (i = 0; i < 10; i++) {        =>    for (i = 0; i < 10; i++)
       atomic_mb_set(&a[i], false);            atomic_set(&a[i], false);
       atomic_mb_set(&b[i], false);          smb_wmb();
@@ -312,8 +327,8 @@ access and for data dependency barriers:
                             smp_read_barrier_depends();
                             z = b[y];

-smp_wmb() also pairs with atomic_mb_read(), and smp_rmb() also pairs
-with atomic_mb_set().
+smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire().
+and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release().


 COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
@@ -359,8 +374,9 @@ and memory barriers, and the equivalents in QEMU:
  note that smp_store_mb() is a little weaker than atomic_mb_set().
  atomic_mb_read() compiles to the same instructions as Linux's
  smp_load_acquire(), but this should be treated as an implementation
-  detail.  If required, QEMU might later add atomic_load_acquire() and
-  atomic_store_release() macros.
+  detail.  QEMU does have atomic_load_acquire() and atomic_store_release()
+  macros, but for now they are only used within atomic.h.  This may
+  change in the future.


 SOURCES
--- a/docs/colo-proxy.txt
+++ b/docs/colo-proxy.txt
@@ -0,0 +1,188 @@
+COLO-proxy
+----------
+Copyright (c) 2016 Intel Corporation
+Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+Copyright (c) 2016 Fujitsu, Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+This document gives an overview of COLO proxy's design.
+
+== Background ==
+COLO-proxy is a part of COLO project. It is used
+to compare the network package to help COLO decide
+whether to do checkpoint. With COLO-proxy's help,
+COLO greatly improves the performance.
+
+The filter-redirector, filter-mirror, colo-compare
+and filter-rewriter compose the COLO-proxy.
+
+== Architecture ==
+
+COLO-Proxy is based on qemu netfilter and it's a plugin for qemu netfilter
+(except colo-compare). It keep Secondary VM connect normally to
+client and compare packets sent by PVM with sent by SVM.
+If the packet difference, notify COLO-frame to do checkpoint and send
+all primary packet has queued. Otherwise just send the queued primary
+packet and drop the queued secondary packet.
+
+Below is a COLO proxy ascii figure:
+
+ Primary qemu                                                           Secondary qemu
+--------------------------------------------------------------+       +----------------------------------------------------------------+
+| +----------------------------------------------------------+ |       |  +-----------------------------------------------------------+ |
+| |                                                          | |       |  |                                                           | |
+| |                        guest                             | |       |  |                        guest                              | |
+| |                                                          | |       |  |                                                           | |
+| +-------^--------------------------+-----------------------+ |       |  +---------------------+--------+----------------------------+ |
+|         |                          |                         |       |                        ^        |                              |
+|         |                          |                         |       |                        |        |                              |
+|         |  +------------------------------------------------------+  |                        |        |                              |
+|netfilter|  |                       |                         |    |  |   netfilter            |        |                              |
+| +----------+ +----------------------------+                  |    |  |  +-----------------------------------------------------------+ |
+| |       |  |                       |      |        out       |    |  |  |                     |        |  filter excute order       | |
+| |       |  |          +-----------------------------+        |    |  |  |                     |        | +------------------->      | |
+| |       |  |          |            |      |         |        |    |  |  |                     |        |   TCP                      | |
+| | +-----+--+-+  +-----v----+ +-----v----+ |pri +----+----+sec|    |  |  | +------------+  +---+----+---v+rewriter++  +------------+ | |
+| | |          |  |          | |          | |in  |         |in |    |  |  | |            |  |        |              |  |            | | |
+| | |  filter  |  |  filter  | |  filter  +------>  colo   <------+ +-------->  filter   +--> adjust |   adjust     +-->   filter   | | |
+| | |  mirror  |  |redirector| |redirector| |    | compare |   |  |    |  | | redirector |  | ack    |   seq        |  | redirector | | |
+| | |          |  |          | |          | |    |         |   |  |    |  | |            |  |        |              |  |            | | |
+| | +----^-----+  +----+-----+ +----------+ |    +---------+   |  |    |  | +------------+  +--------+--------------+  +---+--------+ | |
+| |      |   tx        |   rx           rx  |                  |  |    |  |            tx                        all       |  rx      | |
+| |      |             |                    |                  |  |    |  +-----------------------------------------------------------+ |
+| |      |             +--------------+     |                  |  |    |                                                   |            |
+| |      |   filter excute order      |     |                  |  |    |                                                   |            |
+| |      |  +---------------->        |     |                  |  +--------------------------------------------------------+            |
+| +-----------------------------------------+                  |       |                                                                |
+|        |                            |                        |       |                                                                |
+--------------------------------------------------------------+       +----------------------------------------------------------------+
+         |guest receive               | guest send
+         |                            |
+--------+----------------------------v------------------------+
+|                                                              |                          NOTE: filter direction is rx/tx/all
+|                         tap                                  |                          rx:receive packets sent to the netdev
+|                                                              |                          tx:receive packets sent by the netdev
+--------------------------------------------------------------+
+
+1.Guest receive packet route:
+
+Primary:
+
+Tap --> Mirror Client Filter
+Mirror client will send packet to guest,at the
+same time, copy and forward packet to secondary
+mirror server.
+
+Secondary:
+
+Mirror Server Filter --> TCP Rewriter
+If receive packet is TCP packet,we will adjust ack
+and update TCP checksum, then send to secondary
+guest. Otherwise directly send to guest.
+
+2.Guest send packet route:
+
+Primary:
+
+Guest --> Redirect Server Filter
+Redirect server filter receive primary guest packet
+but do nothing, just pass to next filter.
+
+Redirect Server Filter --> COLO-Compare
+COLO-compare receive primary guest packet then
+waiting scondary redirect packet to compare it.
+If packet same,send queued primary packet and clear
+queued secondary packet, Otherwise send primary packet
+and do checkpoint.
+
+COLO-Compare --> Another Redirector Filter
+The redirector get packet from colo-compare by use
+chardev socket.
+
+Redirector Filter --> Tap
+Send the packet.
+
+Secondary:
+
+Guest --> TCP Rewriter Filter
+If the packet is TCP packet,we will adjust seq
+and update TCP checksum. Then send it to
+redirect client filter. Otherwise directly send to
+redirect client filter.
+
+Redirect Client Filter --> Redirect Server Filter
+Forward packet to primary.
+
+== Components introduction ==
+
+Filter-mirror is a netfilter plugin.
+It gives qemu the ability to mirror
+packets to a chardev.
+
+Filter-redirector is a netfilter plugin.
+It gives qemu the ability to redirect net packet.
+Redirector can redirect filter's net packet to outdev,
+and redirect indev's packet to filter.
+
+                    filter
+                      +
+          redirector  |
+             +--------------+
+             |        |     |
+             |        |     |
+             |        |     |
+  indev +---------+   +---------->  outdev
+             |    |         |
+             |    |         |
+             |    |         |
+             +--------------+
+                  |
+                  v
+                filter
+
+COLO-compare, we do packet comparing job.
+Packets coming from the primary char indev will be sent to outdev.
+Packets coming from the secondary char dev will be dropped after comparing.
+COLO-comapre need two input chardev and one output chardev:
+primary_in=chardev1-id (source: primary send packet)
+secondary_in=chardev2-id (source: secondary send packet)
+outdev=chardev3-id
+
+Filter-rewriter will rewrite some of secondary packet to make
+secondary guest's tcp connection established successfully.
+In this module we will rewrite tcp packet's ack to the secondary
+from primary,and rewrite tcp packet's seq to the primary from
+secondary.
+
+== Usage ==
+
+Here, we use demo ip and port discribe more clearly.
+Primary(ip:3.3.3.3):
+-netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown
+-device e1000,id=e0,netdev=hn0,mac=52:a4:00:12:78:66
+-chardev socket,id=mirror0,host=3.3.3.3,port=9003,server,nowait
+-chardev socket,id=compare1,host=3.3.3.3,port=9004,server,nowait
+-chardev socket,id=compare0,host=3.3.3.3,port=9001,server,nowait
+-chardev socket,id=compare0-0,host=3.3.3.3,port=9001
+-chardev socket,id=compare_out,host=3.3.3.3,port=9005,server,nowait
+-chardev socket,id=compare_out0,host=3.3.3.3,port=9005
+-object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0
+-object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out
+-object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0
+-object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,outdev=compare_out0
+
+Secondary(ip:3.3.3.8):
+-netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,down script=/etc/qemu-ifdown
+-device e1000,netdev=hn0,mac=52:a4:00:12:78:66
+-chardev socket,id=red0,host=3.3.3.3,port=9003
+-chardev socket,id=red1,host=3.3.3.3,port=9004
+-object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0
+-object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1
+
+Note:
+  a.COLO-proxy must work with COLO-frame and Block-replication.
+  b.Primary COLO must be started firstly, because COLO-proxy needs
+    chardev socket server running before secondary started.
+  c.Filter-rewriter only rewrite tcp packet.
--- a/docs/generic-loader.txt
+++ b/docs/generic-loader.txt
@@ -0,0 +1,92 @@
+Copyright (c) 2016 Xilinx Inc.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.  See
+the COPYING file in the top-level directory.
+
+
+The 'loader' device allows the user to load multiple images or values into
+QEMU at startup.
+
+Loading Data into Memory Values
+-------------------------------
+The loader device allows memory values to be set from the command line. This
+can be done by following the syntax below:
+
+     -device loader,addr=<addr>,data=<data>,data-len=<data-len>
+                   [,data-be=<data-be>][,cpu-num=<cpu-num>]
+
+    <addr>      - The address to store the data in.
+    <data>      - The value to be written to the address. The maximum size of
+                  the data is 8 bytes.
+    <data-len>  - The length of the data in bytes. This argument must be
+                  included if the data argument is.
+    <data-be>   - Set to true if the data to be stored on the guest should be
+                  written as big endian data. The default is to write little
+                  endian data.
+    <cpu-num>   - The number of the CPU's address space where the data should
+                  be loaded. If not specified the address space of the first
+                  CPU is used.
+
+All values are parsed using the standard QemuOps parsing. This allows the user
+to specify any values in any format supported. By default the values
+will be parsed as decimal. To use hex values the user should prefix the number
+with a '0x'.
+
+An example of loading value 0x8000000e to address 0xfd1a0104 is:
+    -device loader,addr=0xfd1a0104,data=0x8000000e,data-len=4
+
+Setting a CPU's Program Counter
+-------------------------------
+The loader device allows the CPU's PC to be set from the command line. This
+can be done by following the syntax below:
+
+     -device loader,addr=<addr>,cpu-num=<cpu-num>
+
+    <addr>      - The value to use as the CPU's PC.
+    <cpu-num>   - The number of the CPU whose PC should be set to the
+                  specified value.
+
+All values are parsed using the standard QemuOps parsing. This allows the user
+to specify any values in any format supported. By default the values
+will be parsed as decimal. To use hex values the user should prefix the number
+with a '0x'.
+
+An example of setting CPU 0's PC to 0x8000 is:
+    -device loader,addr=0x8000,cpu-num=0
+
+Loading Files
+-------------
+The loader device also allows files to be loaded into memory. It can load raw
+files and ELF executable files.  Raw files are loaded verbatim.  ELF executable
+files are loaded by an ELF loader.  The syntax is shown below:
+
+    -device loader,file=<file>[,addr=<addr>][,cpu-num=<cpu-num>][,force-raw=<raw>]
+
+    <file>      - A file to be loaded into memory
+    <addr>      - The addr in memory that the file should be loaded. This is
+                  ignored if you are using an ELF (unless force-raw is true).
+                  This is required if you aren't loading an ELF.
+    <cpu-num>   - This specifies the CPU that should be used. This is an
+                  optional argument and will cause the CPU's PC to be set to
+                  where the image is stored or in the case of an ELF file to
+                  the value in the header. This option should only be used
+                  for the boot image.
+                  This will also cause the image to be written to the specified
+                  CPU's address space. If not specified, the default is CPU 0.
+    <force-raw> - Setting force-raw=on forces the file to be treated as a raw
+                  image.  This can be used to load ELF files as if they were raw.
+
+All values are parsed using the standard QemuOps parsing. This allows the user
+to specify any values in any format supported. By default the values
+will be parsed as decimal. To use hex values the user should prefix the number
+with a '0x'.
+
+An example of loading an ELF file which CPU0 will boot is shown below:
+    -device loader,file=./images/boot.elf,cpu-num=0
+
+Restrictions and ToDos
+----------------------
+ - At the moment it is just assumed that if you specify a cpu-num then you
+   want to set the PC as well. This might not always be the case. In future
+   the internal state 'set_pc' (which exists in the generic loader now) should
+   be exposed to the user so that they can choose if the PC is set or not.
--- a/docs/live-block-ops.txt
+++ b/docs/live-block-ops.txt
@@ -4,15 +4,20 @@ LIVE BLOCK OPERATIONS
 High level description of live block operations. Note these are not
 supported for use with the raw format at the moment.

+Note also that this document is incomplete and it currently only
+covers the 'stream' operation. Other operations supported by QEMU such
+as 'commit', 'mirror' and 'backup' are not described here yet. Please
+refer to the qapi/block-core.json file for an overview of those.
+
 Snapshot live merge
 ===================

 Given a snapshot chain, described in this document in the following
 format:

-[A] -> [B] -> [C] -> [D]
+[A] <- [B] <- [C] <- [D] <- [E]

-Where the rightmost object ([D] in the example) described is the current
+Where the rightmost object ([E] in the example) described is the current
 image which the guest OS has write access to. To the left of it is its base
 image, and so on accordingly until the leftmost image, which has no
 base.
@@ -21,11 +26,14 @@ The snapshot live merge operation transforms such a chain into a
 smaller one with fewer elements, such as this transformation relative
 to the first example:

-[A] -> [D]
+[A] <- [E]

-Currently only forward merge with target being the active image is
-supported, that is, data copy is performed in the right direction with
-destination being the rightmost image.
+Data is copied in the right direction with destination being the
+rightmost image, but any other intermediate image can be specified
+instead. In this example data is copied from [C] into [D], so [D] can
+be backed by [B]:
+
+[A] <- [B] <- [D] <- [E]

 The operation is implemented in QEMU through image streaming facilities.

@@ -35,14 +43,20 @@ streaming operation completes it raises a QMP event. 'block_stream'
 copies data from the backing file(s) into the active image. When finished,
 it adjusts the backing file pointer.

-The 'base' parameter specifies an image which data need not be streamed from.
-This image will be used as the backing file for the active image when the
-operation is finished.
+The 'base' parameter specifies an image which data need not be
+streamed from. This image will be used as the backing file for the
+destination image when the operation is finished.

-In the example above, the command would be:
+In the first example above, the command would be:

-(qemu) block_stream virtio0 A
+(qemu) block_stream virtio0 file-A.img

+In order to specify a destination image different from the active
+(rightmost) one we can use its node name instead.
+
+In the second example above, the command would be:
+
+(qemu) block_stream node-D file-B.img

 Live block copy
 ===============
--- a/docs/multiple-iothreads.txt
+++ b/docs/multiple-iothreads.txt
@@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
 acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
 sure to acquire the AioContext for aio_bh_new() if necessary.

-The relationship between AioContext and the block layer
-------------------------------------------------------
-The AioContext originates from the QEMU block layer because it provides a
-scoped way of running event loop iterations until all work is done.  This
-feature is used to complete all in-flight block I/O requests (see
-bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
-used by any QEMU subsystem.
+AioContext and the block layer
+------------------------------
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.

 The block layer has support for AioContext integrated.  Each BlockDriverState
 is associated with an AioContext using bdrv_set_aio_context() and
@@ -122,13 +119,22 @@ Block layer code must therefore expect to run in an IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.

-If main loop code such as a QMP function wishes to access a BlockDriverState it
-must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
-IOThread does not run in parallel.
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.

-Long-running jobs (usually in the form of coroutines) are best scheduled in the
-BlockDriverState's AioContext to avoid the need to acquire/release around each
-bdrv_*() call.  Be aware that there is currently no mechanism to get notified
-when bdrv_set_aio_context() moves this BlockDriverState to a different
-AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you
-may need to add this if you want to support long-running jobs.
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed.  When a block device is running
+in an IOThread, the IOThread can also process requests from the guest
+(via ioeventfd).  To achieve both objects, wrap the code between
+bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
+section".  The functions must be called between aio_context_acquire()
+and aio_context_release().  You can freely release and re-acquire the
+AioContext within a drained section.
+
+Long-running jobs (usually in the form of coroutines) are best scheduled in
+the BlockDriverState's AioContext to avoid the need to acquire/release around
+each bdrv_*() call.  The functions bdrv_add/remove_aio_context_notifier,
+or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
+can be used to get a notification whenever bdrv_set_aio_context() moves a
+BlockDriverState to a different AioContext.
--- a/docs/pcie.txt
+++ b/docs/pcie.txt
@@ -0,0 +1,310 @@
+PCI EXPRESS GUIDELINES
+======================
+
+1. Introduction
+================
+The doc proposes best practices on how to use PCI Express/PCI device
+in PCI Express based machines and explains the reasoning behind them.
+
+The following presentations accompany this document:
+ (1) Q35 overview.
+     http://wiki.qemu.org/images/4/4e/Q35.pdf
+ (2) A comparison between PCI and PCI Express technologies.
+     http://wiki.qemu.org/images/f/f6/PCIvsPCIe.pdf
+
+Note: The usage examples are not intended to replace the full
+documentation, please use QEMU help to retrieve all options.
+
+2. Device placement strategy
+============================
+QEMU does not have a clear socket-device matching mechanism
+and allows any PCI/PCI Express device to be plugged into any
+PCI/PCI Express slot.
+Plugging a PCI device into a PCI Express slot might not always work and
+is weird anyway since it cannot be done for "bare metal".
+Plugging a PCI Express device into a PCI slot will hide the Extended
+Configuration Space thus is also not recommended.
+
+The recommendation is to separate the PCI Express and PCI hierarchies.
+PCI Express devices should be plugged only into PCI Express Root Ports and
+PCI Express Downstream ports.
+
+2.1 Root Bus (pcie.0)
+=====================
+Place only the following kinds of devices directly on the Root Complex:
+    (1) PCI Devices (e.g. network card, graphics card, IDE controller),
+        not controllers. Place only legacy PCI devices on
+        the Root Complex. These will be considered Integrated Endpoints.
+        Note: Integrated Endpoints are not hot-pluggable.
+
+        Although the PCI Express spec does not forbid PCI Express devices as
+        Integrated Endpoints, existing hardware mostly integrates legacy PCI
+        devices with the Root Complex. Guest OSes are suspected to behave
+        strangely when PCI Express devices are integrated
+        with the Root Complex.
+
+    (2) PCI Express Root Ports (ioh3420), for starting exclusively PCI Express
+        hierarchies.
+
+    (3) DMI-PCI Bridges (i82801b11-bridge), for starting legacy PCI
+        hierarchies.
+
+    (4) Extra Root Complexes (pxb-pcie), if multiple PCI Express Root Buses
+        are needed.
+
+   pcie.0 bus
+   ----------------------------------------------------------------------------
+        |                |                    |                  |
+   -----------   ------------------   ------------------   --------------
+   | PCI Dev |   | PCIe Root Port |   | DMI-PCI Bridge |   |  pxb-pcie  |
+   -----------   ------------------   ------------------   --------------
+
+2.1.1 To plug a device into pcie.0 as a Root Complex Integrated Endpoint use:
+          -device <dev>[,bus=pcie.0]
+2.1.2 To expose a new PCI Express Root Bus use:
+          -device pxb-pcie,id=pcie.1,bus_nr=x[,numa_node=y][,addr=z]
+      Only PCI Express Root Ports and DMI-PCI bridges can be connected
+      to the pcie.1 bus:
+          -device ioh3420,id=root_port1[,bus=pcie.1][,chassis=x][,slot=y][,addr=z]                                     \
+          -device i82801b11-bridge,id=dmi_pci_bridge1,bus=pcie.1
+
+
+2.2 PCI Express only hierarchy
+==============================
+Always use PCI Express Root Ports to start PCI Express hierarchies.
+
+A PCI Express Root bus supports up to 32 devices. Since each
+PCI Express Root Port is a function and a multi-function
+device may support up to 8 functions, the maximum possible
+number of PCI Express Root Ports per PCI Express Root Bus is 256.
+
+Prefer grouping PCI Express Root Ports into multi-function devices
+to keep a simple flat hierarchy that is enough for most scenarios.
+Only use PCI Express Switches (x3130-upstream, xio3130-downstream)
+if there is no more room for PCI Express Root Ports.
+Please see section 4. for further justifications.
+
+Plug only PCI Express devices into PCI Express Ports.
+
+
+   pcie.0 bus
+   ----------------------------------------------------------------------------------
+        |                 |                                    |
+   -------------    -------------                        -------------
+   | Root Port |    | Root Port |                        | Root Port |
+   ------------     -------------                        -------------
+         |                            -------------------------|------------------------
+    ------------                      |                 -----------------              |
+    | PCIe Dev |                      |    PCI Express  | Upstream Port |              |
+    ------------                      |      Switch     -----------------              |
+                                      |                  |            |                |
+                                      |    -------------------    -------------------  |
+                                      |    | Downstream Port |    | Downstream Port |  |
+                                      |    -------------------    -------------------  |
+                                      -------------|-----------------------|------------
+                                             ------------
+                                             | PCIe Dev |
+                                             ------------
+
+2.2.1 Plugging a PCI Express device into a PCI Express Root Port:
+          -device ioh3420,id=root_port1,chassis=x,slot=y[,bus=pcie.0][,addr=z]  \
+          -device <dev>,bus=root_port1
+2.2.2 Using multi-function PCI Express Root Ports:
+      -device ioh3420,id=root_port1,multifunction=on,chassis=x,slot=y[,bus=pcie.0][,addr=z.0] \
+      -device ioh3420,id=root_port2,chassis=x1,slot=y1[,bus=pcie.0][,addr=z.1] \
+      -device ioh3420,id=root_port3,chassis=x2,slot=y2[,bus=pcie.0][,addr=z.2] \
+2.2.2 Plugging a PCI Express device into a Switch:
+      -device ioh3420,id=root_port1,chassis=x,slot=y[,bus=pcie.0][,addr=z]  \
+      -device x3130-upstream,id=upstream_port1,bus=root_port1[,addr=x]          \
+      -device xio3130-downstream,id=downstream_port1,bus=upstream_port1,chassis=x1,slot=y1[,addr=z1]] \
+      -device <dev>,bus=downstream_port1
+
+Notes:
+  - (slot, chassis) pair is mandatory and must be
+     unique for each PCI Express Root Port.
+  - 'addr' parameter can be 0 for all the examples above.
+
+
+2.3 PCI only hierarchy
+======================
+Legacy PCI devices can be plugged into pcie.0 as Integrated Endpoints,
+but, as mentioned in section 5, doing so means the legacy PCI
+device in question will be incapable of hot-unplugging.
+Besides that use DMI-PCI Bridges (i82801b11-bridge) in combination
+with PCI-PCI Bridges (pci-bridge) to start PCI hierarchies.
+
+Prefer flat hierarchies. For most scenarios a single DMI-PCI Bridge
+(having 32 slots) and several PCI-PCI Bridges attached to it
+(each supporting also 32 slots) will support hundreds of legacy devices.
+The recommendation is to populate one PCI-PCI Bridge under the DMI-PCI Bridge
+until is full and then plug a new PCI-PCI Bridge...
+
+   pcie.0 bus
+   ----------------------------------------------
+        |                            |
+   -----------               ------------------
+   | PCI Dev |               | DMI-PCI BRIDGE |
+   ----------                ------------------
+                               |            |
+                  ------------------    ------------------
+                  | PCI-PCI Bridge |    | PCI-PCI Bridge |   ...
+                  ------------------    ------------------
+                                         |           |
+                                  -----------     -----------
+                                  | PCI Dev |     | PCI Dev |
+                                  -----------     -----------
+
+2.3.1 To plug a PCI device into pcie.0 as an Integrated Endpoint use:
+      -device <dev>[,bus=pcie.0]
+2.3.2 Plugging a PCI device into a PCI-PCI Bridge:
+      -device i82801b11-bridge,id=dmi_pci_bridge1[,bus=pcie.0]                        \
+      -device pci-bridge,id=pci_bridge1,bus=dmi_pci_bridge1[,chassis_nr=x][,addr=y]   \
+      -device <dev>,bus=pci_bridge1[,addr=x]
+      Note that 'addr' cannot be 0 unless shpc=off parameter is passed to
+      the PCI Bridge.
+
+3. IO space issues
+===================
+The PCI Express Root Ports and PCI Express Downstream ports are seen by
+Firmware/Guest OS as PCI-PCI Bridges. As required by the PCI spec, each
+such Port should be reserved a 4K IO range for, even though only one
+(multifunction) device can be plugged into each Port. This results in
+poor IO space utilization.
+
+The firmware used by QEMU (SeaBIOS/OVMF) may try further optimizations
+by not allocating IO space for each PCI Express Root / PCI Express
+Downstream port if:
+    (1) the port is empty, or
+    (2) the device behind the port has no IO BARs.
+
+The IO space is very limited, to 65536 byte-wide IO ports, and may even be
+fragmented by fixed IO ports owned by platform devices resulting in at most
+10 PCI Express Root Ports or PCI Express Downstream Ports per system
+if devices with IO BARs are used in the PCI Express hierarchy. Using the
+proposed device placing strategy solves this issue by using only
+PCI Express devices within PCI Express hierarchy.
+
+The PCI Express spec requires that PCI Express devices work properly
+without using IO ports. The PCI hierarchy has no such limitations.
+
+
+4. Bus numbers issues
+======================
+Each PCI domain can have up to only 256 buses and the QEMU PCI Express
+machines do not support multiple PCI domains even if extra Root
+Complexes (pxb-pcie) are used.
+
+Each element of the PCI Express hierarchy (Root Complexes,
+PCI Express Root Ports, PCI Express Downstream/Upstream ports)
+uses one bus number. Since only one (multifunction) device
+can be attached to a PCI Express Root Port or PCI Express Downstream
+Port it is advised to plan in advance for the expected number of
+devices to prevent bus number starvation.
+
+Avoiding PCI Express Switches (and thereby striving for a 'flatter' PCI
+Express hierarchy) enables the hierarchy to not spend bus numbers on
+Upstream Ports.
+
+The bus_nr properties of the pxb-pcie devices partition the 0..255 bus
+number space. All bus numbers assigned to the buses recursively behind a
+given pxb-pcie device's root bus must fit between the bus_nr property of
+that pxb-pcie device, and the lowest of the higher bus_nr properties
+that the command line sets for other pxb-pcie devices.
+
+
+5. Hot-plug
+============
+The PCI Express root buses (pcie.0 and the buses exposed by pxb-pcie devices)
+do not support hot-plug, so any devices plugged into Root Complexes
+cannot be hot-plugged/hot-unplugged:
+    (1) PCI Express Integrated Endpoints
+    (2) PCI Express Root Ports
+    (3) DMI-PCI Bridges
+    (4) pxb-pcie
+
+Be aware that PCI Express Downstream Ports can't be hot-plugged into
+an existing PCI Express Upstream Port.
+
+PCI devices can be hot-plugged into PCI-PCI Bridges. The PCI hot-plug is ACPI
+based and can work side by side with the PCI Express native hot-plug.
+
+PCI Express devices can be natively hot-plugged/hot-unplugged into/from
+PCI Express Root Ports (and PCI Express Downstream Ports).
+
+5.1 Planning for hot-plug:
+    (1) PCI hierarchy
+        Leave enough PCI-PCI Bridge slots empty or add one
+        or more empty PCI-PCI Bridges to the DMI-PCI Bridge.
+
+        For each such PCI-PCI Bridge the Guest Firmware is expected to reserve
+        4K IO space and 2M MMIO range to be used for all devices behind it.
+
+        Because of the hard IO limit of around 10 PCI Bridges (~ 40K space)
+        per system don't use more than 9 PCI-PCI Bridges, leaving 4K for the
+        Integrated Endpoints. (The PCI Express Hierarchy needs no IO space).
+
+    (2) PCI Express hierarchy:
+        Leave enough PCI Express Root Ports empty. Use multifunction
+        PCI Express Root Ports (up to 8 ports per pcie.0 slot)
+        on the Root Complex(es), for keeping the
+        hierarchy as flat as possible, thereby saving PCI bus numbers.
+        Don't use PCI Express Switches if you don't have
+        to, each one of those uses an extra PCI bus (for its Upstream Port)
+        that could be put to better use with another Root Port or Downstream
+        Port, which may come handy for hot-plugging another device.
+
+
+5.3 Hot-plug example:
+Using HMP: (add -monitor stdio to QEMU command line)
+  device_add <dev>,id=<id>,bus=<PCI Express Root Port Id/PCI Express Downstream Port Id/PCI-PCI Bridge Id/>
+
+
+6. Device assignment
+====================
+Host devices are mostly PCI Express and should be plugged only into
+PCI Express Root Ports or PCI Express Downstream Ports.
+PCI-PCI Bridge slots can be used for legacy PCI host devices.
+
+6.1 How to detect if a device is PCI Express:
+  > lspci -s 03:00.0 -v (as root)
+
+    03:00.0 Network controller: Intel Corporation Wireless 7260 (rev 83)
+    Subsystem: Intel Corporation Dual Band Wireless-AC 7260
+    Flags: bus master, fast devsel, latency 0, IRQ 50
+    Memory at f0400000 (64-bit, non-prefetchable) [size=8K]
+    Capabilities: [c8] Power Management version 3
+    Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+
+    Capabilities: [40] Express Endpoint, MSI 00
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    Capabilities: [100] Advanced Error Reporting
+    Capabilities: [140] Device Serial Number 7c-7a-91-ff-ff-90-db-20
+    Capabilities: [14c] Latency Tolerance Reporting
+    Capabilities: [154] Vendor Specific Information: ID=cafe Rev=1 Len=014 
+
+If you can see the "Express Endpoint" capability in the
+output, then the device is indeed PCI Express.
+
+
+7. Virtio devices
+=================
+Virtio devices plugged into the PCI hierarchy or as Integrated Endpoints
+will remain PCI and have transitional behaviour as default.
+Transitional virtio devices work in both IO and MMIO modes depending on
+the guest support. The Guest firmware will assign both IO and MMIO resources
+to transitional virtio devices.
+
+Virtio devices plugged into PCI Express ports are PCI Express devices and
+have "1.0" behavior by default without IO support.
+In both cases disable-legacy and disable-modern properties can be used
+to override the behaviour.
+
+Note that setting disable-legacy=off will enable legacy mode (enabling
+legacy behavior) for PCI Express virtio devices causing them to
+require IO space, which, given the limited available IO space, may quickly
+lead to resource exhaustion, and is therefore strongly discouraged.
+
+
+8. Conclusion
+==============
+The proposal offers a usage model that is easy to understand and follow
+and at the same time overcomes the PCI Express architecture limitations.
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -1005,7 +1005,7 @@ Example:
        Error *err = NULL;
        Visitor *v;

-        v = qmp_output_visitor_new(ret_out);
+        v = qobject_output_visitor_new(ret_out);
        visit_type_UserDefOne(v, "unused", &ret_in, &err);
        if (!err) {
            visit_complete(v, ret_out);
@@ -1024,7 +1024,7 @@ Example:
        Visitor *v;
        UserDefOneList *arg1 = NULL;

-        v = qmp_input_visitor_new(QOBJECT(args), true);
+        v = qobject_input_visitor_new(QOBJECT(args), true);
        visit_start_struct(v, NULL, NULL, 0, &err);
        if (err) {
            goto out;
--- a/docs/qmp-commands.txt
+++ b/docs/qmp-commands.txt
@@ -20,7 +20,7 @@ Also, the following notation is used to denote data flow:
 -> data issued by the Client
 <- Server data response

-Please, refer to the QMP specification (QMP/qmp-spec.txt) for detailed
+Please, refer to the QMP specification (docs/qmp-spec.txt) for detailed
 information on the Server command and response formats.

 NOTE: This document is temporary and will be replaced soon.
@@ -554,6 +554,16 @@ Example:
 -> { "execute": "migrate_set_downtime", "arguments": { "value": 0.1 } }
 <- { "return": {} }

+x-colo-lost-heartbeat
+--------------------
+
+Tell COLO that heartbeat is lost, a failover or takeover is needed.
+
+Example:
+
+-> { "execute": "x-colo-lost-heartbeat" }
+<- { "return": {} }
+
 client_migrate_info
 -------------------

@@ -740,8 +750,11 @@ Arguments:
 - "job-id": Identifier for the newly-created block job. If omitted,
            the device name will be used. (json-string, optional)
 - "device": The device name or node-name of a root node (json-string)
- "base": The file name of the backing image above which copying starts
-          (json-string, optional)
+- "base": The file name of the backing image above which copying starts.
+          It cannot be set if 'base-node' is also set (json-string, optional)
+- "base-node": the node name of the backing image above which copying starts.
+               It cannot be set if 'base' is also set.
+               (json-string, optional) (Since 2.8)
 - "backing-file": The backing file string to write into the active layer. This
                  filename is not validated.

@@ -1090,11 +1103,11 @@ Arguments:
 Example:

 -> { "execute": "blockdev-add",
-                "arguments": { "options": { "driver": "qcow2",
-                                            "node-name": "node1534",
-                                            "file": { "driver": "file",
-                                                      "filename": "hd1.qcow2" },
-                                            "backing": "" } } }
+                "arguments": { "driver": "qcow2",
+                               "node-name": "node1534",
+                               "file": { "driver": "file",
+                                         "filename": "hd1.qcow2" },
+                               "backing": "" } }

 <- { "return": {} }

@@ -1790,7 +1803,7 @@ Each json-object contain the following:
                                "file", "file", "ftp", "ftps", "host_cdrom",
                                "host_device", "http", "https",
                                "nbd", "parallels", "qcow", "qcow2", "raw",
-                                "tftp", "vdi", "vmdk", "vpc", "vvfat"
+                                "vdi", "vmdk", "vpc", "vvfat"
         - "backing_file": backing file name (json-string, optional)
         - "backing_file_depth": number of files in the backing file chain (json-int)
         - "encrypted": true if encrypted, false otherwise (json-bool)
@@ -2861,6 +2874,7 @@ Enable/Disable migration capabilities
 - "compress": use multiple compression threads to accelerate live migration
 - "events": generate events for each migration state change
 - "postcopy-ram": postcopy mode for live migration
+- "x-colo": COarse-Grain LOck Stepping (COLO) for Non-stop Service

 Arguments:

@@ -2882,6 +2896,7 @@ Query current migration capabilities
         - "compress": Multiple compression threads state (json-bool)
         - "events": Migration state change event state (json-bool)
         - "postcopy-ram": postcopy ram state (json-bool)
+         - "x-colo": COarse-Grain LOck Stepping for Non-stop Service (json-bool)

 Arguments:

@@ -2895,7 +2910,8 @@ Example:
     {"state": false, "capability": "zero-blocks"},
     {"state": false, "capability": "compress"},
     {"state": true, "capability": "events"},
-     {"state": false, "capability": "postcopy-ram"}
+     {"state": false, "capability": "postcopy-ram"},
+     {"state": false, "capability": "x-colo"}
   ]}

 migrate-set-parameters
@@ -2910,6 +2926,10 @@ Set migration parameters
                          throttled for auto-converge (json-int)
 - "cpu-throttle-increment": set throttle increasing percentage for
                            auto-converge (json-int)
+- "max-bandwidth": set maximum speed for migrations (in bytes/sec) (json-int)
+- "downtime-limit": set maximum tolerated downtime (in milliseconds) for
+                    migrations (json-int)
+- "x-checkpoint-delay": set the delay time for periodic checkpoint (json-int)

 Arguments:

@@ -2931,7 +2951,10 @@ Query current migration parameters
                                    throttled (json-int)
         - "cpu-throttle-increment" : throttle increasing percentage for
                                      auto-converge (json-int)
-
+         - "max-bandwidth" : maximium migration speed in bytes per second
+                             (json-int)
+         - "downtime-limit" : maximum tolerated downtime of migration in
+                              milliseconds (json-int)
 Arguments:

 Example:
@@ -2943,7 +2966,9 @@ Example:
         "cpu-throttle-increment": 10,
         "compress-threads": 8,
         "compress-level": 1,
-         "cpu-throttle-initial": 20
+         "cpu-throttle-initial": 20,
+         "max-bandwidth": 33554432,
+         "downtime-limit": 300
      }
   }

@@ -3123,41 +3148,37 @@ This command is still a work in progress.  It doesn't support all
 block drivers among other things.  Stay away from it unless you want
 to help with its development.

-Arguments:
-
- "options": block driver options
+For the arguments, see the QAPI schema documentation of BlockdevOptions.

 Example (1):

 -> { "execute": "blockdev-add",
-    "arguments": { "options" : { "driver": "qcow2",
-                                 "file": { "driver": "file",
-                                           "filename": "test.qcow2" } } } }
+    "arguments": { "driver": "qcow2",
+                   "file": { "driver": "file",
+                             "filename": "test.qcow2" } } }
 <- { "return": {} }

 Example (2):

 -> { "execute": "blockdev-add",
     "arguments": {
-         "options": {
-           "driver": "qcow2",
-           "node-name": "my_disk",
-           "discard": "unmap",
-           "cache": {
-               "direct": true,
-               "writeback": true
-           },
-           "file": {
-               "driver": "file",
-               "filename": "/tmp/test.qcow2"
-           },
-           "backing": {
-               "driver": "raw",
-               "file": {
-                   "driver": "file",
-                   "filename": "/dev/fdset/4"
-               }
-           }
+         "driver": "qcow2",
+         "node-name": "my_disk",
+         "discard": "unmap",
+         "cache": {
+             "direct": true,
+             "writeback": true
+         },
+         "file": {
+             "driver": "file",
+             "filename": "/tmp/test.qcow2"
+         },
+         "backing": {
+             "driver": "raw",
+             "file": {
+                 "driver": "file",
+                 "filename": "/dev/fdset/4"
+             }
         }
       }
     }
@@ -3184,13 +3205,11 @@ Example:

 -> { "execute": "blockdev-add",
     "arguments": {
-         "options": {
-             "driver": "qcow2",
-             "node-name": "node0",
-             "file": {
-                 "driver": "file",
-                 "filename": "test.qcow2"
-             }
+         "driver": "qcow2",
+         "node-name": "node0",
+         "file": {
+             "driver": "file",
+             "filename": "test.qcow2"
         }
     }
   }
@@ -3239,6 +3258,7 @@ Example:
                    "microseconds": 716996 },
     "event": "DEVICE_TRAY_MOVED",
     "data": { "device": "ide1-cd0",
+               "id": "ide0-1-0",
               "tray-open": true } }

 <- { "return": {} }
@@ -3267,6 +3287,7 @@ Example:
                    "microseconds": 272147 },
     "event": "DEVICE_TRAY_MOVED",
     "data": { "device": "ide1-cd0",
+               "id": "ide0-1-0",
               "tray-open": false } }

 <- { "return": {} }
@@ -3303,6 +3324,7 @@ Example:
                    "microseconds": 549958 },
     "event": "DEVICE_TRAY_MOVED",
     "data": { "device": "ide1-cd0",
+               "id": "ide0-1-0",
               "tray-open": true } }

 <- { "return": {} }
@@ -3332,10 +3354,10 @@ Arguments:
 Example:

 -> { "execute": "blockdev-add",
-     "arguments": { "options": { "node-name": "node0",
-                                 "driver": "raw",
-                                 "file": { "driver": "file",
-                                           "filename": "fedora.iso" } } } }
+     "arguments": { { "node-name": "node0",
+                      "driver": "raw",
+                      "file": { "driver": "file",
+                                "filename": "fedora.iso" } } }

 <- { "return": {} }

@@ -3373,10 +3395,10 @@ Example:

 Add a new node to a quorum
 -> { "execute": "blockdev-add",
-     "arguments": { "options": { "driver": "raw",
-                                 "node-name": "new_node",
-                                 "file": { "driver": "file",
-                                           "filename": "test.raw" } } } }
+     "arguments": { "driver": "raw",
+                    "node-name": "new_node",
+                    "file": { "driver": "file",
+                              "filename": "test.raw" } } }
 <- { "return": {} }
 -> { "execute": "x-blockdev-change",
     "arguments": { "parent": "disk1",
--- a/docs/qmp-events.txt
+++ b/docs/qmp-events.txt
@@ -65,7 +65,12 @@ Emitted when a disk I/O error occurs.

 Data:

- "device": device name (json-string)
+- "device": device name. This is always present for compatibility
+            reasons, but it can be empty ("") if the image does not
+            have a device name associated. (json-string)
+- "node-name": node name. Note that errors may be reported for the root node
+               that is directly attached to a guest device rather than for the
+               node where the error occurred. (json-string)
 - "operation": I/O operation (json-string, "read" or "write")
 - "action": action that has been taken, it's one of the following (json-string):
    "ignore": error has been ignored
@@ -76,6 +81,7 @@ Example:

 { "event": "BLOCK_IO_ERROR",
    "data": { "device": "ide0-hd1",
+              "node-name": "#block212",
              "operation": "write",
              "action": "stop" },
    "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
@@ -214,12 +220,16 @@ or by HMP/QMP commands.

 Data:

- "device": device name (json-string)
+- "device": Block device name. This is always present for compatibility
+            reasons, but it can be empty ("") if the image does not have a
+            device name associated. (json-string)
+- "id": The name or QOM path of the guest device (json-string)
 - "tray-open": true if the tray has been opened or false if it has been closed
               (json-bool)

 { "event": "DEVICE_TRAY_MOVED",
  "data": { "device": "ide1-cd0",
+            "id": "/machine/unattached/device[22]",
            "tray-open": true
  },
  "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
--- a/docs/rcu.txt
+++ b/docs/rcu.txt
@@ -145,7 +145,7 @@ The core RCU API is small:
        and then read from there.

        RCU read-side critical sections must use atomic_rcu_read() to
-        read data, unless concurrent writes are presented by another
+        read data, unless concurrent writes are prevented by another
        synchronization mechanism.

        Furthermore, RCU read-side critical sections should traverse the
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .7.50
 .7.91
				`@@ -1 +0,0 @@`
				`# Default configuration for unicore32-linux-user`