9pfs: Fully restart unreclaim loop (CVE-2021-20181)

Git-commit: 89fbea8737 References: bsc#1182137 Depending on the client activity, the server can be asked to open a huge number of file descriptors and eventually hit RLIMIT_NOFILE. This is currently mitigated using a reclaim logic : the server closes the file descriptors of idle fids, based on the assumption that it will be able to re-open them later. This assumption doesn't hold of course if the client requests the file to be unlinked. In this case, we loop on the entire fid list and mark all related fids as unreclaimable (the reclaim logic will just ignore them) and, of course, we open or re-open their file descriptors if needed since we're about to unlink the file. This is the purpose of v9fs_mark_fids_unreclaim(). Since the actual opening of a file can cause the coroutine to yield, another client request could possibly add a new fid that we may want to mark as non-reclaimable as well. The loop is thus restarted if the re-open request was actually transmitted to the backend. This is achieved by keeping a reference on the first fid (head) before traversing the list. This is wrong in several ways: - a potential clunk request from the client could tear the first fid down and cause the reference to be stale. This leads to a use-after-free error that can be detected with ASAN, using a custom 9p client - fids are added at the head of the list : restarting from the previous head will always miss fids added by a some other potential request All these problems could be avoided if fids were being added at the end of the list. This can be achieved with a QSIMPLEQ, but this is probably too much change for a bug fix. For now let's keep it simple and just restart the loop from the current head. Fixes: CVE-2021-20181 Buglink: https://bugs.launchpad.net/qemu/+bug/1911666 Reported-by: Zero Day Initiative <zdi-disclosures@trendmicro.com> Reviewed-by: Christian Schoenebeck <qemu_oss@crudebyte.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Message-Id: <161064025265.1838153.15185571283519390907.stgit@bahia.lan> Signed-off-by: Greg Kurz <groug@kaod.org> Signed-off-by: Bruce Rogers <brogers@suse.com>
usb: fix setup_len init (CVE-2020-14364)
2021-04-06 16:51:18 -06:00 · 2021-04-06 16:51:18 -06:00 · 2021-04-06 16:51:18 -06:00 · 2021-04-06 16:50:56 -06:00 · 2021-03-29 14:27:09 -06:00 · 2021-03-29 14:27:09 -06:00
377 changed files with 15042 additions and 5845 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,8 @@ pc-bios/optionrom/multiboot.img
 pc-bios/optionrom/kvmvapic.bin
 pc-bios/optionrom/kvmvapic.raw
 pc-bios/optionrom/kvmvapic.img
+pc-bios/s390-ccw/s390-ccw.elf
+pc-bios/s390-ccw/s390-ccw.img
 .stgit-*
 cscope.*
 tags
--- a/2
+++ b/2
@@ -170,7 +170,7 @@ qemu-io$(EXESUF): qemu-io.o cmd.o $(block-obj-y) libqemuutil.a libqemustub.a

 qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o

-fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/virtio-9p-marshal.o libqemuutil.a libqemustub.a
+fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap

 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -16,16 +16,7 @@ block-obj-y += qapi-types.o qapi-visit.o

 block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 block-obj-y += qemu-coroutine-sleep.o
-ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
-block-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
-else
-ifeq ($(CONFIG_SIGALTSTACK_COROUTINE),y)
-block-obj-$(CONFIG_POSIX) += coroutine-sigaltstack.o
-else
-block-obj-$(CONFIG_POSIX) += coroutine-gthread.o
-endif
-endif
-block-obj-$(CONFIG_WIN32) += coroutine-win32.o
+block-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o

 ifeq ($(CONFIG_VIRTIO)$(CONFIG_VIRTFS)$(CONFIG_PCI),yyy)
 # Lots of the fsdev/9pcode is pulled in by vl.c via qemu_fsdev_add.
--- a/2
+++ b/2
@@ -1 +1 @@
-1.4.0
+1.4.2
--- a/arch_init.c
+++ b/arch_init.c
@@ -114,26 +114,6 @@ const uint32_t arch_type = QEMU_ARCH;
 #define RAM_SAVE_FLAG_CONTINUE 0x20
 #define RAM_SAVE_FLAG_XBZRLE   0x40

-#ifdef __ALTIVEC__
-#include <altivec.h>
-#define VECTYPE        vector unsigned char
-#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
-#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
-/* altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics. */
-#undef bool
-#define bool _Bool
-#elif defined __SSE2__
-#include <emmintrin.h>
-#define VECTYPE        __m128i
-#define SPLAT(p)       _mm_set1_epi8(*(p))
-#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
-#else
-#define VECTYPE        unsigned long
-#define SPLAT(p)       (*(p) * (~0UL / 255))
-#define ALL_EQ(v1, v2) ((v1) == (v2))
-#endif
-

 static struct defconfig_file {
    const char *filename;
@@ -164,19 +144,10 @@ int qemu_read_default_config_files(bool userconfig)
    return 0;
 }

-static int is_dup_page(uint8_t *page)
+static inline bool is_zero_page(uint8_t *p)
 {
-    VECTYPE *p = (VECTYPE *)page;
-    VECTYPE val = SPLAT(page);
-    int i;
-
-    for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
-        if (!ALL_EQ(val, p[i])) {
-            return 0;
-        }
-    }
-
-    return 1;
+    return buffer_find_nonzero_offset(p, TARGET_PAGE_SIZE) ==
+        TARGET_PAGE_SIZE;
 }

 /* struct contains XBZRLE cache and a static page
@@ -210,6 +181,7 @@ int64_t xbzrle_cache_resize(int64_t new_size)
 /* accounting for migration statistics */
 typedef struct AccountingInfo {
    uint64_t dup_pages;
+    uint64_t skipped_pages;
    uint64_t norm_pages;
    uint64_t iterations;
    uint64_t xbzrle_bytes;
@@ -235,6 +207,16 @@ uint64_t dup_mig_pages_transferred(void)
    return acct_info.dup_pages;
 }

+uint64_t skipped_mig_bytes_transferred(void)
+{
+    return acct_info.skipped_pages * TARGET_PAGE_SIZE;
+}
+
+uint64_t skipped_mig_pages_transferred(void)
+{
+    return acct_info.skipped_pages;
+}
+
 uint64_t norm_mig_bytes_transferred(void)
 {
    return acct_info.norm_pages * TARGET_PAGE_SIZE;
@@ -347,6 +329,7 @@ static ram_addr_t last_offset;
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
+static bool ram_bulk_stage;

 static inline
 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
@@ -356,7 +339,13 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
    unsigned long nr = base + (start >> TARGET_PAGE_BITS);
    unsigned long size = base + (int128_get64(mr->size) >> TARGET_PAGE_BITS);

-    unsigned long next = find_next_bit(migration_bitmap, size, nr);
+    unsigned long next;
+
+    if (ram_bulk_stage && nr > base) {
+        next = nr + 1;
+    } else {
+        next = find_next_bit(migration_bitmap, size, nr);
+    }

    if (next < size) {
        clear_bit(next, migration_bitmap);
@@ -451,6 +440,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
            if (!block) {
                block = QTAILQ_FIRST(&ram_list.blocks);
                complete_round = true;
+                ram_bulk_stage = false;
            }
        } else {
            uint8_t *p;
@@ -461,13 +451,13 @@ static int ram_save_block(QEMUFile *f, bool last_stage)

            /* In doubt sent page as normal */
            bytes_sent = -1;
-            if (is_dup_page(p)) {
+            if (is_zero_page(p)) {
                acct_info.dup_pages++;
                bytes_sent = save_block_hdr(f, block, offset, cont,
                                            RAM_SAVE_FLAG_COMPRESS);
-                qemu_put_byte(f, *p);
-                bytes_sent += 1;
-            } else if (migrate_use_xbzrle()) {
+                qemu_put_byte(f, 0);
+                bytes_sent++;
+            } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
                current_addr = block->offset + offset;
                bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                              offset, cont, last_stage);
@@ -554,6 +544,7 @@ static void reset_ram_globals(void)
    last_sent_block = NULL;
    last_offset = 0;
    last_version = ram_list.version;
+    ram_bulk_stage = true;
 }

 #define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -745,7 +736,7 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    uint8_t len;

    if (flags & RAM_SAVE_FLAG_CONTINUE) {
-        if (!block) {
+        if (!block || block->length <= offset) {
            fprintf(stderr, "Ack, bad migration stream!\n");
            return NULL;
        }
@@ -758,8 +749,9 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    id[len] = 0;

    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (!strncmp(id, block->idstr, sizeof(id)))
+        if (!strncmp(id, block->idstr, sizeof(id)) && block->length > offset) {
            return memory_region_get_ram_ptr(block->mr) + offset;
+        }
    }

    fprintf(stderr, "Can't find block %s!\n", id);
@@ -833,14 +825,16 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            ch = qemu_get_byte(f);
-            memset(host, ch, TARGET_PAGE_SIZE);
+            if (ch != 0 || !is_zero_page(host)) {
+                memset(host, ch, TARGET_PAGE_SIZE);
 #ifndef _WIN32
-            if (ch == 0 &&
-                (!kvm_enabled() || kvm_has_sync_mmu()) &&
-                getpagesize() <= TARGET_PAGE_SIZE) {
-                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
-            }
+                if (ch == 0 &&
+                    (!kvm_enabled() || kvm_has_sync_mmu()) &&
+                    getpagesize() <= TARGET_PAGE_SIZE) {
+                    qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
+                }
 #endif
+            }
        } else if (flags & RAM_SAVE_FLAG_PAGE) {
            void *host;

--- a/audio/audio.c
+++ b/audio/audio.c
@@ -2054,6 +2054,8 @@ void AUD_del_capture (CaptureVoiceOut *cap, void *cb_opaque)
                    sw = sw1;
                }
                QLIST_REMOVE (cap, entries);
+                g_free (cap->hw.mix_buf);
+                g_free (cap->buf);
                g_free (cap);
            }
            return;
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -24,33 +24,12 @@ typedef struct RngEgd

    CharDriverState *chr;
    char *chr_name;
-
-    GSList *requests;
 } RngEgd;

-typedef struct RngRequest
-{
-    EntropyReceiveFunc *receive_entropy;
-    uint8_t *data;
-    void *opaque;
-    size_t offset;
-    size_t size;
-} RngRequest;
-
-static void rng_egd_request_entropy(RngBackend *b, size_t size,
-                                    EntropyReceiveFunc *receive_entropy,
-                                    void *opaque)
+static void rng_egd_request_entropy(RngBackend *b, RngRequest *req)
 {
    RngEgd *s = RNG_EGD(b);
-    RngRequest *req;
-
-    req = g_malloc(sizeof(*req));
-
-    req->offset = 0;
-    req->size = size;
-    req->receive_entropy = receive_entropy;
-    req->opaque = opaque;
-    req->data = g_malloc(req->size);
+    size_t size = req->size;

    while (size > 0) {
        uint8_t header[2];
@@ -64,23 +43,15 @@ static void rng_egd_request_entropy(RngBackend *b, size_t size,

        size -= len;
    }
-
-    s->requests = g_slist_append(s->requests, req);
 }

-static void rng_egd_free_request(RngRequest *req)
-{
-    g_free(req->data);
-    g_free(req);
-}
-
-static int rng_egd_chr_can_read(void *opaque)
+static size_t rng_egd_chr_can_read(void *opaque)
 {
    RngEgd *s = RNG_EGD(opaque);
    GSList *i;
-    int size = 0;
+    size_t size = 0;

-    for (i = s->requests; i; i = i->next) {
+    for (i = s->parent.requests; i; i = i->next) {
        RngRequest *req = i->data;
        size += req->size - req->offset;
    }
@@ -88,12 +59,12 @@ static int rng_egd_chr_can_read(void *opaque)
    return size;
 }

-static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
+static void rng_egd_chr_read(void *opaque, const uint8_t *buf, size_t size)
 {
    RngEgd *s = RNG_EGD(opaque);

-    while (size > 0 && s->requests) {
-        RngRequest *req = s->requests->data;
+    while (size > 0 && s->parent.requests) {
+        RngRequest *req = s->parent.requests->data;
        int len = MIN(size, req->size - req->offset);

        memcpy(req->data + req->offset, buf, len);
@@ -101,38 +72,13 @@ static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
        size -= len;

        if (req->offset == req->size) {
-            s->requests = g_slist_remove_link(s->requests, s->requests);
-
            req->receive_entropy(req->opaque, req->data, req->size);

-            rng_egd_free_request(req);
+            rng_backend_finalize_request(&s->parent, req);
        }
    }
 }

-static void rng_egd_free_requests(RngEgd *s)
-{
-    GSList *i;
-
-    for (i = s->requests; i; i = i->next) {
-        rng_egd_free_request(i->data);
-    }
-
-    g_slist_free(s->requests);
-    s->requests = NULL;
-}
-
-static void rng_egd_cancel_requests(RngBackend *b)
-{
-    RngEgd *s = RNG_EGD(b);
-
-    /* We simply delete the list of pending requests.  If there is data in the 
-     * queue waiting to be read, this is okay, because there will always be
-     * more data than we requested originally
-     */
-    rng_egd_free_requests(s);
-}
-
 static void rng_egd_opened(RngBackend *b, Error **errp)
 {
    RngEgd *s = RNG_EGD(b);
@@ -194,8 +140,6 @@ static void rng_egd_finalize(Object *obj)
    }

    g_free(s->chr_name);
-
-    rng_egd_free_requests(s);
 }

 static void rng_egd_class_init(ObjectClass *klass, void *data)
@@ -203,7 +147,6 @@ static void rng_egd_class_init(ObjectClass *klass, void *data)
    RngBackendClass *rbc = RNG_BACKEND_CLASS(klass);

    rbc->request_entropy = rng_egd_request_entropy;
-    rbc->cancel_requests = rng_egd_cancel_requests;
    rbc->opened = rng_egd_opened;
 }

--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -21,10 +21,6 @@ struct RndRandom

    int fd;
    char *filename;
-
-    EntropyReceiveFunc *receive_func;
-    void *opaque;
-    size_t size;
 };

 /**
@@ -37,33 +33,35 @@ struct RndRandom
 static void entropy_available(void *opaque)
 {
    RndRandom *s = RNG_RANDOM(opaque);
-    uint8_t buffer[s->size];
-    ssize_t len;

-    len = read(s->fd, buffer, s->size);
-    g_assert(len != -1);
+    while (s->parent.requests != NULL) {
+        RngRequest *req = s->parent.requests->data;
+        ssize_t len;

-    s->receive_func(s->opaque, buffer, len);
-    s->receive_func = NULL;
+        len = read(s->fd, req->data, req->size);
+        if (len < 0 && errno == EAGAIN) {
+            return;
+        }
+        g_assert(len != -1);

+        req->receive_entropy(req->opaque, req->data, len);
+
+        rng_backend_finalize_request(&s->parent, req);
+    }
+
+    /* We've drained all requests, the fd handler can be reset. */
    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
 }

-static void rng_random_request_entropy(RngBackend *b, size_t size,
-                                        EntropyReceiveFunc *receive_entropy,
-                                        void *opaque)
+static void rng_random_request_entropy(RngBackend *b, RngRequest *req)
 {
    RndRandom *s = RNG_RANDOM(b);

-    if (s->receive_func) {
-        s->receive_func(s->opaque, NULL, 0);
+    if (s->parent.requests == NULL) {
+        /* If there are no pending requests yet, we need to
+         * install our fd handler. */
+        qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
    }
-
-    s->receive_func = receive_entropy;
-    s->opaque = opaque;
-    s->size = size;
-
-    qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
 }

 static void rng_random_opened(RngBackend *b, Error **errp)
@@ -74,7 +72,7 @@ static void rng_random_opened(RngBackend *b, Error **errp)
        error_set(errp, QERR_INVALID_PARAMETER_VALUE,
                  "filename", "a valid filename");
    } else {
-        s->fd = open(s->filename, O_RDONLY | O_NONBLOCK);
+        s->fd = qemu_open(s->filename, O_RDONLY | O_NONBLOCK);

        if (s->fd == -1) {
            error_set(errp, QERR_OPEN_FILE_FAILED, s->filename);
@@ -130,7 +128,7 @@ static void rng_random_finalize(Object *obj)
    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);

    if (s->fd != -1) {
-        close(s->fd);
+        qemu_close(s->fd);
    }

    g_free(s->filename);
--- a/backends/rng.c
+++ b/backends/rng.c
@@ -18,18 +18,20 @@ void rng_backend_request_entropy(RngBackend *s, size_t size,
                                 void *opaque)
 {
    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+    RngRequest *req;

    if (k->request_entropy) {
-        k->request_entropy(s, size, receive_entropy, opaque);
-    }
-}
+        req = g_malloc(sizeof(*req));

-void rng_backend_cancel_requests(RngBackend *s)
-{
-    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+        req->offset = 0;
+        req->size = size;
+        req->receive_entropy = receive_entropy;
+        req->opaque = opaque;
+        req->data = g_malloc(req->size);

-    if (k->cancel_requests) {
-        k->cancel_requests(s);
+        k->request_entropy(s, req);
+
+        s->requests = g_slist_append(s->requests, req);
    }
 }

@@ -68,6 +70,30 @@ static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp)
    }
 }

+static void rng_backend_free_request(RngRequest *req)
+{
+    g_free(req->data);
+    g_free(req);
+}
+
+static void rng_backend_free_requests(RngBackend *s)
+{
+    GSList *i;
+
+    for (i = s->requests; i; i = i->next) {
+        rng_backend_free_request(i->data);
+    }
+
+    g_slist_free(s->requests);
+    s->requests = NULL;
+}
+
+void rng_backend_finalize_request(RngBackend *s, RngRequest *req)
+{
+    s->requests = g_slist_remove(s->requests, req);
+    rng_backend_free_request(req);
+}
+
 static void rng_backend_init(Object *obj)
 {
    object_property_add_bool(obj, "opened",
@@ -76,11 +102,19 @@ static void rng_backend_init(Object *obj)
                             NULL);
 }

+static void rng_backend_finalize(Object *obj)
+{
+    RngBackend *s = RNG_BACKEND(obj);
+
+    rng_backend_free_requests(s);
+}
+
 static const TypeInfo rng_backend_info = {
    .name = TYPE_RNG_BACKEND,
    .parent = TYPE_OBJECT,
    .instance_size = sizeof(RngBackend),
    .instance_init = rng_backend_init,
+    .instance_finalize = rng_backend_finalize,
    .class_size = sizeof(RngBackendClass),
    .abstract = true,
 };
--- a/block.c
+++ b/block.c
@@ -1940,6 +1940,10 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                              int nb_sectors)
 {
+    if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+        return -EIO;
+    }
+
    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
                                   nb_sectors * BDRV_SECTOR_SIZE);
 }
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -18,5 +18,7 @@ endif
 common-obj-y += stream.o
 common-obj-y += commit.o
 common-obj-y += mirror.o
+block-obj-y += dictzip.o
+block-obj-y += tar.o

 $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -38,57 +38,42 @@

 // not allocated: 0xffffffff

-// always little-endian
-struct bochs_header_v1 {
-    char magic[32]; // "Bochs Virtual HD Image"
-    char type[16]; // "Redolog"
-    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
-    uint32_t version;
-    uint32_t header; // size of header
-
-    union {
-	struct {
-	    uint32_t catalog; // num of entries
-	    uint32_t bitmap; // bitmap size
-	    uint32_t extent; // extent size
-	    uint64_t disk; // disk size
-	    char padding[HEADER_SIZE - 64 - 8 - 20];
-	} redolog;
-	char padding[HEADER_SIZE - 64 - 8];
-    } extra;
-};
-
 // always little-endian
 struct bochs_header {
-    char magic[32]; // "Bochs Virtual HD Image"
-    char type[16]; // "Redolog"
-    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    char magic[32];     /* "Bochs Virtual HD Image" */
+    char type[16];      /* "Redolog" */
+    char subtype[16];   /* "Undoable" / "Volatile" / "Growing" */
    uint32_t version;
-    uint32_t header; // size of header
+    uint32_t header;    /* size of header */
+
+    uint32_t catalog;   /* num of entries */
+    uint32_t bitmap;    /* bitmap size */
+    uint32_t extent;    /* extent size */

    union {
-	struct {
-	    uint32_t catalog; // num of entries
-	    uint32_t bitmap; // bitmap size
-	    uint32_t extent; // extent size
-	    uint32_t reserved; // for ???
-	    uint64_t disk; // disk size
-	    char padding[HEADER_SIZE - 64 - 8 - 24];
-	} redolog;
-	char padding[HEADER_SIZE - 64 - 8];
+        struct {
+            uint32_t reserved;  /* for ??? */
+            uint64_t disk;      /* disk size */
+            char padding[HEADER_SIZE - 64 - 20 - 12];
+        } QEMU_PACKED redolog;
+        struct {
+            uint64_t disk;      /* disk size */
+            char padding[HEADER_SIZE - 64 - 20 - 8];
+        } QEMU_PACKED redolog_v1;
+        char padding[HEADER_SIZE - 64 - 20];
    } extra;
-};
+} QEMU_PACKED;

 typedef struct BDRVBochsState {
    CoMutex lock;
    uint32_t *catalog_bitmap;
-    int catalog_size;
+    uint32_t catalog_size;

-    int data_offset;
+    uint32_t data_offset;

-    int bitmap_blocks;
-    int extent_blocks;
-    int extent_size;
+    uint32_t bitmap_blocks;
+    uint32_t extent_blocks;
+    uint32_t extent_size;
 } BDRVBochsState;

 static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -111,9 +96,8 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
 static int bochs_open(BlockDriverState *bs, int flags)
 {
    BDRVBochsState *s = bs->opaque;
-    int i;
+    uint32_t i;
    struct bochs_header bochs;
-    struct bochs_header_v1 header_v1;
    int ret;

    bs->read_only = 1; // no write support yet
@@ -132,13 +116,20 @@ static int bochs_open(BlockDriverState *bs, int flags)
    }

    if (le32_to_cpu(bochs.version) == HEADER_V1) {
-      memcpy(&header_v1, &bochs, sizeof(bochs));
-      bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+        bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512;
    } else {
-      bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+        bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+    }
+
+    /* Limit to 1M entries to avoid unbounded allocation. This is what is
+     * needed for the largest image that bximage can create (~8 TB). */
+    s->catalog_size = le32_to_cpu(bochs.catalog);
+    if (s->catalog_size > 0x100000) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Catalog size is too large");
+        return -EFBIG;
    }

-    s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
    s->catalog_bitmap = g_malloc(s->catalog_size * 4);

    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
@@ -152,10 +143,27 @@ static int bochs_open(BlockDriverState *bs, int flags)

    s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);

-    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
-    s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512;
+    s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512;

-    s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+    s->extent_size = le32_to_cpu(bochs.extent);
+    if (s->extent_size == 0) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Extent size may not be zero");
+        return -EINVAL;
+    } else if (s->extent_size > 0x800000) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Extent size %" PRIu32 " is too large",
+                      s->extent_size);
+        return -EINVAL;
+    }
+
+    if (s->catalog_size < bs->total_sectors / s->extent_size) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Catalog size is too small for this disk size");
+        ret = -EINVAL;
+        goto fail;
+    }

    qemu_co_mutex_init(&s->lock);
    return 0;
@@ -168,8 +176,8 @@ fail:
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
    BDRVBochsState *s = bs->opaque;
-    int64_t offset = sector_num * 512;
-    int64_t extent_index, extent_offset, bitmap_offset;
+    uint64_t offset = sector_num * 512;
+    uint64_t extent_index, extent_offset, bitmap_offset;
    char bitmap_entry;

    // seek to sector
@@ -180,8 +188,9 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 	return -1; /* not allocated */
    }

-    bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
-	(s->extent_blocks + s->bitmap_blocks));
+    bitmap_offset = s->data_offset +
+        (512 * (uint64_t) s->catalog_bitmap[extent_index] *
+        (s->extent_blocks + s->bitmap_blocks));

    /* read in bitmap for current extent */
    if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,9 @@
 #include "qemu/module.h"
 #include <zlib.h>

+/* Maximum compressed block size */
+#define MAX_BLOCK_SIZE (64 * 1024 * 1024)
+
 typedef struct BDRVCloopState {
    CoMutex lock;
    uint32_t block_size;
@@ -67,6 +70,29 @@ static int cloop_open(BlockDriverState *bs, int flags)
        return ret;
    }
    s->block_size = be32_to_cpu(s->block_size);
+    if (s->block_size % 512) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "block_size %u must be a multiple of 512",
+                      s->block_size);
+        return -EINVAL;
+    }
+    if (s->block_size == 0) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "block_size cannot be zero");
+        return -EINVAL;
+    }
+
+    /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but
+     * we can accept more.  Prevent ridiculous values like 4 GB - 1 since we
+     * need a buffer this big.
+     */
+    if (s->block_size > MAX_BLOCK_SIZE) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "block_size %u must be %u MB or less",
+                      s->block_size,
+                      MAX_BLOCK_SIZE / (1024 * 1024));
+        return -EINVAL;
+    }

    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
    if (ret < 0) {
@@ -75,7 +101,25 @@ static int cloop_open(BlockDriverState *bs, int flags)
    s->n_blocks = be32_to_cpu(s->n_blocks);

    /* read offsets */
-    offsets_size = s->n_blocks * sizeof(uint64_t);
+    if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) {
+        /* Prevent integer overflow */
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "n_blocks %u must be %zu or less",
+                      s->n_blocks,
+                      (UINT32_MAX - 1) / sizeof(uint64_t));
+        return -EINVAL;
+    }
+    offsets_size = (s->n_blocks + 1) * sizeof(uint64_t);
+    if (offsets_size > 512 * 1024 * 1024) {
+        /* Prevent ridiculous offsets_size which causes memory allocation to
+         * fail or overflows bdrv_pread() size.  In practice the 512 MB
+         * offsets[] limit supports 16 TB images at 256 KB block size.
+         */
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "image requires too many offsets, "
+                      "try increasing block size");
+        return -EINVAL;
+    }
    s->offsets = g_malloc(offsets_size);

    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
@@ -83,13 +127,39 @@ static int cloop_open(BlockDriverState *bs, int flags)
        goto fail;
    }

-    for(i=0;i<s->n_blocks;i++) {
+    for (i = 0; i < s->n_blocks + 1; i++) {
+        uint64_t size;
+
        s->offsets[i] = be64_to_cpu(s->offsets[i]);
-        if (i > 0) {
-            uint32_t size = s->offsets[i] - s->offsets[i - 1];
-            if (size > max_compressed_block_size) {
-                max_compressed_block_size = size;
-            }
+        if (i == 0) {
+            continue;
+        }
+
+        if (s->offsets[i] < s->offsets[i - 1]) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                          "offsets not monotonically increasing at "
+                          "index %u, image file is corrupt", i);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        size = s->offsets[i] - s->offsets[i - 1];
+
+        /* Compressed blocks should be smaller than the uncompressed block size
+         * but maybe compression performed poorly so the compressed block is
+         * actually bigger.  Clamp down on unrealistic values to prevent
+         * ridiculous s->compressed_block allocation.
+         */
+        if (size > 2 * MAX_BLOCK_SIZE) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                          "invalid compressed block size at index %u, "
+                          "image file is corrupt", i);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        if (size > max_compressed_block_size) {
+            max_compressed_block_size = size;
        }
    }

@@ -179,9 +249,7 @@ static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
 static void cloop_close(BlockDriverState *bs)
 {
    BDRVCloopState *s = bs->opaque;
-    if (s->n_blocks > 0) {
-        g_free(s->offsets);
-    }
+    g_free(s->offsets);
    g_free(s->compressed_block);
    g_free(s->uncompressed_block);
    inflateEnd(&s->zstream);
--- a/block/curl.c
+++ b/block/curl.c
@@ -134,6 +134,11 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
    if (!s || !s->orig_buf)
        goto read_end;

+    if (s->buf_off >= s->buf_len) {
+        /* buffer full, read nothing */
+        return 0;
+    }
+    realsize = MIN(realsize, s->buf_len - s->buf_off);
    memcpy(s->orig_buf + s->buf_off, ptr, realsize);
    s->buf_off += realsize;

--- a/block/dictzip.c
+++ b/block/dictzip.c
@@ -0,0 +1,572 @@
+/*
+ * DictZip Block driver for dictzip enabled gzip files
+ *
+ * Use the "dictzip" tool from the "dictd" package to create gzip files that
+ * contain the extra DictZip headers.
+ *
+ * dictzip(1) is a compression program which creates compressed files in the
+ * gzip format (see RFC 1952). However, unlike gzip(1), dictzip(1) compresses
+ * the file in pieces and stores an index to the pieces in the gzip header.
+ * This allows random access to the file at the granularity of the compressed
+ * pieces (currently about 64kB) while maintaining good compression ratios
+ * (within 5% of the expected ratio for dictionary data).
+ * dictd(8) uses files stored in this format.
+ *
+ * For details on DictZip see http://dict.org/.
+ *
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include <zlib.h>
+
+// #define DEBUG
+
+#ifdef DEBUG
+#define dprintf(fmt, ...) do { printf("dzip: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) do { } while (0)
+#endif
+
+#define SECTOR_SIZE 512
+#define Z_STREAM_COUNT 4
+#define CACHE_COUNT 20
+
+/* magic values */
+
+#define GZ_MAGIC1     0x1f
+#define GZ_MAGIC2     0x8b
+#define DZ_MAGIC1      'R'
+#define DZ_MAGIC2      'A'
+
+#define GZ_FEXTRA     0x04      /* Optional field (random access index)    */
+#define GZ_FNAME      0x08      /* Original name                           */
+#define GZ_COMMENT    0x10      /* Zero-terminated, human-readable comment */
+#define GZ_FHCRC      0x02      /* Header CRC16                            */
+
+/* offsets */
+
+#define GZ_ID            0      /* GZ_MAGIC (16bit)                        */
+#define GZ_FLG           3      /* FLaGs (see above)                       */
+#define GZ_XLEN         10      /* eXtra LENgth (16bit)                    */
+#define GZ_SI           12      /* Subfield ID (16bit)                     */
+#define GZ_VERSION      16      /* Version for subfield format             */
+#define GZ_CHUNKSIZE    18      /* Chunk size (16bit)                      */
+#define GZ_CHUNKCNT     20      /* Number of chunks (16bit)                */
+#define GZ_RNDDATA      22      /* Random access data (16bit)              */
+
+#define GZ_99_CHUNKSIZE 18      /* Chunk size (32bit)                      */
+#define GZ_99_CHUNKCNT  22      /* Number of chunks (32bit)                */
+#define GZ_99_FILESIZE  26      /* Size of unpacked file (64bit)           */
+#define GZ_99_RNDDATA   34      /* Random access data (32bit)              */
+
+struct BDRVDictZipState;
+
+typedef struct DictZipAIOCB {
+    BlockDriverAIOCB common;
+    struct BDRVDictZipState *s;
+    QEMUIOVector *qiov;          /* QIOV of the original request */
+    QEMUIOVector *qiov_gz;       /* QIOV of the gz subrequest */
+    QEMUBH *bh;                  /* BH for cache */
+    z_stream *zStream;           /* stream to use for decoding */
+    int zStream_id;              /* stream id of the above pointer */
+    size_t start;                /* offset into the uncompressed file */
+    size_t len;                  /* uncompressed bytes to read */
+    uint8_t *gzipped;            /* the gzipped data */
+    uint8_t *buf;                /* cached result */
+    size_t gz_len;               /* amount of gzip data */
+    size_t gz_start;             /* uncompressed starting point of gzip data */
+    uint64_t offset;             /* offset for "start" into the uncompressed chunk */
+    int chunks_len;              /* amount of uncompressed data in all gzip data */
+} DictZipAIOCB;
+
+typedef struct dict_cache {
+    size_t start;
+    size_t len;
+    uint8_t *buf;
+} DictCache;
+
+typedef struct BDRVDictZipState {
+    BlockDriverState *hd;
+    z_stream zStream[Z_STREAM_COUNT];
+    DictCache cache[CACHE_COUNT];
+    int cache_index;
+    uint8_t  stream_in_use;
+    uint64_t chunk_len;
+    uint32_t chunk_cnt;
+    uint16_t *chunks;
+    uint32_t *chunks32;
+    uint64_t *offsets;
+    int64_t file_len;
+} BDRVDictZipState;
+
+static int dictzip_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size < 2)
+        return 0;
+
+    /* We match on every gzip file */
+    if ((buf[0] == GZ_MAGIC1) && (buf[1] == GZ_MAGIC2))
+        return 100;
+
+    return 0;
+}
+
+static int start_zStream(z_stream *zStream)
+{
+    zStream->zalloc    = NULL;
+    zStream->zfree     = NULL;
+    zStream->opaque    = NULL;
+    zStream->next_in   = 0;
+    zStream->avail_in  = 0;
+    zStream->next_out  = NULL;
+    zStream->avail_out = 0;
+
+    return inflateInit2( zStream, -15 );
+}
+
+static int dictzip_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVDictZipState *s = bs->opaque;
+    const char *err = "Unknown (read error?)";
+    uint8_t magic[2];
+    char buf[100];
+    uint8_t header_flags;
+    uint16_t chunk_len16;
+    uint16_t chunk_cnt16;
+    uint32_t chunk_len32;
+    uint16_t header_ver;
+    uint16_t tmp_short;
+    uint64_t offset;
+    int chunks_len;
+    int headerLength = GZ_XLEN - 1;
+    int rnd_offs;
+    int ret;
+    int i;
+    const char *fname = filename;
+
+    if (!strncmp(filename, "dzip://", 7))
+        fname += 7;
+    else if (!strncmp(filename, "dzip:", 5))
+        fname += 5;
+
+    ret = bdrv_file_open(&s->hd, fname, flags);
+    if (ret < 0)
+        return ret;
+
+    /* initialize zlib streams */
+    for (i = 0; i < Z_STREAM_COUNT; i++) {
+        if (start_zStream( &s->zStream[i] ) != Z_OK) {
+            err = s->zStream[i].msg;
+            goto fail;
+        }
+    }
+
+    /* gzip header */
+    if (bdrv_pread(s->hd, GZ_ID, &magic, sizeof(magic)) != sizeof(magic))
+        goto fail;
+
+    if (!((magic[0] == GZ_MAGIC1) && (magic[1] == GZ_MAGIC2))) {
+        err = "No gzip file";
+        goto fail;
+    }
+
+    /* dzip header */
+    if (bdrv_pread(s->hd, GZ_FLG, &header_flags, 1) != 1)
+        goto fail;
+
+    if (!(header_flags & GZ_FEXTRA)) {
+        err = "Not a dictzip file (wrong flags)";
+        goto fail;
+    }
+
+    /* extra length */
+    if (bdrv_pread(s->hd, GZ_XLEN, &tmp_short, 2) != 2)
+        goto fail;
+
+    headerLength += le16_to_cpu(tmp_short) + 2;
+
+    /* DictZip magic */
+    if (bdrv_pread(s->hd, GZ_SI, &magic, 2) != 2)
+        goto fail;
+
+    if (magic[0] != DZ_MAGIC1 || magic[1] != DZ_MAGIC2) {
+        err = "Not a dictzip file (missing extra magic)";
+        goto fail;
+    }
+
+    /* DictZip version */
+    if (bdrv_pread(s->hd, GZ_VERSION, &header_ver, 2) != 2)
+        goto fail;
+
+    header_ver = le16_to_cpu(header_ver);
+
+    switch (header_ver) {
+        case 1: /* Normal DictZip */
+            /* number of chunks */
+            if (bdrv_pread(s->hd, GZ_CHUNKSIZE, &chunk_len16, 2) != 2)
+                goto fail;
+
+            s->chunk_len = le16_to_cpu(chunk_len16);
+
+            /* chunk count */
+            if (bdrv_pread(s->hd, GZ_CHUNKCNT, &chunk_cnt16, 2) != 2)
+                goto fail;
+
+            s->chunk_cnt = le16_to_cpu(chunk_cnt16);
+            chunks_len = sizeof(short) * s->chunk_cnt;
+            rnd_offs = GZ_RNDDATA;
+            break;
+        case 99: /* Special Alex pigz version */
+            /* number of chunks */
+            if (bdrv_pread(s->hd, GZ_99_CHUNKSIZE, &chunk_len32, 4) != 4)
+                goto fail;
+
+            dprintf("chunk len [%#x] = %d\n", GZ_99_CHUNKSIZE, chunk_len32);
+            s->chunk_len = le32_to_cpu(chunk_len32);
+
+            /* chunk count */
+            if (bdrv_pread(s->hd, GZ_99_CHUNKCNT, &s->chunk_cnt, 4) != 4)
+                goto fail;
+
+            s->chunk_cnt = le32_to_cpu(s->chunk_cnt);
+
+            dprintf("chunk len | count = %d | %d\n", s->chunk_len, s->chunk_cnt);
+
+            /* file size */
+            if (bdrv_pread(s->hd, GZ_99_FILESIZE, &s->file_len, 8) != 8)
+                goto fail;
+
+            s->file_len = le64_to_cpu(s->file_len);
+            chunks_len = sizeof(int) * s->chunk_cnt;
+            rnd_offs = GZ_99_RNDDATA;
+            break;
+        default:
+            err = "Invalid DictZip version";
+            goto fail;
+    }
+
+    /* random access data */
+    s->chunks = g_malloc(chunks_len);
+    if (header_ver == 99)
+        s->chunks32 = (uint32_t *)s->chunks;
+
+    if (bdrv_pread(s->hd, rnd_offs, s->chunks, chunks_len) != chunks_len)
+        goto fail;
+
+    /* orig filename */
+    if (header_flags & GZ_FNAME) {
+        if (bdrv_pread(s->hd, headerLength + 1, buf, sizeof(buf)) != sizeof(buf))
+            goto fail;
+
+        buf[sizeof(buf) - 1] = '\0';
+        headerLength += strlen(buf) + 1;
+
+        if (strlen(buf) == sizeof(buf))
+            goto fail;
+
+        dprintf("filename: %s\n", buf);
+    }
+
+    /* comment field */
+    if (header_flags & GZ_COMMENT) {
+        if (bdrv_pread(s->hd, headerLength, buf, sizeof(buf)) != sizeof(buf))
+            goto fail;
+
+        buf[sizeof(buf) - 1] = '\0';
+        headerLength += strlen(buf) + 1;
+
+        if (strlen(buf) == sizeof(buf))
+            goto fail;
+
+        dprintf("comment: %s\n", buf);
+    }
+
+    if (header_flags & GZ_FHCRC)
+        headerLength += 2;
+
+    /* uncompressed file length*/
+    if (!s->file_len) {
+        uint32_t file_len;
+
+        if (bdrv_pread(s->hd, bdrv_getlength(s->hd) - 4, &file_len, 4) != 4)
+            goto fail;
+
+        s->file_len = le32_to_cpu(file_len);
+    }
+
+    /* compute offsets */
+    s->offsets = g_malloc(sizeof( *s->offsets ) * s->chunk_cnt);
+
+    for (offset = headerLength + 1, i = 0; i < s->chunk_cnt; i++) {
+        s->offsets[i] = offset;
+        switch (header_ver) {
+        case 1:
+            offset += le16_to_cpu(s->chunks[i]);
+            break;
+        case 99:
+            offset += le32_to_cpu(s->chunks32[i]);
+            break;
+        }
+
+        dprintf("chunk %#x - %#x = offset %#x -> %#x\n", i * s->chunk_len, (i+1) * s->chunk_len, s->offsets[i], offset);
+    }
+
+    return 0;
+
+fail:
+    fprintf(stderr, "DictZip: Error opening file: %s\n", err);
+    bdrv_delete(s->hd);
+    if (s->chunks)
+        g_free(s->chunks);
+    return -EINVAL;
+}
+
+/* This callback gets invoked when we have the result in cache already */
+static void dictzip_cache_cb(void *opaque)
+{
+    DictZipAIOCB *acb = (DictZipAIOCB *)opaque;
+
+    qemu_iovec_from_buf(acb->qiov, 0, acb->buf, acb->len);
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+}
+
+/* This callback gets invoked by the underlying block reader when we have
+ * all compressed data. We uncompress in here. */
+static void dictzip_read_cb(void *opaque, int ret)
+{
+    DictZipAIOCB *acb = (DictZipAIOCB *)opaque;
+    struct BDRVDictZipState *s = acb->s;
+    uint8_t *buf;
+    DictCache *cache;
+    int r, i;
+
+    buf = g_malloc(acb->chunks_len);
+
+    /* try to find zlib stream for decoding */
+    do {
+        for (i = 0; i < Z_STREAM_COUNT; i++) {
+            if (!(s->stream_in_use & (1 << i))) {
+                s->stream_in_use |= (1 << i);
+                acb->zStream_id = i;
+                acb->zStream = &s->zStream[i];
+                break;
+            }
+        }
+    } while(!acb->zStream);
+
+    /* sure, we could handle more streams, but this callback should be single
+       threaded and when it's not, we really want to know! */
+    assert(i == 0);
+
+    /* uncompress the chunk */
+    acb->zStream->next_in   = acb->gzipped;
+    acb->zStream->avail_in  = acb->gz_len;
+    acb->zStream->next_out  = buf;
+    acb->zStream->avail_out = acb->chunks_len;
+
+    r = inflate( acb->zStream,  Z_PARTIAL_FLUSH );
+    if ( (r != Z_OK) && (r != Z_STREAM_END) )
+        fprintf(stderr, "Error inflating: [%d] %s\n", r, acb->zStream->msg);
+
+    if ( r == Z_STREAM_END )
+        inflateReset(acb->zStream);
+
+    dprintf("inflating [%d] left: %d | %d bytes\n", r, acb->zStream->avail_in, acb->zStream->avail_out);
+    s->stream_in_use &= ~(1 << acb->zStream_id);
+
+    /* nofity the caller */
+    qemu_iovec_from_buf(acb->qiov, 0, buf + acb->offset, acb->len);
+    acb->common.cb(acb->common.opaque, 0);
+
+    /* fill the cache */
+    cache = &s->cache[s->cache_index];
+    s->cache_index++;
+    if (s->cache_index == CACHE_COUNT)
+        s->cache_index = 0;
+
+    cache->len = 0;
+    if (cache->buf)
+        g_free(cache->buf);
+    cache->start = acb->gz_start;
+    cache->buf = buf;
+    cache->len = acb->chunks_len;
+
+    /* free occupied ressources */
+    g_free(acb->qiov_gz);
+    qemu_aio_release(acb);
+}
+
+static void dictzip_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+}
+
+static const AIOCBInfo dictzip_aiocb_info = {
+    .aiocb_size         = sizeof(DictZipAIOCB),
+    .cancel             = dictzip_aio_cancel,
+};
+
+/* This is where we get a request from a caller to read something */
+static BlockDriverAIOCB *dictzip_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVDictZipState *s = bs->opaque;
+    DictZipAIOCB *acb;
+    QEMUIOVector *qiov_gz;
+    struct iovec *iov;
+    uint8_t *buf;
+    size_t  start = sector_num * SECTOR_SIZE;
+    size_t  len = nb_sectors * SECTOR_SIZE;
+    size_t  end = start + len;
+    size_t  gz_start;
+    size_t  gz_len;
+    int64_t gz_sector_num;
+    int     gz_nb_sectors;
+    int     first_chunk, last_chunk;
+    int     first_offset;
+    int     i;
+
+    acb = qemu_aio_get(&dictzip_aiocb_info, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    /* Search Cache */
+    for (i = 0; i < CACHE_COUNT; i++) {
+        if (!s->cache[i].len)
+            continue;
+
+        if ((start >= s->cache[i].start) &&
+            (end <= (s->cache[i].start + s->cache[i].len))) {
+            acb->buf = s->cache[i].buf + (start - s->cache[i].start);
+            acb->len = len;
+            acb->qiov = qiov;
+            acb->bh = qemu_bh_new(dictzip_cache_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        }
+    }
+
+    /* No cache, so let's decode */
+    /* We need to read these chunks */
+    first_chunk  = start / s->chunk_len;
+    first_offset = start - first_chunk * s->chunk_len;
+    last_chunk   = end / s->chunk_len;
+
+    gz_start = s->offsets[first_chunk];
+    gz_len = 0;
+    for (i = first_chunk; i <= last_chunk; i++) {
+        if (s->chunks32)
+            gz_len += le32_to_cpu(s->chunks32[i]);
+        else
+            gz_len += le16_to_cpu(s->chunks[i]);
+    }
+
+    gz_sector_num = gz_start / SECTOR_SIZE;
+    gz_nb_sectors = (gz_len / SECTOR_SIZE);
+
+    /* account for tail and heads */
+    while ((gz_start + gz_len) > ((gz_sector_num + gz_nb_sectors) * SECTOR_SIZE))
+        gz_nb_sectors++;
+
+    /* Allocate qiov, iov and buf in one chunk so we only need to free qiov */
+    qiov_gz = g_malloc0(sizeof(QEMUIOVector) + sizeof(struct iovec) +
+                           (gz_nb_sectors * SECTOR_SIZE));
+    iov = (struct iovec *)(((char *)qiov_gz) + sizeof(QEMUIOVector));
+    buf = ((uint8_t *)iov) + sizeof(struct iovec *);
+
+    /* Kick off the read by the backing file, so we can start decompressing */
+    iov->iov_base = (void *)buf;
+    iov->iov_len = gz_nb_sectors * 512;
+    qemu_iovec_init_external(qiov_gz, iov, 1);
+
+    dprintf("read %d - %d => %d - %d\n", start, end, gz_start, gz_start + gz_len);
+
+    acb->s = s;
+    acb->qiov = qiov;
+    acb->qiov_gz = qiov_gz;
+    acb->start = start;
+    acb->len = len;
+    acb->gzipped = buf + (gz_start % SECTOR_SIZE);
+    acb->gz_len = gz_len;
+    acb->gz_start = first_chunk * s->chunk_len;
+    acb->offset = first_offset;
+    acb->chunks_len = (last_chunk - first_chunk + 1) * s->chunk_len;
+
+    return bdrv_aio_readv(s->hd, gz_sector_num, qiov_gz, gz_nb_sectors,
+                          dictzip_read_cb, acb);
+}
+
+static void dictzip_close(BlockDriverState *bs)
+{
+    BDRVDictZipState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < CACHE_COUNT; i++) {
+        if (!s->cache[i].len)
+            continue;
+
+        g_free(s->cache[i].buf);
+    }
+
+    for (i = 0; i < Z_STREAM_COUNT; i++) {
+        inflateEnd(&s->zStream[i]);
+    }
+
+    if (s->chunks)
+        g_free(s->chunks);
+
+    if (s->offsets)
+        g_free(s->offsets);
+
+    dprintf("Close\n");
+}
+
+static int64_t dictzip_getlength(BlockDriverState *bs)
+{
+    BDRVDictZipState *s = bs->opaque;
+    dprintf("getlength -> %ld\n", s->file_len);
+    return s->file_len;
+}
+
+static BlockDriver bdrv_dictzip = {
+    .format_name     = "dzip",
+    .protocol_name   = "dzip",
+
+    .instance_size   = sizeof(BDRVDictZipState),
+    .bdrv_file_open  = dictzip_open,
+    .bdrv_close      = dictzip_close,
+    .bdrv_getlength  = dictzip_getlength,
+    .bdrv_probe      = dictzip_probe,
+
+    .bdrv_aio_readv  = dictzip_aio_readv,
+};
+
+static void dictzip_block_init(void)
+{
+    bdrv_register(&bdrv_dictzip);
+}
+
+block_init(dictzip_block_init);
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -27,6 +27,14 @@
 #include "qemu/module.h"
 #include <zlib.h>

+enum {
+    /* Limit chunk sizes to prevent unreasonable amounts of memory being used
+     * or truncating when converting to 32-bit types
+     */
+    DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
+    DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
+};
+
 typedef struct BDRVDMGState {
    CoMutex lock;
    /* each chunk contains a certain number of sectors,
@@ -85,12 +93,43 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
    return 0;
 }

+/* Increase max chunk sizes, if necessary.  This function is used to calculate
+ * the buffer sizes needed for compressed/uncompressed chunk I/O.
+ */
+static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
+                                  uint32_t *max_compressed_size,
+                                  uint32_t *max_sectors_per_chunk)
+{
+    uint32_t compressed_size = 0;
+    uint32_t uncompressed_sectors = 0;
+
+    switch (s->types[chunk]) {
+    case 0x80000005: /* zlib compressed */
+        compressed_size = s->lengths[chunk];
+        uncompressed_sectors = s->sectorcounts[chunk];
+        break;
+    case 1: /* copy */
+        uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
+        break;
+    case 2: /* zero */
+        uncompressed_sectors = s->sectorcounts[chunk];
+        break;
+    }
+
+    if (compressed_size > *max_compressed_size) {
+        *max_compressed_size = compressed_size;
+    }
+    if (uncompressed_sectors > *max_sectors_per_chunk) {
+        *max_sectors_per_chunk = uncompressed_sectors;
+    }
+}
+
 static int dmg_open(BlockDriverState *bs, int flags)
 {
    BDRVDMGState *s = bs->opaque;
-    uint64_t info_begin,info_end,last_in_offset,last_out_offset;
+    uint64_t info_begin, info_end, last_in_offset, last_out_offset;
    uint32_t count, tmp;
-    uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+    uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i;
    int64_t offset;
    int ret;

@@ -152,37 +191,40 @@ static int dmg_open(BlockDriverState *bs, int flags)
            goto fail;
        }

-	if (type == 0x6d697368 && count >= 244) {
-	    int new_size, chunk_count;
+        if (type == 0x6d697368 && count >= 244) {
+            size_t new_size;
+            uint32_t chunk_count;

            offset += 4;
            offset += 200;

-	    chunk_count = (count-204)/40;
-	    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
-	    s->types = g_realloc(s->types, new_size/2);
-	    s->offsets = g_realloc(s->offsets, new_size);
-	    s->lengths = g_realloc(s->lengths, new_size);
-	    s->sectors = g_realloc(s->sectors, new_size);
-	    s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+            chunk_count = (count - 204) / 40;
+            new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+            s->types = g_realloc(s->types, new_size / 2);
+            s->offsets = g_realloc(s->offsets, new_size);
+            s->lengths = g_realloc(s->lengths, new_size);
+            s->sectors = g_realloc(s->sectors, new_size);
+            s->sectorcounts = g_realloc(s->sectorcounts, new_size);

            for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
                ret = read_uint32(bs, offset, &s->types[i]);
                if (ret < 0) {
                    goto fail;
                }
-		offset += 4;
-		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
-		    if(s->types[i]==0xffffffff) {
-			last_in_offset = s->offsets[i-1]+s->lengths[i-1];
-			last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
-		    }
-		    chunk_count--;
-		    i--;
-		    offset += 36;
-		    continue;
-		}
-		offset += 4;
+                offset += 4;
+                if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
+                    s->types[i] != 2) {
+                    if (s->types[i] == 0xffffffff && i > 0) {
+                        last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
+                        last_out_offset = s->sectors[i - 1] +
+                                          s->sectorcounts[i - 1];
+                    }
+                    chunk_count--;
+                    i--;
+                    offset += 36;
+                    continue;
+                }
+                offset += 4;

                ret = read_uint64(bs, offset, &s->sectors[i]);
                if (ret < 0) {
@@ -197,6 +239,14 @@ static int dmg_open(BlockDriverState *bs, int flags)
                }
                offset += 8;

+                if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+                    error_report("sector count %" PRIu64 " for chunk %u is "
+                                 "larger than max (%u)",
+                                 s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+                    ret = -EINVAL;
+                    goto fail;
+                }
+
                ret = read_uint64(bs, offset, &s->offsets[i]);
                if (ret < 0) {
                    goto fail;
@@ -210,19 +260,25 @@ static int dmg_open(BlockDriverState *bs, int flags)
                }
                offset += 8;

-		if(s->lengths[i]>max_compressed_size)
-		    max_compressed_size = s->lengths[i];
-		if(s->sectorcounts[i]>max_sectors_per_chunk)
-		    max_sectors_per_chunk = s->sectorcounts[i];
-	    }
-	    s->n_chunks+=chunk_count;
-	}
+                if (s->lengths[i] > DMG_LENGTHS_MAX) {
+                    error_report("length %" PRIu64 " for chunk %u is larger "
+                                 "than max (%u)",
+                                 s->lengths[i], i, DMG_LENGTHS_MAX);
+                    ret = -EINVAL;
+                    goto fail;
+                }
+
+                update_max_chunk_size(s, i, &max_compressed_size,
+                                      &max_sectors_per_chunk);
+            }
+            s->n_chunks += chunk_count;
+        }
    }

    /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size+1);
-    s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk);
-    if(inflateInit(&s->zstream) != Z_OK) {
+    s->compressed_chunk = g_malloc(max_compressed_size + 1);
+    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
+    if (inflateInit(&s->zstream) != Z_OK) {
        ret = -EINVAL;
        goto fail;
    }
@@ -244,83 +300,82 @@ fail:
 }

 static inline int is_sector_in_chunk(BDRVDMGState* s,
-		uint32_t chunk_num,int sector_num)
+                uint32_t chunk_num, uint64_t sector_num)
 {
-    if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
-	    s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
-	return 0;
-    else
-	return -1;
+    if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num ||
+            s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) {
+        return 0;
+    } else {
+        return -1;
+    }
 }

-static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num)
 {
    /* binary search */
-    uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
-    while(chunk1!=chunk2) {
-	chunk3 = (chunk1+chunk2)/2;
-	if(s->sectors[chunk3]>sector_num)
-	    chunk2 = chunk3;
-	else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
-	    return chunk3;
-	else
-	    chunk1 = chunk3;
+    uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3;
+    while (chunk1 != chunk2) {
+        chunk3 = (chunk1 + chunk2) / 2;
+        if (s->sectors[chunk3] > sector_num) {
+            chunk2 = chunk3;
+        } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) {
+            return chunk3;
+        } else {
+            chunk1 = chunk3;
+        }
    }
    return s->n_chunks; /* error */
 }

-static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num)
+static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
 {
    BDRVDMGState *s = bs->opaque;

-    if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
-	int ret;
-	uint32_t chunk = search_chunk(s,sector_num);
+    if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
+        int ret;
+        uint32_t chunk = search_chunk(s, sector_num);

-	if(chunk>=s->n_chunks)
-	    return -1;
+        if (chunk >= s->n_chunks) {
+            return -1;
+        }

-	s->current_chunk = s->n_chunks;
-	switch(s->types[chunk]) {
-	case 0x80000005: { /* zlib compressed */
-	    int i;
+        s->current_chunk = s->n_chunks;
+        switch (s->types[chunk]) {
+        case 0x80000005: { /* zlib compressed */
+            /* we need to buffer, because only the chunk as whole can be
+             * inflated. */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
+                             s->compressed_chunk, s->lengths[chunk]);
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }

-	    /* we need to buffer, because only the chunk as whole can be
-	     * inflated. */
-	    i=0;
-	    do {
-                ret = bdrv_pread(bs->file, s->offsets[chunk] + i,
-                                 s->compressed_chunk+i, s->lengths[chunk]-i);
-		if(ret<0 && errno==EINTR)
-		    ret=0;
-		i+=ret;
-	    } while(ret>=0 && ret+i<s->lengths[chunk]);
-
-	    if (ret != s->lengths[chunk])
-		return -1;
-
-	    s->zstream.next_in = s->compressed_chunk;
-	    s->zstream.avail_in = s->lengths[chunk];
-	    s->zstream.next_out = s->uncompressed_chunk;
-	    s->zstream.avail_out = 512*s->sectorcounts[chunk];
-	    ret = inflateReset(&s->zstream);
-	    if(ret != Z_OK)
-		return -1;
-	    ret = inflate(&s->zstream, Z_FINISH);
-	    if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
-		return -1;
-	    break; }
-	case 1: /* copy */
-	    ret = bdrv_pread(bs->file, s->offsets[chunk],
+            s->zstream.next_in = s->compressed_chunk;
+            s->zstream.avail_in = s->lengths[chunk];
+            s->zstream.next_out = s->uncompressed_chunk;
+            s->zstream.avail_out = 512 * s->sectorcounts[chunk];
+            ret = inflateReset(&s->zstream);
+            if (ret != Z_OK) {
+                return -1;
+            }
+            ret = inflate(&s->zstream, Z_FINISH);
+            if (ret != Z_STREAM_END ||
+                s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
+                return -1;
+            }
+            break; }
+        case 1: /* copy */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
                             s->uncompressed_chunk, s->lengths[chunk]);
-	    if (ret != s->lengths[chunk])
-		return -1;
-	    break;
-	case 2: /* zero */
-	    memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
-	    break;
-	}
-	s->current_chunk = chunk;
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }
+            break;
+        case 2: /* zero */
+            memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]);
+            break;
+        }
+        s->current_chunk = chunk;
    }
    return 0;
 }
@@ -331,12 +386,14 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num,
    BDRVDMGState *s = bs->opaque;
    int i;

-    for(i=0;i<nb_sectors;i++) {
-	uint32_t sector_offset_in_chunk;
-	if(dmg_read_chunk(bs, sector_num+i) != 0)
-	    return -1;
-	sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
-	memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+    for (i = 0; i < nb_sectors; i++) {
+        uint32_t sector_offset_in_chunk;
+        if (dmg_read_chunk(bs, sector_num + i) != 0) {
+            return -1;
+        }
+        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
+        memcpy(buf + i * 512,
+               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
    }
    return 0;
 }
@@ -368,12 +425,12 @@ static void dmg_close(BlockDriverState *bs)
 }

 static BlockDriver bdrv_dmg = {
-    .format_name	= "dmg",
-    .instance_size	= sizeof(BDRVDMGState),
-    .bdrv_probe		= dmg_probe,
-    .bdrv_open		= dmg_open,
-    .bdrv_read          = dmg_co_read,
-    .bdrv_close		= dmg_close,
+    .format_name    = "dmg",
+    .instance_size  = sizeof(BDRVDMGState),
+    .bdrv_probe     = dmg_probe,
+    .bdrv_open      = dmg_open,
+    .bdrv_read      = dmg_co_read,
+    .bdrv_close     = dmg_close,
 };

 static void bdrv_dmg_init(void)
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -602,6 +602,13 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    acb->buf         = NULL;
    acb->ioh         = buf;

+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -274,7 +274,7 @@ static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
        ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
                            offset, request->len);
        if (ret != request->len) {
-            return -EIO;
+            rc = -EIO;
        }
    }
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
@@ -350,7 +350,7 @@ static int nbd_establish_connection(BlockDriverState *bs)

    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
-    socket_set_nonblock(sock);
+    qemu_set_nonblock(sock);
    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
                            nbd_have_request, s);

--- a/block/parallels.c
+++ b/block/parallels.c
@@ -49,9 +49,9 @@ typedef struct BDRVParallelsState {
    CoMutex lock;

    uint32_t *catalog_bitmap;
-    int catalog_size;
+    unsigned int catalog_size;

-    int tracks;
+    unsigned int tracks;
 } BDRVParallelsState;

 static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -91,8 +91,19 @@ static int parallels_open(BlockDriverState *bs, int flags)
    bs->total_sectors = le32_to_cpu(ph.nb_sectors);

    s->tracks = le32_to_cpu(ph.tracks);
+    if (s->tracks == 0) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Invalid image: Zero sectors per track");
+        ret = -EINVAL;
+        goto fail;
+    }

    s->catalog_size = le32_to_cpu(ph.catalog_entries);
+    if (s->catalog_size > INT_MAX / 4) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Catalog too large");
+        ret = -EFBIG;
+        goto fail;
+    }
    s->catalog_bitmap = g_malloc(s->catalog_size * 4);

    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -60,7 +60,7 @@ typedef struct BDRVQcowState {
    int cluster_sectors;
    int l2_bits;
    int l2_size;
-    int l1_size;
+    unsigned int l1_size;
    uint64_t cluster_offset_mask;
    uint64_t l1_table_offset;
    uint64_t *l1_table;
@@ -124,10 +124,28 @@ static int qcow_open(BlockDriverState *bs, int flags)
        goto fail;
    }

-    if (header.size <= 1 || header.cluster_bits < 9) {
+    if (header.size <= 1) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Image size is too small (must be at least 2 bytes)");
        ret = -EINVAL;
        goto fail;
    }
+    if (header.cluster_bits < 9 || header.cluster_bits > 16) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Cluster size must be between 512 and 64k");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* l2_bits specifies number of entries; storing a uint64_t in each entry,
+     * so bytes = num_entries << 3. */
+    if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "L2 table size must be between 512 and 64k");
+        ret = -EINVAL;
+        goto fail;
+    }
+
    if (header.crypt_method > QCOW_CRYPT_AES) {
        ret = -EINVAL;
        goto fail;
@@ -146,7 +164,19 @@ static int qcow_open(BlockDriverState *bs, int flags)

    /* read the level 1 table */
    shift = s->cluster_bits + s->l2_bits;
-    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+    if (header.size > UINT64_MAX - (1LL << shift)) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Image too large");
+        ret = -EINVAL;
+        goto fail;
+    } else {
+        uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
+        if (l1_size > INT_MAX / sizeof(uint64_t)) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Image too large");
+            ret = -EINVAL;
+            goto fail;
+        }
+        s->l1_size = l1_size;
+    }

    s->l1_table_offset = header.l1_table_offset;
    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,12 +29,13 @@
 #include "block/qcow2.h"
 #include "trace.h"

-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size)
 {
    BDRVQcowState *s = bs->opaque;
-    int new_l1_size, new_l1_size2, ret, i;
+    int new_l1_size2, ret, i;
    uint64_t *new_l1_table;
-    int64_t new_l1_table_offset;
+    int64_t new_l1_table_offset, new_l1_size;
    uint8_t data[12];

    if (min_size <= s->l1_size)
@@ -53,8 +54,13 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
        }
    }

+    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
+        return -EFBIG;
+    }
+
 #ifdef DEBUG_ALLOC2
-    fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+            s->l1_size, new_l1_size);
 #endif

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
@@ -324,15 +330,6 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
    struct iovec iov;
    int n, ret;

-    /*
-     * If this is the last cluster and it is only partially used, we must only
-     * copy until the end of the image, or bdrv_check_request will fail for the
-     * bdrv_read/write calls below.
-     */
-    if (start_sect + n_end > bs->total_sectors) {
-        n_end = bs->total_sectors - start_sect;
-    }
-
    n = n_end - n_start;
    if (n <= 0) {
        return 0;
@@ -391,8 +388,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    int *num, uint64_t *cluster_offset)
 {
    BDRVQcowState *s = bs->opaque;
-    unsigned int l1_index, l2_index;
-    uint64_t l2_offset, *l2_table;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset, *l2_table;
    int l1_bits, c;
    unsigned int index_in_cluster, nb_clusters;
    uint64_t nb_available, nb_needed;
@@ -454,6 +451,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
        break;
    case QCOW2_CLUSTER_ZERO:
+        if (s->qcow_version < 3) {
+            return -EIO;
+        }
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
                &l2_table[l2_index], 0,
                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
@@ -504,8 +504,8 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
                             int *new_l2_index)
 {
    BDRVQcowState *s = bs->opaque;
-    unsigned int l1_index, l2_index;
-    uint64_t l2_offset;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset;
    uint64_t *l2_table = NULL;
    int ret;

@@ -519,6 +519,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
        }
    }

+    assert(l1_index < s->l1_size);
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;

    /* seek the l2 table of the given l2 offset */
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -26,7 +26,7 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"

-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                            int64_t offset, int64_t length,
                            int addend);
@@ -38,8 +38,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 int qcow2_refcount_init(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
-    int ret, refcount_table_size2, i;
+    unsigned int refcount_table_size2, i;
+    int ret;

+    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
    s->refcount_table = g_malloc(refcount_table_size2);
    if (s->refcount_table_size > 0) {
@@ -85,7 +87,7 @@ static int load_refcount_block(BlockDriverState *bs,
 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
 {
    BDRVQcowState *s = bs->opaque;
-    int refcount_table_index, block_index;
+    uint64_t refcount_table_index, block_index;
    int64_t refcount_block_offset;
    int ret;
    uint16_t *refcount_block;
@@ -189,10 +191,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
     *   they can describe them themselves.
     *
     * - We need to consider that at this point we are inside update_refcounts
-     *   and doing the initial refcount increase. This means that some clusters
-     *   have already been allocated by the caller, but their refcount isn't
-     *   accurate yet. free_cluster_index tells us where this allocation ends
-     *   as long as we don't overwrite it by freeing clusters.
+     *   and potentially doing an initial refcount increase. This means that
+     *   some clusters have already been allocated by the caller, but their
+     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
+     *   need to return -EAGAIN to signal the caller that it needs to restart
+     *   the search for free clusters.
     *
     * - alloc_clusters_noref and qcow2_free_clusters may load a different
     *   refcount block into the cache
@@ -201,7 +204,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
    *refcount_block = NULL;

    /* We write to the refcount table, so we might depend on L2 tables */
-    qcow2_cache_flush(bs, s->l2_table_cache);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }

    /* Allocate the refcount block itself and mark it as used */
    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
@@ -237,7 +243,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
            goto fail_block;
        }

-        bdrv_flush(bs->file);
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            goto fail_block;
+        }

        /* Initialize the new refcount block only after updating its refcount,
         * update_refcount uses the refcount cache itself */
@@ -270,7 +279,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
        }

        s->refcount_table[refcount_table_index] = new_block;
-        return 0;
+
+        /* The new refcount block may be where the caller intended to put its
+         * data, so let it restart the search. */
+        return -EAGAIN;
    }

    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
@@ -293,8 +305,11 @@ static int alloc_refcount_block(BlockDriverState *bs,

    /* Calculate the number of refcount blocks needed so far */
    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
-    uint64_t blocks_used = (s->free_cluster_index +
-        refcount_block_clusters - 1) / refcount_block_clusters;
+    uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
+
+    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
+        return -EFBIG;
+    }

    /* And now we need at least one block more for the new metadata */
    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
@@ -327,8 +342,6 @@ static int alloc_refcount_block(BlockDriverState *bs,
    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));

-    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
-
    /* Fill the new refcount table */
    memcpy(new_table, s->refcount_table,
        s->refcount_table_size * sizeof(uint64_t));
@@ -391,17 +404,18 @@ static int alloc_refcount_block(BlockDriverState *bs,
    s->refcount_table_size = table_size;
    s->refcount_table_offset = table_offset;

-    /* Free old table. Remember, we must not change free_cluster_index */
-    uint64_t old_free_cluster_index = s->free_cluster_index;
+    /* Free old table. */
    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
-    s->free_cluster_index = old_free_cluster_index;

    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
    if (ret < 0) {
        return ret;
    }

-    return 0;
+    /* If we were trying to do the initial refcount update for some cluster
+     * allocation, we might have used the same clusters to store newly
+     * allocated metadata. Make the caller search some new space. */
+    return -EAGAIN;

 fail_table:
    g_free(new_table);
@@ -539,15 +553,16 @@ static int update_cluster_refcount(BlockDriverState *bs,


 /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 {
    BDRVQcowState *s = bs->opaque;
-    int i, nb_clusters, refcount;
+    uint64_t i, nb_clusters;
+    int refcount;

    nb_clusters = size_to_clusters(s, size);
 retry:
    for(i = 0; i < nb_clusters; i++) {
-        int64_t next_cluster_index = s->free_cluster_index++;
+        uint64_t next_cluster_index = s->free_cluster_index++;
        refcount = get_refcount(bs, next_cluster_index);

        if (refcount < 0) {
@@ -564,18 +579,21 @@ retry:
    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 }

-int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 {
    int64_t offset;
    int ret;

    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
-    offset = alloc_clusters_noref(bs, size);
-    if (offset < 0) {
-        return offset;
-    }
+    do {
+        offset = alloc_clusters_noref(bs, size);
+        if (offset < 0) {
+            return offset;
+        }
+
+        ret = update_refcount(bs, offset, size, 1);
+    } while (ret == -EAGAIN);

-    ret = update_refcount(bs, offset, size, 1);
    if (ret < 0) {
        return ret;
    }
@@ -588,32 +606,29 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t cluster_index;
-    uint64_t old_free_cluster_index;
    int i, refcount, ret;

-    /* Check how many clusters there are free */
-    cluster_index = offset >> s->cluster_bits;
-    for(i = 0; i < nb_clusters; i++) {
-        refcount = get_refcount(bs, cluster_index++);
+    do {
+        /* Check how many clusters there are free */
+        cluster_index = offset >> s->cluster_bits;
+        for(i = 0; i < nb_clusters; i++) {
+            refcount = get_refcount(bs, cluster_index++);

-        if (refcount < 0) {
-            return refcount;
-        } else if (refcount != 0) {
-            break;
+            if (refcount < 0) {
+                return refcount;
+            } else if (refcount != 0) {
+                break;
+            }
        }
-    }

-    /* And then allocate them */
-    old_free_cluster_index = s->free_cluster_index;
-    s->free_cluster_index = cluster_index + i;
+        /* And then allocate them */
+        ret = update_refcount(bs, offset, i << s->cluster_bits, 1);
+    } while (ret == -EAGAIN);

-    ret = update_refcount(bs, offset, i << s->cluster_bits, 1);
    if (ret < 0) {
        return ret;
    }

-    s->free_cluster_index = old_free_cluster_index;
-
    return i;
 }

@@ -884,8 +899,7 @@ static void inc_refcounts(BlockDriverState *bs,
                          int64_t offset, int64_t size)
 {
    BDRVQcowState *s = bs->opaque;
-    int64_t start, last, cluster_offset;
-    int k;
+    uint64_t start, last, cluster_offset, k;

    if (size <= 0)
        return;
@@ -895,11 +909,7 @@ static void inc_refcounts(BlockDriverState *bs,
    for(cluster_offset = start; cluster_offset <= last;
        cluster_offset += s->cluster_size) {
        k = cluster_offset >> s->cluster_bits;
-        if (k < 0) {
-            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
-                cluster_offset);
-            res->corruptions++;
-        } else if (k >= refcount_table_size) {
+        if (k >= refcount_table_size) {
            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
                "the end of the image file, can't properly check refcounts.\n",
                cluster_offset);
@@ -1112,14 +1122,19 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                          BdrvCheckMode fix)
 {
    BDRVQcowState *s = bs->opaque;
-    int64_t size, i;
-    int nb_clusters, refcount1, refcount2;
+    int64_t size, i, nb_clusters;
+    int refcount1, refcount2;
    QCowSnapshot *sn;
    uint16_t *refcount_table;
    int ret;

    size = bdrv_getlength(bs->file);
    nb_clusters = size_to_clusters(s, size);
+    if (nb_clusters > INT_MAX) {
+        res->check_errors++;
+        return -EFBIG;
+    }
+
    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));

    /* header */
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,31 +26,6 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"

-typedef struct QEMU_PACKED QCowSnapshotHeader {
-    /* header is 8 byte aligned */
-    uint64_t l1_table_offset;
-
-    uint32_t l1_size;
-    uint16_t id_str_size;
-    uint16_t name_size;
-
-    uint32_t date_sec;
-    uint32_t date_nsec;
-
-    uint64_t vm_clock_nsec;
-
-    uint32_t vm_state_size;
-    uint32_t extra_data_size; /* for extension */
-    /* extra data follows */
-    /* id_str follows */
-    /* name follows  */
-} QCowSnapshotHeader;
-
-typedef struct QEMU_PACKED QCowSnapshotExtraData {
-    uint64_t vm_state_size_large;
-    uint64_t disk_size;
-} QCowSnapshotExtraData;
-
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
@@ -141,8 +116,14 @@ int qcow2_read_snapshots(BlockDriverState *bs)
        }
        offset += name_size;
        sn->name[name_size] = '\0';
+
+        if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
+            ret = -EFBIG;
+            goto fail;
+        }
    }

+    assert(offset - s->snapshots_offset <= INT_MAX);
    s->snapshots_size = offset - s->snapshots_offset;
    return 0;

@@ -163,7 +144,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
        uint32_t nb_snapshots;
        uint64_t snapshots_offset;
    } QEMU_PACKED header_data;
-    int64_t offset, snapshots_offset;
+    int64_t offset, snapshots_offset = 0;
    int ret;

    /* compute the size of the snapshots */
@@ -175,16 +156,26 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
        offset += sizeof(extra);
        offset += strlen(sn->id_str);
        offset += strlen(sn->name);
+
+        if (offset > QCOW_MAX_SNAPSHOTS_SIZE) {
+            ret = -EFBIG;
+            goto fail;
+        }
    }
+
+    assert(offset <= INT_MAX);
    snapshots_size = offset;

    /* Allocate space for the new snapshot list */
    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
-    bdrv_flush(bs->file);
    offset = snapshots_offset;
    if (offset < 0) {
        return offset;
    }
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        return ret;
+    }

    /* Write all snapshots to the new list */
    for(i = 0; i < s->nb_snapshots; i++) {
@@ -322,6 +313,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    uint64_t *l1_table = NULL;
    int64_t l1_table_offset;

+    if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) {
+        return -EFBIG;
+    }
+
    memset(sn, 0, sizeof(*sn));

    /* Generate an ID if it wasn't passed */
@@ -636,7 +631,11 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
    sn = &s->snapshots[snapshot_index];

    /* Allocate and read in the snapshot's L1 table */
-    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    if (sn->l1_size > QCOW_MAX_L1_SIZE) {
+        error_report("Snapshot L1 table too large");
+        return -EFBIG;
+    }
+    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));

    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -285,12 +285,40 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
    return ret;
 }

+static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
+                                 uint64_t entries, size_t entry_len)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t size;
+
+    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
+     * because values will be passed to qemu functions taking int64_t. */
+    if (entries > INT64_MAX / entry_len) {
+        return -EINVAL;
+    }
+
+    size = entries * entry_len;
+
+    if (INT64_MAX - size < offset) {
+        return -EINVAL;
+    }
+
+    /* Tables must be cluster aligned */
+    if (offset & (s->cluster_size - 1)) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static int qcow2_open(BlockDriverState *bs, int flags)
 {
    BDRVQcowState *s = bs->opaque;
-    int len, i, ret = 0;
+    unsigned int len, i;
+    int ret = 0;
    QCowHeader header;
    uint64_t ext_end;
+    uint64_t l1_vm_state_index;

    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
@@ -322,6 +350,19 @@ static int qcow2_open(BlockDriverState *bs, int flags)

    s->qcow_version = header.version;

+    /* Initialise cluster size */
+    if (header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS) {
+        report_unsupported(bs, "Unsupported cluster size: 2^%i",
+                           header.cluster_bits);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+
    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
@@ -335,6 +376,18 @@ static int qcow2_open(BlockDriverState *bs, int flags)
        be64_to_cpus(&header.autoclear_features);
        be32_to_cpus(&header.refcount_order);
        be32_to_cpus(&header.header_length);
+
+        if (header.header_length < 104) {
+            report_unsupported(bs, "qcow2 header too short");
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
+    if (header.header_length > s->cluster_size) {
+        report_unsupported(bs, "qcow2 header exceeds cluster size");
+        ret = -EINVAL;
+        goto fail;
    }

    if (header.header_length > sizeof(header)) {
@@ -347,6 +400,12 @@ static int qcow2_open(BlockDriverState *bs, int flags)
        }
    }

+    if (header.backing_file_offset > s->cluster_size) {
+        report_unsupported(bs, "Invalid backing file offset");
+        ret = -EINVAL;
+        goto fail;
+    }
+
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
@@ -377,11 +436,6 @@ static int qcow2_open(BlockDriverState *bs, int flags)
        goto fail;
    }

-    if (header.cluster_bits < MIN_CLUSTER_BITS ||
-        header.cluster_bits > MAX_CLUSTER_BITS) {
-        ret = -EINVAL;
-        goto fail;
-    }
    if (header.crypt_method > QCOW_CRYPT_AES) {
        ret = -EINVAL;
        goto fail;
@@ -390,32 +444,77 @@ static int qcow2_open(BlockDriverState *bs, int flags)
    if (s->crypt_method_header) {
        bs->encrypted = 1;
    }
-    s->cluster_bits = header.cluster_bits;
-    s->cluster_size = 1 << s->cluster_bits;
-    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+
    s->refcount_table_offset = header.refcount_table_offset;
    s->refcount_table_size =
        header.refcount_table_clusters << (s->cluster_bits - 3);

-    s->snapshots_offset = header.snapshots_offset;
-    s->nb_snapshots = header.nb_snapshots;
+    if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) {
+        report_unsupported(bs, "Reference count table too large");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = validate_table_offset(bs, s->refcount_table_offset,
+                                s->refcount_table_size, sizeof(uint64_t));
+    if (ret < 0) {
+        report_unsupported(bs, "Invalid reference count table offset");
+        goto fail;
+    }
+
+    /* Snapshot table offset/length */
+    if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) {
+        report_unsupported(bs, "Too many snapshots");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = validate_table_offset(bs, header.snapshots_offset,
+                                header.nb_snapshots,
+                                sizeof(QCowSnapshotHeader));
+    if (ret < 0) {
+        report_unsupported(bs, "Invalid snapshot table offset");
+        goto fail;
+    }

    /* read the level 1 table */
+    if (header.l1_size > QCOW_MAX_L1_SIZE) {
+        report_unsupported(bs, "Active L1 table too large");
+        ret = -EFBIG;
+        goto fail;
+    }
    s->l1_size = header.l1_size;
-    s->l1_vm_state_index = size_to_l1(s, header.size);
+
+    l1_vm_state_index = size_to_l1(s, header.size);
+    if (l1_vm_state_index > INT_MAX) {
+        ret = -EFBIG;
+        goto fail;
+    }
+    s->l1_vm_state_index = l1_vm_state_index;
+
    /* the L1 table must contain at least enough entries to put
       header.size bytes */
    if (s->l1_size < s->l1_vm_state_index) {
        ret = -EINVAL;
        goto fail;
    }
+
+    ret = validate_table_offset(bs, header.l1_table_offset,
+                                header.l1_size, sizeof(uint64_t));
+    if (ret < 0) {
+        report_unsupported(bs, "Invalid L1 table offset");
+        goto fail;
+    }
    s->l1_table_offset = header.l1_table_offset;
+
+
    if (s->l1_size > 0) {
        s->l1_table = g_malloc0(
            align_offset(s->l1_size * sizeof(uint64_t), 512));
@@ -456,8 +555,10 @@ static int qcow2_open(BlockDriverState *bs, int flags)
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
-        if (len > 1023) {
-            len = 1023;
+        if (len > MIN(1023, s->cluster_size - header.backing_file_offset)) {
+            report_unsupported(bs, "Backing file name too long");
+            ret = -EINVAL;
+            goto fail;
        }
        ret = bdrv_pread(bs->file, header.backing_file_offset,
                         bs->backing_file, len);
@@ -467,6 +568,10 @@ static int qcow2_open(BlockDriverState *bs, int flags)
        bs->backing_file[len] = '\0';
    }

+    /* Internal snapshots */
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
    ret = qcow2_read_snapshots(bs);
    if (ret < 0) {
        goto fail;
@@ -584,7 +689,7 @@ static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
        *pnum = 0;
    }

-    return (cluster_offset != 0);
+    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
 }

 /* handle reading after the end of the backing file */
@@ -665,10 +770,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
            break;

        case QCOW2_CLUSTER_ZERO:
-            if (s->qcow_version < 3) {
-                ret = -EIO;
-                goto fail;
-            }
            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
            break;

@@ -1205,7 +1306,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     */
    BlockDriverState* bs;
    QCowHeader header;
-    uint8_t* refcount_table;
+    uint64_t* refcount_table;
    int ret;

    ret = bdrv_create_file(filename, options);
@@ -1247,9 +1348,10 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    /* Write an empty refcount table */
-    refcount_table = g_malloc0(cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+    /* Write a refcount table with one refcount block */
+    refcount_table = g_malloc0(2 * cluster_size);
+    refcount_table[0] = cpu_to_be64(2 * cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
    g_free(refcount_table);

    if (ret < 0) {
@@ -1271,7 +1373,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
        goto out;
    }

-    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+    ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
    if (ret < 0) {
        goto out;

@@ -1433,7 +1535,8 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
 {
    BDRVQcowState *s = bs->opaque;
-    int ret, new_l1_size;
+    int64_t new_l1_size;
+    int ret;

    if (offset & 511) {
        error_report("The new size must be a multiple of 512");
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -38,6 +38,19 @@
 #define QCOW_CRYPT_AES  1

 #define QCOW_MAX_CRYPT_CLUSTERS 32
+#define QCOW_MAX_SNAPSHOTS 65536
+
+/* 8 MB refcount table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_REFTABLE_SIZE 0x800000
+
+/* 32 MB L1 table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_L1_SIZE 0x2000000
+
+/* Allow for an average of 1k per snapshot table entry, should be plenty of
+ * space for snapshot names and IDs */
+#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)

 /* indicate that the refcount of the referenced cluster is exactly one. */
 #define QCOW_OFLAG_COPIED     (1LL << 63)
@@ -82,6 +95,32 @@ typedef struct QCowHeader {
    uint32_t header_length;
 } QCowHeader;

+typedef struct QEMU_PACKED QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+} QCowSnapshotExtraData;
+
+
 typedef struct QCowSnapshot {
    uint64_t l1_table_offset;
    uint32_t l1_size;
@@ -157,8 +196,8 @@ typedef struct BDRVQcowState {
    uint64_t *refcount_table;
    uint64_t refcount_table_offset;
    uint32_t refcount_table_size;
-    int64_t free_cluster_index;
-    int64_t free_byte_offset;
+    uint64_t free_cluster_index;
+    uint64_t free_byte_offset;

    CoMutex lock;

@@ -168,7 +207,7 @@ typedef struct BDRVQcowState {
    AES_KEY aes_decrypt_key;
    uint64_t snapshots_offset;
    int snapshots_size;
-    int nb_snapshots;
+    unsigned int nb_snapshots;
    QCowSnapshot *snapshots;

    int flags;
@@ -267,7 +306,7 @@ static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
 }

-static inline int size_to_l1(BDRVQcowState *s, int64_t size)
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
 {
    int shift = s->cluster_bits + s->l2_bits;
    return (size + (1ULL << shift) - 1) >> shift;
@@ -279,6 +318,11 @@ static inline int64_t align_offset(int64_t offset, int n)
    return offset;
 }

+static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s)
+{
+    return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
+}
+
 static inline int qcow2_get_cluster_type(uint64_t l2_entry)
 {
    if (l2_entry & QCOW_OFLAG_COMPRESSED) {
@@ -311,7 +355,7 @@ int qcow2_update_header(BlockDriverState *bs);
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);

-int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
    int nb_clusters);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
@@ -327,7 +371,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                          BdrvCheckMode fix);

 /* qcow2-cluster.c functions */
-int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size);
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -142,6 +142,9 @@ typedef struct BDRVRawState {
    bool is_xfs : 1;
 #endif
    bool has_discard : 1;
+#ifdef CONFIG_FIEMAP
+    bool skip_fiemap;
+#endif
 } BDRVRawState;

 typedef struct BDRVRawReopenState {
@@ -1035,6 +1038,79 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
    return result;
 }

+static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
+                      off_t *hole, int nb_sectors, int *pnum)
+{
+#ifdef CONFIG_FIEMAP
+    BDRVRawState *s = bs->opaque;
+    struct {
+        struct fiemap fm;
+        struct fiemap_extent fe;
+    } f;
+
+    if (s->skip_fiemap) {
+        return 1;
+    }
+
+    f.fm.fm_start = start;
+    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
+    f.fm.fm_flags = FIEMAP_FLAG_SYNC;
+    f.fm.fm_extent_count = 1;
+    f.fm.fm_reserved = 0;
+    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
+        /* Assume everything is allocated.  */
+        s->skip_fiemap = true;
+        return 1;
+    }
+
+    if (f.fm.fm_mapped_extents == 0) {
+        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
+         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
+         */
+        off_t length = lseek(s->fd, 0, SEEK_END);
+        *hole = f.fm.fm_start;
+        *data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+    } else {
+        *data = f.fe.fe_logical;
+        *hole = f.fe.fe_logical + f.fe.fe_length;
+    }
+    return 0;
+#else
+    return 1;
+#endif
+}
+
+static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
+                             off_t *hole, int *pnum)
+{
+#if defined SEEK_HOLE && defined SEEK_DATA
+    BDRVRawState *s = bs->opaque;
+
+    *hole = lseek(s->fd, start, SEEK_HOLE);
+    if (*hole == -1) {
+        /* -ENXIO indicates that sector_num was past the end of the file.
+         * There is a virtual hole there.  */
+        assert(errno != -ENXIO);
+
+        return 1;
+    }
+
+    if (*hole > start) {
+        *data = start;
+    } else {
+        /* On a hole.  We need another syscall to find its end.  */
+        *data = lseek(s->fd, start, SEEK_DATA);
+        if (*data == -1) {
+            *data = lseek(s->fd, 0, SEEK_END);
+        }
+    }
+    return 0;
+#else
+    return 1;
+#endif
+}
+
+
 /*
 * Returns true iff the specified sector is present in the disk image. Drivers
 * not implementing the functionality are assumed to not support backing files,
@@ -1054,7 +1130,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
                                            int64_t sector_num,
                                            int nb_sectors, int *pnum)
 {
-    off_t start, data, hole;
+    off_t start, data = 0, hole = 0;
    int ret;

    ret = fd_open(bs);
@@ -1064,65 +1140,15 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,

    start = sector_num * BDRV_SECTOR_SIZE;

-#ifdef CONFIG_FIEMAP
-
-    BDRVRawState *s = bs->opaque;
-    struct {
-        struct fiemap fm;
-        struct fiemap_extent fe;
-    } f;
-
-    f.fm.fm_start = start;
-    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
-    f.fm.fm_flags = 0;
-    f.fm.fm_extent_count = 1;
-    f.fm.fm_reserved = 0;
-    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
-        /* Assume everything is allocated.  */
-        *pnum = nb_sectors;
-        return 1;
-    }
-
-    if (f.fm.fm_mapped_extents == 0) {
-        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
-         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
-         */
-        off_t length = lseek(s->fd, 0, SEEK_END);
-        hole = f.fm.fm_start;
-        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
-    } else {
-        data = f.fe.fe_logical;
-        hole = f.fe.fe_logical + f.fe.fe_length;
-    }
-
-#elif defined SEEK_HOLE && defined SEEK_DATA
-
-    BDRVRawState *s = bs->opaque;
-
-    hole = lseek(s->fd, start, SEEK_HOLE);
-    if (hole == -1) {
-        /* -ENXIO indicates that sector_num was past the end of the file.
-         * There is a virtual hole there.  */
-        assert(errno != -ENXIO);
-
-        /* Most likely EINVAL.  Assume everything is allocated.  */
-        *pnum = nb_sectors;
-        return 1;
-    }
-
-    if (hole > start) {
-        data = start;
-    } else {
-        /* On a hole.  We need another syscall to find its end.  */
-        data = lseek(s->fd, start, SEEK_DATA);
-        if (data == -1) {
-            data = lseek(s->fd, 0, SEEK_END);
+    ret = try_seek_hole(bs, start, &data, &hole, pnum);
+    if (ret) {
+        ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
+        if (ret) {
+            /* Assume everything is allocated. */
+            data = 0;
+            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
        }
    }
-#else
-    *pnum = nb_sectors;
-    return 1;
-#endif

    if (data <= start) {
        /* On a data extent, compute sectors to the end of the extent.  */
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -63,7 +63,8 @@
 typedef enum {
    RBD_AIO_READ,
    RBD_AIO_WRITE,
-    RBD_AIO_DISCARD
+    RBD_AIO_DISCARD,
+    RBD_AIO_FLUSH
 } RBDAIOCmd;

 typedef struct RBDAIOCB {
@@ -379,8 +380,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)

    r = rcb->ret;

-    if (acb->cmd == RBD_AIO_WRITE ||
-        acb->cmd == RBD_AIO_DISCARD) {
+    if (acb->cmd != RBD_AIO_READ) {
        if (r < 0) {
            acb->ret = r;
            acb->error = 1;
@@ -658,6 +658,16 @@ static int rbd_aio_discard_wrapper(rbd_image_t image,
 #endif
 }

+static int rbd_aio_flush_wrapper(rbd_image_t image,
+                                 rbd_completion_t comp)
+{
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+    return rbd_aio_flush(image, comp);
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
                                       int64_t sector_num,
                                       QEMUIOVector *qiov,
@@ -678,7 +688,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
    acb->cmd = cmd;
    acb->qiov = qiov;
-    if (cmd == RBD_AIO_DISCARD) {
+    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
        acb->bounce = NULL;
    } else {
        acb->bounce = qemu_blockalign(bs, qiov->size);
@@ -722,6 +732,9 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    case RBD_AIO_DISCARD:
        r = rbd_aio_discard_wrapper(s->image, off, size, c);
        break;
+    case RBD_AIO_FLUSH:
+        r = rbd_aio_flush_wrapper(s->image, c);
+        break;
    default:
        r = -EINVAL;
    }
@@ -761,6 +774,16 @@ static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                         RBD_AIO_WRITE);
 }

+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
+}
+
+#else
+
 static int qemu_rbd_co_flush(BlockDriverState *bs)
 {
 #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
@@ -771,6 +794,7 @@ static int qemu_rbd_co_flush(BlockDriverState *bs)
    return 0;
 #endif
 }
+#endif

 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
@@ -948,7 +972,12 @@ static BlockDriver bdrv_rbd = {

    .bdrv_aio_readv         = qemu_rbd_aio_readv,
    .bdrv_aio_writev        = qemu_rbd_aio_writev,
+
+#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
+    .bdrv_aio_flush         = qemu_rbd_aio_flush,
+#else
    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
+#endif

 #ifdef LIBRBD_SUPPORTS_DISCARD
    .bdrv_aio_discard       = qemu_rbd_aio_discard,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -549,7 +549,7 @@ static coroutine_fn void do_co_req(void *opaque)
    co = qemu_coroutine_self();
    qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);

-    socket_set_block(sockfd);
+    qemu_set_block(sockfd);
    ret = send_co_req(sockfd, hdr, data, wlen);
    if (ret < 0) {
        goto out;
@@ -579,7 +579,7 @@ static coroutine_fn void do_co_req(void *opaque)
    ret = 0;
 out:
    qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
-    socket_set_nonblock(sockfd);
+    qemu_set_nonblock(sockfd);

    srco->ret = ret;
    srco->finished = true;
@@ -812,7 +812,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
        return fd;
    }

-    socket_set_nonblock(fd);
+    qemu_set_nonblock(fd);

    ret = set_nodelay(fd);
    if (ret) {
--- a/block/tar.c
+++ b/block/tar.c
@@ -0,0 +1,365 @@
+/*
+ * Tar block driver
+ *
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+
+// #define DEBUG
+
+#ifdef DEBUG
+#define dprintf(fmt, ...) do { printf("tar: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) do { } while (0)
+#endif
+
+#define SECTOR_SIZE      512
+
+#define POSIX_TAR_MAGIC  "ustar"
+#define OFFS_LENGTH      0x7c
+#define OFFS_TYPE        0x9c
+#define OFFS_MAGIC       0x101
+
+#define OFFS_S_SP        0x182
+#define OFFS_S_EXT       0x1e2
+#define OFFS_S_LENGTH    0x1e3
+#define OFFS_SX_EXT      0x1f8
+
+typedef struct SparseCache {
+    uint64_t start;
+    uint64_t end;
+} SparseCache;
+
+typedef struct BDRVTarState {
+    BlockDriverState *hd;
+    size_t file_sec;
+    uint64_t file_len;
+    SparseCache *sparse;
+    int sparse_num;
+    uint64_t last_end;
+    char longfile[2048];
+} BDRVTarState;
+
+static int tar_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size < OFFS_MAGIC + 5)
+        return 0;
+
+    /* we only support newer tar */
+    if (!strncmp((char*)buf + OFFS_MAGIC, POSIX_TAR_MAGIC, 5))
+        return 100;
+
+    return 0;
+}
+
+static int str_ends(char *str, const char *end)
+{
+    int end_len = strlen(end);
+    int str_len = strlen(str);
+
+    if (str_len < end_len)
+        return 0;
+
+    return !strncmp(str + str_len - end_len, end, end_len);
+}
+
+static int is_target_file(BlockDriverState *bs, char *filename,
+                          char *header)
+{
+    int retval = 0;
+
+    if (str_ends(filename, ".raw"))
+        retval = 1;
+
+    if (str_ends(filename, ".qcow"))
+        retval = 1;
+
+    if (str_ends(filename, ".qcow2"))
+        retval = 1;
+
+    if (str_ends(filename, ".vmdk"))
+        retval = 1;
+
+    if (retval &&
+        (header[OFFS_TYPE] != '0') &&
+        (header[OFFS_TYPE] != 'S')) {
+        retval = 0;
+    }
+
+    dprintf("does filename %s match? %s\n", filename, retval ? "yes" : "no");
+
+    /* make sure we're not using this name again */
+    filename[0] = '\0';
+
+    return retval;
+}
+
+static uint64_t tar2u64(char *ptr)
+{
+    uint64_t retval;
+    char oldend = ptr[12];
+
+    ptr[12] = '\0';
+    if (*ptr & 0x80) {
+        /* XXX we only support files up to 64 bit length */
+        retval = be64_to_cpu(*(uint64_t *)(ptr+4));
+        dprintf("Convert %lx -> %#lx\n", *(uint64_t*)(ptr+4), retval);
+    } else {
+        retval = strtol(ptr, NULL, 8);
+        dprintf("Convert %s -> %#lx\n", ptr, retval);
+    }
+
+    ptr[12] = oldend;
+
+    return retval;
+}
+
+static void tar_sparse(BDRVTarState *s, uint64_t offs, uint64_t len)
+{
+    SparseCache *sparse;
+
+    if (!len)
+        return;
+    if (!(offs - s->last_end)) {
+        s->last_end += len;
+        return;
+    }
+    if (s->last_end > offs)
+        return;
+
+    dprintf("Last chunk until %lx new chunk at %lx\n", s->last_end, offs);
+
+    s->sparse = g_realloc(s->sparse, (s->sparse_num + 1) * sizeof(SparseCache));
+    sparse = &s->sparse[s->sparse_num];
+    sparse->start = s->last_end;
+    sparse->end = offs;
+    s->last_end = offs + len;
+    s->sparse_num++;
+    dprintf("Sparse at %lx end=%lx\n", sparse->start,
+                                       sparse->end);
+}
+
+static int tar_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVTarState *s = bs->opaque;
+    char header[SECTOR_SIZE];
+    char *real_file = header;
+    char *magic;
+    const char *fname = filename;
+    size_t header_offs = 0;
+    int ret;
+
+    if (!strncmp(filename, "tar://", 6))
+        fname += 6;
+    else if (!strncmp(filename, "tar:", 4))
+        fname += 4;
+
+    ret = bdrv_file_open(&s->hd, fname, flags);
+    if (ret < 0)
+        return ret;
+
+    /* Search the file for an image */
+
+    do {
+        /* tar header */
+        if (bdrv_pread(s->hd, header_offs, header, SECTOR_SIZE) != SECTOR_SIZE)
+            goto fail;
+
+        if ((header_offs > 1) && !header[0]) {
+            fprintf(stderr, "Tar: No image file found in archive\n");
+            goto fail;
+        }
+
+        magic = &header[OFFS_MAGIC];
+        if (strncmp(magic, POSIX_TAR_MAGIC, 5)) {
+            fprintf(stderr, "Tar: Invalid magic: %s\n", magic);
+            goto fail;
+        }
+
+        dprintf("file type: %c\n", header[OFFS_TYPE]);
+
+        /* file length*/
+        s->file_len = (tar2u64(&header[OFFS_LENGTH]) + (SECTOR_SIZE - 1)) &
+                      ~(SECTOR_SIZE - 1);
+        s->file_sec = (header_offs / SECTOR_SIZE) + 1;
+
+        header_offs += s->file_len + SECTOR_SIZE;
+
+        if (header[OFFS_TYPE] == 'L') {
+            bdrv_pread(s->hd, header_offs - s->file_len, s->longfile,
+                       sizeof(s->longfile));
+            s->longfile[sizeof(s->longfile)-1] = '\0';
+            real_file = header;
+        } else if (s->longfile[0]) {
+            real_file = s->longfile;
+        } else {
+            real_file = header;
+        }
+    } while(!is_target_file(bs, real_file, header));
+
+    /* We found an image! */
+
+    if (header[OFFS_TYPE] == 'S') {
+        uint8_t isextended;
+        int i;
+
+        for (i = OFFS_S_SP; i < (OFFS_S_SP + (4 * 24)); i += 24)
+            tar_sparse(s, tar2u64(&header[i]), tar2u64(&header[i+12]));
+
+        s->file_len = tar2u64(&header[OFFS_S_LENGTH]);
+        isextended = header[OFFS_S_EXT];
+
+        while (isextended) {
+            if (bdrv_pread(s->hd, s->file_sec * SECTOR_SIZE, header,
+                           SECTOR_SIZE) != SECTOR_SIZE)
+                goto fail;
+
+            for (i = 0; i < (21 * 24); i += 24)
+                tar_sparse(s, tar2u64(&header[i]), tar2u64(&header[i+12]));
+            isextended = header[OFFS_SX_EXT];
+            s->file_sec++;
+        }
+        tar_sparse(s, s->file_len, 1);
+    }
+
+    return 0;
+
+fail:
+    fprintf(stderr, "Tar: Error opening file\n");
+    bdrv_delete(s->hd);
+    return -EINVAL;
+}
+
+typedef struct TarAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+} TarAIOCB;
+
+/* This callback gets invoked when we have pure sparseness */
+static void tar_sparse_cb(void *opaque)
+{
+    TarAIOCB *acb = (TarAIOCB *)opaque;
+
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+}
+
+static void tar_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+}
+
+static AIOCBInfo tar_aiocb_info = {
+    .aiocb_size         = sizeof(TarAIOCB),
+    .cancel             = tar_aio_cancel,
+};
+
+/* This is where we get a request from a caller to read something */
+static BlockDriverAIOCB *tar_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVTarState *s = bs->opaque;
+    SparseCache *sparse;
+    int64_t sec_file = sector_num + s->file_sec;
+    int64_t start = sector_num * SECTOR_SIZE;
+    int64_t end = start + (nb_sectors * SECTOR_SIZE);
+    int i;
+    TarAIOCB *acb;
+
+    for (i = 0; i < s->sparse_num; i++) {
+        sparse = &s->sparse[i];
+        if (sparse->start > end) {
+            /* We expect the cache to be start increasing */
+            break;
+        } else if ((sparse->start < start) && (sparse->end <= start)) {
+            /* sparse before our offset */
+            sec_file -= (sparse->end - sparse->start) / SECTOR_SIZE;
+        } else if ((sparse->start <= start) && (sparse->end >= end)) {
+            /* all our sectors are sparse */
+            char *buf = g_malloc0(nb_sectors * SECTOR_SIZE);
+
+            acb = qemu_aio_get(&tar_aiocb_info, bs, cb, opaque);
+            qemu_iovec_from_buf(qiov, 0, buf, nb_sectors * SECTOR_SIZE);
+            g_free(buf);
+            acb->bh = qemu_bh_new(tar_sparse_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        } else if (((sparse->start >= start) && (sparse->start < end)) ||
+                   ((sparse->end >= start) && (sparse->end < end))) {
+            /* we're semi-sparse (worst case) */
+            /* let's go synchronous and read all sectors individually */
+            char *buf = g_malloc(nb_sectors * SECTOR_SIZE);
+            uint64_t offs;
+
+            for (offs = 0; offs < (nb_sectors * SECTOR_SIZE);
+                 offs += SECTOR_SIZE) {
+                bdrv_pread(bs, (sector_num * SECTOR_SIZE) + offs,
+                           buf + offs, SECTOR_SIZE);
+            }
+
+            qemu_iovec_from_buf(qiov, 0, buf, nb_sectors * SECTOR_SIZE);
+            acb = qemu_aio_get(&tar_aiocb_info, bs, cb, opaque);
+            acb->bh = qemu_bh_new(tar_sparse_cb, acb);
+            qemu_bh_schedule(acb->bh);
+
+            return &acb->common;
+        }
+    }
+
+    return bdrv_aio_readv(s->hd, sec_file, qiov, nb_sectors,
+                          cb, opaque);
+}
+
+static void tar_close(BlockDriverState *bs)
+{
+    dprintf("Close\n");
+}
+
+static int64_t tar_getlength(BlockDriverState *bs)
+{
+    BDRVTarState *s = bs->opaque;
+    dprintf("getlength -> %ld\n", s->file_len);
+    return s->file_len;
+}
+
+static BlockDriver bdrv_tar = {
+    .format_name     = "tar",
+    .protocol_name   = "tar",
+
+    .instance_size   = sizeof(BDRVTarState),
+    .bdrv_file_open  = tar_open,
+    .bdrv_close      = tar_close,
+    .bdrv_getlength  = tar_getlength,
+    .bdrv_probe      = tar_probe,
+
+    .bdrv_aio_readv  = tar_aio_readv,
+};
+
+static void tar_block_init(void)
+{
+    bdrv_register(&bdrv_tar);
+}
+
+block_init(tar_block_init);
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -120,6 +120,11 @@ typedef unsigned char uuid_t[16];

 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)

+/* max blocks in image is (0xffffffff / 4) */
+#define VDI_BLOCKS_IN_IMAGE_MAX  0x3fffffff
+#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
+                                  (uint64_t)DEFAULT_CLUSTER_SIZE)
+
 #if !defined(CONFIG_UUID)
 static inline void uuid_generate(uuid_t out)
 {
@@ -383,6 +388,14 @@ static int vdi_open(BlockDriverState *bs, int flags)
    vdi_header_print(&header);
 #endif

+    if (header.disk_size > VDI_DISK_SIZE_MAX) {
+        logout("Unsupported VDI image size (size is 0x%" PRIx64
+               ", max supported is 0x%" PRIx64 ")\n",
+               header.disk_size, VDI_DISK_SIZE_MAX);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
    if (header.disk_size % SECTOR_SIZE != 0) {
        /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
           We accept them but round the disk size to the next multiple of
@@ -415,8 +428,9 @@ static int vdi_open(BlockDriverState *bs, int flags)
        logout("unsupported sector size %u B\n", header.sector_size);
        ret = -ENOTSUP;
        goto fail;
-    } else if (header.block_size != 1 * MiB) {
-        logout("unsupported block size %u B\n", header.block_size);
+    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
+        logout("unsupported VDI image (block size %u is not %u)\n",
+               header.block_size, DEFAULT_CLUSTER_SIZE);
        ret = -ENOTSUP;
        goto fail;
    } else if (header.disk_size >
@@ -432,6 +446,11 @@ static int vdi_open(BlockDriverState *bs, int flags)
        logout("parent uuid != 0, unsupported\n");
        ret = -ENOTSUP;
        goto fail;
+    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
+        logout("unsupported VDI image (too many blocks %u, max is %u)\n",
+               header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
+        ret = -ENOTSUP;
+        goto fail;
    }

    bs->total_sectors = header.disk_size / SECTOR_SIZE;
@@ -668,11 +687,20 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
        options++;
    }

+    if (bytes > VDI_DISK_SIZE_MAX) {
+        result = -ENOTSUP;
+        logout("Unsupported VDI image size (size is 0x%" PRIx64
+               ", max supported is 0x%" PRIx64 ")\n",
+               bytes, VDI_DISK_SIZE_MAX);
+        goto exit;
+    }
+
    fd = qemu_open(filename,
                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
                   0644);
    if (fd < 0) {
-        return -errno;
+        result = -errno;
+        goto exit;
    }

    /* We need enough blocks to store the given disk size,
@@ -733,6 +761,7 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
        result = -errno;
    }

+exit:
    return result;
 }

--- a/block/vpc.c
+++ b/block/vpc.c
@@ -45,6 +45,8 @@ enum vhd_type {
 // Seconds since Jan 1, 2000 0:00:00 (UTC)
 #define VHD_TIMESTAMP_BASE 946684800

+#define VHD_MAX_SECTORS       (65535LL * 255 * 255)
+
 // always big-endian
 struct vhd_footer {
    char        creator[8]; // "conectix"
@@ -163,6 +165,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
    struct vhd_dyndisk_header* dyndisk_header;
    uint8_t buf[HEADER_SIZE];
    uint32_t checksum;
+    uint64_t computed_size;
    int disk_type = VHD_DYNAMIC;
    int ret;

@@ -211,7 +214,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;

    /* Allow a maximum disk size of approximately 2 TB */
-    if (bs->total_sectors >= 65535LL * 255 * 255) {
+    if (bs->total_sectors >= VHD_MAX_SECTORS) {
        ret = -EFBIG;
        goto fail;
    }
@@ -231,10 +234,32 @@ static int vpc_open(BlockDriverState *bs, int flags)
        }

        s->block_size = be32_to_cpu(dyndisk_header->block_size);
+        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                          "Invalid block size %" PRIu32, s->block_size);
+            ret = -EINVAL;
+            goto fail;
+        }
        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;

        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
-        s->pagetable = g_malloc(s->max_table_entries * 4);
+
+        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        computed_size = (uint64_t) s->max_table_entries * s->block_size;
+        if (computed_size < bs->total_sectors * 512) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);

        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);

@@ -280,7 +305,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
    return 0;

 fail:
-    g_free(s->pagetable);
+    qemu_vfree(s->pagetable);
 #ifdef CACHE
    g_free(s->pageentry_u8);
 #endif
@@ -789,7 +814,7 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
 static void vpc_close(BlockDriverState *bs)
 {
    BDRVVPCState *s = bs->opaque;
-    g_free(s->pagetable);
+    qemu_vfree(s->pagetable);
 #ifdef CACHE
    g_free(s->pageentry_u8);
 #endif
--- a/blockdev.c
+++ b/blockdev.c
@@ -570,7 +570,7 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type)
        /* add virtio block device */
        opts = qemu_opts_create_nofail(qemu_find_opts("device"));
        if (arch_type == QEMU_ARCH_S390X) {
-            qemu_opt_set(opts, "driver", "virtio-blk-s390");
+            qemu_opt_set(opts, "driver", "virtio-blk-ccw");
        } else {
            qemu_opt_set(opts, "driver", "virtio-blk-pci");
        }
@@ -1043,6 +1043,9 @@ void qmp_block_resize(const char *device, int64_t size, Error **errp)
        return;
    }

+    /* complete all in-flight operations before resizing the device */
+    bdrv_drain_all();
+
    switch (bdrv_truncate(bs, size)) {
    case 0:
        break;
--- a/94
+++ b/94
@@ -283,7 +283,7 @@ sdl_config="${SDL_CONFIG-${cross_prefix}sdl-config}"
 # default flags for all hosts
 QEMU_CFLAGS="-fno-strict-aliasing $QEMU_CFLAGS"
 QEMU_CFLAGS="-Wall -Wundef -Wwrite-strings -Wmissing-prototypes $QEMU_CFLAGS"
-QEMU_CFLAGS="-Wstrict-prototypes -Wredundant-decls $QEMU_CFLAGS"
+QEMU_CFLAGS="-Wstrict-prototypes $QEMU_CFLAGS"
 QEMU_CFLAGS="-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE $QEMU_CFLAGS"
 QEMU_INCLUDES="-I. -I\$(SRC_PATH) -I\$(SRC_PATH)/include"
 if test "$debug_info" = "yes"; then
@@ -1435,6 +1435,7 @@ fi
 if test "$seccomp" != "no" ; then
    if $pkg_config --atleast-version=1.0.0 libseccomp --modversion >/dev/null 2>&1; then
        libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
+        QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
 	seccomp="yes"
    else
 	if test "$seccomp" = "yes"; then
@@ -2759,7 +2760,13 @@ if test "$libiscsi" != "no" ; then
 #include <iscsi/iscsi.h>
 int main(void) { iscsi_unmap_sync(NULL,0,0,0,NULL,0); return 0; }
 EOF
-  if compile_prog "" "-liscsi" ; then
+  if $pkg_config --atleast-version=1.7.0 libiscsi --modversion >/dev/null 2>&1; then
+    libiscsi="yes"
+    libiscsi_cflags=$($pkg_config --cflags libiscsi 2>/dev/null)
+    libiscsi_libs=$($pkg_config --libs libiscsi 2>/dev/null)
+    CFLAGS="$CFLAGS $libiscsi_cflags"
+    LIBS="$LIBS $libiscsi_libs"
+  elif compile_prog "" "-liscsi" ; then
    libiscsi="yes"
    LIBS="$LIBS -liscsi"
  else
@@ -2827,7 +2834,7 @@ EOF
  spice_cflags=$($pkg_config --cflags spice-protocol spice-server 2>/dev/null)
  spice_libs=$($pkg_config --libs spice-protocol spice-server 2>/dev/null)
  if $pkg_config --atleast-version=0.12.0 spice-server >/dev/null 2>&1 && \
-     $pkg_config --atleast-version=0.12.2 spice-protocol > /dev/null 2>&1 && \
+     $pkg_config --atleast-version=0.12.3 spice-protocol > /dev/null 2>&1 && \
     compile_prog "$spice_cflags" "$spice_libs" ; then
    spice="yes"
    libs_softmmu="$libs_softmmu $spice_libs"
@@ -3029,34 +3036,67 @@ fi
 ##########################################
 # check and set a backend for coroutine

-# default is ucontext, but always fallback to gthread
-# windows autodetected by make
-if test "$coroutine" = "" -o "$coroutine" = "ucontext"; then
-  if test "$darwin" != "yes"; then
-    cat > $TMPC << EOF
+# We prefer ucontext, but it's not always possible. The fallback
+# is sigcontext. gthread is not selectable except explicitly, because
+# it is not functional enough to run QEMU proper. (It is occasionally
+# useful for debugging purposes.)  On Windows the only valid backend
+# is the Windows-specific one.
+
+ucontext_works=no
+if test "$darwin" != "yes"; then
+  cat > $TMPC << EOF
 #include <ucontext.h>
 #ifdef __stub_makecontext
 #error Ignoring glibc stub makecontext which will always fail
 #endif
 int main(void) { makecontext(0, 0, 0); return 0; }
 EOF
-    if compile_prog "" "" ; then
-        coroutine_backend=ucontext
-    else
-	coroutine_backend=gthread
-    fi
+  if compile_prog "" "" ; then
+    ucontext_works=yes
+  fi
+fi
+
+if test "$coroutine" = ""; then
+  if test "$mingw32" = "yes"; then
+    coroutine=win32
+  elif test "$ucontext_works" = "yes"; then
+    coroutine=ucontext
+  else
+    coroutine=sigaltstack
  fi
-elif test "$coroutine" = "gthread" ; then
-  coroutine_backend=gthread
-elif test "$coroutine" = "windows" ; then
-  coroutine_backend=windows
-elif test "$coroutine" = "sigaltstack" ; then
-  coroutine_backend=sigaltstack
 else
-  echo
-  echo "Error: unknown coroutine backend $coroutine"
-  echo
-  exit 1
+  case $coroutine in
+  windows)
+    if test "$mingw32" != "yes"; then
+      echo
+      echo "Error: 'windows' coroutine backend only valid for Windows"
+      echo
+      exit 1
+    fi
+    # Unfortunately the user visible backend name doesn't match the
+    # coroutine-*.c filename for this case, so we have to adjust it here.
+    coroutine=win32
+    ;;
+  ucontext)
+    if test "$ucontext_works" != "yes"; then
+      feature_not_found "ucontext"
+    fi
+    ;;
+  gthread|sigaltstack)
+    if test "$mingw32" = "yes"; then
+      echo
+      echo "Error: only the 'windows' coroutine backend is valid for Windows"
+      echo
+      exit 1
+    fi
+    ;;
+  *)
+    echo
+    echo "Error: unknown coroutine backend $coroutine"
+    echo
+    exit 1
+    ;;
+  esac
 fi

 ##########################################
@@ -3339,7 +3379,7 @@ echo "OpenGL support    $opengl"
 echo "libiscsi support  $libiscsi"
 echo "build guest agent $guest_agent"
 echo "seccomp support   $seccomp"
-echo "coroutine backend $coroutine_backend"
+echo "coroutine backend $coroutine"
 echo "GlusterFS support $glusterfs"
 echo "virtio-blk-data-plane $virtio_blk_data_plane"
 echo "gcov              $gcov_tool"
@@ -3662,11 +3702,7 @@ if test "$rbd" = "yes" ; then
  echo "CONFIG_RBD=y" >> $config_host_mak
 fi

-if test "$coroutine_backend" = "ucontext" ; then
-  echo "CONFIG_UCONTEXT_COROUTINE=y" >> $config_host_mak
-elif test "$coroutine_backend" = "sigaltstack" ; then
-  echo "CONFIG_SIGALTSTACK_COROUTINE=y" >> $config_host_mak
-fi
+echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak

 if test "$open_by_handle_at" = "yes" ; then
  echo "CONFIG_OPEN_BY_HANDLE=y" >> $config_host_mak
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -51,12 +51,32 @@ void cpu_resume_from_signal(CPUArchState *env, void *puc)
 }
 #endif

+/* Execute a TB, and fix up the CPU state afterwards if necessary */
+static inline tcg_target_ulong cpu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
+{
+    tcg_target_ulong next_tb = tcg_qemu_tb_exec(env, tb_ptr);
+    if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
+        /* We didn't start executing this TB (eg because the instruction
+         * counter hit zero); we must restore the guest PC to the address
+         * of the start of the TB.
+         */
+        TranslationBlock *tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
+        cpu_pc_from_tb(env, tb);
+    }
+    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_REQUESTED) {
+        /* We were asked to stop executing TBs (probably a pending
+         * interrupt. We've now stopped, so clear the flag.
+         */
+        env->tcg_exit_req = 0;
+    }
+    return next_tb;
+}
+
 /* Execute the code without caching the generated code. An interpreter
   could be used if available. */
 static void cpu_exec_nocache(CPUArchState *env, int max_cycles,
                             TranslationBlock *orig_tb)
 {
-    tcg_target_ulong next_tb;
    TranslationBlock *tb;

    /* Should never happen.
@@ -68,14 +88,8 @@ static void cpu_exec_nocache(CPUArchState *env, int max_cycles,
                     max_cycles);
    env->current_tb = tb;
    /* execute the generated code */
-    next_tb = tcg_qemu_tb_exec(env, tb->tc_ptr);
+    cpu_tb_exec(env, tb->tc_ptr);
    env->current_tb = NULL;
-
-    if ((next_tb & 3) == 2) {
-        /* Restore PC.  This may happen if async event occurs before
-           the TB starts executing.  */
-        cpu_pc_from_tb(env, tb);
-    }
    tb_phys_invalidate(tb, -1);
    tb_free(tb);
 }
@@ -196,6 +210,7 @@ int cpu_exec(CPUArchState *env)
    }

    cpu_single_env = env;
+    current_cpu = cpu;

    if (unlikely(exit_request)) {
        env->exit_request = 1;
@@ -583,7 +598,8 @@ int cpu_exec(CPUArchState *env)
                   spans two pages, we cannot safely do a direct
                   jump. */
                if (next_tb != 0 && tb->page_addr[1] == -1) {
-                    tb_add_jump((TranslationBlock *)(next_tb & ~3), next_tb & 3, tb);
+                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
+                                next_tb & TB_EXIT_MASK, tb);
                }
                spin_unlock(&tb_lock);

@@ -596,13 +612,24 @@ int cpu_exec(CPUArchState *env)
                if (likely(!env->exit_request)) {
                    tc_ptr = tb->tc_ptr;
                    /* execute the generated code */
-                    next_tb = tcg_qemu_tb_exec(env, tc_ptr);
-                    if ((next_tb & 3) == 2) {
+                    next_tb = cpu_tb_exec(env, tc_ptr);
+                    switch (next_tb & TB_EXIT_MASK) {
+                    case TB_EXIT_REQUESTED:
+                        /* Something asked us to stop executing
+                         * chained TBs; just continue round the main
+                         * loop. Whatever requested the exit will also
+                         * have set something else (eg exit_request or
+                         * interrupt_request) which we will handle
+                         * next time around the loop.
+                         */
+                        tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
+                        next_tb = 0;
+                        break;
+                    case TB_EXIT_ICOUNT_EXPIRED:
+                    {
                        /* Instruction counter expired.  */
                        int insns_left;
-                        tb = (TranslationBlock *)(next_tb & ~3);
-                        /* Restore PC.  */
-                        cpu_pc_from_tb(env, tb);
+                        tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
                        insns_left = env->icount_decr.u32;
                        if (env->icount_extra && insns_left >= 0) {
                            /* Refill decrementer and continue execution.  */
@@ -623,6 +650,10 @@ int cpu_exec(CPUArchState *env)
                            next_tb = 0;
                            cpu_loop_exit(env);
                        }
+                        break;
+                    }
+                    default:
+                        break;
                    }
                }
                env->current_tb = NULL;
@@ -633,6 +664,7 @@ int cpu_exec(CPUArchState *env)
            /* Reload env after longjmp - the compiler may have smashed all
             * local variables as longjmp is marked 'noreturn'. */
            env = cpu_single_env;
+            cpu = current_cpu;
        }
    } /* for(;;) */

@@ -667,5 +699,6 @@ int cpu_exec(CPUArchState *env)

    /* fail safe : never use cpu_single_env outside cpu_exec() */
    cpu_single_env = NULL;
+    current_cpu = NULL;
    return ret;
 }
--- a/cpus.c
+++ b/cpus.c
@@ -38,6 +38,10 @@
 #include "qemu/main-loop.h"
 #include "qemu/bitmap.h"

+#ifdef CONFIG_SECCOMP
+#include "sysemu/seccomp.h"
+#endif
+
 #ifndef _WIN32
 #include "qemu/compatfd.h"
 #endif
@@ -432,6 +436,15 @@ void cpu_synchronize_all_post_init(void)
    }
 }

+void cpu_clean_all_dirty(void)
+{
+    CPUArchState *cpu;
+
+    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
+        cpu_clean_state(cpu);
+    }
+}
+
 bool cpu_is_stopped(CPUState *cpu)
 {
    return !runstate_is_running() || cpu->stopped;
@@ -665,9 +678,11 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
    qemu_cpu_kick(cpu);
    while (!wi.done) {
        CPUArchState *self_env = cpu_single_env;
+        CPUState *self_cpu = current_cpu;

        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
        cpu_single_env = self_env;
+        current_cpu = self_cpu;
    }
 }

@@ -737,10 +752,15 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
    CPUState *cpu = ENV_GET_CPU(env);
    int r;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    qemu_mutex_lock(&qemu_global_mutex);
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
    cpu_single_env = env;
+    current_cpu = cpu;

    r = kvm_init_vcpu(cpu);
    if (r < 0) {
@@ -778,6 +798,10 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    sigset_t waitset;
    int r;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    qemu_mutex_lock_iothread();
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
@@ -790,8 +814,10 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    qemu_cond_signal(&qemu_cpu_cond);

    cpu_single_env = env;
+    current_cpu = cpu;
    while (1) {
        cpu_single_env = NULL;
+        current_cpu = NULL;
        qemu_mutex_unlock_iothread();
        do {
            int sig;
@@ -803,6 +829,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
        }
        qemu_mutex_lock_iothread();
        cpu_single_env = env;
+        current_cpu = cpu;
        qemu_wait_io_event_common(cpu);
    }

@@ -817,6 +844,10 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
    CPUState *cpu = arg;
    CPUArchState *env;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    qemu_tcg_init_cpu_signals();
    qemu_thread_get_self(cpu->thread);

@@ -1328,6 +1359,18 @@ exit:
    fclose(f);
 }

+bool spec_ctrl_is_inconsistent(void)
+{
+#if defined(TARGET_I386)
+    X86CPU *x86_cpu = X86_CPU(current_cpu);
+    CPUX86State *env = x86_cpu != NULL ? &x86_cpu->env : NULL;
+    if (env && !(env->cpuid_7_0_edx_features & CPUID_7_0_EDX_SPEC_CTRL) &&
+	    env->spec_ctrl)
+        return true;
+#endif
+    return false;
+}
+
 void qmp_inject_nmi(Error **errp)
 {
 #if defined(TARGET_I386)
--- a/device_tree.c
+++ b/device_tree.c
@@ -85,7 +85,7 @@ void *load_device_tree(const char *filename_path, int *sizep)
    /* First allocate space in qemu for device tree */
    fdt = g_malloc0(dt_size);

-    dt_file_load_size = load_image(filename_path, fdt);
+    dt_file_load_size = load_image_size(filename_path, fdt, dt_size);
    if (dt_file_load_size < 0) {
        printf("Unable to open device tree file '%s'\n",
               filename_path);
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -157,7 +157,6 @@ static const VMStateDescription vmstate_kbd = {
    .name = "pckbd",
    .version_id = 3,
    .minimum_version_id = 3,
-    .minimum_version_id_old = 3,
    .fields      = (VMStateField []) {
        VMSTATE_UINT8(write_cmd, KBDState),
        VMSTATE_UINT8(status, KBDState),
@@ -186,12 +185,13 @@ You can see that there are several version fields:
 - minimum_version_id: the minimum version_id that VMState is able to understand
  for that device.
 - minimum_version_id_old: For devices that were not able to port to vmstate, we can
-  assign a function that knows how to read this old state.
+  assign a function that knows how to read this old state. This field is
+  ignored if there is no load_state_old handler.

 So, VMState is able to read versions from minimum_version_id to
-version_id.  And the function load_state_old() is able to load state
-from minimum_version_id_old to minimum_version_id.  This function is
-deprecated and will be removed when no more users are left.
+version_id.  And the function load_state_old() (if present) is able to
+load state from minimum_version_id_old to minimum_version_id.  This
+function is deprecated and will be removed when no more users are left.

 ===  Massaging functions ===

@@ -272,7 +272,6 @@ const VMStateDescription vmstate_ide_drive_pio_state = {
    .name = "ide_drive/pio_state",
    .version_id = 1,
    .minimum_version_id = 1,
-    .minimum_version_id_old = 1,
    .pre_save = ide_drive_pio_pre_save,
    .post_load = ide_drive_pio_post_load,
    .fields      = (VMStateField []) {
@@ -292,7 +291,6 @@ const VMStateDescription vmstate_ide_drive = {
    .name = "ide_drive",
    .version_id = 3,
    .minimum_version_id = 0,
-    .minimum_version_id_old = 0,
    .post_load = ide_drive_post_load,
    .fields      = (VMStateField []) {
        .... several fields ....
--- a/dump-stub.c
+++ b/dump-stub.c
@@ -52,7 +52,8 @@ int cpu_write_elf32_qemunote(write_core_dump_function f,
    return -1;
 }

-int cpu_get_dump_info(ArchDumpInfo *info)
+int cpu_get_dump_info(ArchDumpInfo *info,
+                      const struct GuestPhysBlockList *guest_phys_blocks)
 {
    return -1;
 }
--- a/dump.c
+++ b/dump.c
@@ -21,6 +21,7 @@
 #include "sysemu/dump.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/memory_mapping.h"
+#include "sysemu/cpus.h"
 #include "qapi/error.h"
 #include "qmp-commands.h"
 #include "exec/gdbstub.h"
@@ -59,6 +60,7 @@ static uint64_t cpu_convert_to_target64(uint64_t val, int endian)
 }

 typedef struct DumpState {
+    GuestPhysBlockList guest_phys_blocks;
    ArchDumpInfo dump_info;
    MemoryMappingList list;
    uint16_t phdr_num;
@@ -69,7 +71,7 @@ typedef struct DumpState {
    hwaddr memory_offset;
    int fd;

-    RAMBlock *block;
+    GuestPhysBlock *next_block;
    ram_addr_t start;
    bool has_filter;
    int64_t begin;
@@ -81,6 +83,7 @@ static int dump_cleanup(DumpState *s)
 {
    int ret = 0;

+    guest_phys_blocks_free(&s->guest_phys_blocks);
    memory_mapping_list_free(&s->list);
    if (s->fd != -1) {
        close(s->fd);
@@ -187,7 +190,8 @@ static int write_elf32_header(DumpState *s)
 }

 static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
-                            int phdr_index, hwaddr offset)
+                            int phdr_index, hwaddr offset,
+                            hwaddr filesz)
 {
    Elf64_Phdr phdr;
    int ret;
@@ -197,15 +201,12 @@ static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
    phdr.p_offset = cpu_convert_to_target64(offset, endian);
    phdr.p_paddr = cpu_convert_to_target64(memory_mapping->phys_addr, endian);
-    if (offset == -1) {
-        /* When the memory is not stored into vmcore, offset will be -1 */
-        phdr.p_filesz = 0;
-    } else {
-        phdr.p_filesz = cpu_convert_to_target64(memory_mapping->length, endian);
-    }
+    phdr.p_filesz = cpu_convert_to_target64(filesz, endian);
    phdr.p_memsz = cpu_convert_to_target64(memory_mapping->length, endian);
    phdr.p_vaddr = cpu_convert_to_target64(memory_mapping->virt_addr, endian);

+    assert(memory_mapping->length >= filesz);
+
    ret = fd_write_vmcore(&phdr, sizeof(Elf64_Phdr), s);
    if (ret < 0) {
        dump_error(s, "dump: failed to write program header table.\n");
@@ -216,7 +217,8 @@ static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
 }

 static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
-                            int phdr_index, hwaddr offset)
+                            int phdr_index, hwaddr offset,
+                            hwaddr filesz)
 {
    Elf32_Phdr phdr;
    int ret;
@@ -226,15 +228,12 @@ static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
    phdr.p_offset = cpu_convert_to_target32(offset, endian);
    phdr.p_paddr = cpu_convert_to_target32(memory_mapping->phys_addr, endian);
-    if (offset == -1) {
-        /* When the memory is not stored into vmcore, offset will be -1 */
-        phdr.p_filesz = 0;
-    } else {
-        phdr.p_filesz = cpu_convert_to_target32(memory_mapping->length, endian);
-    }
+    phdr.p_filesz = cpu_convert_to_target32(filesz, endian);
    phdr.p_memsz = cpu_convert_to_target32(memory_mapping->length, endian);
    phdr.p_vaddr = cpu_convert_to_target32(memory_mapping->virt_addr, endian);

+    assert(memory_mapping->length >= filesz);
+
    ret = fd_write_vmcore(&phdr, sizeof(Elf32_Phdr), s);
    if (ret < 0) {
        dump_error(s, "dump: failed to write program header table.\n");
@@ -388,14 +387,14 @@ static int write_data(DumpState *s, void *buf, int length)
 }

 /* write the memroy to vmcore. 1 page per I/O. */
-static int write_memory(DumpState *s, RAMBlock *block, ram_addr_t start,
+static int write_memory(DumpState *s, GuestPhysBlock *block, ram_addr_t start,
                        int64_t size)
 {
    int64_t i;
    int ret;

    for (i = 0; i < size / TARGET_PAGE_SIZE; i++) {
-        ret = write_data(s, block->host + start + i * TARGET_PAGE_SIZE,
+        ret = write_data(s, block->host_addr + start + i * TARGET_PAGE_SIZE,
                         TARGET_PAGE_SIZE);
        if (ret < 0) {
            return ret;
@@ -403,7 +402,7 @@ static int write_memory(DumpState *s, RAMBlock *block, ram_addr_t start,
    }

    if ((size % TARGET_PAGE_SIZE) != 0) {
-        ret = write_data(s, block->host + start + i * TARGET_PAGE_SIZE,
+        ret = write_data(s, block->host_addr + start + i * TARGET_PAGE_SIZE,
                         size % TARGET_PAGE_SIZE);
        if (ret < 0) {
            return ret;
@@ -413,57 +412,71 @@ static int write_memory(DumpState *s, RAMBlock *block, ram_addr_t start,
    return 0;
 }

-/* get the memory's offset in the vmcore */
-static hwaddr get_offset(hwaddr phys_addr,
-                                     DumpState *s)
+/* get the memory's offset and size in the vmcore */
+static void get_offset_range(hwaddr phys_addr,
+                             ram_addr_t mapping_length,
+                             DumpState *s,
+                             hwaddr *p_offset,
+                             hwaddr *p_filesz)
 {
-    RAMBlock *block;
+    GuestPhysBlock *block;
    hwaddr offset = s->memory_offset;
    int64_t size_in_block, start;

+    /* When the memory is not stored into vmcore, offset will be -1 */
+    *p_offset = -1;
+    *p_filesz = 0;
+
    if (s->has_filter) {
        if (phys_addr < s->begin || phys_addr >= s->begin + s->length) {
-            return -1;
+            return;
        }
    }

-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
        if (s->has_filter) {
-            if (block->offset >= s->begin + s->length ||
-                block->offset + block->length <= s->begin) {
+            if (block->target_start >= s->begin + s->length ||
+                block->target_end <= s->begin) {
                /* This block is out of the range */
                continue;
            }

-            if (s->begin <= block->offset) {
-                start = block->offset;
+            if (s->begin <= block->target_start) {
+                start = block->target_start;
            } else {
                start = s->begin;
            }

-            size_in_block = block->length - (start - block->offset);
-            if (s->begin + s->length < block->offset + block->length) {
-                size_in_block -= block->offset + block->length -
-                                 (s->begin + s->length);
+            size_in_block = block->target_end - start;
+            if (s->begin + s->length < block->target_end) {
+                size_in_block -= block->target_end - (s->begin + s->length);
            }
        } else {
-            start = block->offset;
-            size_in_block = block->length;
+            start = block->target_start;
+            size_in_block = block->target_end - block->target_start;
        }

        if (phys_addr >= start && phys_addr < start + size_in_block) {
-            return phys_addr - start + offset;
+            *p_offset = phys_addr - start + offset;
+
+            /* The offset range mapped from the vmcore file must not spill over
+             * the GuestPhysBlock, clamp it. The rest of the mapping will be
+             * zero-filled in memory at load time; see
+             * <http://refspecs.linuxbase.org/elf/gabi4+/ch5.pheader.html>.
+             */
+            *p_filesz = phys_addr + mapping_length <= start + size_in_block ?
+                        mapping_length :
+                        size_in_block - (phys_addr - start);
+            return;
        }

        offset += size_in_block;
    }
-
-    return -1;
 }

 static int write_elf_loads(DumpState *s)
 {
-    hwaddr offset;
+    hwaddr offset, filesz;
    MemoryMapping *memory_mapping;
    uint32_t phdr_index = 1;
    int ret;
@@ -476,11 +489,15 @@ static int write_elf_loads(DumpState *s)
    }

    QTAILQ_FOREACH(memory_mapping, &s->list.head, next) {
-        offset = get_offset(memory_mapping->phys_addr, s);
+        get_offset_range(memory_mapping->phys_addr,
+                         memory_mapping->length,
+                         s, &offset, &filesz);
        if (s->dump_info.d_class == ELFCLASS64) {
-            ret = write_elf64_load(s, memory_mapping, phdr_index++, offset);
+            ret = write_elf64_load(s, memory_mapping, phdr_index++, offset,
+                                   filesz);
        } else {
-            ret = write_elf32_load(s, memory_mapping, phdr_index++, offset);
+            ret = write_elf32_load(s, memory_mapping, phdr_index++, offset,
+                                   filesz);
        }

        if (ret < 0) {
@@ -591,7 +608,7 @@ static int dump_completed(DumpState *s)
    return 0;
 }

-static int get_next_block(DumpState *s, RAMBlock *block)
+static int get_next_block(DumpState *s, GuestPhysBlock *block)
 {
    while (1) {
        block = QTAILQ_NEXT(block, next);
@@ -601,16 +618,16 @@ static int get_next_block(DumpState *s, RAMBlock *block)
        }

        s->start = 0;
-        s->block = block;
+        s->next_block = block;
        if (s->has_filter) {
-            if (block->offset >= s->begin + s->length ||
-                block->offset + block->length <= s->begin) {
+            if (block->target_start >= s->begin + s->length ||
+                block->target_end <= s->begin) {
                /* This block is out of the range */
                continue;
            }

-            if (s->begin > block->offset) {
-                s->start = s->begin - block->offset;
+            if (s->begin > block->target_start) {
+                s->start = s->begin - block->target_start;
            }
        }

@@ -621,18 +638,18 @@ static int get_next_block(DumpState *s, RAMBlock *block)
 /* write all memory to vmcore */
 static int dump_iterate(DumpState *s)
 {
-    RAMBlock *block;
+    GuestPhysBlock *block;
    int64_t size;
    int ret;

    while (1) {
-        block = s->block;
+        block = s->next_block;

-        size = block->length;
+        size = block->target_end - block->target_start;
        if (s->has_filter) {
            size -= s->start;
-            if (s->begin + s->length < block->offset + block->length) {
-                size -= block->offset + block->length - (s->begin + s->length);
+            if (s->begin + s->length < block->target_end) {
+                size -= block->target_end - (s->begin + s->length);
            }
        }
        ret = write_memory(s, block, s->start, size);
@@ -667,23 +684,23 @@ static int create_vmcore(DumpState *s)

 static ram_addr_t get_start_block(DumpState *s)
 {
-    RAMBlock *block;
+    GuestPhysBlock *block;

    if (!s->has_filter) {
-        s->block = QTAILQ_FIRST(&ram_list.blocks);
+        s->next_block = QTAILQ_FIRST(&s->guest_phys_blocks.head);
        return 0;
    }

-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (block->offset >= s->begin + s->length ||
-            block->offset + block->length <= s->begin) {
+    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
+        if (block->target_start >= s->begin + s->length ||
+            block->target_end <= s->begin) {
            /* This block is out of the range */
            continue;
        }

-        s->block = block;
-        if (s->begin > block->offset) {
-            s->start = s->begin - block->offset;
+        s->next_block = block;
+        if (s->begin > block->target_start) {
+            s->start = s->begin - block->target_start;
        } else {
            s->start = 0;
        }
@@ -707,32 +724,35 @@ static int dump_init(DumpState *s, int fd, bool paging, bool has_filter,
        s->resume = false;
    }

+    /* If we use KVM, we should synchronize the registers before we get dump
+     * info or physmap info.
+     */
+    cpu_synchronize_all_states();
+    nr_cpus = 0;
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        nr_cpus++;
+    }
+
    s->errp = errp;
    s->fd = fd;
    s->has_filter = has_filter;
    s->begin = begin;
    s->length = length;
+
+    guest_phys_blocks_init(&s->guest_phys_blocks);
+    guest_phys_blocks_append(&s->guest_phys_blocks);
+
    s->start = get_start_block(s);
    if (s->start == -1) {
        error_set(errp, QERR_INVALID_PARAMETER, "begin");
        goto cleanup;
    }

-    /*
-     * get dump info: endian, class and architecture.
+    /* get dump info: endian, class and architecture.
     * If the target architecture is not supported, cpu_get_dump_info() will
     * return -1.
-     *
-     * if we use kvm, we should synchronize the register before we get dump
-     * info.
     */
-    nr_cpus = 0;
-    for (env = first_cpu; env != NULL; env = env->next_cpu) {
-        cpu_synchronize_state(env);
-        nr_cpus++;
-    }
-
-    ret = cpu_get_dump_info(&s->dump_info);
+    ret = cpu_get_dump_info(&s->dump_info, &s->guest_phys_blocks);
    if (ret < 0) {
        error_set(errp, QERR_UNSUPPORTED);
        goto cleanup;
@@ -748,9 +768,9 @@ static int dump_init(DumpState *s, int fd, bool paging, bool has_filter,
    /* get memory mapping */
    memory_mapping_list_init(&s->list);
    if (paging) {
-        qemu_get_guest_memory_mapping(&s->list);
+        qemu_get_guest_memory_mapping(&s->list, &s->guest_phys_blocks);
    } else {
-        qemu_get_guest_simple_memory_mapping(&s->list);
+        qemu_get_guest_simple_memory_mapping(&s->list, &s->guest_phys_blocks);
    }

    if (s->has_filter) {
@@ -802,6 +822,8 @@ static int dump_init(DumpState *s, int fd, bool paging, bool has_filter,
    return 0;

 cleanup:
+    guest_phys_blocks_free(&s->guest_phys_blocks);
+
    if (s->resume) {
        vm_start();
    }
@@ -849,7 +871,7 @@ void qmp_dump_guest_memory(bool paging, const char *file, bool has_begin,
        return;
    }

-    s = g_malloc(sizeof(DumpState));
+    s = g_malloc0(sizeof(DumpState));

    ret = dump_init(s, fd, paging, has_begin, begin, length, errp);
    if (ret < 0) {
--- a/exec.c
+++ b/exec.c
@@ -75,6 +75,7 @@ CPUArchState *first_cpu;
 /* current CPU in the current thread. It is only valid inside
   cpu_exec() */
 DEFINE_TLS(CPUArchState *,cpu_single_env);
+DEFINE_TLS(CPUState *, current_cpu);
 /* 0 = Do not count executed instructions.
   1 = Precise instruction counting.
   2 = Adaptive rate instruction counting.  */
@@ -493,7 +494,7 @@ void cpu_reset_interrupt(CPUArchState *env, int mask)
 void cpu_exit(CPUArchState *env)
 {
    env->exit_request = 1;
-    cpu_unlink_tb(env);
+    env->tcg_exit_req = 1;
 }

 void cpu_abort(CPUArchState *env, const char *fmt, ...)
@@ -1080,6 +1081,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,

    qemu_ram_setup_dump(new_block->host, size);
    qemu_madvise(new_block->host, size, QEMU_MADV_HUGEPAGE);
+    qemu_madvise(new_block->host, size, QEMU_MADV_DONTFORK);

    if (kvm_enabled())
        kvm_setup_guest_memory(new_block->host, size);
@@ -1164,7 +1166,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
        offset = addr - block->offset;
        if (offset < block->length) {
-            vaddr = block->host + offset;
+            vaddr = ramblock_ptr(block, offset);
            if (block->flags & RAM_PREALLOC_MASK) {
                ;
            } else {
@@ -1255,7 +1257,7 @@ found:
                xen_map_cache(block->offset, block->length, 1);
        }
    }
-    return block->host + (addr - block->offset);
+    return ramblock_ptr(block, addr - block->offset);
 }

 /* Return a host pointer to ram allocated with qemu_ram_alloc.  Same as
@@ -1282,7 +1284,7 @@ static void *qemu_safe_ram_ptr(ram_addr_t addr)
                        xen_map_cache(block->offset, block->length, 1);
                }
            }
-            return block->host + (addr - block->offset);
+            return ramblock_ptr(block, addr - block->offset);
        }
    }

@@ -1308,7 +1310,7 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size)
            if (addr - block->offset < block->length) {
                if (addr - block->offset + *size > block->length)
                    *size = block->length - addr + block->offset;
-                return block->host + (addr - block->offset);
+                return ramblock_ptr(block, addr - block->offset);
            }
        }

@@ -1868,7 +1870,7 @@ void address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
                      int len, bool is_write)
 {
    AddressSpaceDispatch *d = as->dispatch;
-    int l;
+    ram_addr_t l;
    uint8_t *ptr;
    uint32_t val;
    hwaddr page;
@@ -1908,7 +1910,7 @@ void address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
                addr1 = memory_region_get_ram_addr(section->mr)
                    + memory_region_section_addr(section, addr);
                /* RAM case */
-                ptr = qemu_get_ram_ptr(addr1);
+                ptr = qemu_ram_ptr_length(addr1, &l);
                memcpy(ptr, buf, l);
                invalidate_and_set_dirty(addr1, l);
                qemu_put_ram_ptr(ptr);
@@ -1937,9 +1939,10 @@ void address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
                }
            } else {
                /* RAM case */
-                ptr = qemu_get_ram_ptr(section->mr->ram_addr
-                                       + memory_region_section_addr(section,
-                                                                    addr));
+                ptr = qemu_ram_ptr_length(section->mr->ram_addr +
+                                          memory_region_section_addr(section,
+                                                                     addr),
+                                          &l);
                memcpy(buf, ptr, l);
                qemu_put_ram_ptr(ptr);
            }
@@ -2584,14 +2587,12 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
 }
 #endif

-#if !defined(CONFIG_USER_ONLY)
-
 /*
 * A helper function for the _utterly broken_ virtio device model to find out if
 * it's running on a big endian machine. Don't do this at home kids!
 */
-bool virtio_is_big_endian(void);
-bool virtio_is_big_endian(void)
+bool target_words_bigendian(void);
+bool target_words_bigendian(void)
 {
 #if defined(TARGET_WORDS_BIGENDIAN)
    return true;
@@ -2600,8 +2601,6 @@ bool virtio_is_big_endian(void)
 #endif
 }

-#endif
-
 #ifndef CONFIG_USER_ONLY
 bool cpu_physical_memory_is_io(hwaddr phys_addr)
 {
--- a/fsdev/virtio-9p-marshal.c
+++ b/fsdev/virtio-9p-marshal.c
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p backend
+ * 9p backend
 *
 * Copyright IBM, Corp. 2010
 *
@@ -23,40 +23,9 @@
 #include <errno.h>

 #include "qemu/compiler.h"
-#include "virtio-9p-marshal.h"
+#include "9p-iov-marshal.h"
 #include "qemu/bswap.h"

-void v9fs_string_free(V9fsString *str)
-{
-    g_free(str->data);
-    str->data = NULL;
-    str->size = 0;
-}
-
-void v9fs_string_null(V9fsString *str)
-{
-    v9fs_string_free(str);
-}
-
-void GCC_FMT_ATTR(2, 3)
-v9fs_string_sprintf(V9fsString *str, const char *fmt, ...)
-{
-    va_list ap;
-
-    v9fs_string_free(str);
-
-    va_start(ap, fmt);
-    str->size = g_vasprintf(&str->data, fmt, ap);
-    va_end(ap);
-}
-
-void v9fs_string_copy(V9fsString *lhs, V9fsString *rhs)
-{
-    v9fs_string_free(lhs);
-    v9fs_string_sprintf(lhs, "%s", rhs->data);
-}
-
-
 static ssize_t v9fs_packunpack(void *addr, struct iovec *sg, int sg_count,
                               size_t offset, size_t size, int pack)
 {
@@ -108,15 +77,13 @@ ssize_t v9fs_pack(struct iovec *in_sg, int in_num, size_t offset,
    return v9fs_packunpack((void *)src, in_sg, in_num, offset, size, 1);
 }

-ssize_t v9fs_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
-                       int bswap, const char *fmt, ...)
+ssize_t v9fs_iov_vunmarshal(struct iovec *out_sg, int out_num, size_t offset,
+                            int bswap, const char *fmt, va_list ap)
 {
    int i;
-    va_list ap;
    ssize_t copied = 0;
    size_t old_offset = offset;

-    va_start(ap, fmt);
    for (i = 0; fmt[i]; i++) {
        switch (fmt[i]) {
        case 'b': {
@@ -159,14 +126,14 @@ ssize_t v9fs_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
        }
        case 's': {
            V9fsString *str = va_arg(ap, V9fsString *);
-            copied = v9fs_unmarshal(out_sg, out_num, offset, bswap,
-                                    "w", &str->size);
+            copied = v9fs_iov_unmarshal(out_sg, out_num, offset, bswap,
+                                        "w", &str->size);
            if (copied > 0) {
                offset += copied;
                str->data = g_malloc(str->size + 1);
                copied = v9fs_unpack(str->data, out_sg, out_num, offset,
                                     str->size);
-                if (copied > 0) {
+                if (copied >= 0) {
                    str->data[str->size] = 0;
                } else {
                    v9fs_string_free(str);
@@ -176,56 +143,70 @@ ssize_t v9fs_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
        }
        case 'Q': {
            V9fsQID *qidp = va_arg(ap, V9fsQID *);
-            copied = v9fs_unmarshal(out_sg, out_num, offset, bswap, "bdq",
-                                    &qidp->type, &qidp->version, &qidp->path);
+            copied = v9fs_iov_unmarshal(out_sg, out_num, offset, bswap,
+                                        "bdq", &qidp->type, &qidp->version,
+                                        &qidp->path);
            break;
        }
        case 'S': {
            V9fsStat *statp = va_arg(ap, V9fsStat *);
-            copied = v9fs_unmarshal(out_sg, out_num, offset, bswap,
-                                    "wwdQdddqsssssddd",
-                                    &statp->size, &statp->type, &statp->dev,
-                                    &statp->qid, &statp->mode, &statp->atime,
-                                    &statp->mtime, &statp->length,
-                                    &statp->name, &statp->uid, &statp->gid,
-                                    &statp->muid, &statp->extension,
-                                    &statp->n_uid, &statp->n_gid,
-                                    &statp->n_muid);
+            copied = v9fs_iov_unmarshal(out_sg, out_num, offset, bswap,
+                                        "wwdQdddqsssssddd",
+                                        &statp->size, &statp->type,
+                                        &statp->dev, &statp->qid,
+                                        &statp->mode, &statp->atime,
+                                        &statp->mtime, &statp->length,
+                                        &statp->name, &statp->uid,
+                                        &statp->gid, &statp->muid,
+                                        &statp->extension,
+                                        &statp->n_uid, &statp->n_gid,
+                                        &statp->n_muid);
            break;
        }
        case 'I': {
            V9fsIattr *iattr = va_arg(ap, V9fsIattr *);
-            copied = v9fs_unmarshal(out_sg, out_num, offset, bswap,
-                                    "ddddqqqqq",
-                                    &iattr->valid, &iattr->mode,
-                                    &iattr->uid, &iattr->gid, &iattr->size,
-                                    &iattr->atime_sec, &iattr->atime_nsec,
-                                    &iattr->mtime_sec, &iattr->mtime_nsec);
+            copied = v9fs_iov_unmarshal(out_sg, out_num, offset, bswap,
+                                        "ddddqqqqq",
+                                        &iattr->valid, &iattr->mode,
+                                        &iattr->uid, &iattr->gid,
+                                        &iattr->size, &iattr->atime_sec,
+                                        &iattr->atime_nsec,
+                                        &iattr->mtime_sec,
+                                        &iattr->mtime_nsec);
            break;
        }
        default:
            break;
        }
        if (copied < 0) {
-            va_end(ap);
            return copied;
        }
        offset += copied;
    }
-    va_end(ap);

    return offset - old_offset;
 }

-ssize_t v9fs_marshal(struct iovec *in_sg, int in_num, size_t offset,
-                     int bswap, const char *fmt, ...)
+ssize_t v9fs_iov_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
+                           int bswap, const char *fmt, ...)
+{
+    ssize_t ret;
+    va_list ap;
+
+    va_start(ap, fmt);
+    ret = v9fs_iov_vunmarshal(out_sg, out_num, offset, bswap, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
+
+ssize_t v9fs_iov_vmarshal(struct iovec *in_sg, int in_num, size_t offset,
+                          int bswap, const char *fmt, va_list ap)
 {
    int i;
-    va_list ap;
    ssize_t copied = 0;
    size_t old_offset = offset;

-    va_start(ap, fmt);
    for (i = 0; fmt[i]; i++) {
        switch (fmt[i]) {
        case 'b': {
@@ -265,8 +246,8 @@ ssize_t v9fs_marshal(struct iovec *in_sg, int in_num, size_t offset,
        }
        case 's': {
            V9fsString *str = va_arg(ap, V9fsString *);
-            copied = v9fs_marshal(in_sg, in_num, offset, bswap,
-                                  "w", str->size);
+            copied = v9fs_iov_marshal(in_sg, in_num, offset, bswap,
+                                      "w", str->size);
            if (copied > 0) {
                offset += copied;
                copied = v9fs_pack(in_sg, in_num, offset, str->data, str->size);
@@ -275,49 +256,65 @@ ssize_t v9fs_marshal(struct iovec *in_sg, int in_num, size_t offset,
        }
        case 'Q': {
            V9fsQID *qidp = va_arg(ap, V9fsQID *);
-            copied = v9fs_marshal(in_sg, in_num, offset, bswap, "bdq",
-                                  qidp->type, qidp->version, qidp->path);
+            copied = v9fs_iov_marshal(in_sg, in_num, offset, bswap, "bdq",
+                                      qidp->type, qidp->version,
+                                      qidp->path);
            break;
        }
        case 'S': {
            V9fsStat *statp = va_arg(ap, V9fsStat *);
-            copied = v9fs_marshal(in_sg, in_num, offset, bswap,
-                                  "wwdQdddqsssssddd",
-                                  statp->size, statp->type, statp->dev,
-                                  &statp->qid, statp->mode, statp->atime,
-                                  statp->mtime, statp->length, &statp->name,
-                                  &statp->uid, &statp->gid, &statp->muid,
-                                  &statp->extension, statp->n_uid,
-                                  statp->n_gid, statp->n_muid);
+            copied = v9fs_iov_marshal(in_sg, in_num, offset, bswap,
+                                      "wwdQdddqsssssddd",
+                                      statp->size, statp->type, statp->dev,
+                                      &statp->qid, statp->mode, statp->atime,
+                                      statp->mtime, statp->length,
+                                      &statp->name,
+                                      &statp->uid, &statp->gid, &statp->muid,
+                                      &statp->extension, statp->n_uid,
+                                      statp->n_gid, statp->n_muid);
            break;
        }
        case 'A': {
            V9fsStatDotl *statp = va_arg(ap, V9fsStatDotl *);
-            copied = v9fs_marshal(in_sg, in_num, offset, bswap,
-                                   "qQdddqqqqqqqqqqqqqqq",
-                                   statp->st_result_mask,
-                                   &statp->qid, statp->st_mode,
-                                   statp->st_uid, statp->st_gid,
-                                   statp->st_nlink, statp->st_rdev,
-                                   statp->st_size, statp->st_blksize,
-                                   statp->st_blocks, statp->st_atime_sec,
-                                   statp->st_atime_nsec, statp->st_mtime_sec,
-                                   statp->st_mtime_nsec, statp->st_ctime_sec,
-                                   statp->st_ctime_nsec, statp->st_btime_sec,
-                                   statp->st_btime_nsec, statp->st_gen,
-                                   statp->st_data_version);
+            copied = v9fs_iov_marshal(in_sg, in_num, offset, bswap,
+                                      "qQdddqqqqqqqqqqqqqqq",
+                                      statp->st_result_mask,
+                                      &statp->qid, statp->st_mode,
+                                      statp->st_uid, statp->st_gid,
+                                      statp->st_nlink, statp->st_rdev,
+                                      statp->st_size, statp->st_blksize,
+                                      statp->st_blocks, statp->st_atime_sec,
+                                      statp->st_atime_nsec,
+                                      statp->st_mtime_sec,
+                                      statp->st_mtime_nsec,
+                                      statp->st_ctime_sec,
+                                      statp->st_ctime_nsec,
+                                      statp->st_btime_sec,
+                                      statp->st_btime_nsec, statp->st_gen,
+                                      statp->st_data_version);
            break;
        }
        default:
            break;
        }
        if (copied < 0) {
-            va_end(ap);
            return copied;
        }
        offset += copied;
    }
-    va_end(ap);

    return offset - old_offset;
 }
+
+ssize_t v9fs_iov_marshal(struct iovec *in_sg, int in_num, size_t offset,
+                         int bswap, const char *fmt, ...)
+{
+    ssize_t ret;
+    va_list ap;
+
+    va_start(ap, fmt);
+    ret = v9fs_iov_vmarshal(in_sg, in_num, offset, bswap, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
--- a/fsdev/9p-iov-marshal.h
+++ b/fsdev/9p-iov-marshal.h
@@ -0,0 +1,18 @@
+#ifndef _QEMU_9P_IOV_MARSHAL_H
+#define _QEMU_9P_IOV_MARSHAL_H
+
+#include "9p-marshal.h"
+
+
+ssize_t v9fs_pack(struct iovec *in_sg, int in_num, size_t offset,
+                  const void *src, size_t size);
+ssize_t v9fs_iov_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
+                           int bswap, const char *fmt, ...);
+ssize_t v9fs_iov_marshal(struct iovec *in_sg, int in_num, size_t offset,
+                         int bswap, const char *fmt, ...);
+
+ssize_t v9fs_iov_vunmarshal(struct iovec *out_sg, int out_num, size_t offset,
+                            int bswap, const char *fmt, va_list ap);
+ssize_t v9fs_iov_vmarshal(struct iovec *in_sg, int in_num, size_t offset,
+                          int bswap, const char *fmt, va_list ap);
+#endif
--- a/fsdev/9p-marshal.c
+++ b/fsdev/9p-marshal.c
@@ -0,0 +1,56 @@
+/*
+ * 9p backend
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <glib.h>
+#include <glib/gprintf.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <utime.h>
+#include <sys/uio.h>
+#include <string.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include "qemu/compiler.h"
+#include "9p-marshal.h"
+
+void v9fs_string_free(V9fsString *str)
+{
+    g_free(str->data);
+    str->data = NULL;
+    str->size = 0;
+}
+
+void v9fs_string_null(V9fsString *str)
+{
+    v9fs_string_free(str);
+}
+
+void GCC_FMT_ATTR(2, 3)
+v9fs_string_sprintf(V9fsString *str, const char *fmt, ...)
+{
+    va_list ap;
+
+    v9fs_string_free(str);
+
+    va_start(ap, fmt);
+    str->size = g_vasprintf(&str->data, fmt, ap);
+    va_end(ap);
+}
+
+void v9fs_string_copy(V9fsString *lhs, V9fsString *rhs)
+{
+    v9fs_string_free(lhs);
+    v9fs_string_sprintf(lhs, "%s", rhs->data);
+}
--- a/fsdev/virtio-9p-marshal.h
+++ b/fsdev/virtio-9p-marshal.h
@@ -1,5 +1,5 @@
-#ifndef _QEMU_VIRTIO_9P_MARSHAL_H
-#define _QEMU_VIRTIO_9P_MARSHAL_H
+#ifndef _QEMU_9P_MARSHAL_H
+#define _QEMU_9P_MARSHAL_H

 typedef struct V9fsString
 {
@@ -30,7 +30,7 @@ typedef struct V9fsStat
    V9fsString muid;
    /* 9p2000.u */
    V9fsString extension;
-   int32_t n_uid;
+    int32_t n_uid;
    int32_t n_gid;
    int32_t n_muid;
 } V9fsStat;
@@ -81,10 +81,4 @@ extern void v9fs_string_null(V9fsString *str);
 extern void v9fs_string_sprintf(V9fsString *str, const char *fmt, ...);
 extern void v9fs_string_copy(V9fsString *lhs, V9fsString *rhs);

-ssize_t v9fs_pack(struct iovec *in_sg, int in_num, size_t offset,
-                  const void *src, size_t size);
-ssize_t v9fs_unmarshal(struct iovec *out_sg, int out_num, size_t offset,
-                       int bswap, const char *fmt, ...);
-ssize_t v9fs_marshal(struct iovec *in_sg, int in_num, size_t offset,
-                     int bswap, const char *fmt, ...);
 #endif
--- a/fsdev/Makefile.objs
+++ b/fsdev/Makefile.objs
@@ -1,5 +1,5 @@
 ifeq ($(CONFIG_REALLY_VIRTFS),y)
-common-obj-y = qemu-fsdev.o virtio-9p-marshal.o
+common-obj-y = qemu-fsdev.o 9p-marshal.o 9p-iov-marshal.o
 else
 common-obj-y = qemu-fsdev-dummy.o
 endif
--- a/fsdev/file-op-9p.h
+++ b/fsdev/file-op-9p.h
@@ -102,6 +102,7 @@ struct FileOperations
 {
    int (*parse_opts)(QemuOpts *, struct FsDriverEntry *);
    int (*init)(struct FsContext *);
+    void (*cleanup)(struct FsContext *);
    int (*lstat)(FsContext *, V9fsPath *, struct stat *);
    ssize_t (*readlink)(FsContext *, V9fsPath *, char *, size_t);
    int (*chmod)(FsContext *, V9fsPath *, FsCred *);
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -9,6 +9,10 @@
 * the COPYING file in the top-level directory.
 */

+/* work around a broken sys/capability.h */
+#if defined(__i386__)
+typedef unsigned long long __u64;
+#endif
 #include <sys/resource.h>
 #include <getopt.h>
 #include <syslog.h>
@@ -23,9 +27,9 @@
 #include "qemu-common.h"
 #include "qemu/sockets.h"
 #include "qemu/xattr.h"
-#include "virtio-9p-marshal.h"
-#include "hw/9pfs/virtio-9p-proxy.h"
-#include "fsdev/virtio-9p-marshal.h"
+#include "9p-iov-marshal.h"
+#include "hw/9pfs/9p-proxy.h"
+#include "fsdev/9p-iov-marshal.h"

 #define PROGNAME "virtfs-proxy-helper"

--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2926,14 +2926,14 @@ void gdbserver_fork(CPUArchState *env)
    cpu_watchpoint_remove_all(env, BP_GDB);
 }
 #else
-static int gdb_chr_can_receive(void *opaque)
+static size_t gdb_chr_can_receive(void *opaque)
 {
  /* We can handle an arbitrarily large amount of data.
   Pick the maximum packet size, which is as good as anything.  */
  return MAX_PACKET_LENGTH;
 }

-static void gdb_chr_receive(void *opaque, const uint8_t *buf, int size)
+static void gdb_chr_receive(void *opaque, const uint8_t *buf, size_t size)
 {
    int i;

@@ -2965,7 +2965,7 @@ static void gdb_monitor_output(GDBState *s, const char *msg, int len)
    put_packet(s, buf);
 }

-static int gdb_monitor_write(CharDriverState *chr, const uint8_t *buf, int len)
+static size_t gdb_monitor_write(CharDriverState *chr, const uint8_t *buf, size_t len)
 {
    const char *p = (const char *)buf;
    int max_sz;
--- a/hmp.c
+++ b/hmp.c
@@ -173,6 +173,8 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
                       info->ram->total >> 10);
        monitor_printf(mon, "duplicate: %" PRIu64 " pages\n",
                       info->ram->duplicate);
+        monitor_printf(mon, "skipped: %" PRIu64 " pages\n",
+                       info->ram->skipped);
        monitor_printf(mon, "normal: %" PRIu64 " pages\n",
                       info->ram->normal);
        monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n",
@@ -1211,21 +1213,18 @@ void hmp_send_key(Monitor *mon, const QDict *qdict)
    int has_hold_time = qdict_haskey(qdict, "hold-time");
    int hold_time = qdict_get_try_int(qdict, "hold-time", -1);
    Error *err = NULL;
-    char keyname_buf[16];
    char *separator;
    int keyname_len;

    while (1) {
        separator = strchr(keys, '-');
        keyname_len = separator ? separator - keys : strlen(keys);
-        pstrcpy(keyname_buf, sizeof(keyname_buf), keys);

        /* Be compatible with old interface, convert user inputted "<" */
-        if (!strncmp(keyname_buf, "<", 1) && keyname_len == 1) {
-            pstrcpy(keyname_buf, sizeof(keyname_buf), "less");
+        if (keys[0] == '<' && keyname_len == 1) {
+            keys = "less";
            keyname_len = 4;
        }
-        keyname_buf[keyname_len] = 0;

        keylist = g_malloc0(sizeof(*keylist));
        keylist->value = g_malloc0(sizeof(*keylist->value));
@@ -1238,16 +1237,17 @@ void hmp_send_key(Monitor *mon, const QDict *qdict)
        }
        tmp = keylist;

-        if (strstart(keyname_buf, "0x", NULL)) {
+        if (strstart(keys, "0x", NULL)) {
            char *endp;
-            int value = strtoul(keyname_buf, &endp, 0);
-            if (*endp != '\0') {
+            int value = strtoul(keys, &endp, 0);
+            assert(endp <= keys + keyname_len);
+            if (endp != keys + keyname_len) {
                goto err_out;
            }
            keylist->value->kind = KEY_VALUE_KIND_NUMBER;
            keylist->value->number = value;
        } else {
-            int idx = index_from_key(keyname_buf);
+            int idx = index_from_key(keys, keyname_len);
            if (idx == Q_KEY_CODE_MAX) {
                goto err_out;
            }
@@ -1269,7 +1269,7 @@ out:
    return;

 err_out:
-    monitor_printf(mon, "invalid parameter: %s\n", keyname_buf);
+    monitor_printf(mon, "invalid parameter: %.*s\n", keyname_len, keys);
    goto out;
 }

--- a/hw/9p.h
+++ b/hw/9p.h
@@ -1,24 +0,0 @@
-/*
- * Virtio 9p
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- *  Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_9P_H
-#define QEMU_9P_H
-
-typedef struct V9fsConf
-{
-    /* tag name for the device */
-    char *tag;
-    char *fsdev_id;
-} V9fsConf;
-
-#endif
--- a/hw/9pfs/virtio-9p-handle.c
+++ b/hw/9pfs/virtio-9p-handle.c
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p handle callback
+ * 9p handle callback
 *
 * Copyright IBM, Corp. 2011
 *
@@ -11,9 +11,8 @@
 *
 */

-#include "hw/virtio.h"
-#include "virtio-9p.h"
-#include "virtio-9p-xattr.h"
+#include "9p.h"
+#include "9p-xattr.h"
 #include <arpa/inet.h>
 #include <pwd.h>
 #include <grp.h>
@@ -111,7 +110,7 @@ static int handle_close(FsContext *ctx, V9fsFidOpenState *fs)

 static int handle_closedir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return closedir(fs->dir);
+    return closedir(fs->dir.stream);
 }

 static int handle_open(FsContext *ctx, V9fsPath *fs_path,
@@ -131,8 +130,8 @@ static int handle_opendir(FsContext *ctx,
    if (ret < 0) {
        return -1;
    }
-    fs->dir = fdopendir(ret);
-    if (!fs->dir) {
+    fs->dir.stream = fdopendir(ret);
+    if (!fs->dir.stream) {
        return -1;
    }
    return 0;
@@ -140,24 +139,24 @@ static int handle_opendir(FsContext *ctx,

 static void handle_rewinddir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return rewinddir(fs->dir);
+    rewinddir(fs->dir.stream);
 }

 static off_t handle_telldir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return telldir(fs->dir);
+    return telldir(fs->dir.stream);
 }

 static int handle_readdir_r(FsContext *ctx, V9fsFidOpenState *fs,
                            struct dirent *entry,
                            struct dirent **result)
 {
-    return readdir_r(fs->dir, entry, result);
+    return readdir_r(fs->dir.stream, entry, result);
 }

 static void handle_seekdir(FsContext *ctx, V9fsFidOpenState *fs, off_t off)
 {
-    return seekdir(fs->dir, off);
+    seekdir(fs->dir.stream, off);
 }

 static ssize_t handle_preadv(FsContext *ctx, V9fsFidOpenState *fs,
@@ -261,7 +260,7 @@ static int handle_fstat(FsContext *fs_ctx, int fid_type,
    int fd;

    if (fid_type == P9_FID_DIR) {
-        fd = dirfd(fs->dir);
+        fd = dirfd(fs->dir.stream);
    } else {
        fd = fs->fd;
    }
@@ -408,7 +407,7 @@ static int handle_fsync(FsContext *ctx, int fid_type,
    int fd;

    if (fid_type == P9_FID_DIR) {
-        fd = dirfd(fs->dir);
+        fd = dirfd(fs->dir.stream);
    } else {
        fd = fs->fd;
    }
--- a/hw/9pfs/9p-local.c
+++ b/hw/9pfs/9p-local.c
--- a/hw/9pfs/9p-local.h
+++ b/hw/9pfs/9p-local.h
@@ -0,0 +1,20 @@
+/*
+ * 9p local backend utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_9P_LOCAL_H
+#define QEMU_9P_LOCAL_H
+
+int local_open_nofollow(FsContext *fs_ctx, const char *path, int flags,
+                        mode_t mode);
+int local_opendir_nofollow(FsContext *fs_ctx, const char *path);
+
+#endif
--- a/hw/9pfs/virtio-9p-posix-acl.c
+++ b/hw/9pfs/virtio-9p-posix-acl.c
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p system.posix* xattr callback
+ * 9p system.posix* xattr callback
 *
 * Copyright IBM, Corp. 2010
 *
@@ -13,10 +13,9 @@

 #include <sys/types.h>
 #include "qemu/xattr.h"
-#include "hw/virtio.h"
-#include "virtio-9p.h"
+#include "9p.h"
 #include "fsdev/file-op-9p.h"
-#include "virtio-9p-xattr.h"
+#include "9p-xattr.h"

 #define MAP_ACL_ACCESS "user.virtfs.system.posix_acl_access"
 #define MAP_ACL_DEFAULT "user.virtfs.system.posix_acl_default"
@@ -26,8 +25,7 @@
 static ssize_t mp_pacl_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char buffer[PATH_MAX];
-    return lgetxattr(rpath(ctx, path, buffer), MAP_ACL_ACCESS, value, size);
+    return local_getxattr_nofollow(ctx, path, MAP_ACL_ACCESS, value, size);
 }

 static ssize_t mp_pacl_listxattr(FsContext *ctx, const char *path,
@@ -52,17 +50,16 @@ static ssize_t mp_pacl_listxattr(FsContext *ctx, const char *path,
 static int mp_pacl_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char buffer[PATH_MAX];
-    return lsetxattr(rpath(ctx, path, buffer), MAP_ACL_ACCESS, value,
-            size, flags);
+    return local_setxattr_nofollow(ctx, path, MAP_ACL_ACCESS, value, size,
+                                   flags);
 }

 static int mp_pacl_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
    int ret;
-    char buffer[PATH_MAX];
-    ret  = lremovexattr(rpath(ctx, path, buffer), MAP_ACL_ACCESS);
+
+    ret = local_removexattr_nofollow(ctx, path, MAP_ACL_ACCESS);
    if (ret == -1 && errno == ENODATA) {
        /*
         * We don't get ENODATA error when trying to remove a
@@ -78,8 +75,7 @@ static int mp_pacl_removexattr(FsContext *ctx,
 static ssize_t mp_dacl_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char buffer[PATH_MAX];
-    return lgetxattr(rpath(ctx, path, buffer), MAP_ACL_DEFAULT, value, size);
+    return local_getxattr_nofollow(ctx, path, MAP_ACL_DEFAULT, value, size);
 }

 static ssize_t mp_dacl_listxattr(FsContext *ctx, const char *path,
@@ -104,17 +100,16 @@ static ssize_t mp_dacl_listxattr(FsContext *ctx, const char *path,
 static int mp_dacl_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char buffer[PATH_MAX];
-    return lsetxattr(rpath(ctx, path, buffer), MAP_ACL_DEFAULT, value,
-            size, flags);
+    return local_setxattr_nofollow(ctx, path, MAP_ACL_DEFAULT, value, size,
+                                   flags);
 }

 static int mp_dacl_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
    int ret;
-    char buffer[PATH_MAX];
-    ret  = lremovexattr(rpath(ctx, path, buffer), MAP_ACL_DEFAULT);
+
+    ret = local_removexattr_nofollow(ctx, path, MAP_ACL_DEFAULT);
    if (ret == -1 && errno == ENODATA) {
        /*
         * We don't get ENODATA error when trying to remove a
--- a/hw/9pfs/virtio-9p-proxy.c
+++ b/hw/9pfs/virtio-9p-proxy.c
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p Proxy callback
+ * 9p Proxy callback
 *
 * Copyright IBM, Corp. 2011
 *
@@ -11,10 +11,9 @@
 */
 #include <sys/socket.h>
 #include <sys/un.h>
-#include "hw/virtio.h"
-#include "virtio-9p.h"
+#include "9p.h"
 #include "fsdev/qemu-fsdev.h"
-#include "virtio-9p-proxy.h"
+#include "9p-proxy.h"

 typedef struct V9fsProxy {
    int sockfd;
@@ -631,7 +630,7 @@ static int proxy_close(FsContext *ctx, V9fsFidOpenState *fs)

 static int proxy_closedir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return closedir(fs->dir);
+    return closedir(fs->dir.stream);
 }

 static int proxy_open(FsContext *ctx, V9fsPath *fs_path,
@@ -650,14 +649,14 @@ static int proxy_opendir(FsContext *ctx,
 {
    int serrno, fd;

-    fs->dir = NULL;
+    fs->dir.stream = NULL;
    fd = v9fs_request(ctx->private, T_OPEN, NULL, "sd", fs_path, O_DIRECTORY);
    if (fd < 0) {
        errno = -fd;
        return -1;
    }
-    fs->dir = fdopendir(fd);
-    if (!fs->dir) {
+    fs->dir.stream = fdopendir(fd);
+    if (!fs->dir.stream) {
        serrno = errno;
        close(fd);
        errno = serrno;
@@ -668,24 +667,24 @@ static int proxy_opendir(FsContext *ctx,

 static void proxy_rewinddir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return rewinddir(fs->dir);
+    rewinddir(fs->dir.stream);
 }

 static off_t proxy_telldir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    return telldir(fs->dir);
+    return telldir(fs->dir.stream);
 }

 static int proxy_readdir_r(FsContext *ctx, V9fsFidOpenState *fs,
                           struct dirent *entry,
                           struct dirent **result)
 {
-    return readdir_r(fs->dir, entry, result);
+    return readdir_r(fs->dir.stream, entry, result);
 }

 static void proxy_seekdir(FsContext *ctx, V9fsFidOpenState *fs, off_t off)
 {
-    return seekdir(fs->dir, off);
+    seekdir(fs->dir.stream, off);
 }

 static ssize_t proxy_preadv(FsContext *ctx, V9fsFidOpenState *fs,
@@ -791,7 +790,7 @@ static int proxy_fstat(FsContext *fs_ctx, int fid_type,
    int fd;

    if (fid_type == P9_FID_DIR) {
-        fd = dirfd(fs->dir);
+        fd = dirfd(fs->dir.stream);
    } else {
        fd = fs->fd;
    }
@@ -936,7 +935,7 @@ static int proxy_fsync(FsContext *ctx, int fid_type,
    int fd;

    if (fid_type == P9_FID_DIR) {
-        fd = dirfd(fs->dir);
+        fd = dirfd(fs->dir.stream);
    } else {
        fd = fs->fd;
    }
@@ -1033,13 +1032,10 @@ static int proxy_name_to_path(FsContext *ctx, V9fsPath *dir_path,
                              const char *name, V9fsPath *target)
 {
    if (dir_path) {
-        v9fs_string_sprintf((V9fsString *)target, "%s/%s",
-                            dir_path->data, name);
+        v9fs_path_sprintf(target, "%s/%s", dir_path->data, name);
    } else {
-        v9fs_string_sprintf((V9fsString *)target, "%s", name);
+        v9fs_path_sprintf(target, "%s", name);
    }
-    /* Bump the size for including terminating NULL */
-    target->size++;
    return 0;
 }

--- a/hw/9pfs/virtio-9p-proxy.h
+++ b/hw/9pfs/virtio-9p-proxy.h
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p Proxy callback
+ * 9p Proxy callback
 *
 * Copyright IBM, Corp. 2011
 *
@@ -9,8 +9,8 @@
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 */
-#ifndef _QEMU_VIRTIO_9P_PROXY_H
-#define _QEMU_VIRTIO_9P_PROXY_H
+#ifndef _QEMU_9P_PROXY_H
+#define _QEMU_9P_PROXY_H

 #define PROXY_MAX_IO_SZ (64 * 1024)
 #define V9FS_FD_VALID INT_MAX
@@ -20,9 +20,9 @@
 * marsha/unmarshal doesn't do little endian conversion.
 */
 #define proxy_unmarshal(in_sg, offset, fmt, args...) \
-    v9fs_unmarshal(in_sg, 1, offset, 0, fmt, ##args)
+    v9fs_iov_unmarshal(in_sg, 1, offset, 0, fmt, ##args)
 #define proxy_marshal(out_sg, offset, fmt, args...) \
-    v9fs_marshal(out_sg, 1, offset, 0, fmt, ##args)
+    v9fs_iov_marshal(out_sg, 1, offset, 0, fmt, ##args)

 union MsgControl {
    struct cmsghdr cmsg;
--- a/hw/9pfs/9p-util.c
+++ b/hw/9pfs/9p-util.c
@@ -0,0 +1,77 @@
+/*
+ * 9p utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <assert.h>
+#include "glib.h"
+#include "qemu/osdep.h"
+#include "qemu/xattr.h"
+#include "9p-util.h"
+
+int relative_openat_nofollow(int dirfd, const char *path, int flags,
+                             mode_t mode)
+{
+    int fd;
+
+    fd = dup(dirfd);
+    if (fd == -1) {
+        return -1;
+    }
+
+    while (*path) {
+        const char *c;
+        int next_fd;
+        char *head;
+
+        /* Only relative paths without consecutive slashes */
+        assert(path[0] != '/');
+
+        head = g_strdup(path);
+        c = strchr(path, '/');
+        if (c) {
+            head[c - path] = 0;
+            next_fd = openat_dir(fd, head);
+        } else {
+            next_fd = openat_file(fd, head, flags, mode);
+        }
+        g_free(head);
+        if (next_fd == -1) {
+            close_preserve_errno(fd);
+            return -1;
+        }
+        close(fd);
+        fd = next_fd;
+
+        if (!c) {
+            break;
+        }
+        path = c + 1;
+    }
+
+    return fd;
+}
+
+ssize_t fgetxattrat_nofollow(int dirfd, const char *filename, const char *name,
+                             void *value, size_t size)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lgetxattr(proc_path, name, value, size);
+    g_free(proc_path);
+    return ret;
+}
--- a/hw/9pfs/9p-util.h
+++ b/hw/9pfs/9p-util.h
@@ -0,0 +1,60 @@
+/*
+ * 9p utilities
+ *
+ * Copyright IBM, Corp. 2017
+ *
+ * Authors:
+ *  Greg Kurz <groug@kaod.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_9P_UTIL_H
+#define QEMU_9P_UTIL_H
+
+static inline void close_preserve_errno(int fd)
+{
+    int serrno = errno;
+    close(fd);
+    errno = serrno;
+}
+
+static inline int openat_dir(int dirfd, const char *name)
+{
+#ifdef O_PATH
+#define OPENAT_DIR_O_PATH O_PATH
+#else
+#define OPENAT_DIR_O_PATH 0
+#endif
+    return openat(dirfd, name,
+                  O_DIRECTORY | O_RDONLY | O_NOFOLLOW | OPENAT_DIR_O_PATH);
+}
+
+static inline int openat_file(int dirfd, const char *name, int flags,
+                              mode_t mode)
+{
+    int fd, serrno, ret;
+
+    fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK,
+                mode);
+    if (fd == -1) {
+        return -1;
+    }
+
+    serrno = errno;
+    /* O_NONBLOCK was only needed to open the file. Let's drop it. */
+    ret = fcntl(fd, F_SETFL, flags);
+    assert(!ret);
+    errno = serrno;
+    return fd;
+}
+
+int relative_openat_nofollow(int dirfd, const char *path, int flags,
+                             mode_t mode);
+ssize_t fgetxattrat_nofollow(int dirfd, const char *path, const char *name,
+                             void *value, size_t size);
+int fsetxattrat_nofollow(int dirfd, const char *path, const char *name,
+                         void *value, size_t size, int flags);
+
+#endif
--- a/hw/9pfs/virtio-9p-xattr-user.c
+++ b/hw/9pfs/virtio-9p-xattr-user.c
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p user. xattr callback
+ * 9p user. xattr callback
 *
 * Copyright IBM, Corp. 2010
 *
@@ -12,16 +12,14 @@
 */

 #include <sys/types.h>
-#include "hw/virtio.h"
-#include "virtio-9p.h"
+#include "9p.h"
 #include "fsdev/file-op-9p.h"
-#include "virtio-9p-xattr.h"
+#include "9p-xattr.h"


 static ssize_t mp_user_getxattr(FsContext *ctx, const char *path,
                                const char *name, void *value, size_t size)
 {
-    char buffer[PATH_MAX];
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -30,7 +28,7 @@ static ssize_t mp_user_getxattr(FsContext *ctx, const char *path,
        errno = ENOATTR;
        return -1;
    }
-    return lgetxattr(rpath(ctx, path, buffer), name, value, size);
+    return local_getxattr_nofollow(ctx, path, name, value, size);
 }

 static ssize_t mp_user_listxattr(FsContext *ctx, const char *path,
@@ -69,7 +67,6 @@ static ssize_t mp_user_listxattr(FsContext *ctx, const char *path,
 static int mp_user_setxattr(FsContext *ctx, const char *path, const char *name,
                            void *value, size_t size, int flags)
 {
-    char buffer[PATH_MAX];
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -78,13 +75,12 @@ static int mp_user_setxattr(FsContext *ctx, const char *path, const char *name,
        errno = EACCES;
        return -1;
    }
-    return lsetxattr(rpath(ctx, path, buffer), name, value, size, flags);
+    return local_setxattr_nofollow(ctx, path, name, value, size, flags);
 }

 static int mp_user_removexattr(FsContext *ctx,
                               const char *path, const char *name)
 {
-    char buffer[PATH_MAX];
    if (strncmp(name, "user.virtfs.", 12) == 0) {
        /*
         * Don't allow fetch of user.virtfs namesapce
@@ -93,7 +89,7 @@ static int mp_user_removexattr(FsContext *ctx,
        errno = EACCES;
        return -1;
    }
-    return lremovexattr(rpath(ctx, path, buffer), name);
+    return local_removexattr_nofollow(ctx, path, name);
 }

 XattrOperations mapped_user_xattr = {
--- a/hw/9pfs/9p-xattr.c
+++ b/hw/9pfs/9p-xattr.c
@@ -0,0 +1,318 @@
+/*
+ * 9p  xattr callback
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "9p.h"
+#include "fsdev/file-op-9p.h"
+#include "9p-xattr.h"
+#include "9p-util.h"
+#include "9p-local.h"
+
+
+static XattrOperations *get_xattr_operations(XattrOperations **h,
+                                             const char *name)
+{
+    XattrOperations *xops;
+    for (xops = *(h)++; xops != NULL; xops = *(h)++) {
+        if (!strncmp(name, xops->name, strlen(xops->name))) {
+            return xops;
+        }
+    }
+    return NULL;
+}
+
+ssize_t v9fs_get_xattr(FsContext *ctx, const char *path,
+                       const char *name, void *value, size_t size)
+{
+    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
+    if (xops) {
+        return xops->getxattr(ctx, path, name, value, size);
+    }
+    errno = -EOPNOTSUPP;
+    return -1;
+}
+
+ssize_t pt_listxattr(FsContext *ctx, const char *path,
+                     char *name, void *value, size_t size)
+{
+    int name_size = strlen(name) + 1;
+    if (!value) {
+        return name_size;
+    }
+
+    if (size < name_size) {
+        errno = ERANGE;
+        return -1;
+    }
+
+    /* no need for strncpy: name_size is strlen(name)+1 */
+    memcpy(value, name, name_size);
+    return name_size;
+}
+
+static ssize_t flistxattrat_nofollow(int dirfd, const char *filename,
+                                     char *list, size_t size)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = llistxattr(proc_path, list, size);
+    g_free(proc_path);
+    return ret;
+}
+
+/*
+ * Get the list and pass to each layer to find out whether
+ * to send the data or not
+ */
+ssize_t v9fs_list_xattr(FsContext *ctx, const char *path,
+                        void *value, size_t vsize)
+{
+    ssize_t size = 0;
+    void *ovalue = value;
+    XattrOperations *xops;
+    char *orig_value, *orig_value_start;
+    ssize_t xattr_len, parsed_len = 0, attr_len;
+    char *dirpath, *name;
+    int dirfd;
+
+    /* Get the actual len */
+    dirpath = g_path_get_dirname(path);
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    g_free(dirpath);
+    if (dirfd == -1) {
+        return -1;
+    }
+
+    name = g_path_get_basename(path);
+    xattr_len = flistxattrat_nofollow(dirfd, name, value, 0);
+    if (xattr_len <= 0) {
+        g_free(name);
+        close_preserve_errno(dirfd);
+        return xattr_len;
+    }
+
+    /* Now fetch the xattr and find the actual size */
+    orig_value = g_malloc(xattr_len);
+    xattr_len = flistxattrat_nofollow(dirfd, name, orig_value, xattr_len);
+    g_free(name);
+    close_preserve_errno(dirfd);
+    if (xattr_len < 0) {
+        g_free(orig_value);
+        return -1;
+    }
+
+    /* store the orig pointer */
+    orig_value_start = orig_value;
+    while (xattr_len > parsed_len) {
+        xops = get_xattr_operations(ctx->xops, orig_value);
+        if (!xops) {
+            goto next_entry;
+        }
+
+        if (!value) {
+            size += xops->listxattr(ctx, path, orig_value, value, vsize);
+        } else {
+            size = xops->listxattr(ctx, path, orig_value, value, vsize);
+            if (size < 0) {
+                goto err_out;
+            }
+            value += size;
+            vsize -= size;
+        }
+next_entry:
+        /* Got the next entry */
+        attr_len = strlen(orig_value) + 1;
+        parsed_len += attr_len;
+        orig_value += attr_len;
+    }
+    if (value) {
+        size = value - ovalue;
+    }
+
+err_out:
+    g_free(orig_value_start);
+    return size;
+}
+
+int v9fs_set_xattr(FsContext *ctx, const char *path, const char *name,
+                   void *value, size_t size, int flags)
+{
+    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
+    if (xops) {
+        return xops->setxattr(ctx, path, name, value, size, flags);
+    }
+    errno = -EOPNOTSUPP;
+    return -1;
+
+}
+
+int v9fs_remove_xattr(FsContext *ctx,
+                      const char *path, const char *name)
+{
+    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
+    if (xops) {
+        return xops->removexattr(ctx, path, name);
+    }
+    errno = -EOPNOTSUPP;
+    return -1;
+
+}
+
+ssize_t local_getxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fgetxattrat_nofollow(dirfd, filename, name, value, size);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+ssize_t pt_getxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size)
+{
+    return local_getxattr_nofollow(ctx, path, name, value, size);
+}
+
+int fsetxattrat_nofollow(int dirfd, const char *filename, const char *name,
+                         void *value, size_t size, int flags)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lsetxattr(proc_path, name, value, size, flags);
+    g_free(proc_path);
+    return ret;
+}
+
+ssize_t local_setxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size,
+                                int flags)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fsetxattrat_nofollow(dirfd, filename, name, value, size, flags);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+int pt_setxattr(FsContext *ctx, const char *path, const char *name, void *value,
+                size_t size, int flags)
+{
+    return local_setxattr_nofollow(ctx, path, name, value, size, flags);
+}
+
+static ssize_t fremovexattrat_nofollow(int dirfd, const char *filename,
+                                       const char *name)
+{
+    char *proc_path = g_strdup_printf("/proc/self/fd/%d/%s", dirfd, filename);
+    int ret;
+
+    ret = lremovexattr(proc_path, name);
+    g_free(proc_path);
+    return ret;
+}
+
+ssize_t local_removexattr_nofollow(FsContext *ctx, const char *path,
+                                   const char *name)
+{
+    char *dirpath = g_path_get_dirname(path);
+    char *filename = g_path_get_basename(path);
+    int dirfd;
+    ssize_t ret = -1;
+
+    dirfd = local_opendir_nofollow(ctx, dirpath);
+    if (dirfd == -1) {
+        goto out;
+    }
+
+    ret = fremovexattrat_nofollow(dirfd, filename, name);
+    close_preserve_errno(dirfd);
+out:
+    g_free(dirpath);
+    g_free(filename);
+    return ret;
+}
+
+int pt_removexattr(FsContext *ctx, const char *path, const char *name)
+{
+    return local_removexattr_nofollow(ctx, path, name);
+}
+
+ssize_t notsup_getxattr(FsContext *ctx, const char *path, const char *name,
+                        void *value, size_t size)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
+int notsup_setxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size, int flags)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
+ssize_t notsup_listxattr(FsContext *ctx, const char *path, char *name,
+                         void *value, size_t size)
+{
+    return 0;
+}
+
+int notsup_removexattr(FsContext *ctx, const char *path, const char *name)
+{
+    errno = ENOTSUP;
+    return -1;
+}
+
+XattrOperations *mapped_xattr_ops[] = {
+    &mapped_user_xattr,
+    &mapped_pacl_xattr,
+    &mapped_dacl_xattr,
+    NULL,
+};
+
+XattrOperations *passthrough_xattr_ops[] = {
+    &passthrough_user_xattr,
+    &passthrough_acl_xattr,
+    NULL,
+};
+
+/* for .user none model should be same as passthrough */
+XattrOperations *none_xattr_ops[] = {
+    &passthrough_user_xattr,
+    &none_acl_xattr,
+    NULL,
+};
--- a/hw/9pfs/virtio-9p-xattr.h
+++ b/hw/9pfs/virtio-9p-xattr.h
@@ -1,5 +1,5 @@
 /*
- * Virtio 9p
+ * 9p
 *
 * Copyright IBM, Corp. 2010
 *
@@ -10,8 +10,8 @@
 * the COPYING file in the top-level directory.
 *
 */
-#ifndef _QEMU_VIRTIO_9P_XATTR_H
-#define _QEMU_VIRTIO_9P_XATTR_H
+#ifndef _QEMU_9P_XATTR_H
+#define _QEMU_9P_XATTR_H

 #include "qemu/xattr.h"

@@ -28,6 +28,13 @@ typedef struct xattr_operations
                       const char *path, const char *name);
 } XattrOperations;

+ssize_t local_getxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size);
+ssize_t local_setxattr_nofollow(FsContext *ctx, const char *path,
+                                const char *name, void *value, size_t size,
+                                int flags);
+ssize_t local_removexattr_nofollow(FsContext *ctx, const char *path,
+                                   const char *name);

 extern XattrOperations mapped_user_xattr;
 extern XattrOperations passthrough_user_xattr;
@@ -48,58 +55,21 @@ ssize_t v9fs_list_xattr(FsContext *ctx, const char *path, void *value,
 int v9fs_set_xattr(FsContext *ctx, const char *path, const char *name,
                          void *value, size_t size, int flags);
 int v9fs_remove_xattr(FsContext *ctx, const char *path, const char *name);
+
 ssize_t pt_listxattr(FsContext *ctx, const char *path, char *name, void *value,
                     size_t size);
+ssize_t pt_getxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size);
+int pt_setxattr(FsContext *ctx, const char *path, const char *name, void *value,
+                size_t size, int flags);
+int pt_removexattr(FsContext *ctx, const char *path, const char *name);

-static inline ssize_t pt_getxattr(FsContext *ctx, const char *path,
-                                  const char *name, void *value, size_t size)
-{
-    char buffer[PATH_MAX];
-    return lgetxattr(rpath(ctx, path, buffer), name, value, size);
-}
-
-static inline int pt_setxattr(FsContext *ctx, const char *path,
-                              const char *name, void *value,
-                              size_t size, int flags)
-{
-    char buffer[PATH_MAX];
-    return lsetxattr(rpath(ctx, path, buffer), name, value, size, flags);
-}
-
-static inline int pt_removexattr(FsContext *ctx,
-                                 const char *path, const char *name)
-{
-    char buffer[PATH_MAX];
-    return lremovexattr(rpath(ctx, path, buffer), name);
-}
-
-static inline ssize_t notsup_getxattr(FsContext *ctx, const char *path,
-                                      const char *name, void *value,
-                                      size_t size)
-{
-    errno = ENOTSUP;
-    return -1;
-}
-
-static inline int notsup_setxattr(FsContext *ctx, const char *path,
-                                  const char *name, void *value,
-                                  size_t size, int flags)
-{
-    errno = ENOTSUP;
-    return -1;
-}
-
-static inline ssize_t notsup_listxattr(FsContext *ctx, const char *path,
-                                       char *name, void *value, size_t size)
-{
-    return 0;
-}
-
-static inline int notsup_removexattr(FsContext *ctx,
-                                     const char *path, const char *name)
-{
-    errno = ENOTSUP;
-    return -1;
-}
+ssize_t notsup_getxattr(FsContext *ctx, const char *path, const char *name,
+                        void *value, size_t size);
+int notsup_setxattr(FsContext *ctx, const char *path, const char *name,
+                    void *value, size_t size, int flags);
+ssize_t notsup_listxattr(FsContext *ctx, const char *path, char *name,
+                         void *value, size_t size);
+int notsup_removexattr(FsContext *ctx, const char *path, const char *name);

 #endif
--- a/hw/9pfs/virtio-9p.c
+++ b/hw/9pfs/virtio-9p.c
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -0,0 +1,334 @@
+#ifndef _QEMU_9P_H
+#define _QEMU_9P_H
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <utime.h>
+#include <sys/resource.h>
+#include <glib.h>
+#include "standard-headers/linux/virtio_9p.h"
+#include "hw/virtio.h"
+#include "fsdev/file-op-9p.h"
+#include "fsdev/9p-iov-marshal.h"
+#include "qemu/thread.h"
+#include "qemu/coroutine.h"
+
+enum {
+    P9_TLERROR = 6,
+    P9_RLERROR,
+    P9_TSTATFS = 8,
+    P9_RSTATFS,
+    P9_TLOPEN = 12,
+    P9_RLOPEN,
+    P9_TLCREATE = 14,
+    P9_RLCREATE,
+    P9_TSYMLINK = 16,
+    P9_RSYMLINK,
+    P9_TMKNOD = 18,
+    P9_RMKNOD,
+    P9_TRENAME = 20,
+    P9_RRENAME,
+    P9_TREADLINK = 22,
+    P9_RREADLINK,
+    P9_TGETATTR = 24,
+    P9_RGETATTR,
+    P9_TSETATTR = 26,
+    P9_RSETATTR,
+    P9_TXATTRWALK = 30,
+    P9_RXATTRWALK,
+    P9_TXATTRCREATE = 32,
+    P9_RXATTRCREATE,
+    P9_TREADDIR = 40,
+    P9_RREADDIR,
+    P9_TFSYNC = 50,
+    P9_RFSYNC,
+    P9_TLOCK = 52,
+    P9_RLOCK,
+    P9_TGETLOCK = 54,
+    P9_RGETLOCK,
+    P9_TLINK = 70,
+    P9_RLINK,
+    P9_TMKDIR = 72,
+    P9_RMKDIR,
+    P9_TRENAMEAT = 74,
+    P9_RRENAMEAT,
+    P9_TUNLINKAT = 76,
+    P9_RUNLINKAT,
+    P9_TVERSION = 100,
+    P9_RVERSION,
+    P9_TAUTH = 102,
+    P9_RAUTH,
+    P9_TATTACH = 104,
+    P9_RATTACH,
+    P9_TERROR = 106,
+    P9_RERROR,
+    P9_TFLUSH = 108,
+    P9_RFLUSH,
+    P9_TWALK = 110,
+    P9_RWALK,
+    P9_TOPEN = 112,
+    P9_ROPEN,
+    P9_TCREATE = 114,
+    P9_RCREATE,
+    P9_TREAD = 116,
+    P9_RREAD,
+    P9_TWRITE = 118,
+    P9_RWRITE,
+    P9_TCLUNK = 120,
+    P9_RCLUNK,
+    P9_TREMOVE = 122,
+    P9_RREMOVE,
+    P9_TSTAT = 124,
+    P9_RSTAT,
+    P9_TWSTAT = 126,
+    P9_RWSTAT,
+};
+
+
+/* qid.types */
+enum {
+    P9_QTDIR = 0x80,
+    P9_QTAPPEND = 0x40,
+    P9_QTEXCL = 0x20,
+    P9_QTMOUNT = 0x10,
+    P9_QTAUTH = 0x08,
+    P9_QTTMP = 0x04,
+    P9_QTSYMLINK = 0x02,
+    P9_QTLINK = 0x01,
+    P9_QTFILE = 0x00,
+};
+
+enum p9_proto_version {
+    V9FS_PROTO_2000U = 0x01,
+    V9FS_PROTO_2000L = 0x02,
+};
+
+#define P9_NOTAG    (u16)(~0)
+#define P9_NOFID    (u32)(~0)
+#define P9_MAXWELEM 16
+
+#define FID_REFERENCED          0x1
+#define FID_NON_RECLAIMABLE     0x2
+static inline char *rpath(FsContext *ctx, const char *path)
+{
+    return g_strdup_printf("%s/%s", ctx->fs_root, path);
+}
+
+/*
+ * ample room for Twrite/Rread header
+ * size[4] Tread/Twrite tag[2] fid[4] offset[8] count[4]
+ */
+#define P9_IOHDRSZ 24
+
+typedef struct V9fsPDU V9fsPDU;
+struct V9fsState;
+
+struct V9fsPDU
+{
+    uint32_t size;
+    uint16_t tag;
+    uint8_t id;
+    uint8_t cancelled;
+    CoQueue complete;
+    struct V9fsState *s;
+    QLIST_ENTRY(V9fsPDU) next;
+    uint32_t idx;
+};
+
+
+/* FIXME
+ * 1) change user needs to set groups and stuff
+ */
+
+#define MAX_REQ         128
+#define MAX_TAG_LEN     32
+
+#define BUG_ON(cond) assert(!(cond))
+
+typedef struct V9fsFidState V9fsFidState;
+
+enum {
+    P9_FID_NONE = 0,
+    P9_FID_FILE,
+    P9_FID_DIR,
+    P9_FID_XATTR,
+};
+
+typedef struct V9fsConf
+{
+    /* tag name for the device */
+    char *tag;
+    char *fsdev_id;
+} V9fsConf;
+
+typedef struct V9fsXattr
+{
+    int64_t copied_len;
+    int64_t len;
+    void *value;
+    V9fsString name;
+    int flags;
+} V9fsXattr;
+
+typedef struct V9fsDir {
+    DIR *stream;
+} V9fsDir;
+
+/*
+ * Filled by fs driver on open and other
+ * calls.
+ */
+union V9fsFidOpenState {
+    int fd;
+    V9fsDir dir;
+    V9fsXattr xattr;
+    /*
+     * private pointer for fs drivers, that
+     * have its own internal representation of
+     * open files.
+     */
+    void *private;
+};
+
+struct V9fsFidState
+{
+    int fid_type;
+    int32_t fid;
+    V9fsPath path;
+    V9fsFidOpenState fs;
+    V9fsFidOpenState fs_reclaim;
+    int flags;
+    int open_flags;
+    uid_t uid;
+    int ref;
+    int clunked;
+    V9fsFidState *next;
+    V9fsFidState *rclm_lst;
+};
+
+typedef struct V9fsState
+{
+    QLIST_HEAD(, V9fsPDU) free_list;
+    QLIST_HEAD(, V9fsPDU) active_list;
+    V9fsFidState *fid_list;
+    FileOperations *ops;
+    FsContext ctx;
+    char *tag;
+    enum p9_proto_version proto_version;
+    int32_t msize;
+    V9fsPDU pdus[MAX_REQ];
+    /*
+     * lock ensuring atomic path update
+     * on rename.
+     */
+    CoRwlock rename_lock;
+    int32_t root_fid;
+    Error *migration_blocker;
+    V9fsConf fsconf;
+    V9fsQID root_qid;
+} V9fsState;
+
+/* 9p2000.L open flags */
+#define P9_DOTL_RDONLY        00000000
+#define P9_DOTL_WRONLY        00000001
+#define P9_DOTL_RDWR          00000002
+#define P9_DOTL_NOACCESS      00000003
+#define P9_DOTL_CREATE        00000100
+#define P9_DOTL_EXCL          00000200
+#define P9_DOTL_NOCTTY        00000400
+#define P9_DOTL_TRUNC         00001000
+#define P9_DOTL_APPEND        00002000
+#define P9_DOTL_NONBLOCK      00004000
+#define P9_DOTL_DSYNC         00010000
+#define P9_DOTL_FASYNC        00020000
+#define P9_DOTL_DIRECT        00040000
+#define P9_DOTL_LARGEFILE     00100000
+#define P9_DOTL_DIRECTORY     00200000
+#define P9_DOTL_NOFOLLOW      00400000
+#define P9_DOTL_NOATIME       01000000
+#define P9_DOTL_CLOEXEC       02000000
+#define P9_DOTL_SYNC          04000000
+
+/* 9p2000.L at flags */
+#define P9_DOTL_AT_REMOVEDIR         0x200
+
+/* 9P2000.L lock type */
+#define P9_LOCK_TYPE_RDLCK 0
+#define P9_LOCK_TYPE_WRLCK 1
+#define P9_LOCK_TYPE_UNLCK 2
+
+#define P9_LOCK_SUCCESS 0
+#define P9_LOCK_BLOCKED 1
+#define P9_LOCK_ERROR 2
+#define P9_LOCK_GRACE 3
+
+#define P9_LOCK_FLAGS_BLOCK 1
+#define P9_LOCK_FLAGS_RECLAIM 2
+
+typedef struct V9fsFlock
+{
+    uint8_t type;
+    uint32_t flags;
+    uint64_t start; /* absolute offset */
+    uint64_t length;
+    uint32_t proc_id;
+    V9fsString client_id;
+} V9fsFlock;
+
+typedef struct V9fsGetlock
+{
+    uint8_t type;
+    uint64_t start; /* absolute offset */
+    uint64_t length;
+    uint32_t proc_id;
+    V9fsString client_id;
+} V9fsGetlock;
+
+extern int open_fd_hw;
+extern int total_open_fd;
+
+static inline void v9fs_path_write_lock(V9fsState *s)
+{
+    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
+        qemu_co_rwlock_wrlock(&s->rename_lock);
+    }
+}
+
+static inline void v9fs_path_read_lock(V9fsState *s)
+{
+    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
+        qemu_co_rwlock_rdlock(&s->rename_lock);
+    }
+}
+
+static inline void v9fs_path_unlock(V9fsState *s)
+{
+    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
+        qemu_co_rwlock_unlock(&s->rename_lock);
+    }
+}
+
+static inline uint8_t v9fs_request_cancelled(V9fsPDU *pdu)
+{
+    return pdu->cancelled;
+}
+
+extern void v9fs_reclaim_fd(V9fsPDU *pdu);
+extern void v9fs_path_init(V9fsPath *path);
+extern void v9fs_path_free(V9fsPath *path);
+extern void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
+extern void v9fs_path_copy(V9fsPath *lhs, V9fsPath *rhs);
+extern int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
+                             const char *name, V9fsPath *path);
+extern int v9fs_device_realize_common(V9fsState *s, Error **errp);
+extern void v9fs_device_unrealize_common(V9fsState *s, Error **errp);
+
+ssize_t pdu_marshal(V9fsPDU *pdu, size_t offset, const char *fmt, ...);
+ssize_t pdu_unmarshal(V9fsPDU *pdu, size_t offset, const char *fmt, ...);
+V9fsPDU *pdu_alloc(V9fsState *s);
+void pdu_free(V9fsPDU *pdu);
+void pdu_submit(V9fsPDU *pdu);
+void v9fs_reset(V9fsState *s);
+
+#endif
--- a/hw/9pfs/Makefile.objs
+++ b/hw/9pfs/Makefile.objs
@@ -1,9 +1,9 @@
-common-obj-y  = virtio-9p.o
-common-obj-y += virtio-9p-local.o virtio-9p-xattr.o
-common-obj-y += virtio-9p-xattr-user.o virtio-9p-posix-acl.o
+common-obj-y  = 9p.o 9p-util.o
+common-obj-y += 9p-local.o 9p-xattr.o
+common-obj-y += 9p-xattr-user.o 9p-posix-acl.o
 common-obj-y += virtio-9p-coth.o cofs.o codir.o cofile.o
 common-obj-y += coxattr.o virtio-9p-synth.o
-common-obj-$(CONFIG_OPEN_BY_HANDLE) +=  virtio-9p-handle.o
-common-obj-y += virtio-9p-proxy.o
+common-obj-$(CONFIG_OPEN_BY_HANDLE) +=  9p-handle.o
+common-obj-y += 9p-proxy.o

 obj-y += virtio-9p-device.o
--- a/hw/9pfs/cofile.c
+++ b/hw/9pfs/cofile.c
@@ -138,10 +138,10 @@ int v9fs_co_open2(V9fsPDU *pdu, V9fsFidState *fidp, V9fsString *name, gid_t gid,
    cred.fc_gid = gid;
    /*
     * Hold the directory fid lock so that directory path name
-     * don't change. Read lock is fine because this fid cannot
-     * be used by any other operation.
+     * don't change. Take the write lock to be sure this fid
+     * cannot be used by another operation.
     */
-    v9fs_path_read_lock(s);
+    v9fs_path_write_lock(s);
    v9fs_co_run_in_worker(
        {
            err = s->ops->open2(&s->ctx, &fidp->path,
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -14,11 +14,62 @@
 #include "hw/virtio.h"
 #include "hw/pc.h"
 #include "qemu/sockets.h"
-#include "hw/virtio-pci.h"
 #include "virtio-9p.h"
 #include "fsdev/qemu-fsdev.h"
-#include "virtio-9p-xattr.h"
+#include "9p-xattr.h"
 #include "virtio-9p-coth.h"
+#include "qemu/iov.h"
+
+void virtio_9p_push_and_notify(V9fsPDU *pdu)
+{
+    V9fsState *s = pdu->s;
+    V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
+    VirtQueueElement *elem = &v->elems[pdu->idx];
+
+    /* push onto queue and notify */
+    virtqueue_push(v->vq, elem, pdu->size);
+
+    /* FIXME: we should batch these completions */
+    virtio_notify(VIRTIO_DEVICE(v), v->vq);
+}
+
+static void handle_9p_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    V9fsVirtioState *v = (V9fsVirtioState *)vdev;
+    V9fsState *s = &v->state;
+    V9fsPDU *pdu;
+    ssize_t len;
+
+    while ((pdu = pdu_alloc(s))) {
+        struct {
+            uint32_t size_le;
+            uint8_t id;
+            uint16_t tag_le;
+        } QEMU_PACKED out;
+        VirtQueueElement *elem = &v->elems[pdu->idx];
+
+        len = virtqueue_pop(vq, elem);
+        if (!len) {
+            pdu_free(pdu);
+            break;
+        }
+
+        BUG_ON(elem->out_num == 0 || elem->in_num == 0);
+        QEMU_BUILD_BUG_ON(sizeof out != 7);
+
+        len = iov_to_buf(elem->out_sg, elem->out_num, 0,
+                         &out, sizeof out);
+        BUG_ON(len != sizeof out);
+
+        pdu->size = le32_to_cpu(out.size_le);
+
+        pdu->id = out.id;
+        pdu->tag = le16_to_cpu(out.tag_le);
+
+        qemu_co_queue_init(&pdu->complete);
+        pdu_submit(pdu);
+    }
+}

 static uint32_t virtio_9p_get_features(VirtIODevice *vdev, uint32_t features)
 {
@@ -26,164 +77,125 @@ static uint32_t virtio_9p_get_features(VirtIODevice *vdev, uint32_t features)
    return features;
 }

-static V9fsState *to_virtio_9p(VirtIODevice *vdev)
-{
-    return (V9fsState *)vdev;
-}
-
 static void virtio_9p_get_config(VirtIODevice *vdev, uint8_t *config)
 {
    int len;
    struct virtio_9p_config *cfg;
-    V9fsState *s = to_virtio_9p(vdev);
+    V9fsVirtioState *v = VIRTIO_9P(vdev);
+    V9fsState *s = &v->state;

    len = strlen(s->tag);
    cfg = g_malloc0(sizeof(struct virtio_9p_config) + len);
    stw_raw(&cfg->tag_len, len);
    /* We don't copy the terminating null to config space */
    memcpy(cfg->tag, s->tag, len);
-    memcpy(config, cfg, s->config_size);
+    memcpy(config, cfg, v->config_size);
    g_free(cfg);
 }

-VirtIODevice *virtio_9p_init(DeviceState *dev, V9fsConf *conf)
+static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
 {
-    V9fsState *s;
-    int i, len;
-    struct stat stat;
-    FsDriverEntry *fse;
-    V9fsPath path;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    V9fsVirtioState *v = VIRTIO_9P(dev);
+    V9fsState *s = &v->state;

-    s = (V9fsState *)virtio_common_init("virtio-9p",
-                                    VIRTIO_ID_9P,
-                                    sizeof(struct virtio_9p_config)+
-                                    MAX_TAG_LEN,
-                                    sizeof(V9fsState));
-    /* initialize pdu allocator */
-    QLIST_INIT(&s->free_list);
-    QLIST_INIT(&s->active_list);
-    for (i = 0; i < (MAX_REQ - 1); i++) {
-        QLIST_INSERT_HEAD(&s->free_list, &s->pdus[i], next);
+    if (v9fs_device_realize_common(s, errp)) {
+        goto out;
    }
+    vdev->get_config = virtio_9p_get_config;

-    s->vq = virtio_add_queue(&s->vdev, MAX_REQ, handle_9p_output);
+    v->config_size = sizeof(struct virtio_9p_config) + strlen(s->fsconf.tag);
+    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size);
+    v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output);

-    fse = get_fsdev_fsentry(conf->fsdev_id);
-
-    if (!fse) {
-        /* We don't have a fsdev identified by fsdev_id */
-        fprintf(stderr, "Virtio-9p device couldn't find fsdev with the "
-                "id = %s\n", conf->fsdev_id ? conf->fsdev_id : "NULL");
-        exit(1);
-    }
-
-    if (!conf->tag) {
-        /* we haven't specified a mount_tag */
-        fprintf(stderr, "fsdev with id %s needs mount_tag arguments\n",
-                conf->fsdev_id);
-        exit(1);
-    }
-
-    s->ctx.export_flags = fse->export_flags;
-    s->ctx.fs_root = g_strdup(fse->path);
-    s->ctx.exops.get_st_gen = NULL;
-    len = strlen(conf->tag);
-    if (len > MAX_TAG_LEN - 1) {
-        fprintf(stderr, "mount tag '%s' (%d bytes) is longer than "
-                "maximum (%d bytes)", conf->tag, len, MAX_TAG_LEN - 1);
-        exit(1);
-    }
-
-    s->tag = g_strdup(conf->tag);
-    s->ctx.uid = -1;
-
-    s->ops = fse->ops;
-    s->vdev.get_features = virtio_9p_get_features;
-    s->config_size = sizeof(struct virtio_9p_config) + len;
-    s->vdev.get_config = virtio_9p_get_config;
-    s->fid_list = NULL;
-    qemu_co_rwlock_init(&s->rename_lock);
-
-    if (s->ops->init(&s->ctx) < 0) {
-        fprintf(stderr, "Virtio-9p Failed to initialize fs-driver with id:%s"
-                " and export path:%s\n", conf->fsdev_id, s->ctx.fs_root);
-        exit(1);
-    }
-    if (v9fs_init_worker_threads() < 0) {
-        fprintf(stderr, "worker thread initialization failed\n");
-        exit(1);
-    }
-
-    /*
-     * Check details of export path, We need to use fs driver
-     * call back to do that. Since we are in the init path, we don't
-     * use co-routines here.
-     */
-    v9fs_path_init(&path);
-    if (s->ops->name_to_path(&s->ctx, NULL, "/", &path) < 0) {
-        fprintf(stderr,
-                "error in converting name to path %s", strerror(errno));
-        exit(1);
-    }
-    if (s->ops->lstat(&s->ctx, &path, &stat)) {
-        fprintf(stderr, "share path %s does not exist\n", fse->path);
-        exit(1);
-    } else if (!S_ISDIR(stat.st_mode)) {
-        fprintf(stderr, "share path %s is not a directory\n", fse->path);
-        exit(1);
-    }
-    v9fs_path_free(&path);
-
-    return &s->vdev;
+out:
+    return;
 }

-static int virtio_9p_init_pci(PCIDevice *pci_dev)
+static void virtio_9p_device_unrealize(DeviceState *dev, Error **errp)
 {
-    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
-    VirtIODevice *vdev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    V9fsVirtioState *v = VIRTIO_9P(dev);
+    V9fsState *s = &v->state;

-    vdev = virtio_9p_init(&pci_dev->qdev, &proxy->fsconf);
-    vdev->nvectors = proxy->nvectors;
-    virtio_init_pci(proxy, vdev);
-    /* make the actual value visible */
-    proxy->nvectors = vdev->nvectors;
-    return 0;
+    virtio_cleanup(vdev);
+    v9fs_device_unrealize_common(s, errp);
 }

+static void virtio_9p_reset(VirtIODevice *vdev)
+{
+    V9fsVirtioState *v = (V9fsVirtioState *)vdev;
+
+    v9fs_reset(&v->state);
+}
+
+ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t offset,
+                            const char *fmt, va_list ap)
+{
+    V9fsState *s = pdu->s;
+    V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
+    VirtQueueElement *elem = &v->elems[pdu->idx];
+
+    return v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap);
+}
+
+ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset,
+                              const char *fmt, va_list ap)
+{
+    V9fsState *s = pdu->s;
+    V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
+    VirtQueueElement *elem = &v->elems[pdu->idx];
+
+    return v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, ap);
+}
+
+void virtio_init_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov,
+                              unsigned int *pniov, bool is_write)
+{
+    V9fsState *s = pdu->s;
+    V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
+    VirtQueueElement *elem = &v->elems[pdu->idx];
+
+    if (is_write) {
+        *piov = elem->out_sg;
+        *pniov = elem->out_num;
+    } else {
+        *piov = elem->in_sg;
+        *pniov = elem->in_num;
+    }
+}
+
+/* virtio-9p device */
+
 static Property virtio_9p_properties[] = {
-    DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
-    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
-    DEFINE_VIRTIO_COMMON_FEATURES(VirtIOPCIProxy, host_features),
-    DEFINE_PROP_STRING("mount_tag", VirtIOPCIProxy, fsconf.tag),
-    DEFINE_PROP_STRING("fsdev", VirtIOPCIProxy, fsconf.fsdev_id),
+    DEFINE_PROP_STRING("mount_tag", V9fsVirtioState, state.fsconf.tag),
+    DEFINE_PROP_STRING("fsdev", V9fsVirtioState, state.fsconf.fsdev_id),
    DEFINE_PROP_END_OF_LIST(),
 };

 static void virtio_9p_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);

-    k->init = virtio_9p_init_pci;
-    k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
-    k->device_id = PCI_DEVICE_ID_VIRTIO_9P;
-    k->revision = VIRTIO_PCI_ABI_VERSION;
-    k->class_id = 0x2;
    dc->props = virtio_9p_properties;
-    dc->reset = virtio_pci_reset;
+    vdc->realize = virtio_9p_device_realize;
+    vdc->unrealize = virtio_9p_device_unrealize;
+    vdc->get_features = virtio_9p_get_features;
+    vdc->get_config = virtio_9p_get_config;
+    vdc->reset = virtio_9p_reset;
 }

-static const TypeInfo virtio_9p_info = {
-    .name          = "virtio-9p-pci",
-    .parent        = TYPE_PCI_DEVICE,
-    .instance_size = sizeof(VirtIOPCIProxy),
-    .class_init    = virtio_9p_class_init,
+static const TypeInfo virtio_device_info = {
+    .name = TYPE_VIRTIO_9P,
+    .parent = TYPE_VIRTIO_DEVICE,
+    .instance_size = sizeof(V9fsVirtioState),
+    .class_init = virtio_9p_class_init,
 };

 static void virtio_9p_register_types(void)
 {
-    type_register_static(&virtio_9p_info);
-    virtio_9p_set_fd_limit();
+    type_register_static(&virtio_device_info);
 }

 type_init(virtio_9p_register_types)
--- a/hw/9pfs/virtio-9p-local.c
+++ b/hw/9pfs/virtio-9p-local.c
--- a/hw/9pfs/virtio-9p-synth.c
+++ b/hw/9pfs/virtio-9p-synth.c
@@ -13,8 +13,8 @@
 */

 #include "hw/virtio.h"
-#include "virtio-9p.h"
-#include "virtio-9p-xattr.h"
+#include "9p.h"
+#include "9p-xattr.h"
 #include "fsdev/qemu-fsdev.h"
 #include "virtio-9p-synth.h"

--- a/hw/9pfs/virtio-9p-xattr.c
+++ b/hw/9pfs/virtio-9p-xattr.c
@@ -1,161 +0,0 @@
-/*
- * Virtio 9p  xattr callback
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "hw/virtio.h"
-#include "virtio-9p.h"
-#include "fsdev/file-op-9p.h"
-#include "virtio-9p-xattr.h"
-
-
-static XattrOperations *get_xattr_operations(XattrOperations **h,
-                                             const char *name)
-{
-    XattrOperations *xops;
-    for (xops = *(h)++; xops != NULL; xops = *(h)++) {
-        if (!strncmp(name, xops->name, strlen(xops->name))) {
-            return xops;
-        }
-    }
-    return NULL;
-}
-
-ssize_t v9fs_get_xattr(FsContext *ctx, const char *path,
-                       const char *name, void *value, size_t size)
-{
-    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
-    if (xops) {
-        return xops->getxattr(ctx, path, name, value, size);
-    }
-    errno = -EOPNOTSUPP;
-    return -1;
-}
-
-ssize_t pt_listxattr(FsContext *ctx, const char *path,
-                     char *name, void *value, size_t size)
-{
-    int name_size = strlen(name) + 1;
-    if (!value) {
-        return name_size;
-    }
-
-    if (size < name_size) {
-        errno = ERANGE;
-        return -1;
-    }
-
-    /* no need for strncpy: name_size is strlen(name)+1 */
-    memcpy(value, name, name_size);
-    return name_size;
-}
-
-
-/*
- * Get the list and pass to each layer to find out whether
- * to send the data or not
- */
-ssize_t v9fs_list_xattr(FsContext *ctx, const char *path,
-                        void *value, size_t vsize)
-{
-    ssize_t size = 0;
-    char buffer[PATH_MAX];
-    void *ovalue = value;
-    XattrOperations *xops;
-    char *orig_value, *orig_value_start;
-    ssize_t xattr_len, parsed_len = 0, attr_len;
-
-    /* Get the actual len */
-    xattr_len = llistxattr(rpath(ctx, path, buffer), value, 0);
-    if (xattr_len <= 0) {
-        return xattr_len;
-    }
-
-    /* Now fetch the xattr and find the actual size */
-    orig_value = g_malloc(xattr_len);
-    xattr_len = llistxattr(rpath(ctx, path, buffer), orig_value, xattr_len);
-
-    /* store the orig pointer */
-    orig_value_start = orig_value;
-    while (xattr_len > parsed_len) {
-        xops = get_xattr_operations(ctx->xops, orig_value);
-        if (!xops) {
-            goto next_entry;
-        }
-
-        if (!value) {
-            size += xops->listxattr(ctx, path, orig_value, value, vsize);
-        } else {
-            size = xops->listxattr(ctx, path, orig_value, value, vsize);
-            if (size < 0) {
-                goto err_out;
-            }
-            value += size;
-            vsize -= size;
-        }
-next_entry:
-        /* Got the next entry */
-        attr_len = strlen(orig_value) + 1;
-        parsed_len += attr_len;
-        orig_value += attr_len;
-    }
-    if (value) {
-        size = value - ovalue;
-    }
-
-err_out:
-    g_free(orig_value_start);
-    return size;
-}
-
-int v9fs_set_xattr(FsContext *ctx, const char *path, const char *name,
-                   void *value, size_t size, int flags)
-{
-    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
-    if (xops) {
-        return xops->setxattr(ctx, path, name, value, size, flags);
-    }
-    errno = -EOPNOTSUPP;
-    return -1;
-
-}
-
-int v9fs_remove_xattr(FsContext *ctx,
-                      const char *path, const char *name)
-{
-    XattrOperations *xops = get_xattr_operations(ctx->xops, name);
-    if (xops) {
-        return xops->removexattr(ctx, path, name);
-    }
-    errno = -EOPNOTSUPP;
-    return -1;
-
-}
-
-XattrOperations *mapped_xattr_ops[] = {
-    &mapped_user_xattr,
-    &mapped_pacl_xattr,
-    &mapped_dacl_xattr,
-    NULL,
-};
-
-XattrOperations *passthrough_xattr_ops[] = {
-    &passthrough_user_xattr,
-    &passthrough_acl_xattr,
-    NULL,
-};
-
-/* for .user none model should be same as passthrough */
-XattrOperations *none_xattr_ops[] = {
-    &passthrough_user_xattr,
-    &none_acl_xattr,
-    NULL,
-};
--- a/hw/9pfs/virtio-9p.h
+++ b/hw/9pfs/virtio-9p.h
@@ -1,405 +1,30 @@
 #ifndef _QEMU_VIRTIO_9P_H
 #define _QEMU_VIRTIO_9P_H

-#include <sys/types.h>
-#include <dirent.h>
-#include <sys/time.h>
-#include <utime.h>
-#include <sys/resource.h>
+#include "standard-headers/linux/virtio_9p.h"
 #include "hw/virtio.h"
-#include "fsdev/file-op-9p.h"
-#include "fsdev/virtio-9p-marshal.h"
-#include "qemu/thread.h"
-#include "block/coroutine.h"
+#include "9p.h"

-
-/* The feature bitmap for virtio 9P */
-/* The mount point is specified in a config variable */
-#define VIRTIO_9P_MOUNT_TAG 0
-
-enum {
-    P9_TLERROR = 6,
-    P9_RLERROR,
-    P9_TSTATFS = 8,
-    P9_RSTATFS,
-    P9_TLOPEN = 12,
-    P9_RLOPEN,
-    P9_TLCREATE = 14,
-    P9_RLCREATE,
-    P9_TSYMLINK = 16,
-    P9_RSYMLINK,
-    P9_TMKNOD = 18,
-    P9_RMKNOD,
-    P9_TRENAME = 20,
-    P9_RRENAME,
-    P9_TREADLINK = 22,
-    P9_RREADLINK,
-    P9_TGETATTR = 24,
-    P9_RGETATTR,
-    P9_TSETATTR = 26,
-    P9_RSETATTR,
-    P9_TXATTRWALK = 30,
-    P9_RXATTRWALK,
-    P9_TXATTRCREATE = 32,
-    P9_RXATTRCREATE,
-    P9_TREADDIR = 40,
-    P9_RREADDIR,
-    P9_TFSYNC = 50,
-    P9_RFSYNC,
-    P9_TLOCK = 52,
-    P9_RLOCK,
-    P9_TGETLOCK = 54,
-    P9_RGETLOCK,
-    P9_TLINK = 70,
-    P9_RLINK,
-    P9_TMKDIR = 72,
-    P9_RMKDIR,
-    P9_TRENAMEAT = 74,
-    P9_RRENAMEAT,
-    P9_TUNLINKAT = 76,
-    P9_RUNLINKAT,
-    P9_TVERSION = 100,
-    P9_RVERSION,
-    P9_TAUTH = 102,
-    P9_RAUTH,
-    P9_TATTACH = 104,
-    P9_RATTACH,
-    P9_TERROR = 106,
-    P9_RERROR,
-    P9_TFLUSH = 108,
-    P9_RFLUSH,
-    P9_TWALK = 110,
-    P9_RWALK,
-    P9_TOPEN = 112,
-    P9_ROPEN,
-    P9_TCREATE = 114,
-    P9_RCREATE,
-    P9_TREAD = 116,
-    P9_RREAD,
-    P9_TWRITE = 118,
-    P9_RWRITE,
-    P9_TCLUNK = 120,
-    P9_RCLUNK,
-    P9_TREMOVE = 122,
-    P9_RREMOVE,
-    P9_TSTAT = 124,
-    P9_RSTAT,
-    P9_TWSTAT = 126,
-    P9_RWSTAT,
-};
-
-
-/* qid.types */
-enum {
-    P9_QTDIR = 0x80,
-    P9_QTAPPEND = 0x40,
-    P9_QTEXCL = 0x20,
-    P9_QTMOUNT = 0x10,
-    P9_QTAUTH = 0x08,
-    P9_QTTMP = 0x04,
-    P9_QTSYMLINK = 0x02,
-    P9_QTLINK = 0x01,
-    P9_QTFILE = 0x00,
-};
-
-enum p9_proto_version {
-    V9FS_PROTO_2000U = 0x01,
-    V9FS_PROTO_2000L = 0x02,
-};
-
-#define P9_NOTAG    (u16)(~0)
-#define P9_NOFID    (u32)(~0)
-#define P9_MAXWELEM 16
-
-#define FID_REFERENCED          0x1
-#define FID_NON_RECLAIMABLE     0x2
-static inline const char *rpath(FsContext *ctx, const char *path, char *buffer)
+typedef struct V9fsVirtioState
 {
-    snprintf(buffer, PATH_MAX, "%s/%s", ctx->fs_root, path);
-    return buffer;
-}
-
-/*
- * ample room for Twrite/Rread header
- * size[4] Tread/Twrite tag[2] fid[4] offset[8] count[4]
- */
-#define P9_IOHDRSZ 24
-
-typedef struct V9fsPDU V9fsPDU;
-struct V9fsState;
-
-struct V9fsPDU
-{
-    uint32_t size;
-    uint16_t tag;
-    uint8_t id;
-    uint8_t cancelled;
-    CoQueue complete;
-    VirtQueueElement elem;
-    struct V9fsState *s;
-    QLIST_ENTRY(V9fsPDU) next;
-};
-
-
-/* FIXME
- * 1) change user needs to set groups and stuff
- */
-
-/* from Linux's linux/virtio_9p.h */
-
-/* The ID for virtio console */
-#define VIRTIO_ID_9P    9
-#define MAX_REQ         128
-#define MAX_TAG_LEN     32
-
-#define BUG_ON(cond) assert(!(cond))
-
-typedef struct V9fsFidState V9fsFidState;
-
-enum {
-    P9_FID_NONE = 0,
-    P9_FID_FILE,
-    P9_FID_DIR,
-    P9_FID_XATTR,
-};
-
-typedef struct V9fsXattr
-{
-    int64_t copied_len;
-    int64_t len;
-    void *value;
-    V9fsString name;
-    int flags;
-} V9fsXattr;
-
-/*
- * Filled by fs driver on open and other
- * calls.
- */
-union V9fsFidOpenState {
-    int fd;
-    DIR *dir;
-    V9fsXattr xattr;
-    /*
-     * private pointer for fs drivers, that
-     * have its own internal representation of
-     * open files.
-     */
-    void *private;
-};
-
-struct V9fsFidState
-{
-    int fid_type;
-    int32_t fid;
-    V9fsPath path;
-    V9fsFidOpenState fs;
-    V9fsFidOpenState fs_reclaim;
-    int flags;
-    int open_flags;
-    uid_t uid;
-    int ref;
-    int clunked;
-    V9fsFidState *next;
-    V9fsFidState *rclm_lst;
-};
-
-typedef struct V9fsState
-{
-    VirtIODevice vdev;
+    VirtIODevice parent_obj;
    VirtQueue *vq;
-    V9fsPDU pdus[MAX_REQ];
-    QLIST_HEAD(, V9fsPDU) free_list;
-    QLIST_HEAD(, V9fsPDU) active_list;
-    V9fsFidState *fid_list;
-    FileOperations *ops;
-    FsContext ctx;
-    char *tag;
    size_t config_size;
-    enum p9_proto_version proto_version;
-    int32_t msize;
-    /*
-     * lock ensuring atomic path update
-     * on rename.
-     */
-    CoRwlock rename_lock;
-    int32_t root_fid;
-    Error *migration_blocker;
-} V9fsState;
+    VirtQueueElement elems[MAX_REQ];
+    V9fsState state;
+} V9fsVirtioState;

-typedef struct V9fsStatState {
-    V9fsPDU *pdu;
-    size_t offset;
-    V9fsStat v9stat;
-    V9fsFidState *fidp;
-    struct stat stbuf;
-} V9fsStatState;
+extern void virtio_9p_push_and_notify(V9fsPDU *pdu);

-typedef struct V9fsOpenState {
-    V9fsPDU *pdu;
-    size_t offset;
-    int32_t mode;
-    V9fsFidState *fidp;
-    V9fsQID qid;
-    struct stat stbuf;
-    int iounit;
-} V9fsOpenState;
+ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t offset,
+                            const char *fmt, va_list ap);
+ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset,
+                              const char *fmt, va_list ap);
+void virtio_init_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov,
+                              unsigned int *pniov, bool is_write);

-typedef struct V9fsReadState {
-    V9fsPDU *pdu;
-    size_t offset;
-    int32_t count;
-    int32_t total;
-    int64_t off;
-    V9fsFidState *fidp;
-    struct iovec iov[128]; /* FIXME: bad, bad, bad */
-    struct iovec *sg;
-    off_t dir_pos;
-    struct dirent *dent;
-    struct stat stbuf;
-    V9fsString name;
-    V9fsStat v9stat;
-    int32_t len;
-    int32_t cnt;
-    int32_t max_count;
-} V9fsReadState;
-
-typedef struct V9fsWriteState {
-    V9fsPDU *pdu;
-    size_t offset;
-    int32_t len;
-    int32_t count;
-    int32_t total;
-    int64_t off;
-    V9fsFidState *fidp;
-    struct iovec iov[128]; /* FIXME: bad, bad, bad */
-    struct iovec *sg;
-    int cnt;
-} V9fsWriteState;
-
-struct virtio_9p_config
-{
-    /* number of characters in tag */
-    uint16_t tag_len;
-    /* Variable size tag name */
-    uint8_t tag[0];
-} QEMU_PACKED;
-
-typedef struct V9fsMkState {
-    V9fsPDU *pdu;
-    size_t offset;
-    V9fsQID qid;
-    struct stat stbuf;
-    V9fsString name;
-    V9fsString fullname;
-} V9fsMkState;
-
-/* 9p2000.L open flags */
-#define P9_DOTL_RDONLY        00000000
-#define P9_DOTL_WRONLY        00000001
-#define P9_DOTL_RDWR          00000002
-#define P9_DOTL_NOACCESS      00000003
-#define P9_DOTL_CREATE        00000100
-#define P9_DOTL_EXCL          00000200
-#define P9_DOTL_NOCTTY        00000400
-#define P9_DOTL_TRUNC         00001000
-#define P9_DOTL_APPEND        00002000
-#define P9_DOTL_NONBLOCK      00004000
-#define P9_DOTL_DSYNC         00010000
-#define P9_DOTL_FASYNC        00020000
-#define P9_DOTL_DIRECT        00040000
-#define P9_DOTL_LARGEFILE     00100000
-#define P9_DOTL_DIRECTORY     00200000
-#define P9_DOTL_NOFOLLOW      00400000
-#define P9_DOTL_NOATIME       01000000
-#define P9_DOTL_CLOEXEC       02000000
-#define P9_DOTL_SYNC          04000000
-
-/* 9p2000.L at flags */
-#define P9_DOTL_AT_REMOVEDIR         0x200
-
-/* 9P2000.L lock type */
-#define P9_LOCK_TYPE_RDLCK 0
-#define P9_LOCK_TYPE_WRLCK 1
-#define P9_LOCK_TYPE_UNLCK 2
-
-#define P9_LOCK_SUCCESS 0
-#define P9_LOCK_BLOCKED 1
-#define P9_LOCK_ERROR 2
-#define P9_LOCK_GRACE 3
-
-#define P9_LOCK_FLAGS_BLOCK 1
-#define P9_LOCK_FLAGS_RECLAIM 2
-
-typedef struct V9fsFlock
-{
-    uint8_t type;
-    uint32_t flags;
-    uint64_t start; /* absolute offset */
-    uint64_t length;
-    uint32_t proc_id;
-    V9fsString client_id;
-} V9fsFlock;
-
-typedef struct V9fsGetlock
-{
-    uint8_t type;
-    uint64_t start; /* absolute offset */
-    uint64_t length;
-    uint32_t proc_id;
-    V9fsString client_id;
-} V9fsGetlock;
-
-extern int open_fd_hw;
-extern int total_open_fd;
-
-size_t pdu_packunpack(void *addr, struct iovec *sg, int sg_count,
-                      size_t offset, size_t size, int pack);
-
-static inline size_t do_pdu_unpack(void *dst, struct iovec *sg, int sg_count,
-                        size_t offset, size_t size)
-{
-    return pdu_packunpack(dst, sg, sg_count, offset, size, 0);
-}
-
-static inline void v9fs_path_write_lock(V9fsState *s)
-{
-    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
-        qemu_co_rwlock_wrlock(&s->rename_lock);
-    }
-}
-
-static inline void v9fs_path_read_lock(V9fsState *s)
-{
-    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
-        qemu_co_rwlock_rdlock(&s->rename_lock);
-    }
-}
-
-static inline void v9fs_path_unlock(V9fsState *s)
-{
-    if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
-        qemu_co_rwlock_unlock(&s->rename_lock);
-    }
-}
-
-static inline uint8_t v9fs_request_cancelled(V9fsPDU *pdu)
-{
-    return pdu->cancelled;
-}
-
-extern void handle_9p_output(VirtIODevice *vdev, VirtQueue *vq);
-extern void virtio_9p_set_fd_limit(void);
-extern void v9fs_reclaim_fd(V9fsPDU *pdu);
-extern void v9fs_path_init(V9fsPath *path);
-extern void v9fs_path_free(V9fsPath *path);
-extern void v9fs_path_copy(V9fsPath *lhs, V9fsPath *rhs);
-extern int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
-                             const char *name, V9fsPath *path);
-
-#define pdu_marshal(pdu, offset, fmt, args...)  \
-    v9fs_marshal(pdu->elem.in_sg, pdu->elem.in_num, offset, 1, fmt, ##args)
-#define pdu_unmarshal(pdu, offset, fmt, args...)  \
-    v9fs_unmarshal(pdu->elem.out_sg, pdu->elem.out_num, offset, 1, fmt, ##args)
+#define TYPE_VIRTIO_9P "virtio-9p-device"
+#define VIRTIO_9P(obj) \
+        OBJECT_CHECK(V9fsVirtioState, (obj), TYPE_VIRTIO_9P)

 #endif
--- a/hw/acpi.c
+++ b/hw/acpi.c
@@ -472,8 +472,9 @@ static const MemoryRegionOps acpi_pm_cnt_ops = {
    .endianness = DEVICE_LITTLE_ENDIAN,
 };

-void acpi_pm1_cnt_init(ACPIREGS *ar, MemoryRegion *parent)
+void acpi_pm1_cnt_init(ACPIREGS *ar, MemoryRegion *parent, uint8_t s4_val)
 {
+    ar->pm1.cnt.s4_val = s4_val;
    ar->wakeup.notify = acpi_notify_wakeup;
    qemu_register_wakeup_notifier(&ar->wakeup);
    memory_region_init_io(&ar->pm1.cnt.io, &acpi_pm_cnt_ops, ar, "acpi-cnt", 2);
--- a/hw/acpi.h
+++ b/hw/acpi.h
@@ -142,7 +142,7 @@ void acpi_pm1_evt_init(ACPIREGS *ar, acpi_update_sci_fn update_sci,
                       MemoryRegion *parent);

 /* PM1a_CNT: piix and ich9 don't implement PM1b CNT. */
-void acpi_pm1_cnt_init(ACPIREGS *ar, MemoryRegion *parent);
+void acpi_pm1_cnt_init(ACPIREGS *ar, MemoryRegion *parent, uint8_t s4_val);
 void acpi_pm1_cnt_update(ACPIREGS *ar,
                         bool sci_enable, bool sci_disable);
 void acpi_pm1_cnt_reset(ACPIREGS *ar);
--- a/hw/acpi_ich9.c
+++ b/hw/acpi_ich9.c
@@ -212,7 +212,7 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,

    acpi_pm_tmr_init(&pm->acpi_regs, ich9_pm_update_sci_fn, &pm->io);
    acpi_pm1_evt_init(&pm->acpi_regs, ich9_pm_update_sci_fn, &pm->io);
-    acpi_pm1_cnt_init(&pm->acpi_regs, &pm->io);
+    acpi_pm1_cnt_init(&pm->acpi_regs, &pm->io, 2);

    acpi_gpe_init(&pm->acpi_regs, ICH9_PMIO_GPE0_LEN);
    memory_region_init_io(&pm->io_gpe, &ich9_gpe_ops, pm, "apci-gpe0",
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -266,7 +266,7 @@ static int acpi_load_old(QEMUFile *f, void *opaque, int version_id)
 static const VMStateDescription vmstate_acpi = {
    .name = "piix4_pm",
    .version_id = 3,
-    .minimum_version_id = 3,
+    .minimum_version_id = 2, /* qemu-kvm */
    .minimum_version_id_old = 1,
    .load_state_old = acpi_load_old,
    .post_load = vmstate_acpi_post_load,
@@ -418,7 +418,7 @@ static int piix4_pm_initfn(PCIDevice *dev)

    acpi_pm_tmr_init(&s->ar, pm_tmr_timer, &s->io);
    acpi_pm1_evt_init(&s->ar, pm_tmr_timer, &s->io);
-    acpi_pm1_cnt_init(&s->ar, &s->io);
+    acpi_pm1_cnt_init(&s->ar, &s->io, s->s4_val);
    acpi_gpe_init(&s->ar, GPE_LEN);

    s->powerdown_notifier.notify = piix4_pm_powerdown_req;
--- a/hw/baum.c
+++ b/hw/baum.c
@@ -430,7 +430,7 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len)
 }

 /* The other end is writing some data.  Store it and try to interpret */
-static int baum_write(CharDriverState *chr, const uint8_t *buf, int len)
+static size_t baum_write(CharDriverState *chr, const uint8_t *buf, size_t len)
 {
    BaumDriverState *baum = chr->opaque;
    int tocopy, cur, eaten, orig_len = len;
--- a/hw/bt-hci-csr.c
+++ b/hw/bt-hci-csr.c
@@ -294,8 +294,8 @@ static int csrhci_data_len(const uint8_t *pkt)
    exit(-1);
 }

-static int csrhci_write(struct CharDriverState *chr,
-                const uint8_t *buf, int len)
+static size_t csrhci_write(struct CharDriverState *chr,
+                const uint8_t *buf, size_t len)
 {
    struct csrhci_s *s = (struct csrhci_s *) chr->opaque;
    int plen = s->in_len;
--- a/hw/cadence_uart.c
+++ b/hw/cadence_uart.c
@@ -227,7 +227,7 @@ static void uart_parameters_setup(UartState *s)
    qemu_chr_fe_ioctl(s->chr, CHR_IOCTL_SERIAL_SET_PARAMS, &ssp);
 }

-static int uart_can_receive(void *opaque)
+static size_t uart_can_receive(void *opaque)
 {
    UartState *s = (UartState *)opaque;

--- a/hw/ccid-card-emulated.c
+++ b/hw/ccid-card-emulated.c
@@ -36,6 +36,10 @@
 #include "monitor/monitor.h"
 #include "hw/ccid.h"

+#ifdef CONFIG_SECCOMP
+#include "sysemu/seccomp.h"
+#endif
+
 #define DPRINTF(card, lvl, fmt, ...) \
 do {\
    if (lvl <= card->debug) {\
@@ -234,6 +238,10 @@ static void *handle_apdu_thread(void* arg)
    VReaderStatus reader_status;
    EmulEvent *event;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    while (1) {
        qemu_mutex_lock(&card->handle_apdu_mutex);
        qemu_cond_wait(&card->handle_apdu_cond, &card->handle_apdu_mutex);
@@ -281,6 +289,10 @@ static void *event_thread(void *arg)
    VEvent *event = NULL;
    EmulatedState *card = arg;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    while (1) {
        const char *reader_name;

--- a/hw/ccid-card-passthru.c
+++ b/hw/ccid-card-passthru.c
@@ -104,7 +104,7 @@ static void ccid_card_vscard_send_init(PassthruState *s)
                         (uint8_t *)&msg, sizeof(msg));
 }

-static int ccid_card_vscard_can_read(void *opaque)
+static size_t ccid_card_vscard_can_read(void * opaque)
 {
    PassthruState *card = opaque;

@@ -203,14 +203,14 @@ static void ccid_card_vscard_drop_connection(PassthruState *card)
    card->vscard_in_pos = card->vscard_in_hdr = 0;
 }

-static void ccid_card_vscard_read(void *opaque, const uint8_t *buf, int size)
+static void ccid_card_vscard_read(void *opaque, const uint8_t *buf, size_t size)
 {
    PassthruState *card = opaque;
    VSCMsgHeader *hdr;

    if (card->vscard_in_pos + size > VSCARD_IN_SIZE) {
        error_report(
-            "no room for data: pos %d +  size %d > %d. dropping connection.",
+            "no room for data: pos %d +  size %zu > %d. dropping connection.",
            card->vscard_in_pos, size, VSCARD_IN_SIZE);
        ccid_card_vscard_drop_connection(card);
        return;
--- a/hw/cirrus_vga.c
+++ b/hw/cirrus_vga.c
@@ -172,27 +172,14 @@

 #define CIRRUS_PNPMMIO_SIZE         0x1000

-#define BLTUNSAFE(s) \
-    ( \
-        ( /* check dst is within bounds */ \
-            (s)->cirrus_blt_height * ABS((s)->cirrus_blt_dstpitch) \
-                + ((s)->cirrus_blt_dstaddr & (s)->cirrus_addr_mask) > \
-                    (s)->vga.vram_size \
-        ) || \
-        ( /* check src is within bounds */ \
-            (s)->cirrus_blt_height * ABS((s)->cirrus_blt_srcpitch) \
-                + ((s)->cirrus_blt_srcaddr & (s)->cirrus_addr_mask) > \
-                    (s)->vga.vram_size \
-        ) \
-    )
-
 struct CirrusVGAState;
 typedef void (*cirrus_bitblt_rop_t) (struct CirrusVGAState *s,
-                                     uint8_t * dst, const uint8_t * src,
+                                     uint32_t dstaddr, uint32_t srcaddr,
 				     int dstpitch, int srcpitch,
 				     int bltwidth, int bltheight);
 typedef void (*cirrus_fill_t)(struct CirrusVGAState *s,
-                              uint8_t *dst, int dst_pitch, int width, int height);
+                              uint32_t dstaddr, int dst_pitch,
+                              int width, int height);

 typedef struct CirrusVGAState {
    VGACommonState vga;
@@ -273,19 +260,108 @@ static void cirrus_update_memory_access(CirrusVGAState *s);
 *
 ***************************************/

+static bool blit_region_is_unsafe(struct CirrusVGAState *s,
+                                  int32_t pitch, int32_t addr)
+{
+    if (!pitch) {
+        return true;
+    }
+    if (pitch < 0) {
+        int64_t min = addr
+            + ((int64_t)s->cirrus_blt_height - 1) * pitch
+            - s->cirrus_blt_width;
+        if (min < -1 || addr >= s->vga.vram_size) {
+            return true;
+        }
+    } else {
+        int64_t max = addr
+            + ((int64_t)s->cirrus_blt_height-1) * pitch
+            + s->cirrus_blt_width;
+        if (max > s->vga.vram_size) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool blit_is_unsafe(struct CirrusVGAState *s, bool dst_only)
+{
+    /* should be the case, see cirrus_bitblt_start */
+    assert(s->cirrus_blt_width > 0);
+    assert(s->cirrus_blt_height > 0);
+
+    if (s->cirrus_blt_width > CIRRUS_BLTBUFSIZE) {
+        return true;
+    }
+
+    if (blit_region_is_unsafe(s, s->cirrus_blt_dstpitch,
+                              s->cirrus_blt_dstaddr)) {
+        return true;
+    }
+    if (dst_only) {
+        return false;
+    }
+
+    if (blit_region_is_unsafe(s, s->cirrus_blt_srcpitch,
+                              s->cirrus_blt_srcaddr)) {
+        return true;
+    }
+
+    return false;
+}
+
 static void cirrus_bitblt_rop_nop(CirrusVGAState *s,
-                                  uint8_t *dst,const uint8_t *src,
+                                  uint32_t dstaddr, uint32_t srcaddr,
                                  int dstpitch,int srcpitch,
                                  int bltwidth,int bltheight)
 {
 }

 static void cirrus_bitblt_fill_nop(CirrusVGAState *s,
-                                   uint8_t *dst,
+                                   uint32_t dstaddr,
                                   int dstpitch, int bltwidth,int bltheight)
 {
 }

+static inline uint8_t cirrus_src(CirrusVGAState *s, uint32_t srcaddr)
+{
+    if (s->cirrus_srccounter) {
+        /* cputovideo */
+        return s->cirrus_bltbuf[srcaddr & (CIRRUS_BLTBUFSIZE - 1)];
+    } else {
+        /* videotovideo */
+        return s->vga.vram_ptr[srcaddr & s->cirrus_addr_mask];
+    }
+}
+
+static inline uint16_t cirrus_src16(CirrusVGAState *s, uint32_t srcaddr)
+{
+    uint16_t *src;
+
+    if (s->cirrus_srccounter) {
+        /* cputovideo */
+        src = (void *)&s->cirrus_bltbuf[srcaddr & (CIRRUS_BLTBUFSIZE - 1) & ~1];
+    } else {
+        /* videotovideo */
+        src = (void *)&s->vga.vram_ptr[srcaddr & s->cirrus_addr_mask & ~1];
+    }
+    return *src;
+}
+
+static inline uint32_t cirrus_src32(CirrusVGAState *s, uint32_t srcaddr)
+{
+    uint32_t *src;
+
+    if (s->cirrus_srccounter) {
+        /* cputovideo */
+        src = (void *)&s->cirrus_bltbuf[srcaddr & (CIRRUS_BLTBUFSIZE - 1) & ~3];
+    } else {
+        /* videotovideo */
+        src = (void *)&s->vga.vram_ptr[srcaddr & s->cirrus_addr_mask & ~3];
+    }
+    return *src;
+}
+
 #define ROP_NAME 0
 #define ROP_FN(d, s) 0
 #include "cirrus_vga_rop.h"
@@ -615,25 +691,54 @@ static void cirrus_invalidate_region(CirrusVGAState * s, int off_begin,
    int off_cur;
    int off_cur_end;

+    if (off_pitch < 0) {
+        off_begin -= bytesperline - 1;
+    }
+
    for (y = 0; y < lines; y++) {
 	off_cur = off_begin;
 	off_cur_end = (off_cur + bytesperline) & s->cirrus_addr_mask;
+        assert(off_cur_end >= off_cur);
        memory_region_set_dirty(&s->vga.vram, off_cur, off_cur_end - off_cur);
 	off_begin += off_pitch;
    }
 }

-static int cirrus_bitblt_common_patterncopy(CirrusVGAState * s,
-					    const uint8_t * src)
+static int cirrus_bitblt_common_patterncopy(CirrusVGAState *s, bool videosrc)
 {
-    uint8_t *dst;
+    uint32_t patternsize;
+    uint8_t *src;

-    dst = s->vga.vram_ptr + (s->cirrus_blt_dstaddr & s->cirrus_addr_mask);
+    if (videosrc) {
+        switch (s->vga.get_bpp(&s->vga)) {
+        case 8:
+            patternsize = 64;
+            break;
+        case 15:
+        case 16:
+            patternsize = 128;
+            break;
+        case 24:
+        case 32:
+        default:
+            patternsize = 256;
+            break;
+        }
+        s->cirrus_blt_srcaddr &= ~(patternsize - 1);
+        if (s->cirrus_blt_srcaddr + patternsize > s->vga.vram_size) {
+            return 0;
+        }
+        src = s->vga.vram_ptr + s->cirrus_blt_srcaddr;
+    } else {
+        src = s->cirrus_bltbuf;
+    }

-    if (BLTUNSAFE(s))
+    if (blit_is_unsafe(s, true)) {
        return 0;
+    }

-    (*s->cirrus_rop) (s, dst, src,
+    (*s->cirrus_rop) (s, s->cirrus_blt_dstaddr,
+                      videosrc ? s->cirrus_blt_srcaddr : 0,
                      s->cirrus_blt_dstpitch, 0,
                      s->cirrus_blt_width, s->cirrus_blt_height);
    cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
@@ -648,10 +753,11 @@ static int cirrus_bitblt_solidfill(CirrusVGAState *s, int blt_rop)
 {
    cirrus_fill_t rop_func;

-    if (BLTUNSAFE(s))
+    if (blit_is_unsafe(s, true)) {
        return 0;
+    }
    rop_func = cirrus_fill[rop_to_index[blt_rop]][s->cirrus_blt_pixelwidth - 1];
-    rop_func(s, s->vga.vram_ptr + (s->cirrus_blt_dstaddr & s->cirrus_addr_mask),
+    rop_func(s, s->cirrus_blt_dstaddr,
             s->cirrus_blt_dstpitch,
             s->cirrus_blt_width, s->cirrus_blt_height);
    cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
@@ -669,12 +775,10 @@ static int cirrus_bitblt_solidfill(CirrusVGAState *s, int blt_rop)

 static int cirrus_bitblt_videotovideo_patterncopy(CirrusVGAState * s)
 {
-    return cirrus_bitblt_common_patterncopy(s,
-					    s->vga.vram_ptr + ((s->cirrus_blt_srcaddr & ~7) &
-                                            s->cirrus_addr_mask));
+    return cirrus_bitblt_common_patterncopy(s, true);
 }

-static void cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h)
+static int cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h)
 {
    int sx = 0, sy = 0;
    int dx = 0, dy = 0;
@@ -688,6 +792,9 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h)
        int width, height;

        depth = s->vga.get_bpp(&s->vga) / 8;
+        if (!depth) {
+            return 0;
+        }
        s->vga.get_resolution(&s->vga, &width, &height);

        /* extra x, y */
@@ -717,42 +824,35 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h)
        }
    }

-    /* we have to flush all pending changes so that the copy
-       is generated at the appropriate moment in time */
-    if (notify)
-	vga_hw_update();
-
-    (*s->cirrus_rop) (s, s->vga.vram_ptr +
-		      (s->cirrus_blt_dstaddr & s->cirrus_addr_mask),
-		      s->vga.vram_ptr +
-		      (s->cirrus_blt_srcaddr & s->cirrus_addr_mask),
+    (*s->cirrus_rop) (s, s->cirrus_blt_dstaddr,
+                      s->cirrus_blt_srcaddr,
 		      s->cirrus_blt_dstpitch, s->cirrus_blt_srcpitch,
 		      s->cirrus_blt_width, s->cirrus_blt_height);

    if (notify)
-	qemu_console_copy(s->vga.ds,
-			  sx, sy, dx, dy,
+	dpy_gfx_update(s->vga.ds, dx, dy,
 			  s->cirrus_blt_width / depth,
 			  s->cirrus_blt_height);

    /* we don't have to notify the display that this portion has
-       changed since qemu_console_copy implies this */
+       changed since dpy_gfx_update implies this */

    cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
 				s->cirrus_blt_dstpitch, s->cirrus_blt_width,
 				s->cirrus_blt_height);
+
+    return 1;
 }

 static int cirrus_bitblt_videotovideo_copy(CirrusVGAState * s)
 {
-    if (BLTUNSAFE(s))
+    if (blit_is_unsafe(s, false)) {
        return 0;
+    }

-    cirrus_do_copy(s, s->cirrus_blt_dstaddr - s->vga.start_addr,
+    return cirrus_do_copy(s, s->cirrus_blt_dstaddr - s->vga.start_addr,
            s->cirrus_blt_srcaddr - s->vga.start_addr,
            s->cirrus_blt_width, s->cirrus_blt_height);
-
-    return 1;
 }

 /***************************************
@@ -768,16 +868,15 @@ static void cirrus_bitblt_cputovideo_next(CirrusVGAState * s)

    if (s->cirrus_srccounter > 0) {
        if (s->cirrus_blt_mode & CIRRUS_BLTMODE_PATTERNCOPY) {
-            cirrus_bitblt_common_patterncopy(s, s->cirrus_bltbuf);
+            cirrus_bitblt_common_patterncopy(s, false);
        the_end:
            s->cirrus_srccounter = 0;
            cirrus_bitblt_reset(s);
        } else {
            /* at least one scan line */
            do {
-                (*s->cirrus_rop)(s, s->vga.vram_ptr +
-                                 (s->cirrus_blt_dstaddr & s->cirrus_addr_mask),
-                                  s->cirrus_bltbuf, 0, 0, s->cirrus_blt_width, 1);
+                (*s->cirrus_rop)(s, s->cirrus_blt_dstaddr,
+                                 0, 0, 0, s->cirrus_blt_width, 1);
                cirrus_invalidate_region(s, s->cirrus_blt_dstaddr, 0,
                                         s->cirrus_blt_width, 1);
                s->cirrus_blt_dstaddr += s->cirrus_blt_dstpitch;
@@ -823,6 +922,10 @@ static int cirrus_bitblt_cputovideo(CirrusVGAState * s)
 {
    int w;

+    if (blit_is_unsafe(s, true)) {
+        return 0;
+    }
+
    s->cirrus_blt_mode &= ~CIRRUS_BLTMODE_MEMSYSSRC;
    s->cirrus_srcptr = &s->cirrus_bltbuf[0];
    s->cirrus_srcptr_end = &s->cirrus_bltbuf[0];
@@ -848,6 +951,10 @@ static int cirrus_bitblt_cputovideo(CirrusVGAState * s)
 	}
        s->cirrus_srccounter = s->cirrus_blt_srcpitch * s->cirrus_blt_height;
    }
+
+    /* the blit_is_unsafe call above should catch this */
+    assert(s->cirrus_blt_srcpitch <= CIRRUS_BLTBUFSIZE);
+
    s->cirrus_srcptr = s->cirrus_bltbuf;
    s->cirrus_srcptr_end = s->cirrus_bltbuf + s->cirrus_blt_srcpitch;
    cirrus_update_memory_access(s);
@@ -895,6 +1002,9 @@ static void cirrus_bitblt_start(CirrusVGAState * s)
    s->cirrus_blt_modeext = s->vga.gr[0x33];
    blt_rop = s->vga.gr[0x32];

+    s->cirrus_blt_dstaddr &= s->cirrus_addr_mask;
+    s->cirrus_blt_srcaddr &= s->cirrus_addr_mask;
+
 #ifdef DEBUG_BITBLT
    printf("rop=0x%02x mode=0x%02x modeext=0x%02x w=%d h=%d dpitch=%d spitch=%d daddr=0x%08x saddr=0x%08x writemask=0x%02x\n",
           blt_rop,
@@ -1907,15 +2017,14 @@ static void cirrus_mem_writeb_mode4and5_8bpp(CirrusVGAState * s,
    unsigned val = mem_value;
    uint8_t *dst;

-    dst = s->vga.vram_ptr + (offset &= s->cirrus_addr_mask);
    for (x = 0; x < 8; x++) {
+        dst = s->vga.vram_ptr + ((offset + x) & s->cirrus_addr_mask);
 	if (val & 0x80) {
 	    *dst = s->cirrus_shadow_gr1;
 	} else if (mode == 5) {
 	    *dst = s->cirrus_shadow_gr0;
 	}
 	val <<= 1;
-	dst++;
    }
    memory_region_set_dirty(&s->vga.vram, offset, 8);
 }
@@ -1929,8 +2038,8 @@ static void cirrus_mem_writeb_mode4and5_16bpp(CirrusVGAState * s,
    unsigned val = mem_value;
    uint8_t *dst;

-    dst = s->vga.vram_ptr + (offset &= s->cirrus_addr_mask);
    for (x = 0; x < 8; x++) {
+        dst = s->vga.vram_ptr + ((offset + 2 * x) & s->cirrus_addr_mask & ~1);
 	if (val & 0x80) {
 	    *dst = s->cirrus_shadow_gr1;
 	    *(dst + 1) = s->vga.gr[0x11];
@@ -1939,7 +2048,6 @@ static void cirrus_mem_writeb_mode4and5_16bpp(CirrusVGAState * s,
 	    *(dst + 1) = s->vga.gr[0x10];
 	}
 	val <<= 1;
-	dst += 2;
    }
    memory_region_set_dirty(&s->vga.vram, offset, 16);
 }
--- a/hw/cirrus_vga_rop.h
+++ b/hw/cirrus_vga_rop.h
@@ -22,31 +22,65 @@
 * THE SOFTWARE.
 */

-static inline void glue(rop_8_,ROP_NAME)(uint8_t *dst, uint8_t src)
+static inline void glue(rop_8_, ROP_NAME)(CirrusVGAState *s,
+                                          uint32_t dstaddr, uint8_t src)
 {
+    uint8_t *dst = &s->vga.vram_ptr[dstaddr & s->cirrus_addr_mask];
    *dst = ROP_FN(*dst, src);
 }

-static inline void glue(rop_16_,ROP_NAME)(uint16_t *dst, uint16_t src)
+static inline void glue(rop_tr_8_, ROP_NAME)(CirrusVGAState *s,
+                                             uint32_t dstaddr, uint8_t src,
+                                             uint8_t transp)
 {
+    uint8_t *dst = &s->vga.vram_ptr[dstaddr & s->cirrus_addr_mask];
+    uint8_t pixel = ROP_FN(*dst, src);
+    if (pixel != transp) {
+        *dst = pixel;
+    }
+}
+
+static inline void glue(rop_16_, ROP_NAME)(CirrusVGAState *s,
+                                           uint32_t dstaddr, uint16_t src)
+{
+    uint16_t *dst = (uint16_t *)
+        (&s->vga.vram_ptr[dstaddr & s->cirrus_addr_mask & ~1]);
    *dst = ROP_FN(*dst, src);
 }

-static inline void glue(rop_32_,ROP_NAME)(uint32_t *dst, uint32_t src)
+static inline void glue(rop_tr_16_, ROP_NAME)(CirrusVGAState *s,
+                                              uint32_t dstaddr, uint16_t src,
+                                              uint16_t transp)
 {
+    uint16_t *dst = (uint16_t *)
+        (&s->vga.vram_ptr[dstaddr & s->cirrus_addr_mask & ~1]);
+    uint16_t pixel = ROP_FN(*dst, src);
+    if (pixel != transp) {
+        *dst = pixel;
+    }
+}
+
+static inline void glue(rop_32_, ROP_NAME)(CirrusVGAState *s,
+                                           uint32_t dstaddr, uint32_t src)
+{
+    uint32_t *dst = (uint32_t *)
+        (&s->vga.vram_ptr[dstaddr & s->cirrus_addr_mask & ~3]);
    *dst = ROP_FN(*dst, src);
 }

-#define ROP_OP(d, s) glue(rop_8_,ROP_NAME)(d, s)
-#define ROP_OP_16(d, s) glue(rop_16_,ROP_NAME)(d, s)
-#define ROP_OP_32(d, s) glue(rop_32_,ROP_NAME)(d, s)
+#define ROP_OP(st, d, s)           glue(rop_8_, ROP_NAME)(st, d, s)
+#define ROP_OP_TR(st, d, s, t)     glue(rop_tr_8_, ROP_NAME)(st, d, s, t)
+#define ROP_OP_16(st, d, s)        glue(rop_16_, ROP_NAME)(st, d, s)
+#define ROP_OP_TR_16(st, d, s, t)  glue(rop_tr_16_, ROP_NAME)(st, d, s, t)
+#define ROP_OP_32(st, d, s)        glue(rop_32_, ROP_NAME)(st, d, s)
 #undef ROP_FN

 static void
 glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusVGAState *s,
-                             uint8_t *dst,const uint8_t *src,
-                             int dstpitch,int srcpitch,
-                             int bltwidth,int bltheight)
+                                       uint32_t dstaddr,
+                                       uint32_t srcaddr,
+                                       int dstpitch, int srcpitch,
+                                       int bltwidth, int bltheight)
 {
    int x,y;
    dstpitch -= bltwidth;
@@ -59,134 +93,139 @@ glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusVGAState *s,

    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x++) {
-            ROP_OP(dst, *src);
-            dst++;
-            src++;
+            ROP_OP(s, dstaddr, cirrus_src(s, srcaddr));
+            dstaddr++;
+            srcaddr++;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

 static void
 glue(cirrus_bitblt_rop_bkwd_, ROP_NAME)(CirrusVGAState *s,
-                                        uint8_t *dst,const uint8_t *src,
-                                        int dstpitch,int srcpitch,
-                                        int bltwidth,int bltheight)
+                                        uint32_t dstaddr,
+                                        uint32_t srcaddr,
+                                        int dstpitch, int srcpitch,
+                                        int bltwidth, int bltheight)
 {
    int x,y;
    dstpitch += bltwidth;
    srcpitch += bltwidth;
    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x++) {
-            ROP_OP(dst, *src);
-            dst--;
-            src--;
+            ROP_OP(s, dstaddr, cirrus_src(s, srcaddr));
+            dstaddr--;
+            srcaddr--;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

 static void
 glue(glue(cirrus_bitblt_rop_fwd_transp_, ROP_NAME),_8)(CirrusVGAState *s,
-						       uint8_t *dst,const uint8_t *src,
-						       int dstpitch,int srcpitch,
-						       int bltwidth,int bltheight)
+                                                       uint32_t dstaddr,
+                                                       uint32_t srcaddr,
+                                                       int dstpitch,
+                                                       int srcpitch,
+                                                       int bltwidth,
+                                                       int bltheight)
 {
    int x,y;
-    uint8_t p;
+    uint8_t transp = s->vga.gr[0x34];
    dstpitch -= bltwidth;
    srcpitch -= bltwidth;
+ 
+    if (bltheight > 1 && (dstpitch < 0 || srcpitch < 0)) {
+        return;
+    }
+
    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x++) {
-	    p = *dst;
-            ROP_OP(&p, *src);
-	    if (p != s->vga.gr[0x34]) *dst = p;
-            dst++;
-            src++;
+            ROP_OP_TR(s, dstaddr, cirrus_src(s, srcaddr), transp);
+            dstaddr++;
+            srcaddr++;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

 static void
 glue(glue(cirrus_bitblt_rop_bkwd_transp_, ROP_NAME),_8)(CirrusVGAState *s,
-							uint8_t *dst,const uint8_t *src,
-							int dstpitch,int srcpitch,
-							int bltwidth,int bltheight)
+                                                        uint32_t dstaddr,
+                                                        uint32_t srcaddr,
+                                                        int dstpitch,
+                                                        int srcpitch,
+                                                        int bltwidth,
+                                                        int bltheight)
 {
    int x,y;
-    uint8_t p;
+    uint8_t transp = s->vga.gr[0x34];
    dstpitch += bltwidth;
    srcpitch += bltwidth;
    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x++) {
-	    p = *dst;
-            ROP_OP(&p, *src);
-	    if (p != s->vga.gr[0x34]) *dst = p;
-            dst--;
-            src--;
+            ROP_OP_TR(s, dstaddr, cirrus_src(s, srcaddr), transp);
+            dstaddr--;
+            srcaddr--;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

 static void
 glue(glue(cirrus_bitblt_rop_fwd_transp_, ROP_NAME),_16)(CirrusVGAState *s,
-							uint8_t *dst,const uint8_t *src,
-							int dstpitch,int srcpitch,
-							int bltwidth,int bltheight)
+                                                        uint32_t dstaddr,
+                                                        uint32_t srcaddr,
+                                                        int dstpitch,
+                                                        int srcpitch,
+                                                        int bltwidth,
+                                                        int bltheight)
 {
    int x,y;
-    uint8_t p1, p2;
+    uint16_t transp = s->vga.gr[0x34] | (uint16_t)s->vga.gr[0x35] << 8;
    dstpitch -= bltwidth;
    srcpitch -= bltwidth;
+ 
+    if (bltheight > 1 && (dstpitch < 0 || srcpitch < 0)) {
+        return;
+    }
+
    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x+=2) {
-	    p1 = *dst;
-	    p2 = *(dst+1);
-            ROP_OP(&p1, *src);
-            ROP_OP(&p2, *(src + 1));
-	    if ((p1 != s->vga.gr[0x34]) || (p2 != s->vga.gr[0x35])) {
-		*dst = p1;
-		*(dst+1) = p2;
-	    }
-            dst+=2;
-            src+=2;
+            ROP_OP_TR_16(s, dstaddr, cirrus_src16(s, srcaddr), transp);
+            dstaddr += 2;
+            srcaddr += 2;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

 static void
 glue(glue(cirrus_bitblt_rop_bkwd_transp_, ROP_NAME),_16)(CirrusVGAState *s,
-							 uint8_t *dst,const uint8_t *src,
-							 int dstpitch,int srcpitch,
-							 int bltwidth,int bltheight)
+                                                         uint32_t dstaddr,
+                                                         uint32_t srcaddr,
+                                                         int dstpitch,
+                                                         int srcpitch,
+                                                         int bltwidth,
+                                                         int bltheight)
 {
    int x,y;
-    uint8_t p1, p2;
+    uint16_t transp = s->vga.gr[0x34] | (uint16_t)s->vga.gr[0x35] << 8;
    dstpitch += bltwidth;
    srcpitch += bltwidth;
    for (y = 0; y < bltheight; y++) {
        for (x = 0; x < bltwidth; x+=2) {
-	    p1 = *(dst-1);
-	    p2 = *dst;
-            ROP_OP(&p1, *(src - 1));
-            ROP_OP(&p2, *src);
-	    if ((p1 != s->vga.gr[0x34]) || (p2 != s->vga.gr[0x35])) {
-		*(dst-1) = p1;
-		*dst = p2;
-	    }
-            dst-=2;
-            src-=2;
+            ROP_OP_TR_16(s, dstaddr - 1, cirrus_src16(s, srcaddr - 1), transp);
+            dstaddr -= 2;
+            srcaddr -= 2;
        }
-        dst += dstpitch;
-        src += srcpitch;
+        dstaddr += dstpitch;
+        srcaddr += srcpitch;
    }
 }

--- a/hw/cirrus_vga_rop2.h
+++ b/hw/cirrus_vga_rop2.h
@@ -23,30 +23,32 @@
 */

 #if DEPTH == 8
-#define PUTPIXEL()    ROP_OP(&d[0], col)
+#define PUTPIXEL(s, a, c)    ROP_OP(s, a, c)
 #elif DEPTH == 16
-#define PUTPIXEL()    ROP_OP_16((uint16_t *)&d[0], col)
+#define PUTPIXEL(s, a, c)    ROP_OP_16(s, a, c)
 #elif DEPTH == 24
-#define PUTPIXEL()    ROP_OP(&d[0], col);        \
-                      ROP_OP(&d[1], (col >> 8)); \
-                      ROP_OP(&d[2], (col >> 16))
+#define PUTPIXEL(s, a, c)    do {          \
+        ROP_OP(s, a,     c);               \
+        ROP_OP(s, a + 1, (col >> 8));      \
+        ROP_OP(s, a + 2, (col >> 16));     \
+    } while (0)
 #elif DEPTH == 32
-#define PUTPIXEL()    ROP_OP_32(((uint32_t *)&d[0]), col)
+#define PUTPIXEL(s, a, c)    ROP_OP_32(s, a, c)
 #else
 #error unsupported DEPTH
 #endif

 static void
 glue(glue(glue(cirrus_patternfill_, ROP_NAME), _),DEPTH)
-     (CirrusVGAState * s, uint8_t * dst,
-      const uint8_t * src,
+     (CirrusVGAState *s, uint32_t dstaddr,
+      uint32_t srcaddr,
      int dstpitch, int srcpitch,
      int bltwidth, int bltheight)
 {
-    uint8_t *d;
+    uint32_t addr;
    int x, y, pattern_y, pattern_pitch, pattern_x;
    unsigned int col;
-    const uint8_t *src1;
+    uint32_t src1addr;
 #if DEPTH == 24
    int skipleft = s->vga.gr[0x2f] & 0x1f;
 #else
@@ -63,42 +65,44 @@ glue(glue(glue(cirrus_patternfill_, ROP_NAME), _),DEPTH)
    pattern_y = s->cirrus_blt_srcaddr & 7;
    for(y = 0; y < bltheight; y++) {
        pattern_x = skipleft;
-        d = dst + skipleft;
-        src1 = src + pattern_y * pattern_pitch;
+        addr = dstaddr + skipleft;
+        src1addr = srcaddr + pattern_y * pattern_pitch;
        for (x = skipleft; x < bltwidth; x += (DEPTH / 8)) {
 #if DEPTH == 8
-            col = src1[pattern_x];
+            col = cirrus_src(s, src1addr + pattern_x);
            pattern_x = (pattern_x + 1) & 7;
 #elif DEPTH == 16
-            col = ((uint16_t *)(src1 + pattern_x))[0];
+            col = cirrus_src16(s, src1addr + pattern_x);
            pattern_x = (pattern_x + 2) & 15;
 #elif DEPTH == 24
            {
-                const uint8_t *src2 = src1 + pattern_x * 3;
-                col = src2[0] | (src2[1] << 8) | (src2[2] << 16);
+                uint32_t src2addr = src1addr + pattern_x * 3;
+                col = cirrus_src(s, src2addr) |
+                    (cirrus_src(s, src2addr + 1) << 8) |
+                    (cirrus_src(s, src2addr + 2) << 16);
                pattern_x = (pattern_x + 1) & 7;
            }
 #else
-            col = ((uint32_t *)(src1 + pattern_x))[0];
+            col = cirrus_src32(s, src1addr + pattern_x);
            pattern_x = (pattern_x + 4) & 31;
 #endif
-            PUTPIXEL();
-            d += (DEPTH / 8);
+            PUTPIXEL(s, addr, col);
+            addr += (DEPTH / 8);
        }
        pattern_y = (pattern_y + 1) & 7;
-        dst += dstpitch;
+        dstaddr += dstpitch;
    }
 }

 /* NOTE: srcpitch is ignored */
 static void
 glue(glue(glue(cirrus_colorexpand_transp_, ROP_NAME), _),DEPTH)
-     (CirrusVGAState * s, uint8_t * dst,
-      const uint8_t * src,
+     (CirrusVGAState *s, uint32_t dstaddr,
+      uint32_t srcaddr,
      int dstpitch, int srcpitch,
      int bltwidth, int bltheight)
 {
-    uint8_t *d;
+    uint32_t addr;
    int x, y;
    unsigned bits, bits_xor;
    unsigned int col;
@@ -122,33 +126,33 @@ glue(glue(glue(cirrus_colorexpand_transp_, ROP_NAME), _),DEPTH)

    for(y = 0; y < bltheight; y++) {
        bitmask = 0x80 >> srcskipleft;
-        bits = *src++ ^ bits_xor;
-        d = dst + dstskipleft;
+        bits = cirrus_src(s, srcaddr++) ^ bits_xor;
+        addr = dstaddr + dstskipleft;
        for (x = dstskipleft; x < bltwidth; x += (DEPTH / 8)) {
            if ((bitmask & 0xff) == 0) {
                bitmask = 0x80;
-                bits = *src++ ^ bits_xor;
+                bits = cirrus_src(s, srcaddr++) ^ bits_xor;
            }
            index = (bits & bitmask);
            if (index) {
-                PUTPIXEL();
+                PUTPIXEL(s, addr, col);
            }
-            d += (DEPTH / 8);
+            addr += (DEPTH / 8);
            bitmask >>= 1;
        }
-        dst += dstpitch;
+        dstaddr += dstpitch;
    }
 }

 static void
 glue(glue(glue(cirrus_colorexpand_, ROP_NAME), _),DEPTH)
-     (CirrusVGAState * s, uint8_t * dst,
-      const uint8_t * src,
+     (CirrusVGAState *s, uint32_t dstaddr,
+      uint32_t srcaddr,
      int dstpitch, int srcpitch,
      int bltwidth, int bltheight)
 {
    uint32_t colors[2];
-    uint8_t *d;
+    uint32_t addr;
    int x, y;
    unsigned bits;
    unsigned int col;
@@ -160,30 +164,30 @@ glue(glue(glue(cirrus_colorexpand_, ROP_NAME), _),DEPTH)
    colors[1] = s->cirrus_blt_fgcol;
    for(y = 0; y < bltheight; y++) {
        bitmask = 0x80 >> srcskipleft;
-        bits = *src++;
-        d = dst + dstskipleft;
+        bits = cirrus_src(s, srcaddr++);
+        addr = dstaddr + dstskipleft;
        for (x = dstskipleft; x < bltwidth; x += (DEPTH / 8)) {
            if ((bitmask & 0xff) == 0) {
                bitmask = 0x80;
-                bits = *src++;
+                bits = cirrus_src(s, srcaddr++);
            }
            col = colors[!!(bits & bitmask)];
-            PUTPIXEL();
-            d += (DEPTH / 8);
+            PUTPIXEL(s, addr, col);
+            addr += (DEPTH / 8);
            bitmask >>= 1;
        }
-        dst += dstpitch;
+        dstaddr += dstpitch;
    }
 }

 static void
 glue(glue(glue(cirrus_colorexpand_pattern_transp_, ROP_NAME), _),DEPTH)
-     (CirrusVGAState * s, uint8_t * dst,
-      const uint8_t * src,
+     (CirrusVGAState *s, uint32_t dstaddr,
+      uint32_t srcaddr,
      int dstpitch, int srcpitch,
      int bltwidth, int bltheight)
 {
-    uint8_t *d;
+    uint32_t addr;
    int x, y, bitpos, pattern_y;
    unsigned int bits, bits_xor;
    unsigned int col;
@@ -205,30 +209,30 @@ glue(glue(glue(cirrus_colorexpand_pattern_transp_, ROP_NAME), _),DEPTH)
    pattern_y = s->cirrus_blt_srcaddr & 7;

    for(y = 0; y < bltheight; y++) {
-        bits = src[pattern_y] ^ bits_xor;
+        bits = cirrus_src(s, srcaddr + pattern_y) ^ bits_xor;
        bitpos = 7 - srcskipleft;
-        d = dst + dstskipleft;
+        addr = dstaddr + dstskipleft;
        for (x = dstskipleft; x < bltwidth; x += (DEPTH / 8)) {
            if ((bits >> bitpos) & 1) {
-                PUTPIXEL();
+                PUTPIXEL(s, addr, col);
            }
-            d += (DEPTH / 8);
+            addr += (DEPTH / 8);
            bitpos = (bitpos - 1) & 7;
        }
        pattern_y = (pattern_y + 1) & 7;
-        dst += dstpitch;
+        dstaddr += dstpitch;
    }
 }

 static void
 glue(glue(glue(cirrus_colorexpand_pattern_, ROP_NAME), _),DEPTH)
-     (CirrusVGAState * s, uint8_t * dst,
-      const uint8_t * src,
+     (CirrusVGAState *s, uint32_t dstaddr,
+      uint32_t srcaddr,
      int dstpitch, int srcpitch,
      int bltwidth, int bltheight)
 {
    uint32_t colors[2];
-    uint8_t *d;
+    uint32_t addr;
    int x, y, bitpos, pattern_y;
    unsigned int bits;
    unsigned int col;
@@ -240,40 +244,39 @@ glue(glue(glue(cirrus_colorexpand_pattern_, ROP_NAME), _),DEPTH)
    pattern_y = s->cirrus_blt_srcaddr & 7;

    for(y = 0; y < bltheight; y++) {
-        bits = src[pattern_y];
+        bits = cirrus_src(s, srcaddr + pattern_y);
        bitpos = 7 - srcskipleft;
-        d = dst + dstskipleft;
+        addr = dstaddr + dstskipleft;
        for (x = dstskipleft; x < bltwidth; x += (DEPTH / 8)) {
            col = colors[(bits >> bitpos) & 1];
-            PUTPIXEL();
-            d += (DEPTH / 8);
+            PUTPIXEL(s, addr, col);
+            addr += (DEPTH / 8);
            bitpos = (bitpos - 1) & 7;
        }
        pattern_y = (pattern_y + 1) & 7;
-        dst += dstpitch;
+        dstaddr += dstpitch;
    }
 }

 static void
 glue(glue(glue(cirrus_fill_, ROP_NAME), _),DEPTH)
     (CirrusVGAState *s,
-      uint8_t *dst, int dst_pitch,
+      uint32_t dstaddr, int dst_pitch,
      int width, int height)
 {
-    uint8_t *d, *d1;
+    uint32_t addr;
    uint32_t col;
    int x, y;

    col = s->cirrus_blt_fgcol;

-    d1 = dst;
    for(y = 0; y < height; y++) {
-        d = d1;
+        addr = dstaddr;
        for(x = 0; x < width; x += (DEPTH / 8)) {
-            PUTPIXEL();
-            d += (DEPTH / 8);
+            PUTPIXEL(s, addr, col);
+            addr += (DEPTH / 8);
        }
-        d1 += dst_pitch;
+        dstaddr += dst_pitch;
    }
 }

--- a/hw/dataplane/virtio-blk.c
+++ b/hw/dataplane/virtio-blk.c
@@ -22,6 +22,10 @@
 #include "hw/virtio-blk.h"
 #include "hw/dataplane/virtio-blk.h"

+#ifdef CONFIG_SECCOMP
+#include "sysemu/seccomp.h"
+#endif
+
 enum {
    SEG_MAX = 126,                  /* maximum number of I/O segments */
    VRING_MAX = SEG_MAX + 2,        /* maximum number of vring descriptors */
@@ -356,6 +360,10 @@ static void *data_plane_thread(void *opaque)
 {
    VirtIOBlockDataPlane *s = opaque;

+#ifdef CONFIG_SECCOMP
+    seccomp_start(!!0);
+#endif
+
    do {
        event_poll(&s->event_poll);
    } while (!s->stopping || s->num_reqs > 0);
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -17,7 +17,7 @@
 #ifndef VRING_H
 #define VRING_H

-#include <linux/virtio_ring.h>
+#include "standard-headers/linux/virtio_ring.h"
 #include "qemu-common.h"
 #include "hw/dataplane/hostmem.h"
 #include "hw/virtio.h"
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -594,6 +594,9 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
        msh = hdr + tp->mss;
        do {
            bytes = split_size;
+            if (tp->size >= msh) {
+                goto eop;
+            }
            if (tp->size + bytes > msh)
                bytes = msh - tp->size;

@@ -608,7 +611,8 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
                memmove(tp->data, tp->header, hdr);
                tp->size = hdr;
            }
-        } while (split_size -= bytes);
+            split_size -= bytes;
+        } while (bytes && split_size);
    } else if (!tp->tse && tp->cptse) {
        // context descriptor TSE is not set, while data descriptor TSE is set
        DBGOUT(TXERR, "TCP segmentation error\n");
@@ -618,6 +622,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
        tp->size += split_size;
    }

+eop:
    if (!(txd_lower & E1000_TXD_CMD_EOP))
        return;
    if (!(tp->tse && tp->cptse && tp->size < hdr))
@@ -683,7 +688,8 @@ start_xmit(E1000State *s)
         * bogus values to TDT/TDLEN.
         * there's nothing too intelligent we could do about this.
         */
-        if (s->mac_reg[TDH] == tdh_start) {
+        if (s->mac_reg[TDH] == tdh_start ||
+            tdh_start >= s->mac_reg[TDLEN] / sizeof(desc)) {
            DBGOUT(TXERR, "TDH wraparound @%x, TDT %x, TDLEN %x\n",
                   tdh_start, s->mac_reg[TDT], s->mac_reg[TDLEN]);
            break;
@@ -888,7 +894,8 @@ e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size)
        if (++s->mac_reg[RDH] * sizeof(desc) >= s->mac_reg[RDLEN])
            s->mac_reg[RDH] = 0;
        /* see comment in start_xmit; same here */
-        if (s->mac_reg[RDH] == rdh_start) {
+        if (s->mac_reg[RDH] == rdh_start ||
+            rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
            DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
                   rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
            set_ics(s, 0, E1000_ICS_RXO);
--- a/hw/eepro100.c
+++ b/hw/eepro100.c
@@ -774,6 +774,11 @@ static void tx_command(EEPRO100State *s)
 #if 0
        uint16_t tx_buffer_el = lduw_le_pci_dma(&s->dev, tbd_address + 6);
 #endif
+        if (tx_buffer_size == 0) {
+            /* Prevent an endless loop. */
+            logout("loop in %s:%u\n", __FILE__, __LINE__);
+            break;
+        }
        tbd_address += 8;
        TRACE(RXTX, logout
            ("TBD (simplified mode): buffer address 0x%08x, size 0x%04x\n",
@@ -855,6 +860,10 @@ static void set_multicast_list(EEPRO100State *s)

 static void action_command(EEPRO100State *s)
 {
+    /* The loop below won't stop if it gets special handcrafted data.
+       Therefore we limit the number of iterations. */
+    unsigned max_loop_count = 16;
+
    for (;;) {
        bool bit_el;
        bool bit_s;
@@ -870,6 +879,13 @@ static void action_command(EEPRO100State *s)
 #if 0
        bool bit_sf = ((s->tx.command & COMMAND_SF) != 0);
 #endif
+
+        if (max_loop_count-- == 0) {
+            /* Prevent an endless loop. */
+            logout("loop in %s:%u\n", __FILE__, __LINE__);
+            break;
+        }
+
        s->cu_offset = s->tx.link;
        TRACE(OTHER,
              logout("val=(cu start), status=0x%04x, command=0x%04x, link=0x%08x\n",
@@ -1848,6 +1864,7 @@ static void pci_nic_uninit(PCIDevice *pci_dev)
    memory_region_destroy(&s->io_bar);
    memory_region_destroy(&s->flash_bar);
    vmstate_unregister(&pci_dev->qdev, s->vmstate, s);
+    g_free(s->vmstate);
    eeprom93xx_free(&pci_dev->qdev, s->eeprom);
    qemu_del_nic(s->nic);
 }
--- a/hw/es1370.c
+++ b/hw/es1370.c
@@ -788,6 +788,9 @@ static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel,
    int csc_bytes = (csc + 1) << d->shift;
    int cnt = d->frame_cnt >> 16;
    int size = d->frame_cnt & 0xffff;
+    if (size < cnt) {
+        return;
+    }
    int left = ((size - cnt + 1) << 2) + d->leftover;
    int transferred = 0;
    int temp = audio_MIN (max, audio_MIN (left, csc_bytes));
@@ -796,7 +799,7 @@ static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel,
    addr += (cnt << 2) + d->leftover;

    if (index == ADC_CHANNEL) {
-        while (temp) {
+        while (temp > 0) {
            int acquired, to_copy;

            to_copy = audio_MIN ((size_t) temp, sizeof (tmpbuf));
@@ -814,7 +817,7 @@ static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel,
    else {
        SWVoiceOut *voice = s->dac_voice[index];

-        while (temp) {
+        while (temp > 0) {
            int copied, to_copy;

            to_copy = audio_MIN ((size_t) temp, sizeof (tmpbuf));
--- a/hw/escc.c
+++ b/hw/escc.c
@@ -208,7 +208,7 @@ struct SerialState {
 #define R_EXTINT 15

 static void handle_kbd_command(ChannelState *s, int val);
-static int serial_can_receive(void *opaque);
+static size_t serial_can_receive(void *opaque);
 static void serial_receive_byte(ChannelState *s, int ch);

 static void clear_queue(void *opaque)
@@ -610,10 +610,10 @@ static const MemoryRegionOps escc_mem_ops = {
    },
 };

-static int serial_can_receive(void *opaque)
+static size_t serial_can_receive(void *opaque)
 {
    ChannelState *s = opaque;
-    int ret;
+    size_t ret;

    if (((s->wregs[W_RXCTRL] & RXCTRL_RXEN) == 0) // Rx not enabled
        || ((s->rregs[R_STATUS] & STATUS_RXAV) == STATUS_RXAV))
--- a/hw/esp.c
+++ b/hw/esp.c
@@ -80,7 +80,7 @@ void esp_request_cancelled(SCSIRequest *req)
    }
 }

-static uint32_t get_cmd(ESPState *s, uint8_t *buf)
+static uint32_t get_cmd(ESPState *s, uint8_t *buf, uint8_t buflen)
 {
    uint32_t dmalen;
    int target;
@@ -90,6 +90,9 @@ static uint32_t get_cmd(ESPState *s, uint8_t *buf)
        dmalen = s->rregs[ESP_TCLO];
        dmalen |= s->rregs[ESP_TCMID] << 8;
        dmalen |= s->rregs[ESP_TCHI] << 16;
+        if (dmalen > buflen) {
+            return 0;
+        }
        s->dma_memory_read(s->dma_opaque, buf, dmalen);
    } else {
        dmalen = s->ti_size;
@@ -164,7 +167,7 @@ static void handle_satn(ESPState *s)
        s->dma_cb = handle_satn;
        return;
    }
-    len = get_cmd(s, buf);
+    len = get_cmd(s, buf, sizeof(buf));
    if (len)
        do_cmd(s, buf);
 }
@@ -178,7 +181,7 @@ static void handle_s_without_atn(ESPState *s)
        s->dma_cb = handle_s_without_atn;
        return;
    }
-    len = get_cmd(s, buf);
+    len = get_cmd(s, buf, sizeof(buf));
    if (len) {
        do_busid_cmd(s, buf, 0);
    }
@@ -190,7 +193,7 @@ static void handle_satn_stop(ESPState *s)
        s->dma_cb = handle_satn_stop;
        return;
    }
-    s->cmdlen = get_cmd(s, s->cmdbuf);
+    s->cmdlen = get_cmd(s, s->cmdbuf, sizeof(s->cmdbuf));
    if (s->cmdlen) {
        trace_esp_handle_satn_stop(s->cmdlen);
        s->do_cmd = 1;
@@ -214,7 +217,7 @@ static void write_response(ESPState *s)
    } else {
        s->ti_size = 2;
        s->ti_rptr = 0;
-        s->ti_wptr = 0;
+        s->ti_wptr = 2;
        s->rregs[ESP_RFLAGS] = 2;
    }
    esp_raise_irq(s);
@@ -395,19 +398,17 @@ uint64_t esp_reg_read(ESPState *s, uint32_t saddr)
    trace_esp_mem_readb(saddr, s->rregs[saddr]);
    switch (saddr) {
    case ESP_FIFO:
-        if (s->ti_size > 0) {
+        if ((s->rregs[ESP_RSTAT] & STAT_PIO_MASK) == 0) {
+            /* Data out.  */
+            qemu_log_mask(LOG_UNIMP, "esp: PIO data read not implemented\n");
+            s->rregs[ESP_FIFO] = 0;
+            esp_raise_irq(s);
+        } else if (s->ti_rptr < s->ti_wptr) {
            s->ti_size--;
-            if ((s->rregs[ESP_RSTAT] & STAT_PIO_MASK) == 0) {
-                /* Data out.  */
-                qemu_log_mask(LOG_UNIMP,
-                              "esp: PIO data read not implemented\n");
-                s->rregs[ESP_FIFO] = 0;
-            } else {
-                s->rregs[ESP_FIFO] = s->ti_buf[s->ti_rptr++];
-            }
+            s->rregs[ESP_FIFO] = s->ti_buf[s->ti_rptr++];
            esp_raise_irq(s);
        }
-        if (s->ti_size == 0) {
+        if (s->ti_rptr == s->ti_wptr) {
            s->ti_rptr = 0;
            s->ti_wptr = 0;
        }
@@ -439,8 +440,12 @@ void esp_reg_write(ESPState *s, uint32_t saddr, uint64_t val)
        break;
    case ESP_FIFO:
        if (s->do_cmd) {
-            s->cmdbuf[s->cmdlen++] = val & 0xff;
-        } else if (s->ti_size == TI_BUFSZ - 1) {
+            if (s->cmdlen < TI_BUFSZ) {
+                s->cmdbuf[s->cmdlen++] = val & 0xff;
+            } else {
+                trace_esp_error_fifo_overrun();
+            }
+        } else if (s->ti_wptr == TI_BUFSZ - 1) {
            trace_esp_error_fifo_overrun();
        } else {
            s->ti_size++;
--- a/hw/etraxfs_ser.c
+++ b/hw/etraxfs_ser.c
@@ -156,7 +156,7 @@ static const MemoryRegionOps ser_ops = {
    }
 };

-static void serial_receive(void *opaque, const uint8_t *buf, int size)
+static void serial_receive(void *opaque, const uint8_t *buf, size_t size)
 {
    struct etrax_serial *s = opaque;
    int i;
@@ -177,10 +177,10 @@ static void serial_receive(void *opaque, const uint8_t *buf, int size)
    ser_update_irq(s);
 }

-static int serial_can_receive(void *opaque)
+static size_t serial_can_receive(void *opaque)
 {
    struct etrax_serial *s = opaque;
-    int r;
+    size_t r;

    /* Is the receiver enabled?  */
    if (!(s->regs[RW_REC_CTRL] & (1 << 3))) {
--- a/hw/exynos4210_uart.c
+++ b/hw/exynos4210_uart.c
@@ -484,7 +484,7 @@ static const MemoryRegionOps exynos4210_uart_ops = {
    },
 };

-static int exynos4210_uart_can_receive(void *opaque)
+static size_t exynos4210_uart_can_receive(void *opaque)
 {
    Exynos4210UartState *s = (Exynos4210UartState *)opaque;

--- a/hw/fdc.c
+++ b/hw/fdc.c
@@ -1432,7 +1432,7 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
 {
    FDrive *cur_drv;
    uint32_t retval = 0;
-    int pos;
+    uint32_t pos;

    cur_drv = get_cur_drv(fdctrl);
    fdctrl->dsr &= ~FD_DSR_PWRDOWN;
@@ -1441,8 +1441,8 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
        return 0;
    }
    pos = fdctrl->data_pos;
+    pos %= FD_SECTOR_LEN;
    if (fdctrl->msr & FD_MSR_NONDMA) {
-        pos %= FD_SECTOR_LEN;
        if (pos == 0) {
            if (fdctrl->data_pos != 0)
                if (!fdctrl_seek_to_next_sect(fdctrl, cur_drv)) {
@@ -1786,10 +1786,13 @@ static void fdctrl_handle_option(FDCtrl *fdctrl, int direction)
 static void fdctrl_handle_drive_specification_command(FDCtrl *fdctrl, int direction)
 {
    FDrive *cur_drv = get_cur_drv(fdctrl);
+    uint32_t pos;

-    if (fdctrl->fifo[fdctrl->data_pos - 1] & 0x80) {
+    pos = fdctrl->data_pos - 1;
+    pos %= FD_SECTOR_LEN;
+    if (fdctrl->fifo[pos] & 0x80) {
        /* Command parameters done */
-        if (fdctrl->fifo[fdctrl->data_pos - 1] & 0x40) {
+        if (fdctrl->fifo[pos] & 0x40) {
            fdctrl->fifo[0] = fdctrl->fifo[1];
            fdctrl->fifo[2] = 0;
            fdctrl->fifo[3] = 0;
@@ -1889,7 +1892,7 @@ static uint8_t command_to_handler[256];
 static void fdctrl_write_data(FDCtrl *fdctrl, uint32_t value)
 {
    FDrive *cur_drv;
-    int pos;
+    uint32_t pos;

    /* Reset mode */
    if (!(fdctrl->dor & FD_DOR_nRESET)) {
@@ -1937,7 +1940,9 @@ static void fdctrl_write_data(FDCtrl *fdctrl, uint32_t value)
    }

    FLOPPY_DPRINTF("%s: %02x\n", __func__, value);
-    fdctrl->fifo[fdctrl->data_pos++] = value;
+    pos = fdctrl->data_pos++;
+    pos %= FD_SECTOR_LEN;
+    fdctrl->fifo[pos] = value;
    if (fdctrl->data_pos == fdctrl->data_len) {
        /* We now have all parameters
         * and will be able to treat the command
--- a/hw/fw_cfg.c
+++ b/hw/fw_cfg.c
@@ -203,12 +203,15 @@ static void fw_cfg_reboot(FWCfgState *s)
 static void fw_cfg_write(FWCfgState *s, uint8_t value)
 {
    int arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
-    FWCfgEntry *e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
+    FWCfgEntry *e = (s->cur_entry == FW_CFG_INVALID) ? NULL :
+                     &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];

    trace_fw_cfg_write(s, value);

-    if (s->cur_entry & FW_CFG_WRITE_CHANNEL && e->callback &&
-        s->cur_offset < e->len) {
+    if (s->cur_entry & FW_CFG_WRITE_CHANNEL
+        && e != NULL
+        && e->callback
+        && s->cur_offset < e->len) {
        e->data[s->cur_offset++] = value;
        if (s->cur_offset == e->len) {
            e->callback(e->callback_opaque, e->data);
@@ -237,7 +240,8 @@ static int fw_cfg_select(FWCfgState *s, uint16_t key)
 static uint8_t fw_cfg_read(FWCfgState *s)
 {
    int arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
-    FWCfgEntry *e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
+    FWCfgEntry *e = (s->cur_entry == FW_CFG_INVALID) ? NULL :
+                     &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
    uint8_t ret;

    if (s->cur_entry == FW_CFG_INVALID || !e->data || s->cur_offset >= e->len)
--- a/hw/grlib_apbuart.c
+++ b/hw/grlib_apbuart.c
@@ -125,14 +125,14 @@ static void uart_add_to_fifo(UART          *uart,
    uart->len += length;
 }

-static int grlib_apbuart_can_receive(void *opaque)
+static size_t grlib_apbuart_can_receive(void *opaque)
 {
    UART *uart = opaque;

    return FIFO_LENGTH - uart->len;
 }

-static void grlib_apbuart_receive(void *opaque, const uint8_t *buf, int size)
+static void grlib_apbuart_receive(void *opaque, const uint8_t *buf, size_t size)
 {
    UART *uart = opaque;

--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -222,6 +222,18 @@ static int hpet_pre_load(void *opaque)
    return 0;
 }

+static bool hpet_validate_num_timers(void *opaque, int version_id)
+{
+    HPETState *s = opaque;
+
+    if (s->num_timers < HPET_MIN_TIMERS) {
+        return false;
+    } else if (s->num_timers > HPET_MAX_TIMERS) {
+        return false;
+    }
+    return true;
+}
+
 static int hpet_post_load(void *opaque, int version_id)
 {
    HPETState *s = opaque;
@@ -290,6 +302,7 @@ static const VMStateDescription vmstate_hpet = {
        VMSTATE_UINT64(isr, HPETState),
        VMSTATE_UINT64(hpet_counter, HPETState),
        VMSTATE_UINT8_V(num_timers, HPETState, 2),
+        VMSTATE_VALIDATE("num_timers in range", hpet_validate_num_timers),
        VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers, 0,
                                    vmstate_hpet_timer, HPETTimer),
        VMSTATE_END_OF_LIST()
--- a/hw/i8254.c
+++ b/hw/i8254.c
@@ -187,6 +187,12 @@ static uint64_t pit_ioport_read(void *opaque, hwaddr addr,
    PITChannelState *s;

    addr &= 3;
+
+    if (addr == 3) {
+        /* Mode/Command register is write only, read is ignored */
+        return 0;
+    }
+
    s = &pit->channels[addr];
    if (s->status_latched) {
        s->status_latched = 0;
--- a/hw/i8254_common.c
+++ b/hw/i8254_common.c
@@ -266,6 +266,12 @@ static int pit_dispatch_post_load(void *opaque, int version_id)
    return 0;
 }

+static bool is_qemu_kvm(void *opaque, int version_id)
+{
+    /* HACK: We ignore incoming migration from upstream qemu */
+    return version_id < 3;
+}
+
 static const VMStateDescription vmstate_pit_common = {
    .name = "i8254",
    .version_id = 3,
@@ -275,6 +281,7 @@ static const VMStateDescription vmstate_pit_common = {
    .pre_save = pit_dispatch_pre_save,
    .post_load = pit_dispatch_post_load,
    .fields = (VMStateField[]) {
+        VMSTATE_UNUSED_TEST(is_qemu_kvm, 4),
        VMSTATE_UINT32_V(channels[0].irq_disabled, PITCommonState, 3),
        VMSTATE_STRUCT_ARRAY(channels, PITCommonState, 3, 2,
                             vmstate_pit_channel, PITChannelState),
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -636,7 +636,8 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
    }
 }

-static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset)
+static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
+                                int32_t offset)
 {
    AHCICmdHdr *cmd = ad->cur_cmd;
    uint32_t opts = le32_to_cpu(cmd->opts);
@@ -647,11 +648,19 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset)
    uint8_t *prdt;
    int i;
    int r = 0;
-    int sum = 0;
+    uint64_t sum = 0;
    int off_idx = -1;
-    int off_pos = -1;
+    int64_t off_pos = -1;
    int tbl_entry_size;

+    /*
+     * Note: AHCI PRDT can describe up to 256GiB. SATA/ATA only support
+     * transactions of up to 32MiB as of ATA8-ACS3 rev 1b, assuming a
+     * 512 byte sector size. We limit the PRDT in this implementation to
+     * a reasonably large 2GiB, which can accommodate the maximum transfer
+     * request for sector sizes up to 32K.
+     */
+
    if (!sglist_alloc_hint) {
        DPRINTF(ad->port_no, "no sg list given by guest: 0x%08x\n", opts);
        return -1;
@@ -686,7 +695,7 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset)
        }
        if ((off_idx == -1) || (off_pos < 0) || (off_pos > tbl_entry_size)) {
            DPRINTF(ad->port_no, "%s: Incorrect offset! "
-                            "off_idx: %d, off_pos: %d\n",
+                            "off_idx: %d, off_pos: %"PRId64"\n",
                            __func__, off_idx, off_pos);
            r = -1;
            goto out;
@@ -700,6 +709,13 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset)
            /* flags_size is zero-based */
            qemu_sglist_add(sglist, le64_to_cpu(tbl[i].addr),
                            le32_to_cpu(tbl[i].flags_size) + 1);
+            if (sglist->size > INT32_MAX) {
+                error_report("AHCI Physical Region Descriptor Table describes "
+                             "more than 2 GiB.\n");
+                qemu_sglist_destroy(sglist);
+                r = -1;
+                goto out;
+            }
        }
    }

@@ -709,6 +725,16 @@ out:
    return r;
 }

+static void ncq_err(NCQTransferState *ncq_tfs)
+{
+    IDEState *ide_state = &ncq_tfs->drive->port.ifs[0];
+
+    ide_state->error = ABRT_ERR;
+    ide_state->status = READY_STAT | ERR_STAT;
+    ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
+    ncq_tfs->used = 0;
+}
+
 static void ncq_cb(void *opaque, int ret)
 {
    NCQTransferState *ncq_tfs = (NCQTransferState *)opaque;
@@ -718,10 +744,7 @@ static void ncq_cb(void *opaque, int ret)
    ncq_tfs->drive->port_regs.scr_act &= ~(1 << ncq_tfs->tag);

    if (ret < 0) {
-        /* error */
-        ide_state->error = ABRT_ERR;
-        ide_state->status = READY_STAT | ERR_STAT;
-        ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
+        ncq_err(ncq_tfs);
    } else {
        ide_state->status = READY_STAT | SEEK_STAT;
    }
@@ -1043,16 +1066,19 @@ static void ahci_start_dma(IDEDMA *dma, IDEState *s,
    dma_cb(s, 0);
 }

-static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write)
+static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int is_write)
 {
    AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
    IDEState *s = &ad->port.ifs[0];

-    ahci_populate_sglist(ad, &s->sg, 0);
+    if (ahci_populate_sglist(ad, &s->sg, s->io_buffer_offset) == -1) {
+        DPRINTF(ad->port_no, "ahci_dma_prepare_buf failed.\n");
+        return -1;
+    }
    s->io_buffer_size = s->sg.size;

    DPRINTF(ad->port_no, "len=%#x\n", s->io_buffer_size);
-    return s->io_buffer_size != 0;
+    return s->io_buffer_size;
 }

 static int ahci_dma_rw_buf(IDEDMA *dma, int is_write)
@@ -1104,10 +1130,15 @@ static int ahci_dma_add_status(IDEDMA *dma, int status)
 }

 static int ahci_dma_set_inactive(IDEDMA *dma)
+{
+    return 0;
+}
+
+static int ahci_async_cmd_done(IDEDMA *dma)
 {
    AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);

-    DPRINTF(ad->port_no, "dma done\n");
+    DPRINTF(ad->port_no, "async cmd done\n");

    /* update d2h status */
    ahci_write_fis_d2h(ad, NULL);
@@ -1142,6 +1173,7 @@ static const IDEDMAOps ahci_dma_ops = {
    .set_unit = ahci_dma_set_unit,
    .add_status = ahci_dma_add_status,
    .set_inactive = ahci_dma_set_inactive,
+    .async_cmd_done = ahci_async_cmd_done,
    .restart_cb = ahci_dma_restart_cb,
    .reset = ahci_dma_reset,
 };
@@ -1176,6 +1208,18 @@ void ahci_init(AHCIState *s, DeviceState *qdev, DMAContext *dma, int ports)

 void ahci_uninit(AHCIState *s)
 {
+    int i, j;
+
+    for (i = 0; i < s->ports; i++) {
+        AHCIDevice *ad = &s->dev[i];
+
+        for (j = 0; j < 2; j++) {
+            IDEState *s = &ad->port.ifs[j];
+
+            ide_exit(s);
+        }
+    }
+
    memory_region_destroy(&s->mem);
    memory_region_destroy(&s->idp);
    g_free(s->dev);
@@ -1270,7 +1314,7 @@ const VMStateDescription vmstate_ahci = {
        VMSTATE_UINT32(control_regs.impl, AHCIState),
        VMSTATE_UINT32(control_regs.version, AHCIState),
        VMSTATE_UINT32(idp_index, AHCIState),
-        VMSTATE_INT32(ports, AHCIState),
+        VMSTATE_INT32_EQUAL(ports, AHCIState),
        VMSTATE_END_OF_LIST()
    },
 };
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -231,6 +231,8 @@ void ide_atapi_cmd_reply_end(IDEState *s)
            s->packet_transfer_size -= size;
            s->elementary_transfer_size -= size;
            s->io_buffer_index += size;
+            assert(size <= s->io_buffer_total_len);
+            assert(s->io_buffer_index <= s->io_buffer_total_len);
            ide_transfer_start(s, s->io_buffer + s->io_buffer_index - size,
                               size, ide_atapi_cmd_reply_end);
            ide_set_irq(s->bus);
@@ -879,6 +881,7 @@ static void cmd_start_stop_unit(IDEState *s, uint8_t* buf)

    if (pwrcnd) {
        /* eject/load only happens for power condition == 0 */
+        ide_atapi_cmd_ok(s);
        return;
    }

--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -568,10 +568,18 @@ static void dma_buf_commit(IDEState *s)
    qemu_sglist_destroy(&s->sg);
 }

+static void ide_async_cmd_done(IDEState *s)
+{
+    if (s->bus->dma->ops->async_cmd_done) {
+        s->bus->dma->ops->async_cmd_done(s->bus->dma);
+    }
+}
+
 void ide_set_inactive(IDEState *s)
 {
    s->bus->dma->aiocb = NULL;
    s->bus->dma->ops->set_inactive(s->bus->dma);
+    ide_async_cmd_done(s);
 }

 void ide_dma_error(IDEState *s)
@@ -651,10 +659,11 @@ void ide_dma_cb(void *opaque, int ret)
    n = s->nsector;
    s->io_buffer_index = 0;
    s->io_buffer_size = n * 512;
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, ide_cmd_is_read(s)) == 0) {
+    if (s->bus->dma->ops->prepare_buf(s->bus->dma, ide_cmd_is_read(s)) < 512) {
        /* The PRDs were too short. Reset the Active bit, but don't raise an
         * interrupt. */
        s->status = READY_STAT | SEEK_STAT;
+        dma_buf_commit(s);
        goto eot;
    }

@@ -804,6 +813,7 @@ static void ide_flush_cb(void *opaque, int ret)

    bdrv_acct_done(s->bs, &s->acct);
    s->status = READY_STAT | SEEK_STAT;
+    ide_async_cmd_done(s);
    ide_set_irq(s->bus);
 }

@@ -814,6 +824,7 @@ void ide_flush_cache(IDEState *s)
        return;
    }

+    s->status |= BUSY_STAT;
    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
    bdrv_aio_flush(s->bs, ide_flush_cb, s);
 }
@@ -1013,11 +1024,11 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 static const uint8_t ide_cmd_table[0x100] = {
    /* NOP not implemented, mandatory for CD */
    [CFA_REQ_EXT_ERROR_CODE]            = CFA_OK,
-    [WIN_DSM]                           = ALL_OK,
+    [WIN_DSM]                           = HD_CFA_OK,
    [WIN_DEVICE_RESET]                  = CD_OK,
    [WIN_RECAL]                         = HD_CFA_OK,
    [WIN_READ]                          = ALL_OK,
-    [WIN_READ_ONCE]                     = ALL_OK,
+    [WIN_READ_ONCE]                     = HD_CFA_OK,
    [WIN_READ_EXT]                      = HD_CFA_OK,
    [WIN_READDMA_EXT]                   = HD_CFA_OK,
    [WIN_READ_NATIVE_MAX_EXT]           = HD_CFA_OK,
@@ -1036,12 +1047,12 @@ static const uint8_t ide_cmd_table[0x100] = {
    [CFA_TRANSLATE_SECTOR]              = CFA_OK,
    [WIN_DIAGNOSE]                      = ALL_OK,
    [WIN_SPECIFY]                       = HD_CFA_OK,
-    [WIN_STANDBYNOW2]                   = ALL_OK,
-    [WIN_IDLEIMMEDIATE2]                = ALL_OK,
-    [WIN_STANDBY2]                      = ALL_OK,
-    [WIN_SETIDLE2]                      = ALL_OK,
-    [WIN_CHECKPOWERMODE2]               = ALL_OK,
-    [WIN_SLEEPNOW2]                     = ALL_OK,
+    [WIN_STANDBYNOW2]                   = HD_CFA_OK,
+    [WIN_IDLEIMMEDIATE2]                = HD_CFA_OK,
+    [WIN_STANDBY2]                      = HD_CFA_OK,
+    [WIN_SETIDLE2]                      = HD_CFA_OK,
+    [WIN_CHECKPOWERMODE2]               = HD_CFA_OK,
+    [WIN_SLEEPNOW2]                     = HD_CFA_OK,
    [WIN_PACKETCMD]                     = CD_OK,
    [WIN_PIDENTIFY]                     = CD_OK,
    [WIN_SMART]                         = HD_CFA_OK,
@@ -1055,19 +1066,19 @@ static const uint8_t ide_cmd_table[0x100] = {
    [WIN_WRITEDMA]                      = HD_CFA_OK,
    [WIN_WRITEDMA_ONCE]                 = HD_CFA_OK,
    [CFA_WRITE_MULTI_WO_ERASE]          = CFA_OK,
-    [WIN_STANDBYNOW1]                   = ALL_OK,
-    [WIN_IDLEIMMEDIATE]                 = ALL_OK,
-    [WIN_STANDBY]                       = ALL_OK,
-    [WIN_SETIDLE1]                      = ALL_OK,
-    [WIN_CHECKPOWERMODE1]               = ALL_OK,
-    [WIN_SLEEPNOW1]                     = ALL_OK,
+    [WIN_STANDBYNOW1]                   = HD_CFA_OK,
+    [WIN_IDLEIMMEDIATE]                 = HD_CFA_OK,
+    [WIN_STANDBY]                       = HD_CFA_OK,
+    [WIN_SETIDLE1]                      = HD_CFA_OK,
+    [WIN_CHECKPOWERMODE1]               = HD_CFA_OK,
+    [WIN_SLEEPNOW1]                     = HD_CFA_OK,
    [WIN_FLUSH_CACHE]                   = ALL_OK,
    [WIN_FLUSH_CACHE_EXT]               = HD_CFA_OK,
    [WIN_IDENTIFY]                      = ALL_OK,
    [WIN_SETFEATURES]                   = ALL_OK,
    [IBM_SENSE_CONDITION]               = CFA_OK,
    [CFA_WEAR_LEVEL]                    = HD_CFA_OK,
-    [WIN_READ_NATIVE_MAX]               = ALL_OK,
+    [WIN_READ_NATIVE_MAX]               = HD_CFA_OK,
 };

 static bool ide_cmd_permitted(IDEState *s, uint32_t cmd)
@@ -1603,7 +1614,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
 		case 2: /* extended self test */
 		s->smart_selftest_count++;
 		if(s->smart_selftest_count > 21)
-			s->smart_selftest_count = 0;
+			s->smart_selftest_count = 1;
 		n = 2 + (s->smart_selftest_count - 1) * 24;
 		s->smart_selftest_data[n] = s->sector;
 		s->smart_selftest_data[n+1] = 0x00; /* OK and finished */
@@ -1790,11 +1801,17 @@ void ide_data_writew(void *opaque, uint32_t addr, uint32_t val)
    }

    p = s->data_ptr;
+    if (p + 2 > s->data_end) {
+        return;
+    }
+
    *(uint16_t *)p = le16_to_cpu(val);
    p += 2;
    s->data_ptr = p;
-    if (p >= s->data_end)
+    if (p >= s->data_end) {
+        s->status &= ~DRQ_STAT;
        s->end_transfer_func(s);
+    }
 }

 uint32_t ide_data_readw(void *opaque, uint32_t addr)
@@ -1811,11 +1828,17 @@ uint32_t ide_data_readw(void *opaque, uint32_t addr)
    }

    p = s->data_ptr;
+    if (p + 2 > s->data_end) {
+        return 0;
+    }
+
    ret = cpu_to_le16(*(uint16_t *)p);
    p += 2;
    s->data_ptr = p;
-    if (p >= s->data_end)
+    if (p >= s->data_end) {
+        s->status &= ~DRQ_STAT;
        s->end_transfer_func(s);
+    }
    return ret;
 }

@@ -1832,11 +1855,17 @@ void ide_data_writel(void *opaque, uint32_t addr, uint32_t val)
    }

    p = s->data_ptr;
+    if (p + 4 > s->data_end) {
+        return;
+    }
+
    *(uint32_t *)p = le32_to_cpu(val);
    p += 4;
    s->data_ptr = p;
-    if (p >= s->data_end)
+    if (p >= s->data_end) {
+        s->status &= ~DRQ_STAT;
        s->end_transfer_func(s);
+    }
 }

 uint32_t ide_data_readl(void *opaque, uint32_t addr)
@@ -1853,11 +1882,17 @@ uint32_t ide_data_readl(void *opaque, uint32_t addr)
    }

    p = s->data_ptr;
+    if (p + 4 > s->data_end) {
+        return 0;
+    }
+
    ret = cpu_to_le32(*(uint32_t *)p);
    p += 4;
    s->data_ptr = p;
-    if (p >= s->data_end)
+    if (p >= s->data_end) {
+        s->status &= ~DRQ_STAT;
        s->end_transfer_func(s);
+    }
    return ret;
 }

@@ -2072,6 +2107,11 @@ static int ide_nop_int(IDEDMA *dma, int x)
    return 0;
 }

+static int32_t ide_nop_int32(IDEDMA *dma, int x)
+{
+    return 0;
+}
+
 static void ide_nop_restart(void *opaque, int x, RunState y)
 {
 }
@@ -2079,7 +2119,7 @@ static void ide_nop_restart(void *opaque, int x, RunState y)
 static const IDEDMAOps ide_dma_nop_ops = {
    .start_dma      = ide_nop_start,
    .start_transfer = ide_nop,
-    .prepare_buf    = ide_nop_int,
+    .prepare_buf    = ide_nop_int32,
    .rw_buf         = ide_nop_int,
    .set_unit       = ide_nop_int,
    .add_status     = ide_nop_int,
@@ -2154,6 +2194,14 @@ void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
    bus->dma = &ide_dma_nop;
 }

+void ide_exit(IDEState *s)
+{
+    qemu_del_timer(s->sector_write_timer);
+    qemu_free_timer(s->sector_write_timer);
+    qemu_vfree(s->smart_selftest_data);
+    qemu_vfree(s->io_buffer);
+}
+
 static const MemoryRegionPortio ide_portio_list[] = {
    { 0, 8, 1, .read = ide_ioport_read, .write = ide_ioport_write },
    { 0, 2, 2, .read = ide_data_readw, .write = ide_data_writew },
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .4.0
 .4.2