vnc: add configurable keyboard delay

Limits the rate kbd events from the vnc server are forwarded to the guest, so input devices which are typically low-bandwidth can keep up even on bulky input. v2: update documentation too. v3: spell fixes. Signed-off-by: Gerd Hoffmann <kraxel@redhat.com> Tested-by: Yang Hongyang <hongyang.yang@easystack.cn> Message-id: 1464762150-25817-1-git-send-email-kraxel@redhat.com
sdl2: skip init without outputs
2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00 · 2016-06-03 08:23:26 +02:00
229 changed files with 15997 additions and 6697 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,5 @@
 cscope.*
 tags
 TAGS
+docker-src.*
 *~
--- a/26
+++ b/26
@@ -165,6 +165,7 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/

 PowerPC
+M: David Gibson <david@gibson.dropbear.id.au>
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Maintained
@@ -953,6 +954,14 @@ S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h

+Network packet abstractions
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: include/net/eth.h
+F: net/eth.c
+F: hw/net/net_rx_pkt*
+F: hw/net/net_tx_pkt*
+
 Vmware
 M: Dmitry Fleytman <dmitry@daynix.com>
 S: Maintained
@@ -972,6 +981,16 @@ F: hw/acpi/nvdimm.c
 F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h

+e1000x
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000x*
+
+e1000e
+M: Dmitry Fleytman <dmitry@daynix.com>
+S: Maintained
+F: hw/net/e1000e*
+
 Subsystems
 ----------
 Audio
@@ -1614,3 +1633,10 @@ Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt
+
+Docker testing
+--------------
+Docker based testing framework and cases
+M: Fam Zheng <famz@redhat.com>
+S: Maintained
+F: tests/docker/
--- a/16
+++ b/16
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.

-UNCHECKED_GOALS := %clean TAGS cscope ctags
+UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-%

 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -30,7 +30,6 @@ CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak

-include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@# TODO: The next lines include code which supports a smooth
@@ -49,6 +48,8 @@ ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fa
 endif
 endif

+include $(SRC_PATH)/rules.mak
+
 GENERATED_HEADERS = config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
 GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
@@ -92,9 +93,6 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 DOCS+=qmp-commands.txt
-ifdef CONFIG_LINUX
-DOCS+=kvm_stat.1
-endif
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -571,12 +569,6 @@ qemu-ga.8: qemu-ga.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
 	  "  GEN   $@")

-kvm_stat.1: scripts/kvm/kvm_stat.texi
-	$(call quiet-command, \
-	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
-	  $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \
-	  "  GEN   $@")
-
 dvi: qemu-doc.dvi qemu-tech.dvi
 html: qemu-doc.html qemu-tech.html
 info: qemu-doc.info qemu-tech.info
@@ -652,3 +644,5 @@ endif
 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
 -include $(wildcard *.d tests/*.d)
+
+include $(SRC_PATH)/tests/docker/Makefile.include
--- a/Makefile.target
+++ b/Makefile.target
@@ -108,7 +108,12 @@ obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o

 ifdef CONFIG_LINUX_USER

-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user
+# Note that we only add linux-user/host/$ARCH if it exists, and
+# that it must come before linux-user/host/generic in the search path.
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+             $(patsubst %,-I%,$(wildcard $(SRC_PATH)/linux-user/host/$(ARCH))) \
+             -I$(SRC_PATH)/linux-user/host/generic \
+             -I$(SRC_PATH)/linux-user

 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o
--- a/block.c
+++ b/block.c
@@ -64,16 +64,16 @@ static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
    QLIST_HEAD_INITIALIZER(bdrv_drivers);

-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp);
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp);

 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;

-static void bdrv_close(BlockDriverState *bs);
-
 #ifdef _WIN32
 static int is_windows_drive_prefix(const char *filename)
 {
@@ -220,11 +220,6 @@ void bdrv_register(BlockDriver *bdrv)
    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }

-BlockDriverState *bdrv_new_root(void)
-{
-    return bdrv_new();
-}
-
 BlockDriverState *bdrv_new(void)
 {
    BlockDriverState *bs;
@@ -664,6 +659,18 @@ int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
    return 0;
 }

+static void bdrv_child_cb_drained_begin(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_begin(bs);
+}
+
+static void bdrv_child_cb_drained_end(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_end(bs);
+}
+
 /*
 * Returns the options and flags that a temporary snapshot should get, based on
 * the originally requested flags (the originally requested image will have
@@ -710,6 +717,8 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,

 const BdrvChildRole child_file = {
    .inherit_options = bdrv_inherited_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 /*
@@ -728,6 +737,8 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,

 const BdrvChildRole child_format = {
    .inherit_options = bdrv_inherited_fmt_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 /*
@@ -755,6 +766,8 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options,

 static const BdrvChildRole child_backing = {
    .inherit_options = bdrv_backing_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };

 static int bdrv_open_flags(BlockDriverState *bs, int flags)
@@ -1155,18 +1168,41 @@ static int bdrv_fill_options(QDict **options, const char *filename,
    return 0;
 }

+static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
+{
+    BlockDriverState *old_bs = child->bs;
+
+    if (old_bs) {
+        if (old_bs->quiesce_counter && child->role->drained_end) {
+            child->role->drained_end(child);
+        }
+        QLIST_REMOVE(child, next_parent);
+    }
+
+    child->bs = new_bs;
+
+    if (new_bs) {
+        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
+        if (new_bs->quiesce_counter && child->role->drained_begin) {
+            child->role->drained_begin(child);
+        }
+    }
+}
+
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
-                                  const BdrvChildRole *child_role)
+                                  const BdrvChildRole *child_role,
+                                  void *opaque)
 {
    BdrvChild *child = g_new(BdrvChild, 1);
    *child = (BdrvChild) {
-        .bs     = child_bs,
+        .bs     = NULL,
        .name   = g_strdup(child_name),
        .role   = child_role,
+        .opaque = opaque,
    };

-    QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent);
+    bdrv_replace_child(child, child_bs);

    return child;
 }
@@ -1176,7 +1212,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                             const char *child_name,
                             const BdrvChildRole *child_role)
 {
-    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
+    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role,
+                                              parent_bs);
    QLIST_INSERT_HEAD(&parent_bs->children, child, next);
    return child;
 }
@@ -1187,7 +1224,9 @@ static void bdrv_detach_child(BdrvChild *child)
        QLIST_REMOVE(child, next);
        child->next.le_prev = NULL;
    }
-    QLIST_REMOVE(child, next_parent);
+
+    bdrv_replace_child(child, NULL);
+
    g_free(child->name);
    g_free(child);
 }
@@ -1341,14 +1380,13 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
        qdict_put(options, "driver", qstring_from_str(bs->backing_format));
    }

-    backing_hd = NULL;
-    ret = bdrv_open_inherit(&backing_hd,
-                            *backing_filename ? backing_filename : NULL,
-                            reference, options, 0, bs, &child_backing,
-                            errp);
-    if (ret < 0) {
+    backing_hd = bdrv_open_inherit(*backing_filename ? backing_filename : NULL,
+                                   reference, options, 0, bs, &child_backing,
+                                   errp);
+    if (!backing_hd) {
        bs->open_flags |= BDRV_O_NO_BACKING;
        error_prepend(errp, "Could not open backing file: ");
+        ret = -EINVAL;
        goto free_exit;
    }

@@ -1388,7 +1426,6 @@ BdrvChild *bdrv_open_child(const char *filename,
    BdrvChild *c = NULL;
    BlockDriverState *bs;
    QDict *image_options;
-    int ret;
    char *bdref_key_dot;
    const char *reference;

@@ -1408,10 +1445,9 @@ BdrvChild *bdrv_open_child(const char *filename,
        goto done;
    }

-    bs = NULL;
-    ret = bdrv_open_inherit(&bs, filename, reference, image_options, 0,
-                            parent, child_role, errp);
-    if (ret < 0) {
+    bs = bdrv_open_inherit(filename, reference, image_options, 0,
+                           parent, child_role, errp);
+    if (!bs) {
        goto done;
    }

@@ -1422,15 +1458,16 @@ done:
    return c;
 }

-static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
-                                     QDict *snapshot_options, Error **errp)
+static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
+                                                   int flags,
+                                                   QDict *snapshot_options,
+                                                   Error **errp)
 {
    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
    char *tmp_filename = g_malloc0(PATH_MAX + 1);
    int64_t total_size;
    QemuOpts *opts = NULL;
    BlockDriverState *bs_snapshot;
-    Error *local_err = NULL;
    int ret;

    /* if snapshot, we create a temporary backing file and open it
@@ -1439,7 +1476,6 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
    /* Get the required size from the image */
    total_size = bdrv_getlength(bs);
    if (total_size < 0) {
-        ret = total_size;
        error_setg_errno(errp, -total_size, "Could not get image size");
        goto out;
    }
@@ -1470,22 +1506,26 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
    qdict_put(snapshot_options, "driver",
              qstring_from_str("qcow2"));

-    bs_snapshot = bdrv_new();
-
-    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
-                    flags, &local_err);
+    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
    snapshot_options = NULL;
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    if (!bs_snapshot) {
+        ret = -EINVAL;
        goto out;
    }

+    /* bdrv_append() consumes a strong reference to bs_snapshot (i.e. it will
+     * call bdrv_unref() on it), so in order to be able to return one, we have
+     * to increase bs_snapshot's refcount here */
+    bdrv_ref(bs_snapshot);
    bdrv_append(bs_snapshot, bs);

+    g_free(tmp_filename);
+    return bs_snapshot;
+
 out:
    QDECREF(snapshot_options);
    g_free(tmp_filename);
-    return ret;
+    return NULL;
 }

 /*
@@ -1503,10 +1543,12 @@ out:
 * should be opened. If specified, neither options nor a filename may be given,
 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
 */
-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp)
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp)
 {
    int ret;
    BdrvChild *file = NULL;
@@ -1518,7 +1560,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    QDict *snapshot_options = NULL;
    int snapshot_flags = 0;

-    assert(pbs);
    assert(!child_role || !flags);
    assert(!child_role == !parent);

@@ -1526,33 +1567,22 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        bool options_non_empty = options ? qdict_size(options) : false;
        QDECREF(options);

-        if (*pbs) {
-            error_setg(errp, "Cannot reuse an existing BDS when referencing "
-                       "another block device");
-            return -EINVAL;
-        }
-
        if (filename || options_non_empty) {
            error_setg(errp, "Cannot reference an existing block device with "
                       "additional options or a new filename");
-            return -EINVAL;
+            return NULL;
        }

        bs = bdrv_lookup_bs(reference, reference, errp);
        if (!bs) {
-            return -ENODEV;
+            return NULL;
        }

        bdrv_ref(bs);
-        *pbs = bs;
-        return 0;
+        return bs;
    }

-    if (*pbs) {
-        bs = *pbs;
-    } else {
-        bs = bdrv_new();
-    }
+    bs = bdrv_new();

    /* NULL means an empty set of options */
    if (options == NULL) {
@@ -1562,7 +1592,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
    /* json: syntax counts as explicit options, as if in the QDict */
    parse_json_protocol(options, &filename, &local_err);
    if (local_err) {
-        ret = -EINVAL;
        goto fail;
    }

@@ -1589,7 +1618,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        drv = bdrv_find_format(drvname);
        if (!drv) {
            error_setg(errp, "Unknown driver: '%s'", drvname);
-            ret = -EINVAL;
            goto fail;
        }
    }
@@ -1619,7 +1647,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        file = bdrv_open_child(filename, options, "file", bs,
                               &child_file, true, &local_err);
        if (local_err) {
-            ret = -EINVAL;
            goto fail;
        }
    }
@@ -1646,7 +1673,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
        qdict_put(options, "driver", qstring_from_str(drv->format_name));
    } else if (!drv) {
        error_setg(errp, "Must specify either driver or file");
-        ret = -EINVAL;
        goto fail;
    }

@@ -1689,7 +1715,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                       drv->format_name, entry->key);
        }

-        ret = -EINVAL;
        goto close_and_fail;
    }

@@ -1700,25 +1725,30 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
               && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
        error_setg(errp,
                   "Guest must be stopped for opening of encrypted image");
-        ret = -EBUSY;
        goto close_and_fail;
    }

    QDECREF(options);
-    *pbs = bs;

    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
     * temporary snapshot afterwards. */
    if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
-                                        &local_err);
+        BlockDriverState *snapshot_bs;
+        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
+                                                snapshot_options, &local_err);
        snapshot_options = NULL;
        if (local_err) {
            goto close_and_fail;
        }
+        /* We are not going to return bs but the overlay on top of it
+         * (snapshot_bs); thus, we have to drop the strong reference to bs
+         * (which we obtained by calling bdrv_new()). bs will not be deleted,
+         * though, because the overlay still has a reference to it. */
+        bdrv_unref(bs);
+        bs = snapshot_bs;
    }

-    return 0;
+    return bs;

 fail:
    if (file != NULL) {
@@ -1729,36 +1759,26 @@ fail:
    QDECREF(bs->options);
    QDECREF(options);
    bs->options = NULL;
-    if (!*pbs) {
-        /* If *pbs is NULL, a new BDS has been created in this function and
-           needs to be freed now. Otherwise, it does not need to be closed,
-           since it has not really been opened yet. */
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
    if (local_err) {
        error_propagate(errp, local_err);
    }
-    return ret;
+    return NULL;

 close_and_fail:
-    /* See fail path, but now the BDS has to be always closed */
-    if (*pbs) {
-        bdrv_close(bs);
-    } else {
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
    QDECREF(snapshot_options);
    QDECREF(options);
    if (local_err) {
        error_propagate(errp, local_err);
    }
-    return ret;
+    return NULL;
 }

-int bdrv_open(BlockDriverState **pbs, const char *filename,
-              const char *reference, QDict *options, int flags, Error **errp)
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp)
 {
-    return bdrv_open_inherit(pbs, filename, reference, options, flags, NULL,
+    return bdrv_open_inherit(filename, reference, options, flags, NULL,
                             NULL, errp);
 }

@@ -2132,6 +2152,7 @@ static void bdrv_close(BlockDriverState *bs)
    BdrvAioNotifier *ban, *ban_next;

    assert(!bs->job);
+    assert(!bs->refcnt);

    bdrv_drained_begin(bs); /* complete I/O */
    bdrv_flush(bs);
@@ -2140,8 +2161,6 @@ static void bdrv_close(BlockDriverState *bs)
    bdrv_release_named_dirty_bitmaps(bs);
    assert(QLIST_EMPTY(&bs->dirty_bitmaps));

-    bdrv_parent_cb_change_media(bs, false);
-
    if (bs->drv) {
        BdrvChild *child, *next;

@@ -2190,8 +2209,7 @@ static void bdrv_close(BlockDriverState *bs)

 void bdrv_close_all(void)
 {
-    BlockDriverState *bs;
-    AioContext *aio_context;
+    block_job_cancel_sync_all();

    /* Drop references from requests still in flight, such as canceled block
     * jobs whose AIO context has not been polled yet */
@@ -2200,25 +2218,7 @@ void bdrv_close_all(void)
    blk_remove_all_bs();
    blockdev_close_all_bdrv_states();

-    /* Cancel all block jobs */
-    while (!QTAILQ_EMPTY(&all_bdrv_states)) {
-        QTAILQ_FOREACH(bs, &all_bdrv_states, bs_list) {
-            aio_context = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(aio_context);
-            if (bs->job) {
-                block_job_cancel_sync(bs->job);
-                aio_context_release(aio_context);
-                break;
-            }
-            aio_context_release(aio_context);
-        }
-
-        /* All the remaining BlockDriverStates are referenced directly or
-         * indirectly from block jobs, so there needs to be at least one BDS
-         * directly used by a block job */
-        assert(bs);
-    }
+    assert(QTAILQ_EMPTY(&all_bdrv_states));
 }

 static void change_parent_backing_link(BlockDriverState *from,
@@ -2228,10 +2228,8 @@ static void change_parent_backing_link(BlockDriverState *from,

    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->role != &child_backing);
-        c->bs = to;
-        QLIST_REMOVE(c, next_parent);
-        QLIST_INSERT_HEAD(&to->parents, c, next_parent);
        bdrv_ref(to);
+        bdrv_replace_child(c, to);
        bdrv_unref(from);
    }
 }
@@ -3195,9 +3193,9 @@ void bdrv_invalidate_cache_all(Error **errp)
 {
    BlockDriverState *bs;
    Error *local_err = NULL;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -3239,11 +3237,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
 int bdrv_inactivate_all(void)
 {
    BlockDriverState *bs = NULL;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
    int ret = 0;
    int pass;

-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        aio_context_acquire(bdrv_get_aio_context(bs));
    }

@@ -3252,8 +3250,7 @@ int bdrv_inactivate_all(void)
     * the second pass sets the BDRV_O_INACTIVE flag so that no further write
     * is allowed. */
    for (pass = 0; pass < 2; pass++) {
-        it = NULL;
-        while ((it = bdrv_next(it, &bs)) != NULL) {
+        for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
            ret = bdrv_inactivate_recurse(bs, pass);
            if (ret < 0) {
                goto out;
@@ -3262,8 +3259,7 @@ int bdrv_inactivate_all(void)
    }

 out:
-    it = NULL;
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        aio_context_release(bdrv_get_aio_context(bs));
    }

@@ -3547,11 +3543,10 @@ void bdrv_img_create(const char *filename, const char *fmt,
                          qstring_from_str(backing_fmt));
            }

-            bs = NULL;
-            ret = bdrv_open(&bs, full_backing, NULL, backing_options,
-                            back_flags, &local_err);
+            bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
+                           &local_err);
            g_free(full_backing);
-            if (ret < 0) {
+            if (!bs) {
                goto out;
            }
            size = bdrv_getlength(bs);
@@ -3753,10 +3748,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

    /* walk down the bs forest recursively */
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        bool perm;

        /* try to recurse in this top level bs */
--- a/block/backup.c
+++ b/block/backup.c
@@ -36,7 +36,7 @@ typedef struct CowRequest {

 typedef struct BackupBlockJob {
    BlockJob common;
-    BlockDriverState *target;
+    BlockBackend *target;
    /* bitmap for sync=incremental */
    BdrvDirtyBitmap *sync_bitmap;
    MirrorSyncMode sync_mode;
@@ -47,6 +47,7 @@ typedef struct BackupBlockJob {
    uint64_t sectors_read;
    unsigned long *done_bitmap;
    int64_t cluster_size;
+    NotifierWithReturn before_write;
    QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

@@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req)
    qemu_co_queue_restart_all(&req->wait_queue);
 }

-static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read,
                                      bool is_write_notifier)
 {
-    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    BlockBackend *blk = job->common.blk;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
@@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                start * sectors_per_cluster);

        if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
+            bounce_buffer = blk_blockalign(blk, job->cluster_size);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

-        if (is_write_notifier) {
-            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * sectors_per_cluster,
-                                           n, &bounce_qiov);
-        } else {
-            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
-                                &bounce_qiov);
-        }
+        ret = blk_co_preadv(blk, start * job->cluster_size,
+                            bounce_qiov.size, &bounce_qiov,
+                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
@@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = bdrv_co_write_zeroes(job->target,
-                                       start * sectors_per_cluster,
-                                       n, BDRV_REQ_MAY_UNMAP);
+            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
        } else {
-            ret = bdrv_co_writev(job->target,
-                                 start * sectors_per_cluster, n,
-                                 &bounce_qiov);
+            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+                                 bounce_qiov.size, &bounce_qiov, 0);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
@@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
 {
+    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
    BdrvTrackedRequest *req = opaque;
    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;

+    assert(req->bs == blk_bs(job->common.blk));
    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);

-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
 }

 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -221,7 +217,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
    BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *bs = blk_bs(job->common.blk);

    if (ret < 0 || block_job_is_cancelled(&job->common)) {
        /* Merge the successor back into the parent, delete nothing. */
@@ -279,7 +275,7 @@ static void backup_complete(BlockJob *job, void *opaque)
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
    BackupCompleteData *data = opaque;

-    bdrv_unref(s->target);
+    blk_unref(s->target);

    block_job_completed(job, data->ret);
    g_free(data);
@@ -321,7 +317,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
-    BlockDriverState *bs = job->common.bs;
    HBitmapIter hbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -343,7 +338,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if (yield_and_check(job)) {
                    return ret;
                }
-                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
                                    false);
                if ((ret < 0) &&
@@ -376,11 +371,8 @@ static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
    BackupCompleteData *data;
-    BlockDriverState *bs = job->common.bs;
-    BlockDriverState *target = job->target;
-    NotifierWithReturn before_write = {
-        .notify = backup_before_write_notify,
-    };
+    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockBackend *target = job->target;
    int64_t start, end;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    int ret = 0;
@@ -393,7 +385,8 @@ static void coroutine_fn backup_run(void *opaque)

    job->done_bitmap = bitmap_new(end);

-    bdrv_add_before_write_notifier(bs, &before_write);
+    job->before_write.notify = backup_before_write_notify;
+    bdrv_add_before_write_notifier(bs, &job->before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
@@ -445,7 +438,7 @@ static void coroutine_fn backup_run(void *opaque)
                }
            }
            /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * sectors_per_cluster,
+            ret = backup_do_cow(job, start * sectors_per_cluster,
                                sectors_per_cluster, &error_is_read, false);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
@@ -461,14 +454,14 @@ static void coroutine_fn backup_run(void *opaque)
        }
    }

-    notifier_with_return_remove(&before_write);
+    notifier_with_return_remove(&job->before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);
    g_free(job->done_bitmap);

-    bdrv_op_unblock_all(target, job->common.blocker);
+    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);

    data = g_malloc(sizeof(*data));
    data->ret = ret;
@@ -485,6 +478,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 {
    int64_t len;
    BlockDriverInfo bdi;
+    BackupBlockJob *job = NULL;
    int ret;

    assert(bs);
@@ -542,15 +536,16 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
        goto error;
    }

-    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
-                                           cb, opaque, errp);
+    job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp);
    if (!job) {
        goto error;
    }

+    job->target = blk_new();
+    blk_insert_bs(job->target, target);
+
    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
-    job->target = target;
    job->sync_mode = sync_mode;
    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                       sync_bitmap : NULL;
@@ -558,7 +553,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    /* If there is no backing file on the target, we cannot rely on COW if our
     * backup cluster size is smaller than the target cluster size. Even for
     * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(job->target, &bdi);
+    ret = bdrv_get_info(target, &bdi);
    if (ret < 0 && !target->backing) {
        error_setg_errno(errp, -ret,
            "Couldn't determine the cluster size of the target image, "
@@ -584,4 +579,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
    if (sync_bitmap) {
        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
    }
+    if (job) {
+        blk_unref(job->target);
+        block_job_unref(&job->common);
+    }
 }
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -19,6 +19,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
+#include "trace.h"

 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -119,12 +120,14 @@ static const BdrvChildRole child_root = {
 * Store an error through @errp on failure, unless it's null.
 * Return the new BlockBackend on success, null on failure.
 */
-BlockBackend *blk_new(Error **errp)
+BlockBackend *blk_new(void)
 {
    BlockBackend *blk;

    blk = g_new0(BlockBackend, 1);
    blk->refcnt = 1;
+    blk_set_enable_write_cache(blk, true);
+
    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
    qemu_co_queue_init(&blk->public.throttled_reqs[1]);

@@ -136,27 +139,7 @@ BlockBackend *blk_new(Error **errp)
 }

 /*
- * Create a new BlockBackend with a new BlockDriverState attached.
- * Otherwise just like blk_new(), which see.
- */
-BlockBackend *blk_new_with_bs(Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
-
-    blk = blk_new(errp);
-    if (!blk) {
-        return NULL;
-    }
-
-    bs = bdrv_new_root();
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    blk->root->opaque = blk;
-    return blk;
-}
-
-/*
- * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
+ * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
@@ -171,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                           QDict *options, int flags, Error **errp)
 {
    BlockBackend *blk;
-    int ret;
+    BlockDriverState *bs;

-    blk = blk_new_with_bs(errp);
-    if (!blk) {
-        QDECREF(options);
-        return NULL;
-    }
-
-    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
-    if (ret < 0) {
+    blk = blk_new();
+    bs = bdrv_open(filename, reference, options, flags, errp);
+    if (!bs) {
        blk_unref(blk);
        return NULL;
    }

-    blk_set_enable_write_cache(blk, true);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    return blk;
 }
@@ -286,25 +264,11 @@ BlockBackend *blk_next(BlockBackend *blk)
               : QTAILQ_FIRST(&monitor_block_backends);
 }

-struct BdrvNextIterator {
-    enum {
-        BDRV_NEXT_BACKEND_ROOTS,
-        BDRV_NEXT_MONITOR_OWNED,
-    } phase;
-    BlockBackend *blk;
-    BlockDriverState *bs;
-};
-
 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 * the monitor or attached to a BlockBackend */
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-    if (!it) {
-        it = g_new(BdrvNextIterator, 1);
-        *it = (BdrvNextIterator) {
-            .phase = BDRV_NEXT_BACKEND_ROOTS,
-        };
-    }
+    BlockDriverState *bs;

    /* First, return all root nodes of BlockBackends. In order to avoid
     * returning a BDS twice when multiple BBs refer to it, we only return it
@@ -312,11 +276,11 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
        do {
            it->blk = blk_all_next(it->blk);
-            *bs = it->blk ? blk_bs(it->blk) : NULL;
-        } while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));
+            bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));

-        if (*bs) {
-            return it;
+        if (bs) {
+            return bs;
        }
        it->phase = BDRV_NEXT_MONITOR_OWNED;
    }
@@ -326,10 +290,19 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
     * by the above block already */
    do {
        it->bs = bdrv_next_monitor_owned(it->bs);
-        *bs = it->bs;
-    } while (*bs && bdrv_has_blk(*bs));
+        bs = it->bs;
+    } while (bs && bdrv_has_blk(bs));

-    return *bs ? it : NULL;
+    return bs;
+}
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+    *it = (BdrvNextIterator) {
+        .phase = BDRV_NEXT_BACKEND_ROOTS,
+    };
+
+    return bdrv_next(it);
 }

 /*
@@ -509,8 +482,7 @@ void blk_remove_bs(BlockBackend *blk)
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
    bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    blk->root->opaque = blk;
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);

    notifier_list_notify(&blk->insert_bs_notifiers, blk);
    if (blk->public.throttle_state) {
@@ -770,11 +742,15 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num,
                                  nb_sectors * BDRV_SECTOR_SIZE);
 }

-static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
 {
-    int ret = blk_check_byte_request(blk, offset, bytes);
+    int ret;
+
+    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
    }
@@ -787,12 +763,14 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
    return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
 }

-static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
 {
    int ret;

+    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+
    ret = blk_check_byte_request(blk, offset, bytes);
    if (ret < 0) {
        return ret;
@@ -885,8 +863,8 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
    return ret;
 }

-int blk_write_zeroes(BlockBackend *blk, int64_t offset,
-                     int count, BdrvRequestFlags flags)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags)
 {
    return blk_prw(blk, offset, NULL, count, blk_write_entry,
                   flags | BDRV_REQ_ZERO_WRITE);
@@ -1001,9 +979,9 @@ static void blk_aio_write_entry(void *opaque)
    blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
-                                 int count, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
@@ -1492,8 +1470,8 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
-                                     int count, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
 {
    return blk_co_pwritev(blk, offset, count, NULL,
                          flags | BDRV_REQ_ZERO_WRITE);
@@ -1704,6 +1682,9 @@ static void blk_root_drained_begin(BdrvChild *child)
 {
    BlockBackend *blk = child->opaque;

+    /* Note that blk->root may not be accessible here yet if we are just
+     * attaching to a BlockDriverState that is drained. Use child instead. */
+
    if (blk->public.io_limits_disabled++ == 0) {
        throttle_group_restart_blk(blk);
    }
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,28 +36,36 @@ typedef struct CommitBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *active;
-    BlockDriverState *top;
-    BlockDriverState *base;
+    BlockBackend *top;
+    BlockBackend *base;
    BlockdevOnError on_error;
    int base_flags;
    int orig_overlay_flags;
    char *backing_file_str;
 } CommitBlockJob;

-static int coroutine_fn commit_populate(BlockDriverState *bs,
-                                        BlockDriverState *base,
+static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
    int ret = 0;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };

-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    if (ret) {
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
+                        qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

-    ret = bdrv_write(base, sector_num, buf, nb_sectors);
-    if (ret) {
+    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
+                         qiov.size, &qiov, 0);
+    if (ret < 0) {
        return ret;
    }

@@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque)
    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
+    BlockDriverState *top = blk_bs(s->top);
+    BlockDriverState *base = blk_bs(s->base);
    BlockDriverState *overlay_bs;
    int ret = data->ret;

@@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque)
        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
    }
    g_free(s->backing_file_str);
+    blk_unref(s->top);
+    blk_unref(s->base);
    block_job_completed(&s->common, ret);
    g_free(data);
 }
@@ -102,8 +112,6 @@ static void coroutine_fn commit_run(void *opaque)
 {
    CommitBlockJob *s = opaque;
    CommitCompleteData *data;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
@@ -111,27 +119,27 @@ static void coroutine_fn commit_run(void *opaque)
    int bytes_written = 0;
    int64_t base_len;

-    ret = s->common.len = bdrv_getlength(top);
+    ret = s->common.len = blk_getlength(s->top);


    if (s->common.len < 0) {
        goto out;
    }

-    ret = base_len = bdrv_getlength(base);
+    ret = base_len = blk_getlength(s->base);
    if (base_len < 0) {
        goto out;
    }

    if (base_len < s->common.len) {
-        ret = bdrv_truncate(base, s->common.len);
+        ret = blk_truncate(s->base, s->common.len);
        if (ret) {
            goto out;
        }
    }

    end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);

    for (sector_num = 0; sector_num < end; sector_num += n) {
        uint64_t delay_ns = 0;
@@ -146,7 +154,8 @@ wait:
            break;
        }
        /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(top, base, sector_num,
+        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
+                                      sector_num,
                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                      &n);
        copy = (ret == 1);
@@ -158,7 +167,7 @@ wait:
                    goto wait;
                }
            }
-            ret = commit_populate(top, base, sector_num, n, buf);
+            ret = commit_populate(s->top, s->base, sector_num, n, buf);
            bytes_written += n * BDRV_SECTOR_SIZE;
        }
        if (ret < 0) {
@@ -253,8 +262,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
        return;
    }

-    s->base   = base;
-    s->top    = top;
+    s->base = blk_new();
+    blk_insert_bs(s->base, base);
+
+    s->top = blk_new();
+    blk_insert_bs(s->top, top);
+
    s->active = bs;

    s->base_flags          = orig_base_flags;
--- a/block/io.c
+++ b/block/io.c
@@ -225,6 +225,34 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
    assert(data.done);
 }

+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+        bdrv_parent_drained_begin(bs);
+    }
+
+    bdrv_io_unplugged_begin(bs);
+    bdrv_drain_recurse(bs);
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+    } else {
+        bdrv_drain_poll(bs);
+    }
+    bdrv_io_unplugged_end(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
+    }
+
+    bdrv_parent_drained_end(bs);
+    aio_enable_external(bdrv_get_aio_context(bs));
+}
+
 /*
 * Wait for pending requests to complete on a single BlockDriverState subtree,
 * and suspend block driver's internal I/O until next request arrives.
@@ -238,26 +266,15 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 */
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
-    bdrv_parent_drained_begin(bs);
-    bdrv_io_unplugged_begin(bs);
-    bdrv_drain_recurse(bs);
-    bdrv_co_yield_to_drain(bs);
-    bdrv_io_unplugged_end(bs);
-    bdrv_parent_drained_end(bs);
+    assert(qemu_in_coroutine());
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
 }

 void bdrv_drain(BlockDriverState *bs)
 {
-    bdrv_parent_drained_begin(bs);
-    bdrv_io_unplugged_begin(bs);
-    bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
-    } else {
-        bdrv_drain_poll(bs);
-    }
-    bdrv_io_unplugged_end(bs);
-    bdrv_parent_drained_end(bs);
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
 }

 /*
@@ -271,10 +288,10 @@ void bdrv_drain_all(void)
    /* Always run first iteration so any pending completion BHs run */
    bool busy = true;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
    GSList *aio_ctxs = NULL, *ctx;

-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -302,10 +319,9 @@ void bdrv_drain_all(void)

        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
            AioContext *aio_context = ctx->data;
-            it = NULL;

            aio_context_acquire(aio_context);
-            while ((it = bdrv_next(it, &bs))) {
+            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                if (aio_context == bdrv_get_aio_context(bs)) {
                    if (bdrv_requests_pending(bs)) {
                        busy = true;
@@ -318,8 +334,7 @@ void bdrv_drain_all(void)
        }
    }

-    it = NULL;
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
@@ -1093,24 +1108,6 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }

-int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_NO_SERIALISING);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_COPY_ON_READ);
-}
-
 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768

 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
@@ -2543,23 +2540,3 @@ void bdrv_io_unplugged_end(BlockDriverState *bs)
        }
    }
 }
-
-void bdrv_drained_begin(BlockDriverState *bs)
-{
-    if (!bs->quiesce_counter++) {
-        aio_disable_external(bdrv_get_aio_context(bs));
-    }
-    bdrv_parent_drained_begin(bs);
-    bdrv_drain(bs);
-}
-
-void bdrv_drained_end(BlockDriverState *bs)
-{
-    bdrv_parent_drained_end(bs);
-
-    assert(bs->quiesce_counter > 0);
-    if (--bs->quiesce_counter > 0) {
-        return;
-    }
-    aio_enable_external(bdrv_get_aio_context(bs));
-}
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -833,6 +833,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        return &acb->common;
    }

+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,7 +20,6 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
-#include "qemu/error-report.h"

 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
@@ -36,7 +35,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
    BlockJob common;
    RateLimit limit;
-    BlockDriverState *target;
+    BlockBackend *target;
    BlockDriverState *base;
    /* The name of the graph node to replace */
    char *replaces;
@@ -157,7 +156,8 @@ static void mirror_read_complete(void *opaque, int ret)
        mirror_iteration_done(op, ret);
        return;
    }
-    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                    op->nb_sectors * BDRV_SECTOR_SIZE,
                    mirror_write_complete, op);
 }

@@ -186,7 +186,7 @@ static int mirror_cow_align(MirrorBlockJob *s,
    need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                          s->cow_bitmap);
    if (need_cow) {
-        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+        bdrv_round_to_clusters(blk_bs(s->target), *sector_num, *nb_sectors,
                               &align_sector_num, &align_nb_sectors);
    }

@@ -224,7 +224,7 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                          int nb_sectors)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockBackend *source = s->common.blk;
    int sectors_per_chunk, nb_chunks;
    int ret = nb_sectors;
    MirrorOp *op;
@@ -274,7 +274,8 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);

-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                   nb_sectors * BDRV_SECTOR_SIZE,
                   mirror_read_complete, op);
    return ret;
 }
@@ -296,10 +297,11 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
    s->in_flight++;
    s->sectors_in_flight += nb_sectors;
    if (is_discard) {
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+        blk_aio_discard(s->target, sector_num, op->nb_sectors,
+                        mirror_write_complete, op);
    } else {
-        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
+                              op->nb_sectors * BDRV_SECTOR_SIZE,
                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                              mirror_write_complete, op);
    }
@@ -307,7 +309,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,

 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockDriverState *source = blk_bs(s->common.blk);
    int64_t sector_num, first_chunk;
    uint64_t delay_ns = 0;
    /* At least the first dirty chunk is mirrored in one iteration. */
@@ -384,7 +386,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
        } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
            int64_t target_sector_num;
            int target_nb_sectors;
-            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+            bdrv_round_to_clusters(blk_bs(s->target), sector_num, io_sectors,
                                   &target_sector_num, &target_nb_sectors);
            if (target_sector_num == sector_num &&
                target_nb_sectors == io_sectors) {
@@ -449,7 +451,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
    MirrorExitData *data = opaque;
    AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->common.bs;
+    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);

    /* Make sure that the source BDS doesn't go away before we called
     * block_job_completed(). */
@@ -461,26 +464,25 @@ static void mirror_exit(BlockJob *job, void *opaque)
    }

    if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
+        BlockDriverState *to_replace = src;
        if (s->to_replace) {
            to_replace = s->to_replace;
        }

-        /* This was checked in mirror_start_job(), but meanwhile one of the
-         * nodes could have been newly attached to a BlockBackend. */
-        if (bdrv_has_blk(to_replace) && bdrv_has_blk(s->target)) {
-            error_report("block job: Can't create node with two BlockBackends");
-            data->ret = -EINVAL;
-            goto out;
+        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
        }

-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_replace_in_backing_chain(to_replace, s->target);
+        /* The mirror job has no requests in flight any more, but we need to
+         * drain potential other users of the BDS before changing the graph. */
+        bdrv_drained_begin(target_bs);
+        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_drained_end(target_bs);
+
+        /* We just changed the BDS the job BB refers to */
+        blk_remove_bs(job->blk);
+        blk_insert_bs(job->blk, src);
    }
-
-out:
    if (s->to_replace) {
        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
        error_free(s->replace_blocker);
@@ -490,8 +492,8 @@ out:
        aio_context_release(replace_aio_context);
    }
    g_free(s->replaces);
-    bdrv_op_unblock_all(s->target, s->common.blocker);
-    bdrv_unref(s->target);
+    bdrv_op_unblock_all(target_bs, s->common.blocker);
+    blk_unref(s->target);
    block_job_completed(&s->common, data->ret);
    g_free(data);
    bdrv_drained_end(src);
@@ -505,7 +507,8 @@ static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
    MirrorExitData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
    int64_t sector_num, end, length;
    uint64_t last_pause_ns;
    BlockDriverInfo bdi;
@@ -541,18 +544,18 @@ static void coroutine_fn mirror_run(void *opaque)
     * the destination do COW.  Instead, we copy sectors around the
     * dirty data if needed.  We need a bitmap to do that.
     */
-    bdrv_get_backing_filename(s->target, backing_filename,
+    bdrv_get_backing_filename(target_bs, backing_filename,
                              sizeof(backing_filename));
-    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
        target_cluster_size = bdi.cluster_size;
    }
-    if (backing_filename[0] && !s->target->backing
+    if (backing_filename[0] && !target_bs->backing
        && s->granularity < target_cluster_size) {
        s->buf_size = MAX(s->buf_size, target_cluster_size);
        s->cow_bitmap = bitmap_new(length);
    }
    s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
+    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);

    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -567,7 +570,7 @@ static void coroutine_fn mirror_run(void *opaque)
    if (!s->is_none_mode) {
        /* First part, loop on the sectors and initialize the dirty bitmap.  */
        BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
+        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);

        for (sector_num = 0; sector_num < end; ) {
            /* Just to make sure we are not exceeding int limit. */
@@ -637,7 +640,7 @@ static void coroutine_fn mirror_run(void *opaque)
        should_complete = false;
        if (s->in_flight == 0 && cnt == 0) {
            trace_mirror_before_flush(s);
-            ret = bdrv_flush(s->target);
+            ret = blk_flush(s->target);
            if (ret < 0) {
                if (mirror_error_action(s, false, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
@@ -715,7 +718,7 @@ immediate_exit:
    data->ret = ret;
    /* Before we switch to target in mirror_exit, make sure data doesn't
     * change. */
-    bdrv_drained_begin(s->common.bs);
+    bdrv_drained_begin(bs);
    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
@@ -742,7 +745,8 @@ static void mirror_complete(BlockJob *job, Error **errp)
    Error *local_err = NULL;
    int ret;

-    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
+    ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing",
+                                 &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        return;
@@ -804,7 +808,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                             bool is_none_mode, BlockDriverState *base)
 {
    MirrorBlockJob *s;
-    BlockDriverState *replaced_bs;

    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -821,30 +824,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
        buf_size = DEFAULT_MIRROR_BUF_SIZE;
    }

-    /* We can't support this case as long as the block layer can't handle
-     * multiple BlockBackends per BlockDriverState. */
-    if (replaces) {
-        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
-        if (replaced_bs == NULL) {
-            return;
-        }
-    } else {
-        replaced_bs = bs;
-    }
-    if (bdrv_has_blk(replaced_bs) && bdrv_has_blk(target)) {
-        error_setg(errp, "Can't create node with two BlockBackends");
-        return;
-    }
-
    s = block_job_create(driver, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
    }

+    s->target = blk_new();
+    blk_insert_bs(s->target, target);
+
    s->replaces = g_strdup(replaces);
    s->on_source_error = on_source_error;
    s->on_target_error = on_target_error;
-    s->target = target;
    s->is_none_mode = is_none_mode;
    s->base = base;
    s->granularity = granularity;
@@ -854,11 +844,12 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
    if (!s->dirty_bitmap) {
        g_free(s->replaces);
+        blk_unref(s->target);
        block_job_unref(&s->common);
        return;
    }

-    bdrv_op_block_all(s->target, s->common.blocker);
+    bdrv_op_block_all(target, s->common.blocker);

    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
@@ -931,7 +922,6 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
        }
    }

-    bdrv_ref(base);
    mirror_start_job(bs, base, NULL, speed, 0, 0,
                     on_error, on_error, false, cb, opaque, &local_err,
                     &commit_active_job_driver, false, base);
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -517,8 +517,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
    if (ret < 0) {
        goto exit;
    }
-    ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE,
-                           (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
+    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
+                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
    if (ret < 0) {
        goto exit;
    }
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -374,9 +374,9 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
    bool ok = true;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while (ok && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -384,8 +384,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
            ok = bdrv_can_snapshot(bs);
        }
        aio_context_release(ctx);
+        if (!ok) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ok;
 }
@@ -395,20 +399,27 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 {
    int ret = 0;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
    QEMUSnapshotInfo sn1, *snapshot = &sn1;

-    while (ret == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
        if (bdrv_can_snapshot(bs) &&
                bdrv_snapshot_find(bs, snapshot, name) >= 0) {
            ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
+            if (ret < 0) {
+                goto fail;
+            }
        }
        aio_context_release(ctx);
+        if (ret < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return ret;
 }
@@ -418,9 +429,9 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
    int err = 0;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -428,8 +439,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_goto(bs, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -439,9 +454,9 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
    QEMUSnapshotInfo sn;
    int err = 0;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -449,8 +464,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
            err = bdrv_snapshot_find(bs, &sn, name);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }
@@ -462,9 +481,9 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 {
    int err = 0;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);

        aio_context_acquire(ctx);
@@ -476,24 +495,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
            err = bdrv_snapshot_create(bs, sn);
        }
        aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
    }

+fail:
    *first_bad_bs = bs;
    return err;
 }

 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    bool not_found = true;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while (not_found && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *ctx = bdrv_get_aio_context(bs);
+        bool found;

        aio_context_acquire(ctx);
-        not_found = !bdrv_can_snapshot(bs);
+        found = bdrv_can_snapshot(bs);
        aio_context_release(ctx);
+
+        if (found) {
+            break;
+        }
    }
    return bs;
 }
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
    char *backing_file_str;
 } StreamBlockJob;

-static int coroutine_fn stream_populate(BlockDriverState *bs,
+static int coroutine_fn stream_populate(BlockBackend *blk,
                                        int64_t sector_num, int nb_sectors,
                                        void *buf)
 {
@@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
    qemu_iovec_init_external(&qiov, &iov, 1);

    /* Copy-on-read the unallocated clusters */
-    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
+                         BDRV_REQ_COPY_ON_READ);
 }

 typedef struct {
@@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
    StreamCompleteData *data = opaque;
+    BlockDriverState *bs = blk_bs(job->blk);
    BlockDriverState *base = s->base;

    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                base_fmt = base->drv->format_name;
            }
        }
-        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
-        bdrv_set_backing_hd(job->bs, base);
+        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        bdrv_set_backing_hd(bs, base);
    }

    g_free(s->backing_file_str);
@@ -88,7 +90,8 @@ static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
    StreamCompleteData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockBackend *blk = s->common.blk;
+    BlockDriverState *bs = blk_bs(blk);
    BlockDriverState *base = s->base;
    int64_t sector_num = 0;
    int64_t end = -1;
@@ -159,7 +162,7 @@ wait:
                    goto wait;
                }
            }
-            ret = stream_populate(bs, sector_num, n, buf);
+            ret = stream_populate(blk, sector_num, n, buf);
        }
        if (ret < 0) {
            BlockErrorAction action =
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2998,12 +2998,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
        goto err;
    }

-    s->qcow = NULL;
    options = qdict_new();
    qdict_put(options, "driver", qstring_from_str("qcow"));
-    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
-    if (ret < 0) {
+    s->qcow = bdrv_open(s->qcow_filename, NULL, options,
+                        BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
+    if (!s->qcow) {
+        ret = -EINVAL;
        goto err;
    }

--- a/blockdev.c
+++ b/blockdev.c
@@ -567,11 +567,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

-        blk = blk_new(errp);
-        if (!blk) {
-            goto early_err;
-        }
-
+        blk = blk_new();
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags;
        blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
@@ -657,7 +653,6 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
    QemuOpts *opts;
    Error *local_error = NULL;
    BlockdevDetectZeroesOptions detect_zeroes;
-    int ret;
    int bdrv_flags = 0;

    opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp);
@@ -688,9 +683,8 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
        bdrv_flags |= BDRV_O_INACTIVE;
    }

-    bs = NULL;
-    ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp);
-    if (ret < 0) {
+    bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
+    if (!bs) {
        goto fail_no_bs_opts;
    }

@@ -1643,7 +1637,7 @@ typedef struct ExternalSnapshotState {
 static void external_snapshot_prepare(BlkActionState *common,
                                      Error **errp)
 {
-    int flags = 0, ret;
+    int flags = 0;
    QDict *options = NULL;
    Error *local_err = NULL;
    /* Device and node name of the image to generate the snapshot from */
@@ -1768,11 +1762,10 @@ static void external_snapshot_prepare(BlkActionState *common,
        flags |= BDRV_O_NO_BACKING;
    }

-    assert(state->new_bs == NULL);
-    ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options,
-                    flags, errp);
+    state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags,
+                              errp);
    /* We will manually add the backing_hd field to the bs later */
-    if (ret != 0) {
+    if (!state->new_bs) {
        return;
    }

@@ -2540,7 +2533,7 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
 {
    BlockBackend *blk;
    BlockDriverState *medium_bs = NULL;
-    int bdrv_flags, ret;
+    int bdrv_flags;
    QDict *options = NULL;
    Error *err = NULL;

@@ -2584,9 +2577,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    assert(!medium_bs);
-    ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp);
-    if (ret < 0) {
+    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
+    if (!medium_bs) {
        goto fail;
    }

@@ -3199,7 +3191,6 @@ static void do_drive_backup(const char *device, const char *target,
    Error *local_err = NULL;
    int flags;
    int64_t size;
-    int ret;

    if (!has_speed) {
        speed = 0;
@@ -3283,10 +3274,8 @@ static void do_drive_backup(const char *device, const char *target,
        qdict_put(options, "driver", qstring_from_str(format));
    }

-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options, flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags, errp);
+    if (!target_bs) {
        goto out;
    }

@@ -3304,8 +3293,8 @@ static void do_drive_backup(const char *device, const char *target,
    backup_start(bs, target_bs, speed, sync, bmap,
                 on_source_error, on_target_error,
                 block_job_cb, bs, txn, &local_err);
+    bdrv_unref(target_bs);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
        goto out;
    }
@@ -3389,12 +3378,10 @@ void do_blockdev_backup(const char *device, const char *target,
    }
    target_bs = blk_bs(target_blk);

-    bdrv_ref(target_bs);
    bdrv_set_aio_context(target_bs, aio_context);
    backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
                 on_target_error, block_job_cb, bs, txn, &local_err);
    if (local_err != NULL) {
-        bdrv_unref(target_bs);
        error_propagate(errp, local_err);
    }
 out:
@@ -3470,10 +3457,6 @@ static void blockdev_mirror_common(BlockDriverState *bs,
    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
        return;
    }
-    if (bdrv_has_blk(target)) {
-        error_setg(errp, "Cannot mirror to an attached block device");
-        return;
-    }

    if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) {
        sync = MIRROR_SYNC_MODE_FULL;
@@ -3511,7 +3494,6 @@ void qmp_drive_mirror(const char *device, const char *target,
    QDict *options = NULL;
    int flags;
    int64_t size;
-    int ret;

    blk = blk_by_name(device);
    if (!blk) {
@@ -3620,11 +3602,9 @@ void qmp_drive_mirror(const char *device, const char *target,
    /* Mirroring takes care of copy-on-write using the source's backing
     * file.
     */
-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options,
-                    flags | BDRV_O_NO_BACKING, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags | BDRV_O_NO_BACKING,
+                          errp);
+    if (!target_bs) {
        goto out;
    }

@@ -3639,9 +3619,9 @@ void qmp_drive_mirror(const char *device, const char *target,
                           has_on_target_error, on_target_error,
                           has_unmap, unmap,
                           &local_err);
+    bdrv_unref(target_bs);
    if (local_err) {
        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
    }
 out:
    aio_context_release(aio_context);
@@ -3685,7 +3665,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
    aio_context = bdrv_get_aio_context(bs);
    aio_context_acquire(aio_context);

-    bdrv_ref(target_bs);
    bdrv_set_aio_context(target_bs, aio_context);

    blockdev_mirror_common(bs, target_bs,
@@ -3699,7 +3678,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
                           &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
    }

    aio_context_release(aio_context);
@@ -4164,9 +4142,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
    BlockJobInfoList *head = NULL, **p_next = &head;
    BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;

-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
        AioContext *aio_context = bdrv_get_aio_context(bs);

        aio_context_acquire(aio_context);
--- a/blockjob.c
+++ b/blockjob.c
@@ -50,17 +50,31 @@ struct BlockJobTxn {
    int refcnt;
 };

+static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);
+
+BlockJob *block_job_next(BlockJob *job)
+{
+    if (!job) {
+        return QLIST_FIRST(&block_jobs);
+    }
+    return QLIST_NEXT(job, job_list);
+}
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                       int64_t speed, BlockCompletionFunc *cb,
                       void *opaque, Error **errp)
 {
+    BlockBackend *blk;
    BlockJob *job;

    if (bs->job) {
        error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
        return NULL;
    }
-    bdrv_ref(bs);
+
+    blk = blk_new();
+    blk_insert_bs(blk, bs);
+
    job = g_malloc0(driver->instance_size);
    error_setg(&job->blocker, "block device is in use by block job: %s",
               BlockJobType_lookup[driver->job_type]);
@@ -69,13 +83,15 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,

    job->driver        = driver;
    job->id            = g_strdup(bdrv_get_device_name(bs));
-    job->bs            = bs;
+    job->blk           = blk;
    job->cb            = cb;
    job->opaque        = opaque;
    job->busy          = true;
    job->refcnt        = 1;
    bs->job = job;

+    QLIST_INSERT_HEAD(&block_jobs, job, job_list);
+
    /* Only set speed when necessary to avoid NotSupported error */
    if (speed != 0) {
        Error *local_err = NULL;
@@ -98,11 +114,13 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
    if (--job->refcnt == 0) {
-        job->bs->job = NULL;
-        bdrv_op_unblock_all(job->bs, job->blocker);
-        bdrv_unref(job->bs);
+        BlockDriverState *bs = blk_bs(job->blk);
+        bs->job = NULL;
+        bdrv_op_unblock_all(bs, job->blocker);
+        blk_unref(job->blk);
        error_free(job->blocker);
        g_free(job->id);
+        QLIST_REMOVE(job, job_list);
        g_free(job);
    }
 }
@@ -140,7 +158,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
    txn->aborting = true;
    /* We are the first failed job. Cancel other jobs. */
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
    }
    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
@@ -157,7 +175,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
        assert(other_job->completed);
    }
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        block_job_completed_single(other_job);
        aio_context_release(ctx);
    }
@@ -179,7 +197,7 @@ static void block_job_completed_txn_success(BlockJob *job)
    }
    /* We are the last completed job, commit the transaction. */
    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
        aio_context_acquire(ctx);
        assert(other_job->ret == 0);
        block_job_completed_single(other_job);
@@ -189,9 +207,7 @@ static void block_job_completed_txn_success(BlockJob *job)

 void block_job_completed(BlockJob *job, int ret)
 {
-    BlockDriverState *bs = job->bs;
-
-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);
    assert(!job->completed);
    job->completed = true;
    job->ret = ret;
@@ -282,11 +298,10 @@ static int block_job_finish_sync(BlockJob *job,
                                 void (*finish)(BlockJob *, Error **errp),
                                 Error **errp)
 {
-    BlockDriverState *bs = job->bs;
    Error *local_err = NULL;
    int ret;

-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);

    block_job_ref(job);
    finish(job, &local_err);
@@ -297,7 +312,7 @@ static int block_job_finish_sync(BlockJob *job,
    }
    while (!job->completed) {
        aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
-                                              bdrv_get_aio_context(bs),
+                                              blk_get_aio_context(job->blk),
                 true);
    }
    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
@@ -318,6 +333,19 @@ int block_job_cancel_sync(BlockJob *job)
    return block_job_finish_sync(job, &block_job_cancel_err, NULL);
 }

+void block_job_cancel_sync_all(void)
+{
+    BlockJob *job;
+    AioContext *aio_context;
+
+    while ((job = QLIST_FIRST(&block_jobs))) {
+        aio_context = blk_get_aio_context(job->blk);
+        aio_context_acquire(aio_context);
+        block_job_cancel_sync(job);
+        aio_context_release(aio_context);
+    }
+}
+
 int block_job_complete_sync(BlockJob *job, Error **errp)
 {
    return block_job_finish_sync(job, &block_job_complete, errp);
@@ -336,7 +364,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
    if (block_job_is_paused(job)) {
        qemu_coroutine_yield();
    } else {
-        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
+        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
    }
    job->busy = true;
 }
@@ -465,7 +493,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
    aio_context_acquire(data->aio_context);

    /* Fetch BDS AioContext again, in case it has changed */
-    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context = blk_get_aio_context(data->job->blk);
    aio_context_acquire(aio_context);

    data->job->deferred_to_main_loop = false;
@@ -485,7 +513,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
    BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
    data->job = job;
    data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
-    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->aio_context = blk_get_aio_context(job->blk);
    data->fn = fn;
    data->opaque = opaque;
    job->deferred_to_main_loop = true;
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -345,6 +345,15 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
        *last_tb = NULL;
        cpu->tb_flushed = false;
    }
+#ifndef CONFIG_USER_ONLY
+    /* We don't take care of direct jumps when address mapping changes in
+     * system emulation. So it's not safe to make a direct jump to a TB
+     * spanning two pages because the mapping for the second page can change.
+     */
+    if (tb->page_addr[1] != -1) {
+        *last_tb = NULL;
+    }
+#endif
    /* See if we can patch the calling TB. */
    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
        tb_add_jump(*last_tb, tb_exit, tb);
--- a/cpus.c
+++ b/cpus.c
@@ -972,6 +972,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
    qemu_cpu_kick(cpu);
 }

+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed");
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
    struct qemu_work_item *wi;
@@ -1061,7 +1073,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
    cpu->created = true;
    qemu_cond_signal(&qemu_cpu_cond);

-    while (1) {
+    do {
        if (cpu_can_run(cpu)) {
            r = kvm_cpu_exec(cpu);
            if (r == EXCP_DEBUG) {
@@ -1069,8 +1081,12 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
            }
        }
        qemu_kvm_wait_io_event(cpu);
-    }
+    } while (!cpu->unplug || cpu_can_run(cpu));

+    qemu_kvm_destroy_vcpu(cpu);
+    cpu->created = false;
+    qemu_cond_signal(&qemu_cpu_cond);
+    qemu_mutex_unlock_iothread();
    return NULL;
 }

@@ -1124,6 +1140,7 @@ static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;

    rcu_register_thread();

@@ -1161,6 +1178,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
            }
        }
        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        CPU_FOREACH(cpu) {
+            if (cpu->unplug && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            remove_cpu = NULL;
+        }
    }

    return NULL;
@@ -1317,6 +1346,21 @@ void resume_all_vcpus(void)
    }
 }

+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->unplug = true;
+    qemu_cpu_kick(cpu);
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+    cpu_remove(cpu);
+    while (cpu->created) {
+        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    }
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16

@@ -1533,6 +1577,9 @@ static void tcg_exec_all(void)
                break;
            }
        } else if (cpu->stop || cpu->stopped) {
+            if (cpu->unplug) {
+                next_cpu = CPU_NEXT(cpu);
+            }
            break;
        }
    }
--- a/cputlb.c
+++ b/cputlb.c
@@ -246,7 +246,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
 {
    ram_addr_t ram_addr;

-    if (qemu_ram_addr_from_host(ptr, &ram_addr) == NULL) {
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
        fprintf(stderr, "Bad ram pointer %p\n", ptr);
        abort();
    }
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -18,6 +18,7 @@ CONFIG_MEGASAS_SCSI_PCI=y
 CONFIG_MPTSAS_SCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
+CONFIG_E1000E_PCI=y
 CONFIG_VMXNET3_PCI=y
 CONFIG_IDE_CORE=y
 CONFIG_IDE_QDEV=y
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -70,7 +70,7 @@ void qemu_sglist_destroy(QEMUSGList *qsg)

 typedef struct {
    BlockAIOCB common;
-    BlockBackend *blk;
+    AioContext *ctx;
    BlockAIOCB *acb;
    QEMUSGList *sg;
    uint64_t offset;
@@ -80,6 +80,7 @@ typedef struct {
    QEMUIOVector iov;
    QEMUBH *bh;
    DMAIOFunc *io_func;
+    void *io_func_opaque;
 } DMAAIOCB;

 static void dma_blk_cb(void *opaque, int ret);
@@ -154,8 +155,7 @@ static void dma_blk_cb(void *opaque, int ret)

    if (dbs->iov.size == 0) {
        trace_dma_map_wait(dbs);
-        dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk),
-                             reschedule_dma, dbs);
+        dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs);
        cpu_register_map_client(dbs->bh);
        return;
    }
@@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret)
        qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
    }

-    dbs->acb = dbs->io_func(dbs->blk, dbs->offset, &dbs->iov, 0,
-                            dma_blk_cb, dbs);
+    dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+                            dma_blk_cb, dbs, dbs->io_func_opaque);
    assert(dbs->acb);
 }

@@ -191,23 +191,25 @@ static const AIOCBInfo dma_aiocb_info = {
    .cancel_async       = dma_aio_cancel,
 };

-BlockAIOCB *dma_blk_io(
-    BlockBackend *blk, QEMUSGList *sg, uint64_t sector_num,
-    DMAIOFunc *io_func, BlockCompletionFunc *cb,
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+    QEMUSGList *sg, uint64_t offset,
+    DMAIOFunc *io_func, void *io_func_opaque,
+    BlockCompletionFunc *cb,
    void *opaque, DMADirection dir)
 {
-    DMAAIOCB *dbs = blk_aio_get(&dma_aiocb_info, blk, cb, opaque);
+    DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);

-    trace_dma_blk_io(dbs, blk, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
+    trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));

    dbs->acb = NULL;
-    dbs->blk = blk;
    dbs->sg = sg;
-    dbs->offset = sector_num << BDRV_SECTOR_BITS;
+    dbs->ctx = ctx;
+    dbs->offset = offset;
    dbs->sg_cur_index = 0;
    dbs->sg_cur_byte = 0;
    dbs->dir = dir;
    dbs->io_func = io_func;
+    dbs->io_func_opaque = io_func_opaque;
    dbs->bh = NULL;
    qemu_iovec_init(&dbs->iov, sg->nsg);
    dma_blk_cb(dbs, 0);
@@ -215,19 +217,39 @@ BlockAIOCB *dma_blk_io(
 }


+static
+BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
+                                 BlockCompletionFunc *cb, void *cb_opaque,
+                                 void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t sector,
+                         QEMUSGList *sg, uint64_t offset,
                         void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_preadv, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_read_io_func, blk, cb, opaque,
                      DMA_DIRECTION_FROM_DEVICE);
 }

+static
+BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
+                                  BlockCompletionFunc *cb, void *cb_opaque,
+                                  void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t sector,
+                          QEMUSGList *sg, uint64_t offset,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_pwritev, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_write_io_func, blk, cb, opaque,
                      DMA_DIRECTION_TO_DEVICE);
 }

--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -326,21 +326,41 @@ and memory barriers, and the equivalents in QEMU:
  use a boxed atomic_t type; atomic operations in QEMU are polymorphic
  and use normal C types.

- atomic_read and atomic_set in Linux give no guarantee at all;
-  atomic_read and atomic_set in QEMU include a compiler barrier
-  (similar to the READ_ONCE/WRITE_ONCE macros in Linux).
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+  at all. Linux 4.1 updated them to implement volatile
+  semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).

- most atomic read-modify-write operations in Linux return void;
-  in QEMU, all of them return the old value of the variable.
+  QEMU's atomic_read/set implement, if the compiler supports it, C11
+  atomic relaxed semantics, and volatile semantics otherwise.
+  Both semantics prevent the compiler from doing certain transformations;
+  the difference is that atomic accesses are guaranteed to be atomic,
+  while volatile accesses aren't. Thus, in the volatile case we just cross
+  our fingers hoping that the compiler will generate atomic accesses,
+  since we assume the variables passed are machine-word sized and
+  properly aligned.
+  No barriers are implied by atomic_read/set in either Linux or QEMU.
+
+- atomic read-modify-write operations in Linux are of three kinds:
+
+         atomic_OP          returns void
+         atomic_OP_return   returns new value of the variable
+         atomic_fetch_OP    returns the old value of the variable
+         atomic_cmpxchg     returns the old value of the variable
+
+  In QEMU, the second kind does not exist.  Currently Linux has
+  atomic_fetch_or only.  QEMU provides and, or, inc, dec, add, sub.

 - different atomic read-modify-write operations in Linux imply
  a different set of memory barriers; in QEMU, all of them enforce
  sequential consistency, which means they imply full memory barriers
  before and after the operation.

- Linux does not have an equivalent of atomic_mb_read() and
-  atomic_mb_set().  In particular, note that set_mb() is a little
-  weaker than atomic_mb_set().
+- Linux does not have an equivalent of atomic_mb_set().  In particular,
+  note that smp_store_mb() is a little weaker than atomic_mb_set().
+  atomic_mb_read() compiles to the same instructions as Linux's
+  smp_load_acquire(), but this should be treated as an implementation
+  detail.  If required, QEMU might later add atomic_load_acquire() and
+  atomic_store_release() macros.


 SOURCES
--- a/docs/build-system.txt
+++ b/docs/build-system.txt
@@ -438,6 +438,11 @@ top level Makefile, so anything defined in this file will influence the
 entire build system. Care needs to be taken when writing rules for tests
 to ensure they only apply to the unit test execution / build.

+- tests/docker/Makefile.include
+
+Rules for Docker tests. Like tests/Makefile, this file is included
+directly by the top level Makefile, anything defined in this file will
+influence the entire build system.

 - po/Makefile

--- a/docs/igd-assign.txt
+++ b/docs/igd-assign.txt
@@ -0,0 +1,133 @@
+Intel Graphics Device (IGD) assignment with vfio-pci
+====================================================
+
+IGD has two different modes for assignment using vfio-pci:
+
+1) Universal Pass-Through (UPT) mode:
+
+   In this mode the IGD device is added as a *secondary* (ie. non-primary)
+   graphics device in combination with an emulated primary graphics device.
+   This mode *requires* guest driver support to remove the external
+   dependencies generally associated with IGD (see below).  Those guest
+   drivers only support this mode for Broadwell and newer IGD, according to
+   Intel.  Additionally, this mode by default, and as officially supported
+   by Intel, does not support direct video output.  The intention is to use
+   this mode either to provide hardware acceleration to the emulated graphics
+   or to use this mode in combination with guest-based remote access software,
+   for example VNC (see below for optional output support).  This mode
+   theoretically has no device specific handling dependencies on vfio-pci or
+   the VM firmware.
+
+2) "Legacy" mode:
+
+   In this mode the IGD device is intended to be the primary and exclusive
+   graphics device in the VM[1], as such QEMU does not facilitate any sort
+   of remote graphics to the VM in this mode.  A connected physical monitor
+   is the intended output device for IGD.  This mode includes several
+   requirements and restrictions:
+
+    * IGD must be given address 02.0 on the PCI root bus in the VM
+    * The host kernel must support vfio extensions for IGD (v4.6)
+    * vfio VGA support very likely needs to be enabled in the host kernel
+    * The VM firmware must support specific fw_cfg enablers for IGD
+    * The VM machine type must support a PCI host bridge at 00.0 (standard)
+    * The VM machine type must provide or allow to be created a special
+      ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
+      PCI address 1f.0.
+    * The IGD device must have a VGA ROM, either provided via the romfile
+      option or loaded automatically through vfio (standard).  rombar=0
+      will disable legacy mode support.
+    * Hotplug of the IGD device is not supported.
+    * The IGD device must be a SandyBridge or newer model device.
+
+For either mode, depending on the host kernel, the i915 driver in the host
+may generate faults and errors upon re-binding to an IGD device after it
+has been assigned to a VM.  It's therefore generally recommended to prevent
+such driver binding unless the host driver is known to work well for this.
+There are numerous ways to do this, i915 can be blacklisted on the host,
+the driver_override option can be used to ensure that only vfio-pci can bind
+to the device on the host[2], virsh nodedev-detach can be used to bind the
+device to vfio drivers and then managed='no' set in the VM xml to prevent
+re-binding to i915, etc.  Also note that IGD is also typically the primary
+graphics in the host and special options may be required beyond simply
+blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a
+PCI class device.  Lower level drivers exist that may still claim the device.
+It may therefore be necessary to use kernel boot options video=vesafb:off or
+video=efifb:off (depending on host BIOS/UEFI) or these can be combined to
+a catch-all, video=vesafb:off,efifb:off.  Error messages such as:
+
+    Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow
+
+are a good indicator that such a problem exists.  The host files /proc/iomem
+and /proc/ioports are often useful for identifying drivers consuming ranges
+of the device to cause such conflicts.
+
+Additionally, IGD device are known to generate small numbers of DMAR faults
+when initially assigned.  It is believed that this is simply the IGD attempting
+to access the reserved GTT space after reset, which it no longer has access to
+when accessed from userspace.  So long as the DMAR faults are small in number
+and most importantly, not ongoing, these are not an indication of an error.
+
+Additionally++, analog VGA output (as opposed to digital outputs like HDMI,
+DVI, or DisplayPort) may be unsupported in some use cases.  In the author's
+experience, even DP to VGA adapters can be troublesome while adapters between
+digital formats work well.
+
+Usage
+=====
+The intention is for IGD assignment to be transparent for users and thus for
+management tools like libvirt.  To make use of legacy mode, simply remove all
+other graphics options and use "-nographic" and either "-vga none" or
+"-nodefaults", along with adding the device using vfio-pci:
+
+    -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+
+For UPT mode, retain the default emulated graphics and simply add the vfio-pci
+device making use of any other bus address other than 02.0.  libvirt will
+default to assigning the device a UPT compatible address while legacy mode
+users will need to manually edit the XML if using a tool like virt-manager
+where the VM device address is not expressly specified.
+
+An experimental vfio-pci option also exists to enable OpRegion, and thus
+external monitor support, for UPT mode.  This can be enabled by adding
+"x-igd-opregion=on" to the vfio-pci device options for the IGD device.  As
+with legacy mode, this requires the host to support features introduced in
+the v4.6 kernel.  If Intel chooses to embrace this support, the option may
+be made non-experimental in the future, opening it to libvirt support.
+
+Developer ABI
+=============
+Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+
+1) "etc/igd-opregion"
+
+   This fw_cfg file exposes the OpRegion for the IGD device.  A reserved
+   region should be created below 4GB (recommended 4KB alignment), sized
+   sufficient for the fw_cfg file size, and the content of this file copied
+   to it.  The dword based address of this reserved memory region must also
+   be written to the ASLS register at offset 0xFC on the IGD device.  It is
+   recommended that firmware should make use of this fw_cfg entry for any
+   PCI class VGA device with Intel vendor ID.  Multiple of such devices
+   within a VM is undefined.
+
+2) "etc/igd-bdsm-size"
+
+   This fw_cfg file contains an 8-byte, little endian integer indicating
+   the size of the reserved memory region required for IGD stolen memory.
+   Firmware must allocate a reserved memory below 4GB with required 1MB
+   alignment equal to this size.  Additionally the base address of this
+   reserved region must be written to the dword BDSM register in PCI config
+   space of the IGD device at offset 0x5C.  As this support is related to
+   running the IGD ROM, which has other dependencies on the device appearing
+   at guest address 00:02.0, it's expected that this fw_cfg file is only
+   relevant to a single PCI class VGA device with Intel vendor ID, appearing
+   at PCI bus address 00:02.0.
+
+Footnotes
+=========
+[1] Nothing precludes adding additional emulated or assigned graphics devices
+    as non-primary, other than the combination typically not working.  I only
+    intend to set user expectations, others are welcome to find working
+    combinations or fix whatever issues prevent this from working in the common
+    case.
+[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -403,8 +403,8 @@ listen thread:                     --- page -- page -- page -- page -- page --

 On receipt of CMD_PACKAGED (1)
   All the data associated with the package - the ( ... ) section in the
-diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
-recurses into qemu_loadvm_state_main to process the contents of the package (2)
+diagram - is read into memory, and the main thread recurses into
+qemu_loadvm_state_main to process the contents of the package (2)
 which contains commands (3,6) and devices (4...)

 On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
--- a/exec.c
+++ b/exec.c
@@ -57,6 +57,8 @@
 #include "exec/ram_addr.h"
 #include "exec/log.h"

+#include "migration/vmstate.h"
+
 #include "qemu/range.h"
 #ifndef _WIN32
 #include "qemu/mmap-alloc.h"
@@ -612,15 +614,9 @@ static int cpu_get_free_index(Error **errp)
    return cpu;
 }

-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
-    if (cpu->cpu_index == -1) {
-        /* cpu_index was never allocated by this @cpu or was already freed. */
-        return;
-    }
-
    bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
-    cpu->cpu_index = -1;
 }
 #else

@@ -635,11 +631,42 @@ static int cpu_get_free_index(Error **errp)
    return cpu_index;
 }

-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
+    return;
 }
 #endif

+void cpu_exec_exit(CPUState *cpu)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_lock();
+#endif
+    if (cpu->cpu_index == -1) {
+        /* cpu_index was never allocated by this @cpu or was already freed. */
+#if defined(CONFIG_USER_ONLY)
+        cpu_list_unlock();
+#endif
+        return;
+    }
+
+    QTAILQ_REMOVE(&cpus, cpu, node);
+    cpu_release_index(cpu);
+    cpu->cpu_index = -1;
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_unlock();
+#endif
+
+    if (cc->vmsd != NULL) {
+        vmstate_unregister(NULL, cc->vmsd, cpu);
+    }
+    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
+    }
+}
+
 void cpu_exec_init(CPUState *cpu, Error **errp)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
@@ -1815,40 +1842,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */

-int qemu_get_ram_fd(ram_addr_t addr)
-{
-    RAMBlock *block;
-    int fd;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    fd = block->fd;
-    rcu_read_unlock();
-    return fd;
-}
-
-void qemu_set_ram_fd(ram_addr_t addr, int fd)
-{
-    RAMBlock *block;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    block->fd = fd;
-    rcu_read_unlock();
-}
-
-void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-    void *ptr;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    ptr = ramblock_ptr(block, 0);
-    rcu_read_unlock();
-    return ptr;
-}
-
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
@@ -1856,12 +1849,13 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
 *
 * Called within RCU critical section.
 */
-void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
 {
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
+        addr -= block->offset;
    }

    if (xen_enabled() && block->host == NULL) {
@@ -1875,10 +1869,10 @@ void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)

        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }
-    return ramblock_ptr(block, addr - block->offset);
+    return ramblock_ptr(block, addr);
 }

-/* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
+/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
 * but takes a size argument.
 *
 * Called within RCU critical section.
@@ -1887,16 +1881,15 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                 hwaddr *size)
 {
    RAMBlock *block = ram_block;
-    ram_addr_t offset_inside_block;
    if (*size == 0) {
        return NULL;
    }

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
+        addr -= block->offset;
    }
-    offset_inside_block = addr - block->offset;
-    *size = MIN(*size, block->max_length - offset_inside_block);
+    *size = MIN(*size, block->max_length - addr);

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
@@ -1910,7 +1903,7 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
        block->host = xen_map_cache(block->offset, block->max_length, 1);
    }

-    return ramblock_ptr(block, offset_inside_block);
+    return ramblock_ptr(block, addr);
 }

 /*
@@ -1931,16 +1924,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
 * ram_addr_t.
 */
 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
-                                   ram_addr_t *ram_addr,
                                   ram_addr_t *offset)
 {
    RAMBlock *block;
    uint8_t *host = ptr;

    if (xen_enabled()) {
+        ram_addr_t ram_addr;
        rcu_read_lock();
-        *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        block = qemu_get_ram_block(*ram_addr);
+        ram_addr = xen_ram_addr_from_mapcache(ptr);
+        block = qemu_get_ram_block(ram_addr);
        if (block) {
            *offset = (host - block->host);
        }
@@ -1972,7 +1965,6 @@ found:
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
-    *ram_addr = block->offset + *offset;
    rcu_read_unlock();
    return block;
 }
@@ -1999,18 +1991,17 @@ RAMBlock *qemu_ram_block_by_name(const char *name)

 /* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
 {
    RAMBlock *block;
-    ram_addr_t offset; /* Not used */
-
-    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+    ram_addr_t offset;

+    block = qemu_ram_block_from_host(ptr, false, &offset);
    if (!block) {
-        return NULL;
+        return RAM_ADDR_INVALID;
    }

-    return block->mr;
+    return block->offset + offset;
 }

 /* Called within RCU critical section.  */
@@ -2022,13 +2013,13 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
    }
    switch (size) {
    case 1:
-        stb_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    case 2:
-        stw_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    case 4:
-        stl_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
        break;
    default:
        abort();
@@ -2490,6 +2481,8 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
                                     hwaddr length)
 {
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+    addr += memory_region_get_ram_addr(mr);
+
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
@@ -2602,9 +2595,8 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                abort();
            }
        } else {
-            addr1 += memory_region_get_ram_addr(mr);
            /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
        }
@@ -2695,8 +2687,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
            }
        } else {
            /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block,
-                                   memory_region_get_ram_addr(mr) + addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            memcpy(buf, ptr, l);
        }

@@ -2779,9 +2770,8 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
              memory_region_is_romd(mr))) {
            l = memory_access_size(mr, l, addr1);
        } else {
-            addr1 += memory_region_get_ram_addr(mr);
            /* ROM/RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
@@ -2939,7 +2929,6 @@ void *address_space_map(AddressSpace *as,
    hwaddr done = 0;
    hwaddr l, xlat, base;
    MemoryRegion *mr, *this_mr;
-    ram_addr_t raddr;
    void *ptr;

    if (len == 0) {
@@ -2974,7 +2963,6 @@ void *address_space_map(AddressSpace *as,
    }

    base = xlat;
-    raddr = memory_region_get_ram_addr(mr);

    for (;;) {
        len -= l;
@@ -2993,7 +2981,7 @@ void *address_space_map(AddressSpace *as,

    memory_region_ref(mr);
    *plen = done;
-    ptr = qemu_ram_ptr_length(mr->ram_block, raddr + base, plen);
+    ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
    rcu_read_unlock();

    return ptr;
@@ -3010,7 +2998,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
        MemoryRegion *mr;
        ram_addr_t addr1;

-        mr = qemu_ram_addr_from_host(buffer, &addr1);
+        mr = memory_region_from_host(buffer, &addr1);
        assert(mr != NULL);
        if (is_write) {
            invalidate_and_set_dirty(mr, addr1, access_len);
@@ -3077,8 +3065,7 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = ldl_le_p(ptr);
@@ -3171,8 +3158,7 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = ldq_le_p(ptr);
@@ -3285,8 +3271,7 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
 #endif
    } else {
        /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            val = lduw_le_p(ptr);
@@ -3368,13 +3353,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,

        r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
    } else {
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        stl_p(ptr, val);

        dirty_log_mask = memory_region_get_dirty_log_mask(mr);
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
-        cpu_physical_memory_set_dirty_range(addr1, 4, dirty_log_mask);
+        cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                            4, dirty_log_mask);
        r = MEMTX_OK;
    }
    if (result) {
@@ -3423,8 +3408,7 @@ static inline void address_space_stl_internal(AddressSpace *as,
        r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
    } else {
        /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            stl_le_p(ptr, val);
@@ -3533,8 +3517,7 @@ static inline void address_space_stw_internal(AddressSpace *as,
        r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
    } else {
        /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
        switch (endian) {
        case DEVICE_LITTLE_ENDIAN:
            stw_le_p(ptr, val);
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1008,7 +1008,7 @@ ETEXI

    {
        .name       = "migrate_set_parameter",
-        .args_type  = "parameter:s,value:i",
+        .args_type  = "parameter:s,value:s",
        .params     = "parameter value",
        .help       = "Set the parameter for migration",
        .mhandler.cmd = hmp_migrate_set_parameter,
--- a/hmp.c
+++ b/hmp.c
@@ -35,6 +35,7 @@
 #include "block/qapi.h"
 #include "qemu-io.h"
 #include "qemu/cutils.h"
+#include "qemu/error-report.h"

 #ifdef CONFIG_SPICE
 #include <spice/enums.h>
@@ -168,8 +169,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
    }

    if (info->has_status) {
-        monitor_printf(mon, "Migration status: %s\n",
+        monitor_printf(mon, "Migration status: %s",
                       MigrationStatus_lookup[info->status]);
+        if (info->status == MIGRATION_STATUS_FAILED &&
+            info->has_error_desc) {
+            monitor_printf(mon, " (%s)\n", info->error_desc);
+        } else {
+            monitor_printf(mon, "\n");
+        }
+
        monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
                       info->total_time);
        if (info->has_expected_downtime) {
@@ -286,6 +294,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
        monitor_printf(mon, " %s: %" PRId64,
            MigrationParameter_lookup[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT],
            params->cpu_throttle_increment);
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_CREDS],
+            params->tls_creds ? : "");
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_HOSTNAME],
+            params->tls_hostname ? : "");
        monitor_printf(mon, "\n");
    }

@@ -1235,13 +1249,17 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict)
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
 {
    const char *param = qdict_get_str(qdict, "parameter");
-    int value = qdict_get_int(qdict, "value");
+    const char *valuestr = qdict_get_str(qdict, "value");
+    long valueint = 0;
    Error *err = NULL;
    bool has_compress_level = false;
    bool has_compress_threads = false;
    bool has_decompress_threads = false;
    bool has_cpu_throttle_initial = false;
    bool has_cpu_throttle_increment = false;
+    bool has_tls_creds = false;
+    bool has_tls_hostname = false;
+    bool use_int_value = false;
    int i;

    for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) {
@@ -1249,25 +1267,46 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
            switch (i) {
            case MIGRATION_PARAMETER_COMPRESS_LEVEL:
                has_compress_level = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_COMPRESS_THREADS:
                has_compress_threads = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_DECOMPRESS_THREADS:
                has_decompress_threads = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL:
                has_cpu_throttle_initial = true;
+                use_int_value = true;
                break;
            case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT:
                has_cpu_throttle_increment = true;
                break;
+            case MIGRATION_PARAMETER_TLS_CREDS:
+                has_tls_creds = true;
+                break;
+            case MIGRATION_PARAMETER_TLS_HOSTNAME:
+                has_tls_hostname = true;
+                break;
            }
-            qmp_migrate_set_parameters(has_compress_level, value,
-                                       has_compress_threads, value,
-                                       has_decompress_threads, value,
-                                       has_cpu_throttle_initial, value,
-                                       has_cpu_throttle_increment, value,
+
+            if (use_int_value) {
+                if (qemu_strtol(valuestr, NULL, 10, &valueint) < 0) {
+                    error_setg(&err, "Unable to parse '%s' as an int",
+                               valuestr);
+                    goto cleanup;
+                }
+            }
+
+            qmp_migrate_set_parameters(has_compress_level, valueint,
+                                       has_compress_threads, valueint,
+                                       has_decompress_threads, valueint,
+                                       has_cpu_throttle_initial, valueint,
+                                       has_cpu_throttle_increment, valueint,
+                                       has_tls_creds, valuestr,
+                                       has_tls_hostname, valuestr,
                                       &err);
            break;
        }
@@ -1277,6 +1316,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
        error_setg(&err, QERR_INVALID_PARAMETER, param);
    }

+ cleanup:
    if (err) {
        error_report_err(err);
    }
@@ -1533,6 +1573,9 @@ static void hmp_migrate_status_cb(void *opaque)
        if (status->is_block_migration) {
            monitor_printf(status->mon, "\n");
        }
+        if (info->has_error_desc) {
+            error_report("%s", info->error_desc);
+        }
        monitor_resume(status->mon);
        timer_del(status->timer);
        g_free(status);
--- a/hw/arm/fsl-imx25.c
+++ b/hw/arm/fsl-imx25.c
@@ -191,6 +191,7 @@ static void fsl_imx25_realize(DeviceState *dev, Error **errp)
    }

    qdev_set_nic_properties(DEVICE(&s->fec), &nd_table[0]);
+
    object_property_set_bool(OBJECT(&s->fec), true, "realized", &err);
    if (err) {
        error_propagate(errp, err);
--- a/hw/arm/fsl-imx6.c
+++ b/hw/arm/fsl-imx6.c
@@ -105,6 +105,10 @@ static void fsl_imx6_init(Object *obj)
        snprintf(name, NAME_SIZE, "spi%d", i + 1);
        object_property_add_child(obj, name, OBJECT(&s->spi[i]), NULL);
    }
+
+    object_initialize(&s->eth, sizeof(s->eth), TYPE_IMX_ENET);
+    qdev_set_parent_bus(DEVICE(&s->eth), sysbus_get_default());
+    object_property_add_child(obj, "eth", OBJECT(&s->eth), NULL);
 }

 static void fsl_imx6_realize(DeviceState *dev, Error **errp)
@@ -381,6 +385,19 @@ static void fsl_imx6_realize(DeviceState *dev, Error **errp)
                                            spi_table[i].irq));
    }

+    object_property_set_bool(OBJECT(&s->eth), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->eth), 0, FSL_IMX6_ENET_ADDR);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->eth), 0,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_ENET_MAC_IRQ));
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->eth), 1,
+                       qdev_get_gpio_in(DEVICE(&s->a9mpcore),
+                                        FSL_IMX6_ENET_MAC_1588_IRQ));
+
    /* ROM memory */
    memory_region_init_rom_device(&s->rom, NULL, NULL, NULL, "imx6.rom",
                                  FSL_IMX6_ROM_SIZE, &err);
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -239,7 +239,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
    uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
    uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
    uint64_t data_size = (uint64_t)nlb << data_shift;
-    uint64_t aio_slba  = slba << (data_shift - BDRV_SECTOR_BITS);
+    uint64_t data_offset = slba << data_shift;
    int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;

@@ -258,8 +258,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
    req->has_sg = true;
    dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
    req->aiocb = is_write ?
-        dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) :
-        dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req);
+        dma_blk_write(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req) :
+        dma_blk_read(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req);

    return NVME_NO_COMPLETE;
 }
--- a/hw/bt/hci-csr.c
+++ b/hw/bt/hci-csr.c
@@ -39,9 +39,14 @@ struct csrhci_s {
    int out_size;
    uint8_t outfifo[FIFO_LEN * 2];
    uint8_t inpkt[FIFO_LEN];
+    enum {
+        CSR_HDR_LEN,
+        CSR_DATA_LEN,
+        CSR_DATA
+    } in_state;
    int in_len;
    int in_hdr;
-    int in_data;
+    int in_needed;
    QEMUTimer *out_tm;
    int64_t baud_delay;

@@ -296,38 +301,60 @@ static int csrhci_data_len(const uint8_t *pkt)
    exit(-1);
 }

+static void csrhci_ready_for_next_inpkt(struct csrhci_s *s)
+{
+    s->in_state = CSR_HDR_LEN;
+    s->in_len = 0;
+    s->in_needed = 2;
+    s->in_hdr = INT_MAX;
+}
+
 static int csrhci_write(struct CharDriverState *chr,
                const uint8_t *buf, int len)
 {
    struct csrhci_s *s = (struct csrhci_s *) chr->opaque;
-    int plen = s->in_len;
+    int total = 0;

    if (!s->enable)
        return 0;

-    s->in_len += len;
-    memcpy(s->inpkt + plen, buf, len);
+    for (;;) {
+        int cnt = MIN(len, s->in_needed - s->in_len);
+        if (cnt) {
+            memcpy(s->inpkt + s->in_len, buf, cnt);
+            s->in_len += cnt;
+            buf += cnt;
+            len -= cnt;
+            total += cnt;
+        }

-    while (1) {
-        if (s->in_len >= 2 && plen < 2)
-            s->in_hdr = csrhci_header_len(s->inpkt) + 1;
-
-        if (s->in_len >= s->in_hdr && plen < s->in_hdr)
-            s->in_data = csrhci_data_len(s->inpkt) + s->in_hdr;
-
-        if (s->in_len >= s->in_data) {
-            csrhci_in_packet(s, s->inpkt);
-
-            memmove(s->inpkt, s->inpkt + s->in_len, s->in_len - s->in_data);
-            s->in_len -= s->in_data;
-            s->in_hdr = INT_MAX;
-            s->in_data = INT_MAX;
-            plen = 0;
-        } else
+        if (s->in_len < s->in_needed) {
            break;
+        }
+
+        if (s->in_state == CSR_HDR_LEN) {
+            s->in_hdr = csrhci_header_len(s->inpkt) + 1;
+            assert(s->in_hdr >= s->in_needed);
+            s->in_needed = s->in_hdr;
+            s->in_state = CSR_DATA_LEN;
+            continue;
+        }
+
+        if (s->in_state == CSR_DATA_LEN) {
+            s->in_needed += csrhci_data_len(s->inpkt);
+            /* hci_acl_hdr could specify more than 4096 bytes, so assert.  */
+            assert(s->in_needed <= sizeof(s->inpkt));
+            s->in_state = CSR_DATA;
+            continue;
+        }
+
+        if (s->in_state == CSR_DATA) {
+            csrhci_in_packet(s, s->inpkt);
+            csrhci_ready_for_next_inpkt(s);
+        }
    }

-    return len;
+    return total;
 }

 static void csrhci_out_hci_packet_event(void *opaque,
@@ -389,11 +416,9 @@ static void csrhci_reset(struct csrhci_s *s)
 {
    s->out_len = 0;
    s->out_size = FIFO_LEN;
-    s->in_len = 0;
+    csrhci_ready_for_next_inpkt(s);
    s->baud_delay = NANOSECONDS_PER_SECOND;
    s->enable = 0;
-    s->in_hdr = INT_MAX;
-    s->in_data = INT_MAX;

    s->modem_state = 0;
    /* After a while... (but sooner than 10ms) */
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -983,9 +983,10 @@ void slavio_serial_ms_kbd_init(hwaddr base, qemu_irq irq,
    sysbus_mmio_map(s, 0, base);
 }

-static int escc_init1(SysBusDevice *dev)
+static void escc_init1(Object *obj)
 {
-    ESCCState *s = ESCC(dev);
+    ESCCState *s = ESCC(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
    unsigned int i;

    s->chn[0].disabled = s->disabled;
@@ -994,17 +995,26 @@ static int escc_init1(SysBusDevice *dev)
        sysbus_init_irq(dev, &s->chn[i].irq);
        s->chn[i].chn = 1 - i;
        s->chn[i].clock = s->frequency / 2;
+    }
+    s->chn[0].otherchn = &s->chn[1];
+    s->chn[1].otherchn = &s->chn[0];
+
+    memory_region_init_io(&s->mmio, obj, &escc_mem_ops, s, "escc",
+                          ESCC_SIZE << s->it_shift);
+    sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void escc_realize(DeviceState *dev, Error **errp)
+{
+    ESCCState *s = ESCC(dev);
+    unsigned int i;
+
+    for (i = 0; i < 2; i++) {
        if (s->chn[i].chr) {
            qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive,
                                  serial_receive1, serial_event, &s->chn[i]);
        }
    }
-    s->chn[0].otherchn = &s->chn[1];
-    s->chn[1].otherchn = &s->chn[0];
-
-    memory_region_init_io(&s->mmio, OBJECT(s), &escc_mem_ops, s, "escc",
-                          ESCC_SIZE << s->it_shift);
-    sysbus_init_mmio(dev, &s->mmio);

    if (s->chn[0].type == mouse) {
        qemu_add_mouse_event_handler(sunmouse_event, &s->chn[0], 0,
@@ -1014,8 +1024,6 @@ static int escc_init1(SysBusDevice *dev)
        s->chn[1].hs = qemu_input_handler_register((DeviceState *)(&s->chn[1]),
                                                   &sunkbd_handler);
    }
-
-    return 0;
 }

 static Property escc_properties[] = {
@@ -1032,10 +1040,9 @@ static Property escc_properties[] = {
 static void escc_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = escc_init1;
    dc->reset = escc_reset;
+    dc->realize = escc_realize;
    dc->vmsd = &vmstate_escc;
    dc->props = escc_properties;
    set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
@@ -1045,6 +1052,7 @@ static const TypeInfo escc_info = {
    .name          = TYPE_ESCC,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(ESCCState),
+    .instance_init = escc_init1,
    .class_init    = escc_class_init,
 };

--- a/hw/char/etraxfs_ser.c
+++ b/hw/char/etraxfs_ser.c
@@ -159,6 +159,11 @@ static const MemoryRegionOps ser_ops = {
    }
 };

+static Property etraxfs_ser_properties[] = {
+    DEFINE_PROP_CHR("chardev", ETRAXSerial, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void serial_receive(void *opaque, const uint8_t *buf, int size)
 {
    ETRAXSerial *s = opaque;
@@ -209,40 +214,42 @@ static void etraxfs_ser_reset(DeviceState *d)

 }

-static int etraxfs_ser_init(SysBusDevice *dev)
+static void etraxfs_ser_init(Object *obj)
+{
+    ETRAXSerial *s = ETRAX_SERIAL(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+
+    sysbus_init_irq(dev, &s->irq);
+    memory_region_init_io(&s->mmio, obj, &ser_ops, s,
+                          "etraxfs-serial", R_MAX * 4);
+    sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void etraxfs_ser_realize(DeviceState *dev, Error **errp)
 {
    ETRAXSerial *s = ETRAX_SERIAL(dev);

-    sysbus_init_irq(dev, &s->irq);
-    memory_region_init_io(&s->mmio, OBJECT(s), &ser_ops, s,
-                          "etraxfs-serial", R_MAX * 4);
-    sysbus_init_mmio(dev, &s->mmio);
-
-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
    if (s->chr) {
        qemu_chr_add_handlers(s->chr,
                              serial_can_receive, serial_receive,
                              serial_event, s);
    }
-    return 0;
 }

 static void etraxfs_ser_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = etraxfs_ser_init;
    dc->reset = etraxfs_ser_reset;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = etraxfs_ser_properties;
+    dc->realize = etraxfs_ser_realize;
 }

 static const TypeInfo etraxfs_ser_info = {
    .name          = TYPE_ETRAX_FS_SERIAL,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(ETRAXSerial),
+    .instance_init = etraxfs_ser_init,
    .class_init    = etraxfs_ser_class_init,
 };

--- a/hw/char/lm32_juart.c
+++ b/hw/char/lm32_juart.c
@@ -114,17 +114,13 @@ static void juart_reset(DeviceState *d)
    s->jrx = 0;
 }

-static int lm32_juart_init(SysBusDevice *dev)
+static void lm32_juart_realize(DeviceState *dev, Error **errp)
 {
    LM32JuartState *s = LM32_JUART(dev);

-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
    if (s->chr) {
        qemu_chr_add_handlers(s->chr, juart_can_rx, juart_rx, juart_event, s);
    }
-
-    return 0;
 }

 static const VMStateDescription vmstate_lm32_juart = {
@@ -138,16 +134,19 @@ static const VMStateDescription vmstate_lm32_juart = {
    }
 };

+static Property lm32_juart_properties[] = {
+    DEFINE_PROP_CHR("chardev", LM32JuartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_juart_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = lm32_juart_init;
    dc->reset = juart_reset;
    dc->vmsd = &vmstate_lm32_juart;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = lm32_juart_properties;
+    dc->realize = lm32_juart_realize;
 }

 static const TypeInfo lm32_juart_info = {
--- a/hw/char/lm32_uart.c
+++ b/hw/char/lm32_uart.c
@@ -249,23 +249,25 @@ static void uart_reset(DeviceState *d)
    s->regs[R_LSR] = LSR_THRE | LSR_TEMT;
 }

-static int lm32_uart_init(SysBusDevice *dev)
+static void lm32_uart_init(Object *obj)
 {
-    LM32UartState *s = LM32_UART(dev);
+    LM32UartState *s = LM32_UART(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);

    sysbus_init_irq(dev, &s->irq);

-    memory_region_init_io(&s->iomem, OBJECT(s), &uart_ops, s,
+    memory_region_init_io(&s->iomem, obj, &uart_ops, s,
                          "uart", R_MAX * 4);
    sysbus_init_mmio(dev, &s->iomem);
+}
+
+static void lm32_uart_realize(DeviceState *dev, Error **errp)
+{
+    LM32UartState *s = LM32_UART(dev);

-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
    if (s->chr) {
        qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
    }
-
-    return 0;
 }

 static const VMStateDescription vmstate_lm32_uart = {
@@ -278,22 +280,26 @@ static const VMStateDescription vmstate_lm32_uart = {
    }
 };

+static Property lm32_uart_properties[] = {
+    DEFINE_PROP_CHR("chardev", LM32UartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_uart_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);

-    k->init = lm32_uart_init;
    dc->reset = uart_reset;
    dc->vmsd = &vmstate_lm32_uart;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = lm32_uart_properties;
+    dc->realize = lm32_uart_realize;
 }

 static const TypeInfo lm32_uart_info = {
    .name          = TYPE_LM32_UART,
    .parent        = TYPE_SYS_BUS_DEVICE,
    .instance_size = sizeof(LM32UartState),
+    .instance_init = lm32_uart_init,
    .class_init    = lm32_uart_class_init,
 };

--- a/hw/char/milkymist-uart.c
+++ b/hw/char/milkymist-uart.c
@@ -200,8 +200,6 @@ static void milkymist_uart_realize(DeviceState *dev, Error **errp)
 {
    MilkymistUartState *s = MILKYMIST_UART(dev);

-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
    if (s->chr) {
        qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
    }
@@ -229,6 +227,11 @@ static const VMStateDescription vmstate_milkymist_uart = {
    }
 };

+static Property milkymist_uart_properties[] = {
+    DEFINE_PROP_CHR("chardev", MilkymistUartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void milkymist_uart_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
@@ -236,8 +239,7 @@ static void milkymist_uart_class_init(ObjectClass *klass, void *data)
    dc->realize = milkymist_uart_realize;
    dc->reset = milkymist_uart_reset;
    dc->vmsd = &vmstate_milkymist_uart;
-    /* Reason: realize() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = milkymist_uart_properties;
 }

 static const TypeInfo milkymist_uart_info = {
--- a/hw/core/Makefile.objs
+++ b/hw/core/Makefile.objs
@@ -1,5 +1,6 @@
 # core qdev-related obj files, also used by *-user:
 common-obj-y += qdev.o qdev-properties.o
+common-obj-y += bus.o
 common-obj-y += fw-path-provider.o
 # irq.o needed for qdev GPIO handling:
 common-obj-y += irq.o
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@@ -0,0 +1,251 @@
+/*
+ *  Dynamic device configuration and creation -- buses.
+ *
+ *  Copyright (c) 2009 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "hw/qdev.h"
+#include "qapi/error.h"
+
+static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler,
+                                              Error **errp)
+{
+
+    object_property_set_link(OBJECT(bus), OBJECT(handler),
+                             QDEV_HOTPLUG_HANDLER_PROPERTY, errp);
+}
+
+void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp)
+{
+    qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp);
+}
+
+void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp)
+{
+    qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp);
+}
+
+int qbus_walk_children(BusState *bus,
+                       qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
+                       qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
+                       void *opaque)
+{
+    BusChild *kid;
+    int err;
+
+    if (pre_busfn) {
+        err = pre_busfn(bus, opaque);
+        if (err) {
+            return err;
+        }
+    }
+
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        err = qdev_walk_children(kid->child,
+                                 pre_devfn, pre_busfn,
+                                 post_devfn, post_busfn, opaque);
+        if (err < 0) {
+            return err;
+        }
+    }
+
+    if (post_busfn) {
+        err = post_busfn(bus, opaque);
+        if (err) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
+{
+    const char *typename = object_get_typename(OBJECT(bus));
+    BusClass *bc;
+    char *buf;
+    int i, len, bus_id;
+
+    bus->parent = parent;
+
+    if (name) {
+        bus->name = g_strdup(name);
+    } else if (bus->parent && bus->parent->id) {
+        /* parent device has id -> use it plus parent-bus-id for bus name */
+        bus_id = bus->parent->num_child_bus;
+
+        len = strlen(bus->parent->id) + 16;
+        buf = g_malloc(len);
+        snprintf(buf, len, "%s.%d", bus->parent->id, bus_id);
+        bus->name = buf;
+    } else {
+        /* no id -> use lowercase bus type plus global bus-id for bus name */
+        bc = BUS_GET_CLASS(bus);
+        bus_id = bc->automatic_ids++;
+
+        len = strlen(typename) + 16;
+        buf = g_malloc(len);
+        len = snprintf(buf, len, "%s.%d", typename, bus_id);
+        for (i = 0; i < len; i++) {
+            buf[i] = qemu_tolower(buf[i]);
+        }
+        bus->name = buf;
+    }
+
+    if (bus->parent) {
+        QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling);
+        bus->parent->num_child_bus++;
+        object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
+        object_unref(OBJECT(bus));
+    } else if (bus != sysbus_get_default()) {
+        /* TODO: once all bus devices are qdevified,
+           only reset handler for main_system_bus should be registered here. */
+        qemu_register_reset(qbus_reset_all_fn, bus);
+    }
+}
+
+static void bus_unparent(Object *obj)
+{
+    BusState *bus = BUS(obj);
+    BusChild *kid;
+
+    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
+        DeviceState *dev = kid->child;
+        object_unparent(OBJECT(dev));
+    }
+    if (bus->parent) {
+        QLIST_REMOVE(bus, sibling);
+        bus->parent->num_child_bus--;
+        bus->parent = NULL;
+    } else {
+        assert(bus != sysbus_get_default()); /* main_system_bus is never freed */
+        qemu_unregister_reset(qbus_reset_all_fn, bus);
+    }
+}
+
+void qbus_create_inplace(void *bus, size_t size, const char *typename,
+                         DeviceState *parent, const char *name)
+{
+    object_initialize(bus, size, typename);
+    qbus_realize(bus, parent, name);
+}
+
+BusState *qbus_create(const char *typename, DeviceState *parent, const char *name)
+{
+    BusState *bus;
+
+    bus = BUS(object_new(typename));
+    qbus_realize(bus, parent, name);
+
+    return bus;
+}
+
+static bool bus_get_realized(Object *obj, Error **errp)
+{
+    BusState *bus = BUS(obj);
+
+    return bus->realized;
+}
+
+static void bus_set_realized(Object *obj, bool value, Error **errp)
+{
+    BusState *bus = BUS(obj);
+    BusClass *bc = BUS_GET_CLASS(bus);
+    BusChild *kid;
+    Error *local_err = NULL;
+
+    if (value && !bus->realized) {
+        if (bc->realize) {
+            bc->realize(bus, &local_err);
+        }
+
+        /* TODO: recursive realization */
+    } else if (!value && bus->realized) {
+        QTAILQ_FOREACH(kid, &bus->children, sibling) {
+            DeviceState *dev = kid->child;
+            object_property_set_bool(OBJECT(dev), false, "realized",
+                                     &local_err);
+            if (local_err != NULL) {
+                break;
+            }
+        }
+        if (bc->unrealize && local_err == NULL) {
+            bc->unrealize(bus, &local_err);
+        }
+    }
+
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    bus->realized = value;
+}
+
+static void qbus_initfn(Object *obj)
+{
+    BusState *bus = BUS(obj);
+
+    QTAILQ_INIT(&bus->children);
+    object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY,
+                             TYPE_HOTPLUG_HANDLER,
+                             (Object **)&bus->hotplug_handler,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
+                             NULL);
+    object_property_add_bool(obj, "realized",
+                             bus_get_realized, bus_set_realized, NULL);
+}
+
+static char *default_bus_get_fw_dev_path(DeviceState *dev)
+{
+    return g_strdup(object_get_typename(OBJECT(dev)));
+}
+
+static void bus_class_init(ObjectClass *class, void *data)
+{
+    BusClass *bc = BUS_CLASS(class);
+
+    class->unparent = bus_unparent;
+    bc->get_fw_dev_path = default_bus_get_fw_dev_path;
+}
+
+static void qbus_finalize(Object *obj)
+{
+    BusState *bus = BUS(obj);
+
+    g_free((char *)bus->name);
+}
+
+static const TypeInfo bus_info = {
+    .name = TYPE_BUS,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(BusState),
+    .abstract = true,
+    .class_size = sizeof(BusClass),
+    .instance_init = qbus_initfn,
+    .instance_finalize = qbus_finalize,
+    .class_init = bus_class_init,
+};
+
+static void bus_register_types(void)
+{
+    type_register_static(&bus_info);
+}
+
+type_init(bus_register_types)
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -109,24 +109,6 @@ void qdev_set_parent_bus(DeviceState *dev, BusState *bus)
    bus_add_child(bus, dev);
 }

-static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler,
-                                              Error **errp)
-{
-
-    object_property_set_link(OBJECT(bus), OBJECT(handler),
-                             QDEV_HOTPLUG_HANDLER_PROPERTY, errp);
-}
-
-void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp)
-{
-    qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp);
-}
-
-void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp)
-{
-    qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp);
-}
-
 /* Create a new device.  This only initializes the device state
   structure and allows properties to be set.  The device still needs
   to be realized.  See qdev-core.h.  */
@@ -595,40 +577,6 @@ BusState *qdev_get_child_bus(DeviceState *dev, const char *name)
    return NULL;
 }

-int qbus_walk_children(BusState *bus,
-                       qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
-                       qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
-                       void *opaque)
-{
-    BusChild *kid;
-    int err;
-
-    if (pre_busfn) {
-        err = pre_busfn(bus, opaque);
-        if (err) {
-            return err;
-        }
-    }
-
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
-        err = qdev_walk_children(kid->child,
-                                 pre_devfn, pre_busfn,
-                                 post_devfn, post_busfn, opaque);
-        if (err < 0) {
-            return err;
-        }
-    }
-
-    if (post_busfn) {
-        err = post_busfn(bus, opaque);
-        if (err) {
-            return err;
-        }
-    }
-
-    return 0;
-}
-
 int qdev_walk_children(DeviceState *dev,
                       qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
                       qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
@@ -685,129 +633,6 @@ DeviceState *qdev_find_recursive(BusState *bus, const char *id)
    return NULL;
 }

-static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
-{
-    const char *typename = object_get_typename(OBJECT(bus));
-    BusClass *bc;
-    char *buf;
-    int i, len, bus_id;
-
-    bus->parent = parent;
-
-    if (name) {
-        bus->name = g_strdup(name);
-    } else if (bus->parent && bus->parent->id) {
-        /* parent device has id -> use it plus parent-bus-id for bus name */
-        bus_id = bus->parent->num_child_bus;
-
-        len = strlen(bus->parent->id) + 16;
-        buf = g_malloc(len);
-        snprintf(buf, len, "%s.%d", bus->parent->id, bus_id);
-        bus->name = buf;
-    } else {
-        /* no id -> use lowercase bus type plus global bus-id for bus name */
-        bc = BUS_GET_CLASS(bus);
-        bus_id = bc->automatic_ids++;
-
-        len = strlen(typename) + 16;
-        buf = g_malloc(len);
-        len = snprintf(buf, len, "%s.%d", typename, bus_id);
-        for (i = 0; i < len; i++) {
-            buf[i] = qemu_tolower(buf[i]);
-        }
-        bus->name = buf;
-    }
-
-    if (bus->parent) {
-        QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling);
-        bus->parent->num_child_bus++;
-        object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
-        object_unref(OBJECT(bus));
-    } else if (bus != sysbus_get_default()) {
-        /* TODO: once all bus devices are qdevified,
-           only reset handler for main_system_bus should be registered here. */
-        qemu_register_reset(qbus_reset_all_fn, bus);
-    }
-}
-
-static void bus_unparent(Object *obj)
-{
-    BusState *bus = BUS(obj);
-    BusChild *kid;
-
-    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
-        DeviceState *dev = kid->child;
-        object_unparent(OBJECT(dev));
-    }
-    if (bus->parent) {
-        QLIST_REMOVE(bus, sibling);
-        bus->parent->num_child_bus--;
-        bus->parent = NULL;
-    } else {
-        assert(bus != sysbus_get_default()); /* main_system_bus is never freed */
-        qemu_unregister_reset(qbus_reset_all_fn, bus);
-    }
-}
-
-static bool bus_get_realized(Object *obj, Error **errp)
-{
-    BusState *bus = BUS(obj);
-
-    return bus->realized;
-}
-
-static void bus_set_realized(Object *obj, bool value, Error **errp)
-{
-    BusState *bus = BUS(obj);
-    BusClass *bc = BUS_GET_CLASS(bus);
-    BusChild *kid;
-    Error *local_err = NULL;
-
-    if (value && !bus->realized) {
-        if (bc->realize) {
-            bc->realize(bus, &local_err);
-        }
-
-        /* TODO: recursive realization */
-    } else if (!value && bus->realized) {
-        QTAILQ_FOREACH(kid, &bus->children, sibling) {
-            DeviceState *dev = kid->child;
-            object_property_set_bool(OBJECT(dev), false, "realized",
-                                     &local_err);
-            if (local_err != NULL) {
-                break;
-            }
-        }
-        if (bc->unrealize && local_err == NULL) {
-            bc->unrealize(bus, &local_err);
-        }
-    }
-
-    if (local_err != NULL) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
-    bus->realized = value;
-}
-
-void qbus_create_inplace(void *bus, size_t size, const char *typename,
-                         DeviceState *parent, const char *name)
-{
-    object_initialize(bus, size, typename);
-    qbus_realize(bus, parent, name);
-}
-
-BusState *qbus_create(const char *typename, DeviceState *parent, const char *name)
-{
-    BusState *bus;
-
-    bus = BUS(object_new(typename));
-    qbus_realize(bus, parent, name);
-
-    return bus;
-}
-
 static char *bus_get_fw_dev_path(BusState *bus, DeviceState *dev)
 {
    BusClass *bc = BUS_GET_CLASS(bus);
@@ -1315,55 +1140,8 @@ static const TypeInfo device_type_info = {
    .class_size = sizeof(DeviceClass),
 };

-static void qbus_initfn(Object *obj)
-{
-    BusState *bus = BUS(obj);
-
-    QTAILQ_INIT(&bus->children);
-    object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY,
-                             TYPE_HOTPLUG_HANDLER,
-                             (Object **)&bus->hotplug_handler,
-                             object_property_allow_set_link,
-                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
-                             NULL);
-    object_property_add_bool(obj, "realized",
-                             bus_get_realized, bus_set_realized, NULL);
-}
-
-static char *default_bus_get_fw_dev_path(DeviceState *dev)
-{
-    return g_strdup(object_get_typename(OBJECT(dev)));
-}
-
-static void bus_class_init(ObjectClass *class, void *data)
-{
-    BusClass *bc = BUS_CLASS(class);
-
-    class->unparent = bus_unparent;
-    bc->get_fw_dev_path = default_bus_get_fw_dev_path;
-}
-
-static void qbus_finalize(Object *obj)
-{
-    BusState *bus = BUS(obj);
-
-    g_free((char *)bus->name);
-}
-
-static const TypeInfo bus_info = {
-    .name = TYPE_BUS,
-    .parent = TYPE_OBJECT,
-    .instance_size = sizeof(BusState),
-    .abstract = true,
-    .class_size = sizeof(BusClass),
-    .instance_init = qbus_initfn,
-    .instance_finalize = qbus_finalize,
-    .class_init = bus_class_init,
-};
-
 static void qdev_register_types(void)
 {
-    type_register_static(&bus_info);
    type_register_static(&device_type_info);
 }

--- a/hw/cris/axis_dev88.c
+++ b/hw/cris/axis_dev88.c
@@ -37,6 +37,7 @@
 #include "sysemu/block-backend.h"
 #include "exec/address-spaces.h"
 #include "sysemu/qtest.h"
+#include "sysemu/sysemu.h"

 #define D(x)
 #define DNAND(x)
@@ -341,8 +342,7 @@ void axisdev88_init(MachineState *machine)
    sysbus_create_varargs("etraxfs,timer", 0x3005e000, irq[0x1b], nmi[1], NULL);

    for (i = 0; i < 4; i++) {
-        sysbus_create_simple("etraxfs,serial", 0x30026000 + i * 0x2000,
-                             irq[0x14 + i]);
+        etraxfs_ser_create(0x30026000 + i * 0x2000, irq[0x14 + i], serial_hds[i]);
    }

    if (kernel_filename) {
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1006,7 +1006,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs)
        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
                       &ncq_tfs->sglist, BLOCK_ACCT_READ);
        ncq_tfs->aiocb = dma_blk_read(ide_state->blk, &ncq_tfs->sglist,
-                                      ncq_tfs->lba, ncq_cb, ncq_tfs);
+                                      ncq_tfs->lba << BDRV_SECTOR_BITS,
+                                      ncq_cb, ncq_tfs);
        break;
    case WRITE_FPDMA_QUEUED:
        DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
@@ -1018,7 +1019,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs)
        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
                       &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
        ncq_tfs->aiocb = dma_blk_write(ide_state->blk, &ncq_tfs->sglist,
-                                       ncq_tfs->lba, ncq_cb, ncq_tfs);
+                                       ncq_tfs->lba << BDRV_SECTOR_BITS,
+                                       ncq_cb, ncq_tfs);
        break;
    default:
        DPRINTF(port, "error: unsupported NCQ command (0x%02x) received\n",
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -441,13 +441,14 @@ static void ide_issue_trim_cb(void *opaque, int ret)
    }
 }

-BlockAIOCB *ide_issue_trim(BlockBackend *blk,
-        int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *ide_issue_trim(
+        int64_t offset, QEMUIOVector *qiov,
+        BlockCompletionFunc *cb, void *cb_opaque, void *opaque)
 {
+    BlockBackend *blk = opaque;
    TrimAIOCB *iocb;

-    iocb = blk_aio_get(&trim_aiocb_info, blk, cb, opaque);
+    iocb = blk_aio_get(&trim_aiocb_info, blk, cb, cb_opaque);
    iocb->blk = blk;
    iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb);
    iocb->ret = 0;
@@ -799,6 +800,7 @@ static void ide_dma_cb(void *opaque, int ret)
    IDEState *s = opaque;
    int n;
    int64_t sector_num;
+    uint64_t offset;
    bool stay_active = false;

    if (ret == -ECANCELED) {
@@ -859,18 +861,20 @@ static void ide_dma_cb(void *opaque, int ret)
        return;
    }

+    offset = sector_num << BDRV_SECTOR_BITS;
    switch (s->dma_cmd) {
    case IDE_DMA_READ:
-        s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, sector_num,
+        s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, offset,
                                          ide_dma_cb, s);
        break;
    case IDE_DMA_WRITE:
-        s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, sector_num,
+        s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, offset,
                                           ide_dma_cb, s);
        break;
    case IDE_DMA_TRIM:
-        s->bus->dma->aiocb = dma_blk_io(s->blk, &s->sg, sector_num,
-                                        ide_issue_trim, ide_dma_cb, s,
+        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk),
+                                        &s->sg, offset,
+                                        ide_issue_trim, s->blk, ide_dma_cb, s,
                                        DMA_DIRECTION_TO_DEVICE);
        break;
    default:
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -613,9 +613,9 @@ void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
                        EndTransferFunc *end_transfer_func);
 void ide_transfer_stop(IDEState *s);
 void ide_set_inactive(IDEState *s, bool more);
-BlockAIOCB *ide_issue_trim(BlockBackend *blk,
-        int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *ide_issue_trim(
+        int64_t offset, QEMUIOVector *qiov,
+        BlockCompletionFunc *cb, void *cb_opaque, void *opaque);
 BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num,
                               QEMUIOVector *iov, int nb_sectors,
                               BlockCompletionFunc *cb, void *opaque);
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -230,7 +230,7 @@ static void pmac_dma_trim(BlockBackend *blk,
    s->io_buffer_index += io->len;
    io->len = 0;

-    s->bus->dma->aiocb = ide_issue_trim(blk, offset, &io->iov, 0, cb, io);
+    s->bus->dma->aiocb = ide_issue_trim(offset, &io->iov, cb, io, blk);
 }

 static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
--- a/hw/lm32/lm32.h
+++ b/hw/lm32/lm32.h
@@ -16,14 +16,31 @@ static inline DeviceState *lm32_pic_init(qemu_irq cpu_irq)
    return dev;
 }

-static inline DeviceState *lm32_juart_init(void)
+static inline DeviceState *lm32_juart_init(CharDriverState *chr)
 {
    DeviceState *dev;

    dev = qdev_create(NULL, TYPE_LM32_JUART);
+    qdev_prop_set_chr(dev, "chardev", chr);
    qdev_init_nofail(dev);

    return dev;
 }

+static inline DeviceState *lm32_uart_create(hwaddr addr,
+                                            qemu_irq irq,
+                                            CharDriverState *chr)
+{
+    DeviceState *dev;
+    SysBusDevice *s;
+
+    dev = qdev_create(NULL, "lm32-uart");
+    s = SYS_BUS_DEVICE(dev);
+    qdev_prop_set_chr(dev, "chardev", chr);
+    qdev_init_nofail(dev);
+    sysbus_mmio_map(s, 0, addr);
+    sysbus_connect_irq(s, 0, irq);
+    return dev;
+}
+
 #endif
--- a/hw/lm32/lm32_boards.c
+++ b/hw/lm32/lm32_boards.c
@@ -31,6 +31,7 @@
 #include "lm32_hwsetup.h"
 #include "lm32.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"

 typedef struct {
    LM32CPU *cpu;
@@ -131,12 +132,12 @@ static void lm32_evr_init(MachineState *machine)
        irq[i] = qdev_get_gpio_in(env->pic_state, i);
    }

-    sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+    lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
    sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
    sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);

    /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);

    reset_info->bootstrap_pc = flash_base;

@@ -232,13 +233,13 @@ static void lm32_uclinux_init(MachineState *machine)
        irq[i] = qdev_get_gpio_in(env->pic_state, i);
    }

-    sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+    lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
    sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
    sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
    sysbus_create_simple("lm32-timer", timer2_base, irq[timer2_irq]);

    /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);

    reset_info->bootstrap_pc = flash_base;

--- a/hw/lm32/milkymist-hw.h
+++ b/hw/lm32/milkymist-hw.h
@@ -5,11 +5,13 @@
 #include "net/net.h"

 static inline DeviceState *milkymist_uart_create(hwaddr base,
-        qemu_irq irq)
+                                                 qemu_irq irq,
+                                                 CharDriverState *chr)
 {
    DeviceState *dev;

    dev = qdev_create(NULL, "milkymist-uart");
+    qdev_prop_set_chr(dev, "chardev", chr);
    qdev_init_nofail(dev);
    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
    sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, irq);
--- a/hw/lm32/milkymist.c
+++ b/hw/lm32/milkymist.c
@@ -159,7 +159,7 @@ milkymist_init(MachineState *machine)
    }
    g_free(bios_filename);

-    milkymist_uart_create(0x60000000, irq[0]);
+    milkymist_uart_create(0x60000000, irq[0], serial_hds[0]);
    milkymist_sysctl_create(0x60001000, irq[1], irq[2], irq[3],
            80000000, 0x10014d31, 0x0000041f, 0x00000001);
    milkymist_hpdmc_create(0x60002000);
@@ -175,7 +175,7 @@ milkymist_init(MachineState *machine)
            0x20000000, 0x1000, 0x20020000, 0x2000);

    /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);

    if (kernel_filename) {
        uint64_t entry;
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -33,7 +33,6 @@
 #include "sysemu/hostmem.h"
 #include "sysemu/qtest.h"
 #include "qapi/visitor.h"
-#include "exec/ram_addr.h"

 #include "hw/misc/ivshmem.h"

@@ -533,7 +532,7 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
    }
    memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s),
                               "ivshmem.bar2", size, ptr);
-    qemu_set_ram_fd(memory_region_get_ram_addr(&s->server_bar2), fd);
+    memory_region_set_fd(&s->server_bar2, fd);
    s->ivshmem_bar2 = &s->server_bar2;
 }

@@ -940,7 +939,7 @@ static void ivshmem_exit(PCIDevice *dev)
                             strerror(errno));
            }

-            fd = qemu_get_ram_fd(memory_region_get_ram_addr(s->ivshmem_bar2));
+            fd = memory_region_get_fd(s->ivshmem_bar2);
            close(fd);
        }

--- a/hw/net/Makefile.objs
+++ b/hw/net/Makefile.objs
@@ -6,9 +6,10 @@ common-obj-$(CONFIG_NE2000_PCI) += ne2000.o
 common-obj-$(CONFIG_EEPRO100_PCI) += eepro100.o
 common-obj-$(CONFIG_PCNET_PCI) += pcnet-pci.o
 common-obj-$(CONFIG_PCNET_COMMON) += pcnet.o
-common-obj-$(CONFIG_E1000_PCI) += e1000.o
+common-obj-$(CONFIG_E1000_PCI) += e1000.o e1000x_common.o
+common-obj-$(CONFIG_E1000E_PCI) += e1000e.o e1000e_core.o e1000x_common.o
 common-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
-common-obj-$(CONFIG_VMXNET3_PCI) += vmxnet_tx_pkt.o vmxnet_rx_pkt.o
+common-obj-$(CONFIG_VMXNET3_PCI) += net_tx_pkt.o net_rx_pkt.o
 common-obj-$(CONFIG_VMXNET3_PCI) += vmxnet3.o

 common-obj-$(CONFIG_SMC91C111) += smc91c111.o
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -36,7 +36,7 @@
 #include "qemu/iov.h"
 #include "qemu/range.h"

-#include "e1000_regs.h"
+#include "e1000x_common.h"

 static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};

@@ -64,11 +64,6 @@ static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
 #define PNPMMIO_SIZE      0x20000
 #define MIN_BUF_SIZE      60 /* Min. octets in an ethernet frame sans FCS */

-/* this is the size past which hardware will drop packets when setting LPE=0 */
-#define MAXIMUM_ETHERNET_VLAN_SIZE 1522
-/* this is the size past which hardware will drop packets when setting LPE=1 */
-#define MAXIMUM_ETHERNET_LPE_SIZE 16384
-
 #define MAXIMUM_ETHERNET_HDR_LEN (14+4)

 /*
@@ -102,22 +97,9 @@ typedef struct E1000State_st {
        unsigned char vlan[4];
        unsigned char data[0x10000];
        uint16_t size;
-        unsigned char sum_needed;
        unsigned char vlan_needed;
-        uint8_t ipcss;
-        uint8_t ipcso;
-        uint16_t ipcse;
-        uint8_t tucss;
-        uint8_t tucso;
-        uint16_t tucse;
-        uint8_t hdr_len;
-        uint16_t mss;
-        uint32_t paylen;
+        e1000x_txd_props props;
        uint16_t tso_frames;
-        char tse;
-        int8_t ip;
-        int8_t tcp;
-        char cptse;     // current packet tse bit
    } tx;

    struct {
@@ -162,52 +144,19 @@ typedef struct E1000BaseClass {
 #define E1000_DEVICE_GET_CLASS(obj) \
    OBJECT_GET_CLASS(E1000BaseClass, (obj), TYPE_E1000_BASE)

-#define defreg(x)    x = (E1000_##x>>2)
-enum {
-    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
-    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
-    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
-    defreg(MPC),     defreg(PBA),     defreg(RCTL),    defreg(RDBAH),
-    defreg(RDBAL),   defreg(RDH),     defreg(RDLEN),   defreg(RDT),
-    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),    defreg(TDBAH),
-    defreg(TDBAL),   defreg(TDH),     defreg(TDLEN),   defreg(TDT),
-    defreg(TORH),    defreg(TORL),    defreg(TOTH),    defreg(TOTL),
-    defreg(TPR),     defreg(TPT),     defreg(TXDCTL),  defreg(WUFC),
-    defreg(RA),      defreg(MTA),     defreg(CRCERRS), defreg(VFTA),
-    defreg(VET),     defreg(RDTR),    defreg(RADV),    defreg(TADV),
-    defreg(ITR),     defreg(FCRUC),   defreg(TDFH),    defreg(TDFT),
-    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(RDFH),
-    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
-    defreg(IPAV),    defreg(WUC),     defreg(WUS),     defreg(AIT),
-    defreg(IP6AT),   defreg(IP4AT),   defreg(FFLT),    defreg(FFMT),
-    defreg(FFVT),    defreg(WUPM),    defreg(PBM),     defreg(SCC),
-    defreg(ECOL),    defreg(MCC),     defreg(LATECOL), defreg(COLC),
-    defreg(DC),      defreg(TNCRS),   defreg(SEC),     defreg(CEXTERR),
-    defreg(RLEC),    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC),
-    defreg(XOFFTXC), defreg(RFC),     defreg(RJC),     defreg(RNBC),
-    defreg(TSCTFC),  defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
-    defreg(RUC),     defreg(ROC),     defreg(GORCL),   defreg(GORCH),
-    defreg(GOTCL),   defreg(GOTCH),   defreg(BPRC),    defreg(MPRC),
-    defreg(TSCTC),   defreg(PRC64),   defreg(PRC127),  defreg(PRC255),
-    defreg(PRC511),  defreg(PRC1023), defreg(PRC1522), defreg(PTC64),
-    defreg(PTC127),  defreg(PTC255),  defreg(PTC511),  defreg(PTC1023),
-    defreg(PTC1522), defreg(MPTC),    defreg(BPTC)
-};
-
-static void
-e1000_link_down(E1000State *s)
-{
-    s->mac_reg[STATUS] &= ~E1000_STATUS_LU;
-    s->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
-    s->phy_reg[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE;
-    s->phy_reg[PHY_LP_ABILITY] &= ~MII_LPAR_LPACK;
-}
-
 static void
 e1000_link_up(E1000State *s)
 {
-    s->mac_reg[STATUS] |= E1000_STATUS_LU;
-    s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS;
+    e1000x_update_regs_on_link_up(s->mac_reg, s->phy_reg);
+
+    /* E1000_STATUS_LU is tested by e1000_can_receive() */
+    qemu_flush_queued_packets(qemu_get_queue(s->nic));
+}
+
+static void
+e1000_autoneg_done(E1000State *s)
+{
+    e1000x_update_regs_on_autoneg_done(s->mac_reg, s->phy_reg);

    /* E1000_STATUS_LU is tested by e1000_can_receive() */
    qemu_flush_queued_packets(qemu_get_queue(s->nic));
@@ -233,10 +182,7 @@ set_phy_ctrl(E1000State *s, int index, uint16_t val)
     * down.
     */
    if (have_autoneg(s) && (val & MII_CR_RESTART_AUTO_NEG)) {
-        e1000_link_down(s);
-        DBGOUT(PHY, "Start link auto negotiation\n");
-        timer_mod(s->autoneg_timer,
-                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
+        e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
    }
 }

@@ -401,43 +347,16 @@ e1000_autoneg_timer(void *opaque)
 {
    E1000State *s = opaque;
    if (!qemu_get_queue(s->nic)->link_down) {
-        e1000_link_up(s);
-        s->phy_reg[PHY_LP_ABILITY] |= MII_LPAR_LPACK;
-        s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
-        DBGOUT(PHY, "Auto negotiation is completed\n");
+        e1000_autoneg_done(s);
        set_ics(s, 0, E1000_ICS_LSC); /* signal link status change to guest */
    }
 }

-static int
-rxbufsize(uint32_t v)
-{
-    v &= E1000_RCTL_BSEX | E1000_RCTL_SZ_16384 | E1000_RCTL_SZ_8192 |
-         E1000_RCTL_SZ_4096 | E1000_RCTL_SZ_2048 | E1000_RCTL_SZ_1024 |
-         E1000_RCTL_SZ_512 | E1000_RCTL_SZ_256;
-    switch (v) {
-    case E1000_RCTL_BSEX | E1000_RCTL_SZ_16384:
-        return 16384;
-    case E1000_RCTL_BSEX | E1000_RCTL_SZ_8192:
-        return 8192;
-    case E1000_RCTL_BSEX | E1000_RCTL_SZ_4096:
-        return 4096;
-    case E1000_RCTL_SZ_1024:
-        return 1024;
-    case E1000_RCTL_SZ_512:
-        return 512;
-    case E1000_RCTL_SZ_256:
-        return 256;
-    }
-    return 2048;
-}
-
 static void e1000_reset(void *opaque)
 {
    E1000State *d = opaque;
    E1000BaseClass *edc = E1000_DEVICE_GET_CLASS(d);
    uint8_t *macaddr = d->conf.macaddr.a;
-    int i;

    timer_del(d->autoneg_timer);
    timer_del(d->mit_timer);
@@ -453,17 +372,10 @@ static void e1000_reset(void *opaque)
    memset(&d->tx, 0, sizeof d->tx);

    if (qemu_get_queue(d->nic)->link_down) {
-        e1000_link_down(d);
+        e1000x_update_regs_on_link_down(d->mac_reg, d->phy_reg);
    }

-    /* Some guests expect pre-initialized RAH/RAL (AddrValid flag + MACaddr) */
-    d->mac_reg[RA] = 0;
-    d->mac_reg[RA + 1] = E1000_RAH_AV;
-    for (i = 0; i < 4; i++) {
-        d->mac_reg[RA] |= macaddr[i] << (8 * i);
-        d->mac_reg[RA + 1] |= (i < 2) ? macaddr[i + 4] << (8 * i) : 0;
-    }
-    qemu_format_nic_info_str(qemu_get_queue(d->nic), macaddr);
+    e1000x_reset_mac_addr(d->nic, d->mac_reg, macaddr);
 }

 static void
@@ -477,7 +389,7 @@ static void
 set_rx_control(E1000State *s, int index, uint32_t val)
 {
    s->mac_reg[RCTL] = val;
-    s->rxbuf_size = rxbufsize(val);
+    s->rxbuf_size = e1000x_rxbufsize(val);
    s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
    DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
           s->mac_reg[RCTL]);
@@ -597,90 +509,16 @@ putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse)
    }
 }

-static inline void
-inc_reg_if_not_full(E1000State *s, int index)
-{
-    if (s->mac_reg[index] != 0xffffffff) {
-        s->mac_reg[index]++;
-    }
-}
-
 static inline void
 inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr)
 {
    if (!memcmp(arr, bcast, sizeof bcast)) {
-        inc_reg_if_not_full(s, BPTC);
+        e1000x_inc_reg_if_not_full(s->mac_reg, BPTC);
    } else if (arr[0] & 1) {
-        inc_reg_if_not_full(s, MPTC);
+        e1000x_inc_reg_if_not_full(s->mac_reg, MPTC);
    }
 }

-static void
-grow_8reg_if_not_full(E1000State *s, int index, int size)
-{
-    uint64_t sum = s->mac_reg[index] | (uint64_t)s->mac_reg[index+1] << 32;
-
-    if (sum + size < sum) {
-        sum = ~0ULL;
-    } else {
-        sum += size;
-    }
-    s->mac_reg[index] = sum;
-    s->mac_reg[index+1] = sum >> 32;
-}
-
-static void
-increase_size_stats(E1000State *s, const int *size_regs, int size)
-{
-    if (size > 1023) {
-        inc_reg_if_not_full(s, size_regs[5]);
-    } else if (size > 511) {
-        inc_reg_if_not_full(s, size_regs[4]);
-    } else if (size > 255) {
-        inc_reg_if_not_full(s, size_regs[3]);
-    } else if (size > 127) {
-        inc_reg_if_not_full(s, size_regs[2]);
-    } else if (size > 64) {
-        inc_reg_if_not_full(s, size_regs[1]);
-    } else if (size == 64) {
-        inc_reg_if_not_full(s, size_regs[0]);
-    }
-}
-
-static inline int
-vlan_enabled(E1000State *s)
-{
-    return ((s->mac_reg[CTRL] & E1000_CTRL_VME) != 0);
-}
-
-static inline int
-vlan_rx_filter_enabled(E1000State *s)
-{
-    return ((s->mac_reg[RCTL] & E1000_RCTL_VFE) != 0);
-}
-
-static inline int
-is_vlan_packet(E1000State *s, const uint8_t *buf)
-{
-    return (be16_to_cpup((uint16_t *)(buf + 12)) ==
-                le16_to_cpu(s->mac_reg[VET]));
-}
-
-static inline int
-is_vlan_txd(uint32_t txd_lower)
-{
-    return ((txd_lower & E1000_TXD_CMD_VLE) != 0);
-}
-
-/* FCS aka Ethernet CRC-32. We don't get it from backends and can't
- * fill it in, just pad descriptor length by 4 bytes unless guest
- * told us to strip it off the packet. */
-static inline int
-fcs_len(E1000State *s)
-{
-    return (s->mac_reg[RCTL] & E1000_RCTL_SECRC) ? 0 : 4;
-}
-
 static void
 e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
 {
@@ -694,7 +532,7 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
        qemu_send_packet(nc, buf, size);
    }
    inc_tx_bcast_or_mcast_count(s, buf);
-    increase_size_stats(s, PTCregs, size);
+    e1000x_increase_size_stats(s->mac_reg, PTCregs, size);
 }

 static void
@@ -704,34 +542,34 @@ xmit_seg(E1000State *s)
    unsigned int frames = s->tx.tso_frames, css, sofar;
    struct e1000_tx *tp = &s->tx;

-    if (tp->tse && tp->cptse) {
-        css = tp->ipcss;
+    if (tp->props.tse && tp->props.cptse) {
+        css = tp->props.ipcss;
        DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
               frames, tp->size, css);
-        if (tp->ip) {    /* IPv4 */
+        if (tp->props.ip) {    /* IPv4 */
            stw_be_p(tp->data+css+2, tp->size - css);
            stw_be_p(tp->data+css+4,
                     be16_to_cpup((uint16_t *)(tp->data+css+4))+frames);
        } else {         /* IPv6 */
            stw_be_p(tp->data+css+4, tp->size - css);
        }
-        css = tp->tucss;
+        css = tp->props.tucss;
        len = tp->size - css;
-        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->tcp, css, len);
-        if (tp->tcp) {
-            sofar = frames * tp->mss;
+        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->props.tcp, css, len);
+        if (tp->props.tcp) {
+            sofar = frames * tp->props.mss;
            stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
-            if (tp->paylen - sofar > tp->mss) {
+            if (tp->props.paylen - sofar > tp->props.mss) {
                tp->data[css + 13] &= ~9;    /* PSH, FIN */
            } else if (frames) {
-                inc_reg_if_not_full(s, TSCTC);
+                e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
            }
        } else    /* UDP */
            stw_be_p(tp->data+css+4, len);
-        if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
+        if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
            unsigned int phsum;
            // add pseudo-header length before checksum calculation
-            sp = (uint16_t *)(tp->data + tp->tucso);
+            sp = (uint16_t *)(tp->data + tp->props.tucso);
            phsum = be16_to_cpup(sp) + len;
            phsum = (phsum >> 16) + (phsum & 0xffff);
            stw_be_p(sp, phsum);
@@ -739,10 +577,14 @@ xmit_seg(E1000State *s)
        tp->tso_frames++;
    }

-    if (tp->sum_needed & E1000_TXD_POPTS_TXSM)
-        putsum(tp->data, tp->size, tp->tucso, tp->tucss, tp->tucse);
-    if (tp->sum_needed & E1000_TXD_POPTS_IXSM)
-        putsum(tp->data, tp->size, tp->ipcso, tp->ipcss, tp->ipcse);
+    if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
+        putsum(tp->data, tp->size, tp->props.tucso,
+               tp->props.tucss, tp->props.tucse);
+    }
+    if (tp->props.sum_needed & E1000_TXD_POPTS_IXSM) {
+        putsum(tp->data, tp->size, tp->props.ipcso,
+               tp->props.ipcss, tp->props.ipcse);
+    }
    if (tp->vlan_needed) {
        memmove(tp->vlan, tp->data, 4);
        memmove(tp->data, tp->data + 4, 8);
@@ -752,8 +594,8 @@ xmit_seg(E1000State *s)
        e1000_send_packet(s, tp->data, tp->size);
    }

-    inc_reg_if_not_full(s, TPT);
-    grow_8reg_if_not_full(s, TOTL, s->tx.size);
+    e1000x_inc_reg_if_not_full(s->mac_reg, TPT);
+    e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size);
    s->mac_reg[GPTC] = s->mac_reg[TPT];
    s->mac_reg[GOTCL] = s->mac_reg[TOTL];
    s->mac_reg[GOTCH] = s->mac_reg[TOTH];
@@ -765,7 +607,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
    PCIDevice *d = PCI_DEVICE(s);
    uint32_t txd_lower = le32_to_cpu(dp->lower.data);
    uint32_t dtype = txd_lower & (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D);
-    unsigned int split_size = txd_lower & 0xffff, bytes, sz, op;
+    unsigned int split_size = txd_lower & 0xffff, bytes, sz;
    unsigned int msh = 0xfffff;
    uint64_t addr;
    struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
@@ -773,38 +615,27 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)

    s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
    if (dtype == E1000_TXD_CMD_DEXT) {    /* context descriptor */
-        op = le32_to_cpu(xp->cmd_and_length);
-        tp->ipcss = xp->lower_setup.ip_fields.ipcss;
-        tp->ipcso = xp->lower_setup.ip_fields.ipcso;
-        tp->ipcse = le16_to_cpu(xp->lower_setup.ip_fields.ipcse);
-        tp->tucss = xp->upper_setup.tcp_fields.tucss;
-        tp->tucso = xp->upper_setup.tcp_fields.tucso;
-        tp->tucse = le16_to_cpu(xp->upper_setup.tcp_fields.tucse);
-        tp->paylen = op & 0xfffff;
-        tp->hdr_len = xp->tcp_seg_setup.fields.hdr_len;
-        tp->mss = le16_to_cpu(xp->tcp_seg_setup.fields.mss);
-        tp->ip = (op & E1000_TXD_CMD_IP) ? 1 : 0;
-        tp->tcp = (op & E1000_TXD_CMD_TCP) ? 1 : 0;
-        tp->tse = (op & E1000_TXD_CMD_TSE) ? 1 : 0;
+        e1000x_read_tx_ctx_descr(xp, &tp->props);
        tp->tso_frames = 0;
-        if (tp->tucso == 0) {    /* this is probably wrong */
+        if (tp->props.tucso == 0) {    /* this is probably wrong */
            DBGOUT(TXSUM, "TCP/UDP: cso 0!\n");
-            tp->tucso = tp->tucss + (tp->tcp ? 16 : 6);
+            tp->props.tucso = tp->props.tucss + (tp->props.tcp ? 16 : 6);
        }
        return;
    } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
        // data descriptor
        if (tp->size == 0) {
-            tp->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
+            tp->props.sum_needed = le32_to_cpu(dp->upper.data) >> 8;
        }
-        tp->cptse = ( txd_lower & E1000_TXD_CMD_TSE ) ? 1 : 0;
+        tp->props.cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
    } else {
        // legacy descriptor
-        tp->cptse = 0;
+        tp->props.cptse = 0;
    }

-    if (vlan_enabled(s) && is_vlan_txd(txd_lower) &&
-        (tp->cptse || txd_lower & E1000_TXD_CMD_EOP)) {
+    if (e1000x_vlan_enabled(s->mac_reg) &&
+        e1000x_is_vlan_txd(txd_lower) &&
+        (tp->props.cptse || txd_lower & E1000_TXD_CMD_EOP)) {
        tp->vlan_needed = 1;
        stw_be_p(tp->vlan_header,
                      le16_to_cpu(s->mac_reg[VET]));
@@ -813,8 +644,8 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
    }

    addr = le64_to_cpu(dp->buffer_addr);
-    if (tp->tse && tp->cptse) {
-        msh = tp->hdr_len + tp->mss;
+    if (tp->props.tse && tp->props.cptse) {
+        msh = tp->props.hdr_len + tp->props.mss;
        do {
            bytes = split_size;
            if (tp->size + bytes > msh)
@@ -823,19 +654,19 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
            bytes = MIN(sizeof(tp->data) - tp->size, bytes);
            pci_dma_read(d, addr, tp->data + tp->size, bytes);
            sz = tp->size + bytes;
-            if (sz >= tp->hdr_len && tp->size < tp->hdr_len) {
-                memmove(tp->header, tp->data, tp->hdr_len);
+            if (sz >= tp->props.hdr_len && tp->size < tp->props.hdr_len) {
+                memmove(tp->header, tp->data, tp->props.hdr_len);
            }
            tp->size = sz;
            addr += bytes;
            if (sz == msh) {
                xmit_seg(s);
-                memmove(tp->data, tp->header, tp->hdr_len);
-                tp->size = tp->hdr_len;
+                memmove(tp->data, tp->header, tp->props.hdr_len);
+                tp->size = tp->props.hdr_len;
            }
            split_size -= bytes;
        } while (bytes && split_size);
-    } else if (!tp->tse && tp->cptse) {
+    } else if (!tp->props.tse && tp->props.cptse) {
        // context descriptor TSE is not set, while data descriptor TSE is set
        DBGOUT(TXERR, "TCP segmentation error\n");
    } else {
@@ -846,14 +677,14 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)

    if (!(txd_lower & E1000_TXD_CMD_EOP))
        return;
-    if (!(tp->tse && tp->cptse && tp->size < tp->hdr_len)) {
+    if (!(tp->props.tse && tp->props.cptse && tp->size < tp->props.hdr_len)) {
        xmit_seg(s);
    }
    tp->tso_frames = 0;
-    tp->sum_needed = 0;
+    tp->props.sum_needed = 0;
    tp->vlan_needed = 0;
    tp->size = 0;
-    tp->cptse = 0;
+    tp->props.cptse = 0;
 }

 static uint32_t
@@ -925,11 +756,11 @@ start_xmit(E1000State *s)
 static int
 receive_filter(E1000State *s, const uint8_t *buf, int size)
 {
-    static const int mta_shift[] = {4, 3, 2, 0};
-    uint32_t f, rctl = s->mac_reg[RCTL], ra[2], *rp;
+    uint32_t rctl = s->mac_reg[RCTL];
    int isbcast = !memcmp(buf, bcast, sizeof bcast), ismcast = (buf[0] & 1);

-    if (is_vlan_packet(s, buf) && vlan_rx_filter_enabled(s)) {
+    if (e1000x_is_vlan_packet(buf, le16_to_cpu(s->mac_reg[VET])) &&
+        e1000x_vlan_rx_filter_enabled(s->mac_reg)) {
        uint16_t vid = be16_to_cpup((uint16_t *)(buf + 14));
        uint32_t vfta = le32_to_cpup((uint32_t *)(s->mac_reg + VFTA) +
                                     ((vid >> 5) & 0x7f));
@@ -942,44 +773,16 @@ receive_filter(E1000State *s, const uint8_t *buf, int size)
    }

    if (ismcast && (rctl & E1000_RCTL_MPE)) {          /* promiscuous mcast */
-        inc_reg_if_not_full(s, MPRC);
+        e1000x_inc_reg_if_not_full(s->mac_reg, MPRC);
        return 1;
    }

    if (isbcast && (rctl & E1000_RCTL_BAM)) {          /* broadcast enabled */
-        inc_reg_if_not_full(s, BPRC);
+        e1000x_inc_reg_if_not_full(s->mac_reg, BPRC);
        return 1;
    }

-    for (rp = s->mac_reg + RA; rp < s->mac_reg + RA + 32; rp += 2) {
-        if (!(rp[1] & E1000_RAH_AV))
-            continue;
-        ra[0] = cpu_to_le32(rp[0]);
-        ra[1] = cpu_to_le32(rp[1]);
-        if (!memcmp(buf, (uint8_t *)ra, 6)) {
-            DBGOUT(RXFILTER,
-                   "unicast match[%d]: %02x:%02x:%02x:%02x:%02x:%02x\n",
-                   (int)(rp - s->mac_reg - RA)/2,
-                   buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]);
-            return 1;
-        }
-    }
-    DBGOUT(RXFILTER, "unicast mismatch: %02x:%02x:%02x:%02x:%02x:%02x\n",
-           buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]);
-
-    f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
-    f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff;
-    if (s->mac_reg[MTA + (f >> 5)] & (1 << (f & 0x1f))) {
-        inc_reg_if_not_full(s, MPRC);
-        return 1;
-    }
-    DBGOUT(RXFILTER,
-           "dropping, inexact filter mismatch: %02x:%02x:%02x:%02x:%02x:%02x MO %d MTA[%d] %x\n",
-           buf[0], buf[1], buf[2], buf[3], buf[4], buf[5],
-           (rctl >> E1000_RCTL_MO_SHIFT) & 3, f >> 5,
-           s->mac_reg[MTA + (f >> 5)]);
-
-    return 0;
+    return e1000x_rx_group_filter(s->mac_reg, buf);
 }

 static void
@@ -989,13 +792,11 @@ e1000_set_link_status(NetClientState *nc)
    uint32_t old_status = s->mac_reg[STATUS];

    if (nc->link_down) {
-        e1000_link_down(s);
+        e1000x_update_regs_on_link_down(s->mac_reg, s->phy_reg);
    } else {
        if (have_autoneg(s) &&
            !(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
-            /* emulate auto-negotiation if supported */
-            timer_mod(s->autoneg_timer,
-                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
+            e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
        } else {
            e1000_link_up(s);
        }
@@ -1028,9 +829,7 @@ e1000_can_receive(NetClientState *nc)
 {
    E1000State *s = qemu_get_nic_opaque(nc);

-    return (s->mac_reg[STATUS] & E1000_STATUS_LU) &&
-        (s->mac_reg[RCTL] & E1000_RCTL_EN) &&
-        (s->parent_obj.config[PCI_COMMAND] & PCI_COMMAND_MASTER) &&
+    return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
        e1000_has_rxbufs(s, 1);
 }

@@ -1061,14 +860,8 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
    size_t desc_offset;
    size_t desc_size;
    size_t total_size;
-    static const int PRCregs[6] = { PRC64, PRC127, PRC255, PRC511,
-                                    PRC1023, PRC1522 };

-    if (!(s->mac_reg[STATUS] & E1000_STATUS_LU)) {
-        return -1;
-    }
-
-    if (!(s->mac_reg[RCTL] & E1000_RCTL_EN)) {
+    if (!e1000x_hw_rx_enabled(s->mac_reg)) {
        return -1;
    }

@@ -1076,7 +869,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
    if (size < sizeof(min_buf)) {
        iov_to_buf(iov, iovcnt, 0, min_buf, size);
        memset(&min_buf[size], 0, sizeof(min_buf) - size);
-        inc_reg_if_not_full(s, RUC);
+        e1000x_inc_reg_if_not_full(s->mac_reg, RUC);
        min_iov.iov_base = filter_buf = min_buf;
        min_iov.iov_len = size = sizeof(min_buf);
        iovcnt = 1;
@@ -1088,11 +881,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
    }

    /* Discard oversized packets if !LPE and !SBP. */
-    if ((size > MAXIMUM_ETHERNET_LPE_SIZE ||
-        (size > MAXIMUM_ETHERNET_VLAN_SIZE
-        && !(s->mac_reg[RCTL] & E1000_RCTL_LPE)))
-        && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) {
-        inc_reg_if_not_full(s, ROC);
+    if (e1000x_is_oversized(s->mac_reg, size)) {
        return size;
    }

@@ -1100,7 +889,8 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
        return size;
    }

-    if (vlan_enabled(s) && is_vlan_packet(s, filter_buf)) {
+    if (e1000x_vlan_enabled(s->mac_reg) &&
+        e1000x_is_vlan_packet(filter_buf, le16_to_cpu(s->mac_reg[VET]))) {
        vlan_special = cpu_to_le16(be16_to_cpup((uint16_t *)(filter_buf
                                                                + 14)));
        iov_ofs = 4;
@@ -1119,7 +909,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)

    rdh_start = s->mac_reg[RDH];
    desc_offset = 0;
-    total_size = size + fcs_len(s);
+    total_size = size + e1000x_fcs_len(s->mac_reg);
    if (!e1000_has_rxbufs(s, total_size)) {
            set_ics(s, 0, E1000_ICS_RXO);
            return -1;
@@ -1179,17 +969,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
        }
    } while (desc_offset < total_size);

-    increase_size_stats(s, PRCregs, total_size);
-    inc_reg_if_not_full(s, TPR);
-    s->mac_reg[GPRC] = s->mac_reg[TPR];
-    /* TOR - Total Octets Received:
-     * This register includes bytes received in a packet from the <Destination
-     * Address> field through the <CRC> field, inclusively.
-     * Always include FCS length (4) in size.
-     */
-    grow_8reg_if_not_full(s, TORL, size+4);
-    s->mac_reg[GORCL] = s->mac_reg[TORL];
-    s->mac_reg[GORCH] = s->mac_reg[TORH];
+    e1000x_update_rx_total_stats(s->mac_reg, size, total_size);

    n = E1000_ICS_RXT0;
    if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
@@ -1670,20 +1450,20 @@ static const VMStateDescription vmstate_e1000 = {
        VMSTATE_UINT16(eecd_state.bitnum_out, E1000State),
        VMSTATE_UINT16(eecd_state.reading, E1000State),
        VMSTATE_UINT32(eecd_state.old_eecd, E1000State),
-        VMSTATE_UINT8(tx.ipcss, E1000State),
-        VMSTATE_UINT8(tx.ipcso, E1000State),
-        VMSTATE_UINT16(tx.ipcse, E1000State),
-        VMSTATE_UINT8(tx.tucss, E1000State),
-        VMSTATE_UINT8(tx.tucso, E1000State),
-        VMSTATE_UINT16(tx.tucse, E1000State),
-        VMSTATE_UINT32(tx.paylen, E1000State),
-        VMSTATE_UINT8(tx.hdr_len, E1000State),
-        VMSTATE_UINT16(tx.mss, E1000State),
+        VMSTATE_UINT8(tx.props.ipcss, E1000State),
+        VMSTATE_UINT8(tx.props.ipcso, E1000State),
+        VMSTATE_UINT16(tx.props.ipcse, E1000State),
+        VMSTATE_UINT8(tx.props.tucss, E1000State),
+        VMSTATE_UINT8(tx.props.tucso, E1000State),
+        VMSTATE_UINT16(tx.props.tucse, E1000State),
+        VMSTATE_UINT32(tx.props.paylen, E1000State),
+        VMSTATE_UINT8(tx.props.hdr_len, E1000State),
+        VMSTATE_UINT16(tx.props.mss, E1000State),
        VMSTATE_UINT16(tx.size, E1000State),
        VMSTATE_UINT16(tx.tso_frames, E1000State),
-        VMSTATE_UINT8(tx.sum_needed, E1000State),
-        VMSTATE_INT8(tx.ip, E1000State),
-        VMSTATE_INT8(tx.tcp, E1000State),
+        VMSTATE_UINT8(tx.props.sum_needed, E1000State),
+        VMSTATE_INT8(tx.props.ip, E1000State),
+        VMSTATE_INT8(tx.props.tcp, E1000State),
        VMSTATE_BUFFER(tx.header, E1000State),
        VMSTATE_BUFFER(tx.data, E1000State),
        VMSTATE_UINT16_ARRAY(eeprom_data, E1000State, 64),
@@ -1806,15 +1586,11 @@ static void e1000_write_config(PCIDevice *pci_dev, uint32_t address,
    }
 }

-
 static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)
 {
    DeviceState *dev = DEVICE(pci_dev);
    E1000State *d = E1000(pci_dev);
-    PCIDeviceClass *pdc = PCI_DEVICE_GET_CLASS(pci_dev);
    uint8_t *pci_conf;
-    uint16_t checksum = 0;
-    int i;
    uint8_t *macaddr;

    pci_dev->config_write = e1000_write_config;
@@ -1832,17 +1608,14 @@ static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)

    pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);

-    memmove(d->eeprom_data, e1000_eeprom_template,
-        sizeof e1000_eeprom_template);
    qemu_macaddr_default_if_unset(&d->conf.macaddr);
    macaddr = d->conf.macaddr.a;
-    for (i = 0; i < 3; i++)
-        d->eeprom_data[i] = (macaddr[2*i+1]<<8) | macaddr[2*i];
-    d->eeprom_data[11] = d->eeprom_data[13] = pdc->device_id;
-    for (i = 0; i < EEPROM_CHECKSUM_REG; i++)
-        checksum += d->eeprom_data[i];
-    checksum = (uint16_t) EEPROM_SUM - checksum;
-    d->eeprom_data[EEPROM_CHECKSUM_REG] = checksum;
+
+    e1000x_core_prepare_eeprom(d->eeprom_data,
+                               e1000_eeprom_template,
+                               sizeof(e1000_eeprom_template),
+                               PCI_DEVICE_GET_CLASS(pci_dev)->device_id,
+                               macaddr);

    d->nic = qemu_new_nic(&net_e1000_info, &d->conf,
                          object_get_typename(OBJECT(d)), dev->id, d);
--- a/hw/net/e1000_regs.h
+++ b/hw/net/e1000_regs.h
@@ -85,6 +85,7 @@
 #define E1000_DEV_ID_82573E              0x108B
 #define E1000_DEV_ID_82573E_IAMT         0x108C
 #define E1000_DEV_ID_82573L              0x109A
+#define E1000_DEV_ID_82574L              0x10D3
 #define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5
 #define E1000_DEV_ID_80003ES2LAN_COPPER_DPT     0x1096
 #define E1000_DEV_ID_80003ES2LAN_SERDES_DPT     0x1098
@@ -104,6 +105,7 @@
 #define E1000_PHY_ID2_82544x 0xC30
 #define E1000_PHY_ID2_8254xx_DEFAULT 0xC20 /* 82540x, 82545x, and 82546x */
 #define E1000_PHY_ID2_82573x 0xCC0
+#define E1000_PHY_ID2_82574x 0xCB1

 /* Register Set. (82543, 82544)
 *
@@ -135,8 +137,11 @@
 #define E1000_ITR      0x000C4  /* Interrupt Throttling Rate - RW */
 #define E1000_ICS      0x000C8  /* Interrupt Cause Set - WO */
 #define E1000_IMS      0x000D0  /* Interrupt Mask Set - RW */
+#define E1000_EIAC     0x000DC  /* Ext. Interrupt Auto Clear - RW */
 #define E1000_IMC      0x000D8  /* Interrupt Mask Clear - WO */
 #define E1000_IAM      0x000E0  /* Interrupt Acknowledge Auto Mask */
+#define E1000_IVAR     0x000E4  /* Interrupt Vector Allocation Register - RW */
+#define E1000_EITR     0x000E8  /* Extended Interrupt Throttling Rate - RW */
 #define E1000_RCTL     0x00100  /* RX Control - RW */
 #define E1000_RDTR1    0x02820  /* RX Delay Timer (1) - RW */
 #define E1000_RDBAL1   0x02900  /* RX Descriptor Base Address Low (1) - RW */
@@ -145,6 +150,7 @@
 #define E1000_RDH1     0x02910  /* RX Descriptor Head (1) - RW */
 #define E1000_RDT1     0x02918  /* RX Descriptor Tail (1) - RW */
 #define E1000_FCTTV    0x00170  /* Flow Control Transmit Timer Value - RW */
+#define E1000_FCRTV    0x05F40  /* Flow Control Refresh Timer Value - RW */
 #define E1000_TXCW     0x00178  /* TX Configuration Word - RW */
 #define E1000_RXCW     0x00180  /* RX Configuration Word - RO */
 #define E1000_TCTL     0x00400  /* TX Control - RW */
@@ -161,6 +167,10 @@
 #define E1000_PBM      0x10000  /* Packet Buffer Memory - RW */
 #define E1000_PBS      0x01008  /* Packet Buffer Size - RW */
 #define E1000_EEMNGCTL 0x01010  /* MNG EEprom Control */
+#define E1000_EEMNGDATA    0x01014 /* MNG EEPROM Read/Write data */
+#define E1000_FLMNGCTL     0x01018 /* MNG Flash Control */
+#define E1000_FLMNGDATA    0x0101C /* MNG FLASH Read data */
+#define E1000_FLMNGCNT     0x01020 /* MNG FLASH Read Counter */
 #define E1000_FLASH_UPDATES 1000
 #define E1000_EEARBC   0x01024  /* EEPROM Auto Read Bus Control */
 #define E1000_FLASHT   0x01028  /* FLASH Timer Register */
@@ -169,9 +179,12 @@
 #define E1000_FLSWDATA 0x01034  /* FLASH data register */
 #define E1000_FLSWCNT  0x01038  /* FLASH Access Counter */
 #define E1000_FLOP     0x0103C  /* FLASH Opcode Register */
+#define E1000_FLOL     0x01050  /* FEEP Auto Load */
 #define E1000_ERT      0x02008  /* Early Rx Threshold - RW */
 #define E1000_FCRTL    0x02160  /* Flow Control Receive Threshold Low - RW */
+#define E1000_FCRTL_A  0x00168  /* Alias to FCRTL */
 #define E1000_FCRTH    0x02168  /* Flow Control Receive Threshold High - RW */
+#define E1000_FCRTH_A  0x00160  /* Alias to FCRTH */
 #define E1000_PSRCTL   0x02170  /* Packet Split Receive Control - RW */
 #define E1000_RDBAL    0x02800  /* RX Descriptor Base Address Low - RW */
 #define E1000_RDBAH    0x02804  /* RX Descriptor Base Address High - RW */
@@ -179,11 +192,17 @@
 #define E1000_RDH      0x02810  /* RX Descriptor Head - RW */
 #define E1000_RDT      0x02818  /* RX Descriptor Tail - RW */
 #define E1000_RDTR     0x02820  /* RX Delay Timer - RW */
+#define E1000_RDTR_A   0x00108  /* Alias to RDTR */
 #define E1000_RDBAL0   E1000_RDBAL /* RX Desc Base Address Low (0) - RW */
+#define E1000_RDBAL0_A 0x00110     /* Alias to RDBAL0 */
 #define E1000_RDBAH0   E1000_RDBAH /* RX Desc Base Address High (0) - RW */
+#define E1000_RDBAH0_A 0x00114     /* Alias to RDBAH0 */
 #define E1000_RDLEN0   E1000_RDLEN /* RX Desc Length (0) - RW */
+#define E1000_RDLEN0_A 0x00118     /* Alias to RDLEN0 */
 #define E1000_RDH0     E1000_RDH   /* RX Desc Head (0) - RW */
+#define E1000_RDH0_A   0x00120     /* Alias to RDH0 */
 #define E1000_RDT0     E1000_RDT   /* RX Desc Tail (0) - RW */
+#define E1000_RDT0_A   0x00128     /* Alias to RDT0 */
 #define E1000_RDTR0    E1000_RDTR  /* RX Delay Timer (0) - RW */
 #define E1000_RXDCTL   0x02828  /* RX Descriptor Control queue 0 - RW */
 #define E1000_RXDCTL1  0x02928  /* RX Descriptor Control queue 1 - RW */
@@ -192,22 +211,33 @@
 #define E1000_RAID     0x02C08  /* Receive Ack Interrupt Delay - RW */
 #define E1000_TXDMAC   0x03000  /* TX DMA Control - RW */
 #define E1000_KABGTXD  0x03004  /* AFE Band Gap Transmit Ref Data */
+#define E1000_POEMB    0x00F10  /* PHY OEM Bits Register - RW */
 #define E1000_RDFH     0x02410  /* Receive Data FIFO Head Register - RW */
+#define E1000_RDFH_A   0x08000  /* Alias to RDFH */
 #define E1000_RDFT     0x02418  /* Receive Data FIFO Tail Register - RW */
+#define E1000_RDFT_A   0x08008  /* Alias to RDFT */
 #define E1000_RDFHS    0x02420  /* Receive Data FIFO Head Saved Register - RW */
 #define E1000_RDFTS    0x02428  /* Receive Data FIFO Tail Saved Register - RW */
 #define E1000_RDFPC    0x02430  /* Receive Data FIFO Packet Count - RW */
 #define E1000_TDFH     0x03410  /* TX Data FIFO Head - RW */
+#define E1000_TDFH_A   0x08010  /* Alias to TDFH */
 #define E1000_TDFT     0x03418  /* TX Data FIFO Tail - RW */
+#define E1000_TDFT_A   0x08018  /* Alias to TDFT */
 #define E1000_TDFHS    0x03420  /* TX Data FIFO Head Saved - RW */
 #define E1000_TDFTS    0x03428  /* TX Data FIFO Tail Saved - RW */
 #define E1000_TDFPC    0x03430  /* TX Data FIFO Packet Count - RW */
 #define E1000_TDBAL    0x03800  /* TX Descriptor Base Address Low - RW */
+#define E1000_TDBAL_A  0x00420  /* Alias to TDBAL */
 #define E1000_TDBAH    0x03804  /* TX Descriptor Base Address High - RW */
+#define E1000_TDBAH_A  0x00424  /* Alias to TDBAH */
 #define E1000_TDLEN    0x03808  /* TX Descriptor Length - RW */
+#define E1000_TDLEN_A  0x00428  /* Alias to TDLEN */
 #define E1000_TDH      0x03810  /* TX Descriptor Head - RW */
+#define E1000_TDH_A    0x00430  /* Alias to TDH */
 #define E1000_TDT      0x03818  /* TX Descripotr Tail - RW */
+#define E1000_TDT_A    0x00438  /* Alias to TDT */
 #define E1000_TIDV     0x03820  /* TX Interrupt Delay Value - RW */
+#define E1000_TIDV_A   0x00440  /* Alias to TIDV */
 #define E1000_TXDCTL   0x03828  /* TX Descriptor Control - RW */
 #define E1000_TADV     0x0382C  /* TX Interrupt Absolute Delay Val - RW */
 #define E1000_TSPMT    0x03830  /* TCP Segmentation PAD & Min Threshold - RW */
@@ -288,9 +318,15 @@
 #define E1000_ICRXOC   0x04124  /* Interrupt Cause Receiver Overrun Count */
 #define E1000_RXCSUM   0x05000  /* RX Checksum Control - RW */
 #define E1000_RFCTL    0x05008  /* Receive Filter Control*/
+#define E1000_MAVTV0   0x05010  /* Management VLAN TAG Value 0 */
+#define E1000_MAVTV1   0x05014  /* Management VLAN TAG Value 1 */
+#define E1000_MAVTV2   0x05018  /* Management VLAN TAG Value 2 */
+#define E1000_MAVTV3   0x0501c  /* Management VLAN TAG Value 3 */
 #define E1000_MTA      0x05200  /* Multicast Table Array - RW Array */
 #define E1000_RA       0x05400  /* Receive Address - RW Array */
+#define E1000_RA_A     0x00040  /* Alias to RA */
 #define E1000_VFTA     0x05600  /* VLAN Filter Table Array - RW Array */
+#define E1000_VFTA_A   0x00600  /* Alias to VFTA */
 #define E1000_WUC      0x05800  /* Wakeup Control - RW */
 #define E1000_WUFC     0x05808  /* Wakeup Filter Control - RW */
 #define E1000_WUS      0x05810  /* Wakeup Status - RO */
@@ -300,27 +336,57 @@
 #define E1000_IP6AT    0x05880  /* IPv6 Address Table - RW Array */
 #define E1000_WUPL     0x05900  /* Wakeup Packet Length - RW */
 #define E1000_WUPM     0x05A00  /* Wakeup Packet Memory - RO A */
+#define E1000_MFUTP01  0x05828  /* Management Flex UDP/TCP Ports 0/1 - RW */
+#define E1000_MFUTP23  0x05830  /* Management Flex UDP/TCP Ports 2/3 - RW */
+#define E1000_MFVAL    0x05824  /* Manageability Filters Valid - RW */
+#define E1000_MDEF     0x05890  /* Manageability Decision Filters - RW Array */
 #define E1000_FFLT     0x05F00  /* Flexible Filter Length Table - RW Array */
 #define E1000_HOST_IF  0x08800  /* Host Interface */
 #define E1000_FFMT     0x09000  /* Flexible Filter Mask Table - RW Array */
+#define E1000_FTFT     0x09400  /* Flexible TCO Filter Table - RW Array */
 #define E1000_FFVT     0x09800  /* Flexible Filter Value Table - RW Array */

 #define E1000_KUMCTRLSTA 0x00034 /* MAC-PHY interface - RW */
-#define E1000_MDPHYA     0x0003C  /* PHY address - RW */
-#define E1000_MANC2H     0x05860  /* Management Control To Host - RW */
+#define E1000_MDPHYA     0x0003C /* PHY address - RW */
+#define E1000_MANC2H     0x05860 /* Management Control To Host - RW */
 #define E1000_SW_FW_SYNC 0x05B5C /* Software-Firmware Synchronization - RW */

 #define E1000_GCR       0x05B00 /* PCI-Ex Control */
+#define E1000_FUNCTAG   0x05B08 /* Function-Tag Register */
 #define E1000_GSCL_1    0x05B10 /* PCI-Ex Statistic Control #1 */
 #define E1000_GSCL_2    0x05B14 /* PCI-Ex Statistic Control #2 */
 #define E1000_GSCL_3    0x05B18 /* PCI-Ex Statistic Control #3 */
 #define E1000_GSCL_4    0x05B1C /* PCI-Ex Statistic Control #4 */
+#define E1000_GSCN_0    0x05B20 /* 3GIO Statistic Counter Register #0 */
+#define E1000_GSCN_1    0x05B24 /* 3GIO Statistic Counter Register #1 */
+#define E1000_GSCN_2    0x05B28 /* 3GIO Statistic Counter Register #2 */
+#define E1000_GSCN_3    0x05B2C /* 3GIO Statistic Counter Register #3 */
 #define E1000_FACTPS    0x05B30 /* Function Active and Power State to MNG */
 #define E1000_SWSM      0x05B50 /* SW Semaphore */
+#define E1000_GCR2      0x05B64 /* 3GIO Control Register 2 */
 #define E1000_FWSM      0x05B54 /* FW Semaphore */
+#define E1000_PBACLR    0x05B68 /* MSI-X PBA Clear */
 #define E1000_FFLT_DBG  0x05F04 /* Debug Register */
 #define E1000_HICR      0x08F00 /* Host Inteface Control */

+#define E1000_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */
+#define E1000_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */
+#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
+#define E1000_RXSTMPL    0x0B624 /* Rx timestamp Low - RO */
+#define E1000_RXSTMPH    0x0B628 /* Rx timestamp High - RO */
+#define E1000_TXSTMPL    0x0B618 /* Tx timestamp value Low - RO */
+#define E1000_TXSTMPH    0x0B61C /* Tx timestamp value High - RO */
+#define E1000_SYSTIML    0x0B600 /* System time register Low - RO */
+#define E1000_SYSTIMH    0x0B604 /* System time register High - RO */
+#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
+#define E1000_RXMTRL     0x0B634 /* Time sync Rx EtherType and Msg Type - RW */
+#define E1000_RXUDP      0x0B638 /* Time Sync Rx UDP Port - RW */
+#define E1000_RXSATRL    0x0B62C /* Rx timestamp attribute low - RO */
+#define E1000_RXSATRH    0x0B630 /* Rx timestamp attribute high - RO */
+#define E1000_TIMADJL    0x0B60C /* Time Adjustment Offset register Low - RW */
+#define E1000_TIMADJH    0x0B610 /* Time Adjustment Offset register High - RW */
+#define E1000_RXCFGL     0x0B634 /* RX Ethertype and Message Type - RW*/
+
 /* RSS registers */
 #define E1000_CPUVEC    0x02C10 /* CPU Vector Register - RW */
 #define E1000_MRQC      0x05818 /* Multiple Receive Control - RW */
@@ -329,6 +395,85 @@
 #define E1000_RSSIM     0x05864 /* RSS Interrupt Mask */
 #define E1000_RSSIR     0x05868 /* RSS Interrupt Request */

+#define E1000_MRQC_ENABLED(mrqc) (((mrqc) & (BIT(0) | BIT(1))) == BIT(0))
+
+#define E1000_RETA_IDX(hash)        ((hash) & (BIT(7) - 1))
+#define E1000_RETA_VAL(reta, hash)  (((uint8_t *)(reta))[E1000_RETA_IDX(hash)])
+#define E1000_RSS_QUEUE(reta, hash) ((E1000_RETA_VAL(reta, hash) & BIT(7)) >> 7)
+
+#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16))
+#define E1000_MRQC_EN_IPV4(mrqc)    ((mrqc) & BIT(17))
+#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18))
+#define E1000_MRQC_EN_IPV6EX(mrqc)  ((mrqc) & BIT(19))
+#define E1000_MRQC_EN_IPV6(mrqc)    ((mrqc) & BIT(20))
+
+#define E1000_MRQ_RSS_TYPE_NONE     (0)
+#define E1000_MRQ_RSS_TYPE_IPV4TCP  (1)
+#define E1000_MRQ_RSS_TYPE_IPV4     (2)
+#define E1000_MRQ_RSS_TYPE_IPV6TCP  (3)
+#define E1000_MRQ_RSS_TYPE_IPV6EX   (4)
+#define E1000_MRQ_RSS_TYPE_IPV6     (5)
+
+#define E1000_ICR_ASSERTED BIT(31)
+#define E1000_EIAC_MASK    0x01F00000
+
+/* [TR]DBAL and [TR]DLEN masks */
+#define E1000_XDBAL_MASK            (~(BIT(4) - 1))
+#define E1000_XDLEN_MASK            ((BIT(20) - 1) & (~(BIT(7) - 1)))
+
+/* IVAR register parsing helpers */
+#define E1000_IVAR_INT_ALLOC_VALID  (0x8)
+
+#define E1000_IVAR_RXQ0_SHIFT       (0)
+#define E1000_IVAR_RXQ1_SHIFT       (4)
+#define E1000_IVAR_TXQ0_SHIFT       (8)
+#define E1000_IVAR_TXQ1_SHIFT       (12)
+#define E1000_IVAR_OTHER_SHIFT      (16)
+
+#define E1000_IVAR_ENTRY_MASK       (0xF)
+#define E1000_IVAR_ENTRY_VALID_MASK E1000_IVAR_INT_ALLOC_VALID
+#define E1000_IVAR_ENTRY_VEC_MASK   (0x7)
+
+#define E1000_IVAR_RXQ0(x)          ((x) >> E1000_IVAR_RXQ0_SHIFT)
+#define E1000_IVAR_RXQ1(x)          ((x) >> E1000_IVAR_RXQ1_SHIFT)
+#define E1000_IVAR_TXQ0(x)          ((x) >> E1000_IVAR_TXQ0_SHIFT)
+#define E1000_IVAR_TXQ1(x)          ((x) >> E1000_IVAR_TXQ1_SHIFT)
+#define E1000_IVAR_OTHER(x)         ((x) >> E1000_IVAR_OTHER_SHIFT)
+
+#define E1000_IVAR_ENTRY_VALID(x)   ((x) & E1000_IVAR_ENTRY_VALID_MASK)
+#define E1000_IVAR_ENTRY_VEC(x)     ((x) & E1000_IVAR_ENTRY_VEC_MASK)
+
+#define E1000_IVAR_TX_INT_EVERY_WB  BIT(31)
+
+/* RFCTL register bits */
+#define E1000_RFCTL_ISCSI_DIS           0x00000001
+#define E1000_RFCTL_NFSW_DIS            0x00000040
+#define E1000_RFCTL_NFSR_DIS            0x00000080
+#define E1000_RFCTL_IPV6_DIS            0x00000400
+#define E1000_RFCTL_IPV6_XSUM_DIS       0x00000800
+#define E1000_RFCTL_ACK_DIS             0x00001000
+#define E1000_RFCTL_ACK_DATA_DIS        0x00002000
+#define E1000_RFCTL_IPFRSP_DIS          0x00004000
+#define E1000_RFCTL_EXTEN               0x00008000
+#define E1000_RFCTL_IPV6_EX_DIS         0x00010000
+#define E1000_RFCTL_NEW_IPV6_EXT_DIS    0x00020000
+
+/* PSRCTL parsing */
+#define E1000_PSRCTL_BSIZE0_MASK   0x0000007F
+#define E1000_PSRCTL_BSIZE1_MASK   0x00003F00
+#define E1000_PSRCTL_BSIZE2_MASK   0x003F0000
+#define E1000_PSRCTL_BSIZE3_MASK   0x3F000000
+
+#define E1000_PSRCTL_BSIZE0_SHIFT  0
+#define E1000_PSRCTL_BSIZE1_SHIFT  8
+#define E1000_PSRCTL_BSIZE2_SHIFT  16
+#define E1000_PSRCTL_BSIZE3_SHIFT  24
+
+#define E1000_PSRCTL_BUFFS_PER_DESC 4
+
+/* TARC* parsing */
+#define E1000_TARC_ENABLE BIT(10)
+
 /* PHY 1000 MII Register/Bit Definitions */
 /* PHY Registers defined by IEEE */
 #define PHY_CTRL         0x00 /* Control Register */
@@ -344,6 +489,40 @@
 #define PHY_1000T_STATUS 0x0A /* 1000Base-T Status Reg */
 #define PHY_EXT_STATUS   0x0F /* Extended Status Reg */

+/* 82574-specific registers */
+#define PHY_COPPER_CTRL1      0x10 /* Copper Specific Control Register 1 */
+#define PHY_COPPER_STAT1      0x11 /* Copper Specific Status Register 1 */
+#define PHY_COPPER_INT_ENABLE 0x12  /* Interrupt Enable Register */
+#define PHY_COPPER_STAT2      0x13 /* Copper Specific Status Register 2 */
+#define PHY_COPPER_CTRL3      0x14 /* Copper Specific Control Register 3 */
+#define PHY_COPPER_CTRL2      0x1A /* Copper Specific Control Register 2 */
+#define PHY_RX_ERR_CNTR       0x15  /* Receive Error Counter */
+#define PHY_PAGE              0x16 /* Page Address (Any page) */
+#define PHY_OEM_BITS          0x19 /* OEM Bits (Page 0) */
+#define PHY_BIAS_1            0x1d /* Bias Setting Register */
+#define PHY_BIAS_2            0x1e /* Bias Setting Register */
+
+/* 82574-specific registers - page 2 */
+#define PHY_MAC_CTRL1         0x10 /* MAC Specific Control Register 1 */
+#define PHY_MAC_INT_ENABLE    0x12 /* MAC Interrupt Enable Register */
+#define PHY_MAC_STAT          0x13 /* MAC Specific Status Register */
+#define PHY_MAC_CTRL2         0x15 /* MAC Specific Control Register 2 */
+
+/* 82574-specific registers - page 3 */
+#define PHY_LED_03_FUNC_CTRL1 0x10 /* LED[3:0] Function Control */
+#define PHY_LED_03_POL_CTRL   0x11 /* LED[3:0] Polarity Control */
+#define PHY_LED_TIMER_CTRL    0x12 /* LED Timer Control */
+#define PHY_LED_45_CTRL       0x13 /* LED[5:4] Function Control and Polarity */
+
+/* 82574-specific registers - page 5 */
+#define PHY_1000T_SKEW        0x14 /* 1000 BASE - T Pair Skew Register */
+#define PHY_1000T_SWAP        0x15 /* 1000 BASE - T Pair Swap and Polarity */
+
+/* 82574-specific registers - page 6 */
+#define PHY_CRC_COUNTERS      0x11 /* CRC Counters */
+
+#define PHY_PAGE_RW_MASK 0x7F /* R/W part of page address register */
+
 #define MAX_PHY_REG_ADDRESS        0x1F  /* 5 bit address bus (0-0x1F) */
 #define MAX_PHY_MULTI_PAGE_REG     0xF   /* Registers equal on all pages */

@@ -423,6 +602,18 @@
 #define E1000_ICR_DSW           0x00000020 /* FW changed the status of DISSW bit in the FWSM */
 #define E1000_ICR_PHYINT        0x00001000 /* LAN connected device generates an interrupt */
 #define E1000_ICR_EPRST         0x00100000 /* ME handware reset occurs */
+#define E1000_ICR_RXQ0          0x00100000 /* Rx Queue 0 Interrupt */
+#define E1000_ICR_RXQ1          0x00200000 /* Rx Queue 1 Interrupt */
+#define E1000_ICR_TXQ0          0x00400000 /* Tx Queue 0 Interrupt */
+#define E1000_ICR_TXQ1          0x00800000 /* Tx Queue 1 Interrupt */
+#define E1000_ICR_OTHER         0x01000000 /* Other Interrupts */
+
+#define E1000_ICR_OTHER_CAUSES (E1000_ICR_LSC  | \
+                                E1000_ICR_RXO  | \
+                                E1000_ICR_MDAC | \
+                                E1000_ICR_SRPD | \
+                                E1000_ICR_ACK  | \
+                                E1000_ICR_MNG)

 /* Interrupt Cause Set */
 #define E1000_ICS_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
@@ -471,6 +662,11 @@
 #define E1000_IMS_SRPD      E1000_ICR_SRPD
 #define E1000_IMS_ACK       E1000_ICR_ACK       /* Receive Ack frame */
 #define E1000_IMS_MNG       E1000_ICR_MNG       /* Manageability event */
+#define E1000_IMS_RXQ0      E1000_ICR_RXQ0
+#define E1000_IMS_RXQ1      E1000_ICR_RXQ1
+#define E1000_IMS_TXQ0      E1000_ICR_TXQ0
+#define E1000_IMS_TXQ1      E1000_ICR_TXQ1
+#define E1000_IMS_OTHER     E1000_ICR_OTHER
 #define E1000_IMS_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
 #define E1000_IMS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
 #define E1000_IMS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
@@ -562,6 +758,15 @@
 #define E1000_EEPROM_RW_ADDR_SHIFT 8    /* Shift to the address bits */
 #define E1000_EEPROM_POLL_WRITE    1    /* Flag for polling for write complete */
 #define E1000_EEPROM_POLL_READ     0    /* Flag for polling for read complete */
+
+/* 82574 EERD/EEWR registers layout */
+#define E1000_EERW_START        BIT(0)
+#define E1000_EERW_DONE         BIT(1)
+#define E1000_EERW_ADDR_SHIFT   2
+#define E1000_EERW_ADDR_MASK    ((1L << 14) - 1)
+#define E1000_EERW_DATA_SHIFT   16
+#define E1000_EERW_DATA_MASK   ((1L << 16) - 1)
+
 /* Register Bit Masks */
 /* Device Control */
 #define E1000_CTRL_FD       0x00000001  /* Full duplex.0=half; 1=full */
@@ -584,7 +789,17 @@
 #define E1000_CTRL_D_UD_EN  0x00002000  /* Dock/Undock enable */
 #define E1000_CTRL_D_UD_POLARITY 0x00004000 /* Defined polarity of Dock/Undock indication in SDP[0] */
 #define E1000_CTRL_FORCE_PHY_RESET 0x00008000 /* Reset both PHY ports, through PHYRST_N pin */
+#define E1000_CTRL_SPD_SHIFT 8          /* Speed Select Shift */
+
+#define E1000_CTRL_EXT_ASDCHK  0x00001000 /* auto speed detection check */
+#define E1000_CTRL_EXT_EE_RST  0x00002000 /* EEPROM reset */
 #define E1000_CTRL_EXT_LINK_EN 0x00010000 /* enable link status from external LINK_0 and LINK_1 pins */
+#define E1000_CTRL_EXT_EIAME   0x01000000
+#define E1000_CTRL_EXT_IAME    0x08000000 /* Int ACK Auto-mask */
+#define E1000_CTRL_EXT_PBA_CLR 0x80000000 /* PBA Clear */
+#define E1000_CTRL_EXT_INT_TIMERS_CLEAR_ENA 0x20000000
+#define E1000_CTRL_EXT_SPD_BYPS  0x00008000 /* Speed Select Bypass */
+
 #define E1000_CTRL_SWDPIN0  0x00040000  /* SWDPIN 0 value */
 #define E1000_CTRL_SWDPIN1  0x00080000  /* SWDPIN 1 value */
 #define E1000_CTRL_SWDPIN2  0x00100000  /* SWDPIN 2 value */
@@ -593,6 +808,7 @@
 #define E1000_CTRL_SWDPIO1  0x00800000  /* SWDPIN 1 input or output */
 #define E1000_CTRL_SWDPIO2  0x01000000  /* SWDPIN 2 input or output */
 #define E1000_CTRL_SWDPIO3  0x02000000  /* SWDPIN 3 input or output */
+#define E1000_CTRL_ADVD3WUC 0x00100000  /* D3 WUC */
 #define E1000_CTRL_RST      0x04000000  /* Global reset */
 #define E1000_CTRL_RFCE     0x08000000  /* Receive Flow Control enable */
 #define E1000_CTRL_TFCE     0x10000000  /* Transmit flow control enable */
@@ -617,9 +833,13 @@
 #define E1000_STATUS_LAN_INIT_DONE 0x00000200   /* Lan Init Completion
                                                   by EEPROM/Flash */
 #define E1000_STATUS_ASDV       0x00000300      /* Auto speed detect value */
+#define E1000_STATUS_ASDV_10    0x00000000      /* ASDV 10Mb */
+#define E1000_STATUS_ASDV_100   0x00000100      /* ASDV 100Mb */
+#define E1000_STATUS_ASDV_1000  0x00000200      /* ASDV 1Gb */
 #define E1000_STATUS_DOCK_CI    0x00000800      /* Change in Dock/Undock state. Clear on write '0'. */
 #define E1000_STATUS_GIO_MASTER_ENABLE 0x00080000 /* Status of Master requests. */
 #define E1000_STATUS_MTXCKOK    0x00000400      /* MTX clock running OK */
+#define E1000_STATUS_PHYRA      0x00000400      /* PHY Reset Asserted */
 #define E1000_STATUS_PCI66      0x00000800      /* In 66Mhz slot */
 #define E1000_STATUS_BUS64      0x00001000      /* In 64 bit slot */
 #define E1000_STATUS_PCIX_MODE  0x00002000      /* PCI-X mode */
@@ -634,6 +854,8 @@
 #define E1000_STATUS_FUSE_9       0x08000000
 #define E1000_STATUS_SERDES0_DIS  0x10000000 /* SERDES disabled on port 0 */
 #define E1000_STATUS_SERDES1_DIS  0x20000000 /* SERDES disabled on port 1 */
+#define E1000_STATUS_SPEED_SHIFT  6
+#define E1000_STATUS_ASDV_SHIFT   8

 /* EEPROM/Flash Control */
 #define E1000_EECD_SK        0x00000001 /* EEPROM Clock */
@@ -664,6 +886,8 @@
 #define E1000_EECD_AUPDEN    0x00100000 /* Enable Autonomous FLASH update */
 #define E1000_EECD_SHADV     0x00200000 /* Shadow RAM Data Valid */
 #define E1000_EECD_SEC1VAL   0x00400000 /* Sector One Valid */
+
+
 #define E1000_EECD_SECVAL_SHIFT      22
 #define E1000_STM_OPCODE     0xDB00
 #define E1000_HICR_FW_RESET  0xC0
@@ -684,6 +908,18 @@
 #define E1000_MDIC_INT_EN    0x20000000
 #define E1000_MDIC_ERROR     0x40000000

+/* Rx Interrupt Delay Timer */
+#define E1000_RDTR_FPD       BIT(31)
+
+/* Tx Interrupt Delay Timer */
+#define E1000_TIDV_FPD       BIT(31)
+
+/* Delay increments in nanoseconds for delayed interrupts registers */
+#define E1000_INTR_DELAY_NS_RES (1024)
+
+/* Delay increments in nanoseconds for interrupt throttling registers */
+#define E1000_INTR_THROTTLING_NS_RES (256)
+
 /* EEPROM Commands - Microwire */
 #define EEPROM_READ_OPCODE_MICROWIRE  0x6  /* EEPROM read opcode */
 #define EEPROM_WRITE_OPCODE_MICROWIRE 0x5  /* EEPROM write opcode */
@@ -711,6 +947,21 @@
 #define E1000_EEPROM_CFG_DONE         0x00040000   /* MNG config cycle done */
 #define E1000_EEPROM_CFG_DONE_PORT_1  0x00080000   /* ...for second port */

+/* PCI Express Control */
+/* 3GIO Control Register - GCR (0x05B00; RW) */
+#define E1000_L0S_ADJUST              (1 << 9)
+#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
+#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
+
+#define E1000_L0S_ADJUST              (1 << 9)
+#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
+#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
+
+#define E1000_GCR_RO_BITS             (1 << 23 | 1 << 25 | 1 << 26)
+
+/* MSI-X PBA Clear register */
+#define E1000_PBACLR_VALID_MASK       (BIT(5) - 1)
+
 /* Transmit Descriptor */
 struct e1000_tx_desc {
    uint64_t buffer_addr;       /* Address of the descriptor's data buffer */
@@ -752,7 +1003,9 @@ struct e1000_tx_desc {
 #define E1000_TXD_CMD_TCP    0x01000000 /* TCP packet */
 #define E1000_TXD_CMD_IP     0x02000000 /* IP packet */
 #define E1000_TXD_CMD_TSE    0x04000000 /* TCP Seg enable */
+#define E1000_TXD_CMD_SNAP   0x40000000 /* Update SNAP header */
 #define E1000_TXD_STAT_TC    0x00000004 /* Tx Underrun */
+#define E1000_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */

 /* Transmit Control */
 #define E1000_TCTL_RST    0x00000001    /* software reset */
@@ -767,7 +1020,7 @@ struct e1000_tx_desc {
 #define E1000_TCTL_NRTU   0x02000000    /* No Re-transmit on underrun */
 #define E1000_TCTL_MULR   0x10000000    /* Multiple request support */

-/* Receive Descriptor */
+/* Legacy Receive Descriptor */
 struct e1000_rx_desc {
    uint64_t buffer_addr; /* Address of the descriptor's data buffer */
    uint16_t length;     /* Length of data DMAed into data buffer */
@@ -777,6 +1030,78 @@ struct e1000_rx_desc {
    uint16_t special;
 };

+/* Extended Receive Descriptor */
+union e1000_rx_desc_extended {
+    struct {
+        uint64_t buffer_addr;
+        uint64_t reserved;
+    } read;
+    struct {
+        struct {
+            uint32_t mrq;           /* Multiple Rx Queues */
+            union {
+                uint32_t rss;       /* RSS Hash */
+                struct {
+                    uint16_t ip_id; /* IP id */
+                    uint16_t csum;  /* Packet Checksum */
+                } csum_ip;
+            } hi_dword;
+        } lower;
+        struct {
+            uint32_t status_error;  /* ext status/error */
+            uint16_t length;
+            uint16_t vlan;          /* VLAN tag */
+        } upper;
+    } wb;                           /* writeback */
+};
+
+#define MAX_PS_BUFFERS 4
+
+/* Number of packet split data buffers (not including the header buffer) */
+#define PS_PAGE_BUFFERS    (MAX_PS_BUFFERS - 1)
+
+/* Receive Descriptor - Packet Split */
+union e1000_rx_desc_packet_split {
+    struct {
+        /* one buffer for protocol header(s), three data buffers */
+        uint64_t buffer_addr[MAX_PS_BUFFERS];
+    } read;
+    struct {
+        struct {
+            uint32_t mrq;          /* Multiple Rx Queues */
+            union {
+                uint32_t rss;          /* RSS Hash */
+                struct {
+                    uint16_t ip_id;    /* IP id */
+                    uint16_t csum;     /* Packet Checksum */
+                } csum_ip;
+            } hi_dword;
+        } lower;
+        struct {
+            uint32_t status_error;     /* ext status/error */
+            uint16_t length0;      /* length of buffer 0 */
+            uint16_t vlan;         /* VLAN tag */
+        } middle;
+        struct {
+            uint16_t header_status;
+            /* length of buffers 1-3 */
+            uint16_t length[PS_PAGE_BUFFERS];
+        } upper;
+        uint64_t reserved;
+    } wb; /* writeback */
+};
+
+/* Receive Checksum Control bits */
+#define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
+#define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
+#define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
+
+#define E1000_RING_DESC_LEN       (16)
+#define E1000_RING_DESC_LEN_SHIFT (4)
+
+#define E1000_MIN_RX_DESC_LEN   E1000_RING_DESC_LEN
+#define E1000_MAX_RX_DESC_LEN   (sizeof(union e1000_rx_desc_packet_split))
+
 /* Receive Descriptor bit definitions */
 #define E1000_RXD_STAT_DD       0x01    /* Descriptor Done */
 #define E1000_RXD_STAT_EOP      0x02    /* End of Packet */
@@ -802,6 +1127,15 @@ struct e1000_rx_desc {
 #define E1000_RXD_SPC_CFI_MASK  0x1000  /* CFI is bit 12 */
 #define E1000_RXD_SPC_CFI_SHIFT 12

+/* RX packet types */
+#define E1000_RXD_PKT_MAC       (0)
+#define E1000_RXD_PKT_IP4       (1)
+#define E1000_RXD_PKT_IP4_XDP   (2)
+#define E1000_RXD_PKT_IP6       (5)
+#define E1000_RXD_PKT_IP6_XDP   (6)
+
+#define E1000_RXD_PKT_TYPE(t) ((t) << 16)
+
 #define E1000_RXDEXT_STATERR_CE    0x01000000
 #define E1000_RXDEXT_STATERR_SE    0x02000000
 #define E1000_RXDEXT_STATERR_SEQ   0x04000000
@@ -879,6 +1213,8 @@ struct e1000_data_desc {
 #define E1000_MANC_NEIGHBOR_EN   0x00004000 /* Enable Neighbor Discovery
                                             * Filtering */
 #define E1000_MANC_ARP_RES_EN    0x00008000 /* Enable ARP response Filtering */
+#define E1000_MANC_DIS_IP_CHK_ARP  0x10000000 /* Disable IP address chacking */
+                                              /*for ARP packets - in 82574 */
 #define E1000_MANC_TCO_RESET     0x00010000 /* TCO Reset Occurred */
 #define E1000_MANC_RCV_TCO_EN    0x00020000 /* Receive TCO Packets Enabled */
 #define E1000_MANC_REPORT_STATUS 0x00040000 /* Status Reporting Enabled */
@@ -902,7 +1238,14 @@ struct e1000_data_desc {
 #define E1000_MANC_SMB_DATA_OUT_SHIFT  28 /* SMBus Data Out Shift */
 #define E1000_MANC_SMB_CLK_OUT_SHIFT   29 /* SMBus Clock Out Shift */

+/* FACTPS Control */
+#define E1000_FACTPS_LAN0_ON     0x00000004 /* Lan 0 enable */
+
 /* For checksumming, the sum of all words in the EEPROM should equal 0xBABA. */
 #define EEPROM_SUM 0xBABA

+/* I/O-Mapped Access to Internal Registers, Memories, and Flash */
+#define E1000_IOADDR 0x00
+#define E1000_IODATA 0x04
+
 #endif /* _E1000_HW_H_ */
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -0,0 +1,739 @@
+/*
+* QEMU INTEL 82574 GbE NIC emulation
+*
+* Software developer's manuals:
+* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
+*
+* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+* Developed by Daynix Computing LTD (http://www.daynix.com)
+*
+* Authors:
+* Dmitry Fleytman <dmitry@daynix.com>
+* Leonid Bloch <leonid@daynix.com>
+* Yan Vugenfirer <yan@daynix.com>
+*
+* Based on work done by:
+* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+* Copyright (c) 2008 Qumranet
+* Based on work done by:
+* Copyright (c) 2007 Dan Aloni
+* Copyright (c) 2004 Antony T Curtis
+*
+* This library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "qemu/osdep.h"
+#include "net/net.h"
+#include "net/tap.h"
+#include "qemu/range.h"
+#include "sysemu/sysemu.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+
+#include "hw/net/e1000_regs.h"
+
+#include "e1000x_common.h"
+#include "e1000e_core.h"
+
+#include "trace.h"
+
+#define TYPE_E1000E "e1000e"
+#define E1000E(obj) OBJECT_CHECK(E1000EState, (obj), TYPE_E1000E)
+
+typedef struct E1000EState {
+    PCIDevice parent_obj;
+    NICState *nic;
+    NICConf conf;
+
+    MemoryRegion mmio;
+    MemoryRegion flash;
+    MemoryRegion io;
+    MemoryRegion msix;
+
+    uint32_t ioaddr;
+
+    uint16_t subsys_ven;
+    uint16_t subsys;
+
+    uint16_t subsys_ven_used;
+    uint16_t subsys_used;
+
+    uint32_t intr_state;
+    bool disable_vnet;
+
+    E1000ECore core;
+
+} E1000EState;
+
+#define E1000E_MMIO_IDX     0
+#define E1000E_FLASH_IDX    1
+#define E1000E_IO_IDX       2
+#define E1000E_MSIX_IDX     3
+
+#define E1000E_MMIO_SIZE    (128 * 1024)
+#define E1000E_FLASH_SIZE   (128 * 1024)
+#define E1000E_IO_SIZE      (32)
+#define E1000E_MSIX_SIZE    (16 * 1024)
+
+#define E1000E_MSIX_TABLE   (0x0000)
+#define E1000E_MSIX_PBA     (0x2000)
+
+#define E1000E_USE_MSI     BIT(0)
+#define E1000E_USE_MSIX    BIT(1)
+
+static uint64_t
+e1000e_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    E1000EState *s = opaque;
+    return e1000e_core_read(&s->core, addr, size);
+}
+
+static void
+e1000e_mmio_write(void *opaque, hwaddr addr,
+                   uint64_t val, unsigned size)
+{
+    E1000EState *s = opaque;
+    e1000e_core_write(&s->core, addr, val, size);
+}
+
+static bool
+e1000e_io_get_reg_index(E1000EState *s, uint32_t *idx)
+{
+    if (s->ioaddr < 0x1FFFF) {
+        *idx = s->ioaddr;
+        return true;
+    }
+
+    if (s->ioaddr < 0x7FFFF) {
+        trace_e1000e_wrn_io_addr_undefined(s->ioaddr);
+        return false;
+    }
+
+    if (s->ioaddr < 0xFFFFF) {
+        trace_e1000e_wrn_io_addr_flash(s->ioaddr);
+        return false;
+    }
+
+    trace_e1000e_wrn_io_addr_unknown(s->ioaddr);
+    return false;
+}
+
+static uint64_t
+e1000e_io_read(void *opaque, hwaddr addr, unsigned size)
+{
+    E1000EState *s = opaque;
+    uint32_t idx;
+    uint64_t val;
+
+    switch (addr) {
+    case E1000_IOADDR:
+        trace_e1000e_io_read_addr(s->ioaddr);
+        return s->ioaddr;
+    case E1000_IODATA:
+        if (e1000e_io_get_reg_index(s, &idx)) {
+            val = e1000e_core_read(&s->core, idx, sizeof(val));
+            trace_e1000e_io_read_data(idx, val);
+            return val;
+        }
+        return 0;
+    default:
+        trace_e1000e_wrn_io_read_unknown(addr);
+        return 0;
+    }
+}
+
+static void
+e1000e_io_write(void *opaque, hwaddr addr,
+                uint64_t val, unsigned size)
+{
+    E1000EState *s = opaque;
+    uint32_t idx;
+
+    switch (addr) {
+    case E1000_IOADDR:
+        trace_e1000e_io_write_addr(val);
+        s->ioaddr = (uint32_t) val;
+        return;
+    case E1000_IODATA:
+        if (e1000e_io_get_reg_index(s, &idx)) {
+            trace_e1000e_io_write_data(idx, val);
+            e1000e_core_write(&s->core, idx, val, sizeof(val));
+        }
+        return;
+    default:
+        trace_e1000e_wrn_io_write_unknown(addr);
+        return;
+    }
+}
+
+static const MemoryRegionOps mmio_ops = {
+    .read = e1000e_mmio_read,
+    .write = e1000e_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static const MemoryRegionOps io_ops = {
+    .read = e1000e_io_read,
+    .write = e1000e_io_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static int
+e1000e_nc_can_receive(NetClientState *nc)
+{
+    E1000EState *s = qemu_get_nic_opaque(nc);
+    return e1000e_can_receive(&s->core);
+}
+
+static ssize_t
+e1000e_nc_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
+{
+    E1000EState *s = qemu_get_nic_opaque(nc);
+    return e1000e_receive_iov(&s->core, iov, iovcnt);
+}
+
+static ssize_t
+e1000e_nc_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+    E1000EState *s = qemu_get_nic_opaque(nc);
+    return e1000e_receive(&s->core, buf, size);
+}
+
+static void
+e1000e_set_link_status(NetClientState *nc)
+{
+    E1000EState *s = qemu_get_nic_opaque(nc);
+    e1000e_core_set_link_status(&s->core);
+}
+
+static NetClientInfo net_e1000e_info = {
+    .type = NET_CLIENT_OPTIONS_KIND_NIC,
+    .size = sizeof(NICState),
+    .can_receive = e1000e_nc_can_receive,
+    .receive = e1000e_nc_receive,
+    .receive_iov = e1000e_nc_receive_iov,
+    .link_status_changed = e1000e_set_link_status,
+};
+
+/*
+* EEPROM (NVM) contents documented in Table 36, section 6.1
+* and generally 6.1.2 Software accessed words.
+*/
+static const uint16_t e1000e_eeprom_template[64] = {
+  /*        Address        |    Compat.    | ImVer |   Compat.     */
+    0x0000, 0x0000, 0x0000, 0x0420, 0xf746, 0x2010, 0xffff, 0xffff,
+  /*      PBA      |ICtrl1 | SSID  | SVID  | DevID |-------|ICtrl2 */
+    0x0000, 0x0000, 0x026b, 0x0000, 0x8086, 0x0000, 0x0000, 0x8058,
+  /*    NVM words 1,2,3    |-------------------------------|PCI-EID*/
+    0x0000, 0x2001, 0x7e7c, 0xffff, 0x1000, 0x00c8, 0x0000, 0x2704,
+  /* PCIe Init. Conf 1,2,3 |PCICtrl|PHY|LD1|-------| RevID | LD0,2 */
+    0x6cc9, 0x3150, 0x070e, 0x460b, 0x2d84, 0x0100, 0xf000, 0x0706,
+  /* FLPAR |FLANADD|LAN-PWR|FlVndr |ICtrl3 |APTSMBA|APTRxEP|APTSMBC*/
+    0x6000, 0x0080, 0x0f04, 0x7fff, 0x4f01, 0xc600, 0x0000, 0x20ff,
+  /* APTIF | APTMC |APTuCP |LSWFWID|MSWFWID|NC-SIMC|NC-SIC | VPDP  */
+    0x0028, 0x0003, 0x0000, 0x0000, 0x0000, 0x0003, 0x0000, 0xffff,
+  /*                            SW Section                         */
+    0x0100, 0xc000, 0x121c, 0xc007, 0xffff, 0xffff, 0xffff, 0xffff,
+  /*                      SW Section                       |CHKSUM */
+    0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0120, 0xffff, 0x0000,
+};
+
+static void e1000e_core_realize(E1000EState *s)
+{
+    s->core.owner = &s->parent_obj;
+    s->core.owner_nic = s->nic;
+}
+
+static void
+e1000e_init_msi(E1000EState *s)
+{
+    int res;
+
+    res = msi_init(PCI_DEVICE(s),
+                   0xD0,   /* MSI capability offset              */
+                   1,      /* MAC MSI interrupts                 */
+                   true,   /* 64-bit message addresses supported */
+                   false); /* Per vector mask supported          */
+
+    if (res > 0) {
+        s->intr_state |= E1000E_USE_MSI;
+    } else {
+        trace_e1000e_msi_init_fail(res);
+    }
+}
+
+static void
+e1000e_cleanup_msi(E1000EState *s)
+{
+    if (s->intr_state & E1000E_USE_MSI) {
+        msi_uninit(PCI_DEVICE(s));
+    }
+}
+
+static void
+e1000e_unuse_msix_vectors(E1000EState *s, int num_vectors)
+{
+    int i;
+    for (i = 0; i < num_vectors; i++) {
+        msix_vector_unuse(PCI_DEVICE(s), i);
+    }
+}
+
+static bool
+e1000e_use_msix_vectors(E1000EState *s, int num_vectors)
+{
+    int i;
+    for (i = 0; i < num_vectors; i++) {
+        int res = msix_vector_use(PCI_DEVICE(s), i);
+        if (res < 0) {
+            trace_e1000e_msix_use_vector_fail(i, res);
+            e1000e_unuse_msix_vectors(s, i);
+            return false;
+        }
+    }
+    return true;
+}
+
+static void
+e1000e_init_msix(E1000EState *s)
+{
+    PCIDevice *d = PCI_DEVICE(s);
+    int res = msix_init(PCI_DEVICE(s), E1000E_MSIX_VEC_NUM,
+                        &s->msix,
+                        E1000E_MSIX_IDX, E1000E_MSIX_TABLE,
+                        &s->msix,
+                        E1000E_MSIX_IDX, E1000E_MSIX_PBA,
+                        0xA0);
+
+    if (res < 0) {
+        trace_e1000e_msix_init_fail(res);
+    } else {
+        if (!e1000e_use_msix_vectors(s, E1000E_MSIX_VEC_NUM)) {
+            msix_uninit(d, &s->msix, &s->msix);
+        } else {
+            s->intr_state |= E1000E_USE_MSIX;
+        }
+    }
+}
+
+static void
+e1000e_cleanup_msix(E1000EState *s)
+{
+    if (s->intr_state & E1000E_USE_MSIX) {
+        e1000e_unuse_msix_vectors(s, E1000E_MSIX_VEC_NUM);
+        msix_uninit(PCI_DEVICE(s), &s->msix, &s->msix);
+    }
+}
+
+static void
+e1000e_init_net_peer(E1000EState *s, PCIDevice *pci_dev, uint8_t *macaddr)
+{
+    DeviceState *dev = DEVICE(pci_dev);
+    NetClientState *nc;
+    int i;
+
+    s->nic = qemu_new_nic(&net_e1000e_info, &s->conf,
+        object_get_typename(OBJECT(s)), dev->id, s);
+
+    s->core.max_queue_num = s->conf.peers.queues - 1;
+
+    trace_e1000e_mac_set_permanent(MAC_ARG(macaddr));
+    memcpy(s->core.permanent_mac, macaddr, sizeof(s->core.permanent_mac));
+
+    qemu_format_nic_info_str(qemu_get_queue(s->nic), macaddr);
+
+    /* Setup virtio headers */
+    if (s->disable_vnet) {
+        s->core.has_vnet = false;
+        trace_e1000e_cfg_support_virtio(false);
+        return;
+    } else {
+        s->core.has_vnet = true;
+    }
+
+    for (i = 0; i < s->conf.peers.queues; i++) {
+        nc = qemu_get_subqueue(s->nic, i);
+        if (!nc->peer || !qemu_has_vnet_hdr(nc->peer)) {
+            s->core.has_vnet = false;
+            trace_e1000e_cfg_support_virtio(false);
+            return;
+        }
+    }
+
+    trace_e1000e_cfg_support_virtio(true);
+
+    for (i = 0; i < s->conf.peers.queues; i++) {
+        nc = qemu_get_subqueue(s->nic, i);
+        qemu_set_vnet_hdr_len(nc->peer, sizeof(struct virtio_net_hdr));
+        qemu_using_vnet_hdr(nc->peer, true);
+    }
+}
+
+static inline uint64_t
+e1000e_gen_dsn(uint8_t *mac)
+{
+    return (uint64_t)(mac[5])        |
+           (uint64_t)(mac[4])  << 8  |
+           (uint64_t)(mac[3])  << 16 |
+           (uint64_t)(0x00FF)  << 24 |
+           (uint64_t)(0x00FF)  << 32 |
+           (uint64_t)(mac[2])  << 40 |
+           (uint64_t)(mac[1])  << 48 |
+           (uint64_t)(mac[0])  << 56;
+}
+
+static int
+e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
+{
+    int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF);
+
+    if (ret >= 0) {
+        pci_set_word(pdev->config + offset + PCI_PM_PMC,
+                     PCI_PM_CAP_VER_1_1 |
+                     pmc);
+
+        pci_set_word(pdev->wmask + offset + PCI_PM_CTRL,
+                     PCI_PM_CTRL_STATE_MASK |
+                     PCI_PM_CTRL_PME_ENABLE |
+                     PCI_PM_CTRL_DATA_SEL_MASK);
+
+        pci_set_word(pdev->w1cmask + offset + PCI_PM_CTRL,
+                     PCI_PM_CTRL_PME_STATUS);
+    }
+
+    return ret;
+}
+
+static void e1000e_write_config(PCIDevice *pci_dev, uint32_t address,
+                                uint32_t val, int len)
+{
+    E1000EState *s = E1000E(pci_dev);
+
+    pci_default_write_config(pci_dev, address, val, len);
+
+    if (range_covers_byte(address, len, PCI_COMMAND) &&
+        (pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+        qemu_flush_queued_packets(qemu_get_queue(s->nic));
+    }
+}
+
+static void e1000e_pci_realize(PCIDevice *pci_dev, Error **errp)
+{
+    static const uint16_t e1000e_pmrb_offset = 0x0C8;
+    static const uint16_t e1000e_pcie_offset = 0x0E0;
+    static const uint16_t e1000e_aer_offset =  0x100;
+    static const uint16_t e1000e_dsn_offset =  0x140;
+    E1000EState *s = E1000E(pci_dev);
+    uint8_t *macaddr;
+
+    trace_e1000e_cb_pci_realize();
+
+    pci_dev->config_write = e1000e_write_config;
+
+    pci_dev->config[PCI_CACHE_LINE_SIZE] = 0x10;
+    pci_dev->config[PCI_INTERRUPT_PIN] = 1;
+
+    pci_set_word(pci_dev->config + PCI_SUBSYSTEM_VENDOR_ID, s->subsys_ven);
+    pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID, s->subsys);
+
+    s->subsys_ven_used = s->subsys_ven;
+    s->subsys_used = s->subsys;
+
+    /* Define IO/MMIO regions */
+    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_ops, s,
+                          "e1000e-mmio", E1000E_MMIO_SIZE);
+    pci_register_bar(pci_dev, E1000E_MMIO_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &s->mmio);
+
+    /*
+     * We provide a dummy implementation for the flash BAR
+     * for drivers that may theoretically probe for its presence.
+     */
+    memory_region_init(&s->flash, OBJECT(s),
+                       "e1000e-flash", E1000E_FLASH_SIZE);
+    pci_register_bar(pci_dev, E1000E_FLASH_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &s->flash);
+
+    memory_region_init_io(&s->io, OBJECT(s), &io_ops, s,
+                          "e1000e-io", E1000E_IO_SIZE);
+    pci_register_bar(pci_dev, E1000E_IO_IDX,
+                     PCI_BASE_ADDRESS_SPACE_IO, &s->io);
+
+    memory_region_init(&s->msix, OBJECT(s), "e1000e-msix",
+                       E1000E_MSIX_SIZE);
+    pci_register_bar(pci_dev, E1000E_MSIX_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &s->msix);
+
+    /* Create networking backend */
+    qemu_macaddr_default_if_unset(&s->conf.macaddr);
+    macaddr = s->conf.macaddr.a;
+
+    e1000e_init_msix(s);
+
+    if (pcie_endpoint_cap_v1_init(pci_dev, e1000e_pcie_offset) < 0) {
+        hw_error("Failed to initialize PCIe capability");
+    }
+
+    e1000e_init_msi(s);
+
+    if (e1000e_add_pm_capability(pci_dev, e1000e_pmrb_offset,
+                                  PCI_PM_CAP_DSI) < 0) {
+        hw_error("Failed to initialize PM capability");
+    }
+
+    if (pcie_aer_init(pci_dev, e1000e_aer_offset, PCI_ERR_SIZEOF) < 0) {
+        hw_error("Failed to initialize AER capability");
+    }
+
+    pcie_dev_ser_num_init(pci_dev, e1000e_dsn_offset,
+                          e1000e_gen_dsn(macaddr));
+
+    e1000e_init_net_peer(s, pci_dev, macaddr);
+
+    /* Initialize core */
+    e1000e_core_realize(s);
+
+    e1000e_core_pci_realize(&s->core,
+                            e1000e_eeprom_template,
+                            sizeof(e1000e_eeprom_template),
+                            macaddr);
+}
+
+static void e1000e_pci_uninit(PCIDevice *pci_dev)
+{
+    E1000EState *s = E1000E(pci_dev);
+
+    trace_e1000e_cb_pci_uninit();
+
+    e1000e_core_pci_uninit(&s->core);
+
+    pcie_aer_exit(pci_dev);
+    pcie_cap_exit(pci_dev);
+
+    qemu_del_nic(s->nic);
+
+    e1000e_cleanup_msix(s);
+    e1000e_cleanup_msi(s);
+}
+
+static void e1000e_qdev_reset(DeviceState *dev)
+{
+    E1000EState *s = E1000E(dev);
+
+    trace_e1000e_cb_qdev_reset();
+
+    e1000e_core_reset(&s->core);
+}
+
+static void e1000e_pre_save(void *opaque)
+{
+    E1000EState *s = opaque;
+
+    trace_e1000e_cb_pre_save();
+
+    e1000e_core_pre_save(&s->core);
+}
+
+static int e1000e_post_load(void *opaque, int version_id)
+{
+    E1000EState *s = opaque;
+
+    trace_e1000e_cb_post_load();
+
+    if ((s->subsys != s->subsys_used) ||
+        (s->subsys_ven != s->subsys_ven_used)) {
+        fprintf(stderr,
+            "ERROR: Cannot migrate while device properties "
+            "(subsys/subsys_ven) differ");
+        return -1;
+    }
+
+    return e1000e_core_post_load(&s->core);
+}
+
+static const VMStateDescription e1000e_vmstate_tx = {
+    .name = "e1000e-tx",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(props.sum_needed, struct e1000e_tx),
+        VMSTATE_UINT8(props.ipcss, struct e1000e_tx),
+        VMSTATE_UINT8(props.ipcso, struct e1000e_tx),
+        VMSTATE_UINT16(props.ipcse, struct e1000e_tx),
+        VMSTATE_UINT8(props.tucss, struct e1000e_tx),
+        VMSTATE_UINT8(props.tucso, struct e1000e_tx),
+        VMSTATE_UINT16(props.tucse, struct e1000e_tx),
+        VMSTATE_UINT8(props.hdr_len, struct e1000e_tx),
+        VMSTATE_UINT16(props.mss, struct e1000e_tx),
+        VMSTATE_UINT32(props.paylen, struct e1000e_tx),
+        VMSTATE_INT8(props.ip, struct e1000e_tx),
+        VMSTATE_INT8(props.tcp, struct e1000e_tx),
+        VMSTATE_BOOL(props.tse, struct e1000e_tx),
+        VMSTATE_BOOL(props.cptse, struct e1000e_tx),
+        VMSTATE_BOOL(skip_cp, struct e1000e_tx),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription e1000e_vmstate_intr_timer = {
+    .name = "e1000e-intr-timer",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_TIMER_PTR(timer, E1000IntrDelayTimer),
+        VMSTATE_BOOL(running, E1000IntrDelayTimer),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_E1000E_INTR_DELAY_TIMER(_f, _s)                     \
+    VMSTATE_STRUCT(_f, _s, 0,                                       \
+                   e1000e_vmstate_intr_timer, E1000IntrDelayTimer)
+
+#define VMSTATE_E1000E_INTR_DELAY_TIMER_ARRAY(_f, _s, _num)         \
+    VMSTATE_STRUCT_ARRAY(_f, _s, _num, 0,                           \
+                         e1000e_vmstate_intr_timer, E1000IntrDelayTimer)
+
+static const VMStateDescription e1000e_vmstate = {
+    .name = "e1000e",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = e1000e_pre_save,
+    .post_load = e1000e_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_PCIE_DEVICE(parent_obj, E1000EState),
+        VMSTATE_MSIX(parent_obj, E1000EState),
+
+        VMSTATE_UINT32(ioaddr, E1000EState),
+        VMSTATE_UINT32(intr_state, E1000EState),
+        VMSTATE_UINT32(core.rxbuf_min_shift, E1000EState),
+        VMSTATE_UINT8(core.rx_desc_len, E1000EState),
+        VMSTATE_UINT32_ARRAY(core.rxbuf_sizes, E1000EState,
+                             E1000_PSRCTL_BUFFS_PER_DESC),
+        VMSTATE_UINT32(core.rx_desc_buf_size, E1000EState),
+        VMSTATE_UINT16_ARRAY(core.eeprom, E1000EState, E1000E_EEPROM_SIZE),
+        VMSTATE_UINT16_2DARRAY(core.phy, E1000EState,
+                               E1000E_PHY_PAGES, E1000E_PHY_PAGE_SIZE),
+        VMSTATE_UINT32_ARRAY(core.mac, E1000EState, E1000E_MAC_SIZE),
+        VMSTATE_UINT8_ARRAY(core.permanent_mac, E1000EState, ETH_ALEN),
+
+        VMSTATE_UINT32(core.delayed_causes, E1000EState),
+
+        VMSTATE_UINT16(subsys, E1000EState),
+        VMSTATE_UINT16(subsys_ven, E1000EState),
+
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.rdtr, E1000EState),
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.radv, E1000EState),
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.raid, E1000EState),
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.tadv, E1000EState),
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.tidv, E1000EState),
+
+        VMSTATE_E1000E_INTR_DELAY_TIMER(core.itr, E1000EState),
+        VMSTATE_BOOL(core.itr_intr_pending, E1000EState),
+
+        VMSTATE_E1000E_INTR_DELAY_TIMER_ARRAY(core.eitr, E1000EState,
+                                              E1000E_MSIX_VEC_NUM),
+        VMSTATE_BOOL_ARRAY(core.eitr_intr_pending, E1000EState,
+                           E1000E_MSIX_VEC_NUM),
+
+        VMSTATE_UINT32(core.itr_guest_value, E1000EState),
+        VMSTATE_UINT32_ARRAY(core.eitr_guest_value, E1000EState,
+                             E1000E_MSIX_VEC_NUM),
+
+        VMSTATE_UINT16(core.vet, E1000EState),
+
+        VMSTATE_STRUCT_ARRAY(core.tx, E1000EState, E1000E_NUM_QUEUES, 0,
+                             e1000e_vmstate_tx, struct e1000e_tx),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static PropertyInfo e1000e_prop_disable_vnet,
+                    e1000e_prop_subsys_ven,
+                    e1000e_prop_subsys;
+
+static Property e1000e_properties[] = {
+    DEFINE_NIC_PROPERTIES(E1000EState, conf),
+    DEFINE_PROP_DEFAULT("disable_vnet_hdr", E1000EState, disable_vnet, false,
+                        e1000e_prop_disable_vnet, bool),
+    DEFINE_PROP_DEFAULT("subsys_ven", E1000EState, subsys_ven,
+                        PCI_VENDOR_ID_INTEL,
+                        e1000e_prop_subsys_ven, uint16_t),
+    DEFINE_PROP_DEFAULT("subsys", E1000EState, subsys, 0,
+                        e1000e_prop_subsys, uint16_t),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void e1000e_class_init(ObjectClass *class, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(class);
+    PCIDeviceClass *c = PCI_DEVICE_CLASS(class);
+
+    c->realize = e1000e_pci_realize;
+    c->exit = e1000e_pci_uninit;
+    c->vendor_id = PCI_VENDOR_ID_INTEL;
+    c->device_id = E1000_DEV_ID_82574L;
+    c->revision = 0;
+    c->class_id = PCI_CLASS_NETWORK_ETHERNET;
+    c->is_express = 1;
+
+    dc->desc = "Intel 82574L GbE Controller";
+    dc->reset = e1000e_qdev_reset;
+    dc->vmsd = &e1000e_vmstate;
+    dc->props = e1000e_properties;
+
+    e1000e_prop_disable_vnet = qdev_prop_uint8;
+    e1000e_prop_disable_vnet.description = "Do not use virtio headers, "
+                                           "perform SW offloads emulation "
+                                           "instead";
+
+    e1000e_prop_subsys_ven = qdev_prop_uint16;
+    e1000e_prop_subsys_ven.description = "PCI device Subsystem Vendor ID";
+
+    e1000e_prop_subsys = qdev_prop_uint16;
+    e1000e_prop_subsys.description = "PCI device Subsystem ID";
+
+    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+}
+
+static void e1000e_instance_init(Object *obj)
+{
+    E1000EState *s = E1000E(obj);
+    device_add_bootindex_property(obj, &s->conf.bootindex,
+                                  "bootindex", "/ethernet-phy@0",
+                                  DEVICE(obj), NULL);
+}
+
+static const TypeInfo e1000e_info = {
+    .name = TYPE_E1000E,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(E1000EState),
+    .class_init = e1000e_class_init,
+    .instance_init = e1000e_instance_init,
+};
+
+static void e1000e_register_types(void)
+{
+    type_register_static(&e1000e_info);
+}
+
+type_init(e1000e_register_types)
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -0,0 +1,146 @@
+/*
+* Core code for QEMU e1000e emulation
+*
+* Software developer's manuals:
+* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
+*
+* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+* Developed by Daynix Computing LTD (http://www.daynix.com)
+*
+* Authors:
+* Dmitry Fleytman <dmitry@daynix.com>
+* Leonid Bloch <leonid@daynix.com>
+* Yan Vugenfirer <yan@daynix.com>
+*
+* Based on work done by:
+* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+* Copyright (c) 2008 Qumranet
+* Based on work done by:
+* Copyright (c) 2007 Dan Aloni
+* Copyright (c) 2004 Antony T Curtis
+*
+* This library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define E1000E_PHY_PAGE_SIZE    (0x20)
+#define E1000E_PHY_PAGES        (0x07)
+#define E1000E_MAC_SIZE         (0x8000)
+#define E1000E_EEPROM_SIZE      (64)
+#define E1000E_MSIX_VEC_NUM     (5)
+#define E1000E_NUM_QUEUES       (2)
+
+typedef struct E1000Core E1000ECore;
+
+enum { PHY_R = BIT(0),
+       PHY_W = BIT(1),
+       PHY_RW = PHY_R | PHY_W,
+       PHY_ANYPAGE = BIT(2) };
+
+typedef struct E1000IntrDelayTimer_st {
+    QEMUTimer *timer;
+    bool running;
+    uint32_t delay_reg;
+    uint32_t delay_resolution_ns;
+    E1000ECore *core;
+} E1000IntrDelayTimer;
+
+struct E1000Core {
+    uint32_t mac[E1000E_MAC_SIZE];
+    uint16_t phy[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE];
+    uint16_t eeprom[E1000E_EEPROM_SIZE];
+
+    uint32_t rxbuf_sizes[E1000_PSRCTL_BUFFS_PER_DESC];
+    uint32_t rx_desc_buf_size;
+    uint32_t rxbuf_min_shift;
+    uint8_t rx_desc_len;
+
+    QEMUTimer *autoneg_timer;
+
+    struct e1000e_tx {
+        e1000x_txd_props props;
+
+        bool skip_cp;
+        struct NetTxPkt *tx_pkt;
+    } tx[E1000E_NUM_QUEUES];
+
+    struct NetRxPkt *rx_pkt;
+
+    bool has_vnet;
+    int max_queue_num;
+
+    /* Interrupt moderation management */
+    uint32_t delayed_causes;
+
+    E1000IntrDelayTimer radv;
+    E1000IntrDelayTimer rdtr;
+    E1000IntrDelayTimer raid;
+
+    E1000IntrDelayTimer tadv;
+    E1000IntrDelayTimer tidv;
+
+    E1000IntrDelayTimer itr;
+    bool itr_intr_pending;
+
+    E1000IntrDelayTimer eitr[E1000E_MSIX_VEC_NUM];
+    bool eitr_intr_pending[E1000E_MSIX_VEC_NUM];
+
+    VMChangeStateEntry *vmstate;
+
+    uint32_t itr_guest_value;
+    uint32_t eitr_guest_value[E1000E_MSIX_VEC_NUM];
+
+    uint16_t vet;
+
+    uint8_t permanent_mac[ETH_ALEN];
+
+    NICState *owner_nic;
+    PCIDevice *owner;
+    void (*owner_start_recv)(PCIDevice *d);
+};
+
+void
+e1000e_core_write(E1000ECore *core, hwaddr addr, uint64_t val, unsigned size);
+
+uint64_t
+e1000e_core_read(E1000ECore *core, hwaddr addr, unsigned size);
+
+void
+e1000e_core_pci_realize(E1000ECore      *regs,
+                       const uint16_t *eeprom_templ,
+                       uint32_t        eeprom_size,
+                       const uint8_t  *macaddr);
+
+void
+e1000e_core_reset(E1000ECore *core);
+
+void
+e1000e_core_pre_save(E1000ECore *core);
+
+int
+e1000e_core_post_load(E1000ECore *core);
+
+void
+e1000e_core_set_link_status(E1000ECore *core);
+
+void
+e1000e_core_pci_uninit(E1000ECore *core);
+
+int
+e1000e_can_receive(E1000ECore *core);
+
+ssize_t
+e1000e_receive(E1000ECore *core, const uint8_t *buf, size_t size);
+
+ssize_t
+e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt);
--- a/hw/net/e1000x_common.c
+++ b/hw/net/e1000x_common.c
@@ -0,0 +1,267 @@
+/*
+* QEMU e1000(e) emulation - shared code
+*
+* Copyright (c) 2008 Qumranet
+*
+* Based on work done by:
+* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+* Copyright (c) 2007 Dan Aloni
+* Copyright (c) 2004 Antony T Curtis
+*
+* This library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "hw/pci/pci.h"
+#include "net/net.h"
+
+#include "e1000x_common.h"
+
+#include "trace.h"
+
+bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac)
+{
+    bool link_up = mac[STATUS] & E1000_STATUS_LU;
+    bool rx_enabled = mac[RCTL] & E1000_RCTL_EN;
+    bool pci_master = d->config[PCI_COMMAND] & PCI_COMMAND_MASTER;
+
+    if (!link_up || !rx_enabled || !pci_master) {
+        trace_e1000x_rx_can_recv_disabled(link_up, rx_enabled, pci_master);
+        return false;
+    }
+
+    return true;
+}
+
+bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet)
+{
+    uint16_t eth_proto = be16_to_cpup((uint16_t *)(buf + 12));
+    bool res = (eth_proto == vet);
+
+    trace_e1000x_vlan_is_vlan_pkt(res, eth_proto, vet);
+
+    return res;
+}
+
+bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf)
+{
+    static const int mta_shift[] = { 4, 3, 2, 0 };
+    uint32_t f, ra[2], *rp, rctl = mac[RCTL];
+
+    for (rp = mac + RA; rp < mac + RA + 32; rp += 2) {
+        if (!(rp[1] & E1000_RAH_AV)) {
+            continue;
+        }
+        ra[0] = cpu_to_le32(rp[0]);
+        ra[1] = cpu_to_le32(rp[1]);
+        if (!memcmp(buf, (uint8_t *)ra, 6)) {
+            trace_e1000x_rx_flt_ucast_match((int)(rp - mac - RA) / 2,
+                                            MAC_ARG(buf));
+            return true;
+        }
+    }
+    trace_e1000x_rx_flt_ucast_mismatch(MAC_ARG(buf));
+
+    f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
+    f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff;
+    if (mac[MTA + (f >> 5)] & (1 << (f & 0x1f))) {
+        e1000x_inc_reg_if_not_full(mac, MPRC);
+        return true;
+    }
+
+    trace_e1000x_rx_flt_inexact_mismatch(MAC_ARG(buf),
+                                         (rctl >> E1000_RCTL_MO_SHIFT) & 3,
+                                         f >> 5,
+                                         mac[MTA + (f >> 5)]);
+
+    return false;
+}
+
+bool e1000x_hw_rx_enabled(uint32_t *mac)
+{
+    if (!(mac[STATUS] & E1000_STATUS_LU)) {
+        trace_e1000x_rx_link_down(mac[STATUS]);
+        return false;
+    }
+
+    if (!(mac[RCTL] & E1000_RCTL_EN)) {
+        trace_e1000x_rx_disabled(mac[RCTL]);
+        return false;
+    }
+
+    return true;
+}
+
+bool e1000x_is_oversized(uint32_t *mac, size_t size)
+{
+    /* this is the size past which hardware will
+       drop packets when setting LPE=0 */
+    static const int maximum_ethernet_vlan_size = 1522;
+    /* this is the size past which hardware will
+       drop packets when setting LPE=1 */
+    static const int maximum_ethernet_lpe_size = 16384;
+
+    if ((size > maximum_ethernet_lpe_size ||
+        (size > maximum_ethernet_vlan_size
+            && !(mac[RCTL] & E1000_RCTL_LPE)))
+        && !(mac[RCTL] & E1000_RCTL_SBP)) {
+        e1000x_inc_reg_if_not_full(mac, ROC);
+        trace_e1000x_rx_oversized(size);
+        return true;
+    }
+
+    return false;
+}
+
+void e1000x_restart_autoneg(uint32_t *mac, uint16_t *phy, QEMUTimer *timer)
+{
+    e1000x_update_regs_on_link_down(mac, phy);
+    trace_e1000x_link_negotiation_start();
+    timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
+}
+
+void e1000x_reset_mac_addr(NICState *nic, uint32_t *mac_regs,
+                           uint8_t *mac_addr)
+{
+    int i;
+
+    mac_regs[RA] = 0;
+    mac_regs[RA + 1] = E1000_RAH_AV;
+    for (i = 0; i < 4; i++) {
+        mac_regs[RA] |= mac_addr[i] << (8 * i);
+        mac_regs[RA + 1] |=
+            (i < 2) ? mac_addr[i + 4] << (8 * i) : 0;
+    }
+
+    qemu_format_nic_info_str(qemu_get_queue(nic), mac_addr);
+    trace_e1000x_mac_indicate(MAC_ARG(mac_addr));
+}
+
+void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy)
+{
+    e1000x_update_regs_on_link_up(mac, phy);
+    phy[PHY_LP_ABILITY] |= MII_LPAR_LPACK;
+    phy[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
+    trace_e1000x_link_negotiation_done();
+}
+
+void
+e1000x_core_prepare_eeprom(uint16_t       *eeprom,
+                           const uint16_t *templ,
+                           uint32_t        templ_size,
+                           uint16_t        dev_id,
+                           const uint8_t  *macaddr)
+{
+    uint16_t checksum = 0;
+    int i;
+
+    memmove(eeprom, templ, templ_size);
+
+    for (i = 0; i < 3; i++) {
+        eeprom[i] = (macaddr[2 * i + 1] << 8) | macaddr[2 * i];
+    }
+
+    eeprom[11] = eeprom[13] = dev_id;
+
+    for (i = 0; i < EEPROM_CHECKSUM_REG; i++) {
+        checksum += eeprom[i];
+    }
+
+    checksum = (uint16_t) EEPROM_SUM - checksum;
+
+    eeprom[EEPROM_CHECKSUM_REG] = checksum;
+}
+
+uint32_t
+e1000x_rxbufsize(uint32_t rctl)
+{
+    rctl &= E1000_RCTL_BSEX | E1000_RCTL_SZ_16384 | E1000_RCTL_SZ_8192 |
+        E1000_RCTL_SZ_4096 | E1000_RCTL_SZ_2048 | E1000_RCTL_SZ_1024 |
+        E1000_RCTL_SZ_512 | E1000_RCTL_SZ_256;
+    switch (rctl) {
+    case E1000_RCTL_BSEX | E1000_RCTL_SZ_16384:
+        return 16384;
+    case E1000_RCTL_BSEX | E1000_RCTL_SZ_8192:
+        return 8192;
+    case E1000_RCTL_BSEX | E1000_RCTL_SZ_4096:
+        return 4096;
+    case E1000_RCTL_SZ_1024:
+        return 1024;
+    case E1000_RCTL_SZ_512:
+        return 512;
+    case E1000_RCTL_SZ_256:
+        return 256;
+    }
+    return 2048;
+}
+
+void
+e1000x_update_rx_total_stats(uint32_t *mac,
+                             size_t data_size,
+                             size_t data_fcs_size)
+{
+    static const int PRCregs[6] = { PRC64, PRC127, PRC255, PRC511,
+                                    PRC1023, PRC1522 };
+
+    e1000x_increase_size_stats(mac, PRCregs, data_fcs_size);
+    e1000x_inc_reg_if_not_full(mac, TPR);
+    mac[GPRC] = mac[TPR];
+    /* TOR - Total Octets Received:
+    * This register includes bytes received in a packet from the <Destination
+    * Address> field through the <CRC> field, inclusively.
+    * Always include FCS length (4) in size.
+    */
+    e1000x_grow_8reg_if_not_full(mac, TORL, data_size + 4);
+    mac[GORCL] = mac[TORL];
+    mac[GORCH] = mac[TORH];
+}
+
+void
+e1000x_increase_size_stats(uint32_t *mac, const int *size_regs, int size)
+{
+    if (size > 1023) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[5]);
+    } else if (size > 511) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[4]);
+    } else if (size > 255) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[3]);
+    } else if (size > 127) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[2]);
+    } else if (size > 64) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[1]);
+    } else if (size == 64) {
+        e1000x_inc_reg_if_not_full(mac, size_regs[0]);
+    }
+}
+
+void
+e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
+                         e1000x_txd_props *props)
+{
+    uint32_t op = le32_to_cpu(d->cmd_and_length);
+
+    props->ipcss = d->lower_setup.ip_fields.ipcss;
+    props->ipcso = d->lower_setup.ip_fields.ipcso;
+    props->ipcse = le16_to_cpu(d->lower_setup.ip_fields.ipcse);
+    props->tucss = d->upper_setup.tcp_fields.tucss;
+    props->tucso = d->upper_setup.tcp_fields.tucso;
+    props->tucse = le16_to_cpu(d->upper_setup.tcp_fields.tucse);
+    props->paylen = op & 0xfffff;
+    props->hdr_len = d->tcp_seg_setup.fields.hdr_len;
+    props->mss = le16_to_cpu(d->tcp_seg_setup.fields.mss);
+    props->ip = (op & E1000_TXD_CMD_IP) ? 1 : 0;
+    props->tcp = (op & E1000_TXD_CMD_TCP) ? 1 : 0;
+    props->tse = (op & E1000_TXD_CMD_TSE) ? 1 : 0;
+}
--- a/hw/net/e1000x_common.h
+++ b/hw/net/e1000x_common.h
@@ -0,0 +1,213 @@
+/*
+* QEMU e1000(e) emulation - shared code
+*
+* Copyright (c) 2008 Qumranet
+*
+* Based on work done by:
+* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+* Copyright (c) 2007 Dan Aloni
+* Copyright (c) 2004 Antony T Curtis
+*
+* This library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "e1000_regs.h"
+
+#define defreg(x)   x = (E1000_##x >> 2)
+enum {
+    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
+    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
+    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
+    defreg(MPC),     defreg(PBA),     defreg(RCTL),    defreg(RDBAH0),
+    defreg(RDBAL0),  defreg(RDH0),    defreg(RDLEN0),  defreg(RDT0),
+    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),    defreg(TDBAH),
+    defreg(TDBAL),   defreg(TDH),     defreg(TDLEN),   defreg(TDT),
+    defreg(TDLEN1),  defreg(TDBAL1),  defreg(TDBAH1),  defreg(TDH1),
+    defreg(TDT1),    defreg(TORH),    defreg(TORL),    defreg(TOTH),
+    defreg(TOTL),    defreg(TPR),     defreg(TPT),     defreg(TXDCTL),
+    defreg(WUFC),    defreg(RA),      defreg(MTA),     defreg(CRCERRS),
+    defreg(VFTA),    defreg(VET),     defreg(RDTR),    defreg(RADV),
+    defreg(TADV),    defreg(ITR),     defreg(SCC),     defreg(ECOL),
+    defreg(MCC),     defreg(LATECOL), defreg(COLC),    defreg(DC),
+    defreg(TNCRS),   defreg(SEC),     defreg(CEXTERR), defreg(RLEC),
+    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC), defreg(XOFFTXC),
+    defreg(FCRUC),   defreg(AIT),     defreg(TDFH),    defreg(TDFT),
+    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(WUC),
+    defreg(WUS),     defreg(POEMB),   defreg(PBS),     defreg(RDFH),
+    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
+    defreg(PBM),     defreg(IPAV),    defreg(IP4AT),   defreg(IP6AT),
+    defreg(WUPM),    defreg(FFLT),    defreg(FFMT),    defreg(FFVT),
+    defreg(TARC0),   defreg(TARC1),   defreg(IAM),     defreg(EXTCNF_CTRL),
+    defreg(GCR),     defreg(TIMINCA), defreg(EIAC),    defreg(CTRL_EXT),
+    defreg(IVAR),    defreg(MFUTP01), defreg(MFUTP23), defreg(MANC2H),
+    defreg(MFVAL),   defreg(MDEF),    defreg(FACTPS),  defreg(FTFT),
+    defreg(RUC),     defreg(ROC),     defreg(RFC),     defreg(RJC),
+    defreg(PRC64),   defreg(PRC127),  defreg(PRC255),  defreg(PRC511),
+    defreg(PRC1023), defreg(PRC1522), defreg(PTC64),   defreg(PTC127),
+    defreg(PTC255),  defreg(PTC511),  defreg(PTC1023), defreg(PTC1522),
+    defreg(GORCL),   defreg(GORCH),   defreg(GOTCL),   defreg(GOTCH),
+    defreg(RNBC),    defreg(BPRC),    defreg(MPRC),    defreg(RFCTL),
+    defreg(PSRCTL),  defreg(MPTC),    defreg(BPTC),    defreg(TSCTFC),
+    defreg(IAC),     defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
+    defreg(TSCTC),   defreg(RXCSUM),  defreg(FUNCTAG), defreg(GSCL_1),
+    defreg(GSCL_2),  defreg(GSCL_3),  defreg(GSCL_4),  defreg(GSCN_0),
+    defreg(GSCN_1),  defreg(GSCN_2),  defreg(GSCN_3),  defreg(GCR2),
+    defreg(RAID),    defreg(RSRPD),   defreg(TIDV),    defreg(EITR),
+    defreg(MRQC),    defreg(RETA),    defreg(RSSRK),   defreg(RDBAH1),
+    defreg(RDBAL1),  defreg(RDLEN1),  defreg(RDH1),    defreg(RDT1),
+    defreg(PBACLR),  defreg(FCAL),    defreg(FCAH),    defreg(FCT),
+    defreg(FCRTH),   defreg(FCRTL),   defreg(FCTTV),   defreg(FCRTV),
+    defreg(FLA),     defreg(EEWR),    defreg(FLOP),    defreg(FLOL),
+    defreg(FLSWCTL), defreg(FLSWCNT), defreg(RXDCTL),  defreg(RXDCTL1),
+    defreg(MAVTV0),  defreg(MAVTV1),  defreg(MAVTV2),  defreg(MAVTV3),
+    defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH),
+    defreg(RXCFGL),  defreg(RXUDP),   defreg(TIMADJL), defreg(TIMADJH),
+    defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH),
+    defreg(FLASHT),  defreg(TIPG),    defreg(RDH),     defreg(RDT),
+    defreg(RDLEN),   defreg(RDBAH),   defreg(RDBAL),
+    defreg(TXDCTL1),
+    defreg(FLSWDATA),
+    defreg(CTRL_DUP),
+    defreg(EXTCNF_SIZE),
+    defreg(EEMNGCTL),
+    defreg(EEMNGDATA),
+    defreg(FLMNGCTL),
+    defreg(FLMNGDATA),
+    defreg(FLMNGCNT),
+    defreg(TSYNCRXCTL),
+    defreg(TSYNCTXCTL),
+
+    /* Aliases */
+    defreg(RDH0_A),  defreg(RDT0_A),  defreg(RDTR_A),  defreg(RDFH_A),
+    defreg(RDFT_A),  defreg(TDH_A),   defreg(TDT_A),   defreg(TIDV_A),
+    defreg(TDFH_A),  defreg(TDFT_A),  defreg(RA_A),    defreg(RDBAL0_A),
+    defreg(TDBAL_A), defreg(TDLEN_A), defreg(VFTA_A),  defreg(RDLEN0_A),
+    defreg(FCRTL_A), defreg(FCRTH_A)
+};
+
+static inline void
+e1000x_inc_reg_if_not_full(uint32_t *mac, int index)
+{
+    if (mac[index] != 0xffffffff) {
+        mac[index]++;
+    }
+}
+
+static inline void
+e1000x_grow_8reg_if_not_full(uint32_t *mac, int index, int size)
+{
+    uint64_t sum = mac[index] | (uint64_t)mac[index + 1] << 32;
+
+    if (sum + size < sum) {
+        sum = ~0ULL;
+    } else {
+        sum += size;
+    }
+    mac[index] = sum;
+    mac[index + 1] = sum >> 32;
+}
+
+static inline int
+e1000x_vlan_enabled(uint32_t *mac)
+{
+    return ((mac[CTRL] & E1000_CTRL_VME) != 0);
+}
+
+static inline int
+e1000x_is_vlan_txd(uint32_t txd_lower)
+{
+    return ((txd_lower & E1000_TXD_CMD_VLE) != 0);
+}
+
+static inline int
+e1000x_vlan_rx_filter_enabled(uint32_t *mac)
+{
+    return ((mac[RCTL] & E1000_RCTL_VFE) != 0);
+}
+
+static inline int
+e1000x_fcs_len(uint32_t *mac)
+{
+    /* FCS aka Ethernet CRC-32. We don't get it from backends and can't
+    * fill it in, just pad descriptor length by 4 bytes unless guest
+    * told us to strip it off the packet. */
+    return (mac[RCTL] & E1000_RCTL_SECRC) ? 0 : 4;
+}
+
+static inline void
+e1000x_update_regs_on_link_down(uint32_t *mac, uint16_t *phy)
+{
+    mac[STATUS] &= ~E1000_STATUS_LU;
+    phy[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
+    phy[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE;
+    phy[PHY_LP_ABILITY] &= ~MII_LPAR_LPACK;
+}
+
+static inline void
+e1000x_update_regs_on_link_up(uint32_t *mac, uint16_t *phy)
+{
+    mac[STATUS] |= E1000_STATUS_LU;
+    phy[PHY_STATUS] |= MII_SR_LINK_STATUS;
+}
+
+void e1000x_update_rx_total_stats(uint32_t *mac,
+                                  size_t data_size,
+                                  size_t data_fcs_size);
+
+void e1000x_core_prepare_eeprom(uint16_t       *eeprom,
+                                const uint16_t *templ,
+                                uint32_t        templ_size,
+                                uint16_t        dev_id,
+                                const uint8_t  *macaddr);
+
+uint32_t e1000x_rxbufsize(uint32_t rctl);
+
+bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac);
+
+bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet);
+
+bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf);
+
+bool e1000x_hw_rx_enabled(uint32_t *mac);
+
+bool e1000x_is_oversized(uint32_t *mac, size_t size);
+
+void e1000x_restart_autoneg(uint32_t *mac, uint16_t *phy, QEMUTimer *timer);
+
+void e1000x_reset_mac_addr(NICState *nic, uint32_t *mac_regs,
+                           uint8_t *mac_addr);
+
+void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy);
+
+void e1000x_increase_size_stats(uint32_t *mac, const int *size_regs, int size);
+
+typedef struct e1000x_txd_props {
+    unsigned char sum_needed;
+    uint8_t ipcss;
+    uint8_t ipcso;
+    uint16_t ipcse;
+    uint8_t tucss;
+    uint8_t tucso;
+    uint16_t tucse;
+    uint32_t paylen;
+    uint8_t hdr_len;
+    uint16_t mss;
+    int8_t ip;
+    int8_t tcp;
+    bool tse;
+    bool cptse;
+} e1000x_txd_props;
+
+void e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
+                              e1000x_txd_props *props);
--- a/hw/net/imx_fec.c
+++ b/hw/net/imx_fec.c
--- a/hw/net/mipsnet.c
+++ b/hw/net/mipsnet.c
@@ -83,6 +83,9 @@ static ssize_t mipsnet_receive(NetClientState *nc, const uint8_t *buf, size_t si
    if (!mipsnet_can_receive(nc))
        return 0;

+    if (size >= sizeof(s->rx_buffer)) {
+        return 0;
+    }
    s->busy = 1;

    /* Just accept everything. */
--- a/hw/net/net_rx_pkt.c
+++ b/hw/net/net_rx_pkt.c
@@ -0,0 +1,600 @@
+/*
+ * QEMU RX packets abstractions
+ *
+ * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Tamir Shomer <tamirs@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "net_rx_pkt.h"
+#include "net/checksum.h"
+#include "net/tap.h"
+
+struct NetRxPkt {
+    struct virtio_net_hdr virt_hdr;
+    uint8_t ehdr_buf[sizeof(struct eth_header)];
+    struct iovec *vec;
+    uint16_t vec_len_total;
+    uint16_t vec_len;
+    uint32_t tot_len;
+    uint16_t tci;
+    bool vlan_stripped;
+    bool has_virt_hdr;
+    eth_pkt_types_e packet_type;
+
+    /* Analysis results */
+    bool isip4;
+    bool isip6;
+    bool isudp;
+    bool istcp;
+
+    size_t l3hdr_off;
+    size_t l4hdr_off;
+    size_t l5hdr_off;
+
+    eth_ip6_hdr_info ip6hdr_info;
+    eth_ip4_hdr_info ip4hdr_info;
+    eth_l4_hdr_info  l4hdr_info;
+};
+
+void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr)
+{
+    struct NetRxPkt *p = g_malloc0(sizeof *p);
+    p->has_virt_hdr = has_virt_hdr;
+    p->vec = NULL;
+    p->vec_len_total = 0;
+    *pkt = p;
+}
+
+void net_rx_pkt_uninit(struct NetRxPkt *pkt)
+{
+    if (pkt->vec_len_total != 0) {
+        g_free(pkt->vec);
+    }
+
+    g_free(pkt);
+}
+
+struct virtio_net_hdr *net_rx_pkt_get_vhdr(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+    return &pkt->virt_hdr;
+}
+
+static inline void
+net_rx_pkt_iovec_realloc(struct NetRxPkt *pkt,
+                            int new_iov_len)
+{
+    if (pkt->vec_len_total < new_iov_len) {
+        g_free(pkt->vec);
+        pkt->vec = g_malloc(sizeof(*pkt->vec) * new_iov_len);
+        pkt->vec_len_total = new_iov_len;
+    }
+}
+
+static void
+net_rx_pkt_pull_data(struct NetRxPkt *pkt,
+                        const struct iovec *iov, int iovcnt,
+                        size_t ploff)
+{
+    if (pkt->vlan_stripped) {
+        net_rx_pkt_iovec_realloc(pkt, iovcnt + 1);
+
+        pkt->vec[0].iov_base = pkt->ehdr_buf;
+        pkt->vec[0].iov_len = sizeof(pkt->ehdr_buf);
+
+        pkt->tot_len =
+            iov_size(iov, iovcnt) - ploff + sizeof(struct eth_header);
+
+        pkt->vec_len = iov_copy(pkt->vec + 1, pkt->vec_len_total - 1,
+                                iov, iovcnt, ploff, pkt->tot_len);
+    } else {
+        net_rx_pkt_iovec_realloc(pkt, iovcnt);
+
+        pkt->tot_len = iov_size(iov, iovcnt) - ploff;
+        pkt->vec_len = iov_copy(pkt->vec, pkt->vec_len_total,
+                                iov, iovcnt, ploff, pkt->tot_len);
+    }
+
+    eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->isip4, &pkt->isip6,
+                      &pkt->isudp, &pkt->istcp,
+                      &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off,
+                      &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info);
+
+    trace_net_rx_pkt_parsed(pkt->isip4, pkt->isip6, pkt->isudp, pkt->istcp,
+                            pkt->l3hdr_off, pkt->l4hdr_off, pkt->l5hdr_off);
+}
+
+void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt,
+                                const struct iovec *iov, int iovcnt,
+                                size_t iovoff, bool strip_vlan)
+{
+    uint16_t tci = 0;
+    uint16_t ploff = iovoff;
+    assert(pkt);
+    pkt->vlan_stripped = false;
+
+    if (strip_vlan) {
+        pkt->vlan_stripped = eth_strip_vlan(iov, iovcnt, iovoff, pkt->ehdr_buf,
+                                            &ploff, &tci);
+    }
+
+    pkt->tci = tci;
+
+    net_rx_pkt_pull_data(pkt, iov, iovcnt, ploff);
+}
+
+void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt,
+                                const struct iovec *iov, int iovcnt,
+                                size_t iovoff, bool strip_vlan,
+                                uint16_t vet)
+{
+    uint16_t tci = 0;
+    uint16_t ploff = iovoff;
+    assert(pkt);
+    pkt->vlan_stripped = false;
+
+    if (strip_vlan) {
+        pkt->vlan_stripped = eth_strip_vlan_ex(iov, iovcnt, iovoff, vet,
+                                               pkt->ehdr_buf,
+                                               &ploff, &tci);
+    }
+
+    pkt->tci = tci;
+
+    net_rx_pkt_pull_data(pkt, iov, iovcnt, ploff);
+}
+
+void net_rx_pkt_dump(struct NetRxPkt *pkt)
+{
+#ifdef NET_RX_PKT_DEBUG
+    NetRxPkt *pkt = (NetRxPkt *)pkt;
+    assert(pkt);
+
+    printf("RX PKT: tot_len: %d, vlan_stripped: %d, vlan_tag: %d\n",
+              pkt->tot_len, pkt->vlan_stripped, pkt->tci);
+#endif
+}
+
+void net_rx_pkt_set_packet_type(struct NetRxPkt *pkt,
+    eth_pkt_types_e packet_type)
+{
+    assert(pkt);
+
+    pkt->packet_type = packet_type;
+
+}
+
+eth_pkt_types_e net_rx_pkt_get_packet_type(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->packet_type;
+}
+
+size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->tot_len;
+}
+
+void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data,
+                              size_t len)
+{
+    const struct iovec iov = {
+        .iov_base = (void *)data,
+        .iov_len = len
+    };
+
+    assert(pkt);
+
+    eth_get_protocols(&iov, 1, &pkt->isip4, &pkt->isip6,
+                      &pkt->isudp, &pkt->istcp,
+                      &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off,
+                      &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info);
+}
+
+void net_rx_pkt_get_protocols(struct NetRxPkt *pkt,
+                              bool *isip4, bool *isip6,
+                              bool *isudp, bool *istcp)
+{
+    assert(pkt);
+
+    *isip4 = pkt->isip4;
+    *isip6 = pkt->isip6;
+    *isudp = pkt->isudp;
+    *istcp = pkt->istcp;
+}
+
+size_t net_rx_pkt_get_l3_hdr_offset(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+    return pkt->l3hdr_off;
+}
+
+size_t net_rx_pkt_get_l4_hdr_offset(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+    return pkt->l4hdr_off;
+}
+
+size_t net_rx_pkt_get_l5_hdr_offset(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+    return pkt->l5hdr_off;
+}
+
+eth_ip6_hdr_info *net_rx_pkt_get_ip6_info(struct NetRxPkt *pkt)
+{
+    return &pkt->ip6hdr_info;
+}
+
+eth_ip4_hdr_info *net_rx_pkt_get_ip4_info(struct NetRxPkt *pkt)
+{
+    return &pkt->ip4hdr_info;
+}
+
+eth_l4_hdr_info *net_rx_pkt_get_l4_info(struct NetRxPkt *pkt)
+{
+    return &pkt->l4hdr_info;
+}
+
+static inline void
+_net_rx_rss_add_chunk(uint8_t *rss_input, size_t *bytes_written,
+                      void *ptr, size_t size)
+{
+    memcpy(&rss_input[*bytes_written], ptr, size);
+    trace_net_rx_pkt_rss_add_chunk(ptr, size, *bytes_written);
+    *bytes_written += size;
+}
+
+static inline void
+_net_rx_rss_prepare_ip4(uint8_t *rss_input,
+                        struct NetRxPkt *pkt,
+                        size_t *bytes_written)
+{
+    struct ip_header *ip4_hdr = &pkt->ip4hdr_info.ip4_hdr;
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &ip4_hdr->ip_src, sizeof(uint32_t));
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &ip4_hdr->ip_dst, sizeof(uint32_t));
+}
+
+static inline void
+_net_rx_rss_prepare_ip6(uint8_t *rss_input,
+                        struct NetRxPkt *pkt,
+                        bool ipv6ex, size_t *bytes_written)
+{
+    eth_ip6_hdr_info *ip6info = &pkt->ip6hdr_info;
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+           (ipv6ex && ip6info->rss_ex_src_valid) ? &ip6info->rss_ex_src
+                                                 : &ip6info->ip6_hdr.ip6_src,
+           sizeof(struct in6_address));
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+           (ipv6ex && ip6info->rss_ex_dst_valid) ? &ip6info->rss_ex_dst
+                                                 : &ip6info->ip6_hdr.ip6_dst,
+           sizeof(struct in6_address));
+}
+
+static inline void
+_net_rx_rss_prepare_tcp(uint8_t *rss_input,
+                        struct NetRxPkt *pkt,
+                        size_t *bytes_written)
+{
+    struct tcp_header *tcphdr = &pkt->l4hdr_info.hdr.tcp;
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &tcphdr->th_sport, sizeof(uint16_t));
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &tcphdr->th_dport, sizeof(uint16_t));
+}
+
+uint32_t
+net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
+                         NetRxPktRssType type,
+                         uint8_t *key)
+{
+    uint8_t rss_input[36];
+    size_t rss_length = 0;
+    uint32_t rss_hash = 0;
+    net_toeplitz_key key_data;
+
+    switch (type) {
+    case NetPktRssIpV4:
+        assert(pkt->isip4);
+        trace_net_rx_pkt_rss_ip4();
+        _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV4Tcp:
+        assert(pkt->isip4);
+        assert(pkt->istcp);
+        trace_net_rx_pkt_rss_ip4_tcp();
+        _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
+        _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV6Tcp:
+        assert(pkt->isip6);
+        assert(pkt->istcp);
+        trace_net_rx_pkt_rss_ip6_tcp();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
+        _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV6:
+        assert(pkt->isip6);
+        trace_net_rx_pkt_rss_ip6();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
+        break;
+    case NetPktRssIpV6Ex:
+        assert(pkt->isip6);
+        trace_net_rx_pkt_rss_ip6_ex();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
+        break;
+    default:
+        assert(false);
+        break;
+    }
+
+    net_toeplitz_key_init(&key_data, key);
+    net_toeplitz_add(&rss_hash, rss_input, rss_length, &key_data);
+
+    trace_net_rx_pkt_rss_hash(rss_length, rss_hash);
+
+    return rss_hash;
+}
+
+uint16_t net_rx_pkt_get_ip_id(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    if (pkt->isip4) {
+        return be16_to_cpu(pkt->ip4hdr_info.ip4_hdr.ip_id);
+    }
+
+    return 0;
+}
+
+bool net_rx_pkt_is_tcp_ack(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    if (pkt->istcp) {
+        return TCP_HEADER_FLAGS(&pkt->l4hdr_info.hdr.tcp) & TCP_FLAG_ACK;
+    }
+
+    return false;
+}
+
+bool net_rx_pkt_has_tcp_data(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    if (pkt->istcp) {
+        return pkt->l4hdr_info.has_tcp_data;
+    }
+
+    return false;
+}
+
+struct iovec *net_rx_pkt_get_iovec(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->vec;
+}
+
+uint16_t net_rx_pkt_get_iovec_len(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->vec_len;
+}
+
+void net_rx_pkt_set_vhdr(struct NetRxPkt *pkt,
+                            struct virtio_net_hdr *vhdr)
+{
+    assert(pkt);
+
+    memcpy(&pkt->virt_hdr, vhdr, sizeof pkt->virt_hdr);
+}
+
+void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt,
+    const struct iovec *iov, int iovcnt)
+{
+    assert(pkt);
+
+    iov_to_buf(iov, iovcnt, 0, &pkt->virt_hdr, sizeof pkt->virt_hdr);
+}
+
+bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->vlan_stripped;
+}
+
+bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->has_virt_hdr;
+}
+
+uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt)
+{
+    assert(pkt);
+
+    return pkt->tci;
+}
+
+bool net_rx_pkt_validate_l3_csum(struct NetRxPkt *pkt, bool *csum_valid)
+{
+    uint32_t cntr;
+    uint16_t csum;
+    uint32_t csl;
+
+    trace_net_rx_pkt_l3_csum_validate_entry();
+
+    if (!pkt->isip4) {
+        trace_net_rx_pkt_l3_csum_validate_not_ip4();
+        return false;
+    }
+
+    csl = pkt->l4hdr_off - pkt->l3hdr_off;
+
+    cntr = net_checksum_add_iov(pkt->vec, pkt->vec_len,
+                                pkt->l3hdr_off,
+                                csl, 0);
+
+    csum = net_checksum_finish(cntr);
+
+    *csum_valid = (csum == 0);
+
+    trace_net_rx_pkt_l3_csum_validate_csum(pkt->l3hdr_off, csl,
+                                           cntr, csum, *csum_valid);
+
+    return true;
+}
+
+static uint16_t
+_net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt)
+{
+    uint32_t cntr;
+    uint16_t csum;
+    uint16_t csl;
+    uint32_t cso;
+
+    trace_net_rx_pkt_l4_csum_calc_entry();
+
+    if (pkt->isip4) {
+        if (pkt->isudp) {
+            csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen);
+            trace_net_rx_pkt_l4_csum_calc_ip4_udp();
+        } else {
+            csl = be16_to_cpu(pkt->ip4hdr_info.ip4_hdr.ip_len) -
+                  IP_HDR_GET_LEN(&pkt->ip4hdr_info.ip4_hdr);
+            trace_net_rx_pkt_l4_csum_calc_ip4_tcp();
+        }
+
+        cntr = eth_calc_ip4_pseudo_hdr_csum(&pkt->ip4hdr_info.ip4_hdr,
+                                            csl, &cso);
+        trace_net_rx_pkt_l4_csum_calc_ph_csum(cntr, csl);
+    } else {
+        if (pkt->isudp) {
+            csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen);
+            trace_net_rx_pkt_l4_csum_calc_ip6_udp();
+        } else {
+            struct ip6_header *ip6hdr = &pkt->ip6hdr_info.ip6_hdr;
+            size_t full_ip6hdr_len = pkt->l4hdr_off - pkt->l3hdr_off;
+            size_t ip6opts_len = full_ip6hdr_len - sizeof(struct ip6_header);
+
+            csl = be16_to_cpu(ip6hdr->ip6_ctlun.ip6_un1.ip6_un1_plen) -
+                  ip6opts_len;
+            trace_net_rx_pkt_l4_csum_calc_ip6_tcp();
+        }
+
+        cntr = eth_calc_ip6_pseudo_hdr_csum(&pkt->ip6hdr_info.ip6_hdr, csl,
+                                            pkt->ip6hdr_info.l4proto, &cso);
+        trace_net_rx_pkt_l4_csum_calc_ph_csum(cntr, csl);
+    }
+
+    cntr += net_checksum_add_iov(pkt->vec, pkt->vec_len,
+                                 pkt->l4hdr_off, csl, cso);
+
+    csum = net_checksum_finish(cntr);
+
+    trace_net_rx_pkt_l4_csum_calc_csum(pkt->l4hdr_off, csl, cntr, csum);
+
+    return csum;
+}
+
+bool net_rx_pkt_validate_l4_csum(struct NetRxPkt *pkt, bool *csum_valid)
+{
+    uint16_t csum;
+
+    trace_net_rx_pkt_l4_csum_validate_entry();
+
+    if (!pkt->istcp && !pkt->isudp) {
+        trace_net_rx_pkt_l4_csum_validate_not_xxp();
+        return false;
+    }
+
+    if (pkt->isudp && (pkt->l4hdr_info.hdr.udp.uh_sum == 0)) {
+        trace_net_rx_pkt_l4_csum_validate_udp_with_no_checksum();
+        return false;
+    }
+
+    if (pkt->isip4 && pkt->ip4hdr_info.fragment) {
+        trace_net_rx_pkt_l4_csum_validate_ip4_fragment();
+        return false;
+    }
+
+    csum = _net_rx_pkt_calc_l4_csum(pkt);
+
+    *csum_valid = ((csum == 0) || (csum == 0xFFFF));
+
+    trace_net_rx_pkt_l4_csum_validate_csum(*csum_valid);
+
+    return true;
+}
+
+bool net_rx_pkt_fix_l4_csum(struct NetRxPkt *pkt)
+{
+    uint16_t csum = 0;
+    uint32_t l4_cso;
+
+    trace_net_rx_pkt_l4_csum_fix_entry();
+
+    if (pkt->istcp) {
+        l4_cso = offsetof(struct tcp_header, th_sum);
+        trace_net_rx_pkt_l4_csum_fix_tcp(l4_cso);
+    } else if (pkt->isudp) {
+        if (pkt->l4hdr_info.hdr.udp.uh_sum == 0) {
+            trace_net_rx_pkt_l4_csum_fix_udp_with_no_checksum();
+            return false;
+        }
+        l4_cso = offsetof(struct udp_header, uh_sum);
+        trace_net_rx_pkt_l4_csum_fix_udp(l4_cso);
+    } else {
+        trace_net_rx_pkt_l4_csum_fix_not_xxp();
+        return false;
+    }
+
+    if (pkt->isip4 && pkt->ip4hdr_info.fragment) {
+            trace_net_rx_pkt_l4_csum_fix_ip4_fragment();
+            return false;
+    }
+
+    /* Set zero to checksum word */
+    iov_from_buf(pkt->vec, pkt->vec_len,
+                 pkt->l4hdr_off + l4_cso,
+                 &csum, sizeof(csum));
+
+    /* Calculate L4 checksum */
+    csum = cpu_to_be16(_net_rx_pkt_calc_l4_csum(pkt));
+
+    /* Set calculated checksum to checksum word */
+    iov_from_buf(pkt->vec, pkt->vec_len,
+                 pkt->l4hdr_off + l4_cso,
+                 &csum, sizeof(csum));
+
+    trace_net_rx_pkt_l4_csum_fix_csum(pkt->l4hdr_off + l4_cso, csum);
+
+    return true;
+}
--- a/hw/net/net_rx_pkt.h
+++ b/hw/net/net_rx_pkt.h
@@ -0,0 +1,363 @@
+/*
+ * QEMU RX packets abstraction
+ *
+ * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Tamir Shomer <tamirs@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef NET_RX_PKT_H
+#define NET_RX_PKT_H
+
+#include "net/eth.h"
+
+/* defines to enable packet dump functions */
+/*#define NET_RX_PKT_DEBUG*/
+
+struct NetRxPkt;
+
+/**
+ * Clean all rx packet resources
+ *
+ * @pkt:            packet
+ *
+ */
+void net_rx_pkt_uninit(struct NetRxPkt *pkt);
+
+/**
+ * Init function for rx packet functionality
+ *
+ * @pkt:            packet pointer
+ * @has_virt_hdr:   device uses virtio header
+ *
+ */
+void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr);
+
+/**
+ * returns total length of data attached to rx context
+ *
+ * @pkt:            packet
+ *
+ * Return:  nothing
+ *
+ */
+size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt);
+
+/**
+ * parse and set packet analysis results
+ *
+ * @pkt:            packet
+ * @data:           pointer to the data buffer to be parsed
+ * @len:            data length
+ *
+ */
+void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data,
+                              size_t len);
+
+/**
+ * fetches packet analysis results
+ *
+ * @pkt:            packet
+ * @isip4:          whether the packet given is IPv4
+ * @isip6:          whether the packet given is IPv6
+ * @isudp:          whether the packet given is UDP
+ * @istcp:          whether the packet given is TCP
+ *
+ */
+void net_rx_pkt_get_protocols(struct NetRxPkt *pkt,
+                                 bool *isip4, bool *isip6,
+                                 bool *isudp, bool *istcp);
+
+/**
+* fetches L3 header offset
+*
+* @pkt:            packet
+*
+*/
+size_t net_rx_pkt_get_l3_hdr_offset(struct NetRxPkt *pkt);
+
+/**
+* fetches L4 header offset
+*
+* @pkt:            packet
+*
+*/
+size_t net_rx_pkt_get_l4_hdr_offset(struct NetRxPkt *pkt);
+
+/**
+* fetches L5 header offset
+*
+* @pkt:            packet
+*
+*/
+size_t net_rx_pkt_get_l5_hdr_offset(struct NetRxPkt *pkt);
+
+/**
+ * fetches IP6 header analysis results
+ *
+ * Return:  pointer to analysis results structure which is stored in internal
+ *          packet area.
+ *
+ */
+eth_ip6_hdr_info *net_rx_pkt_get_ip6_info(struct NetRxPkt *pkt);
+
+/**
+ * fetches IP4 header analysis results
+ *
+ * Return:  pointer to analysis results structure which is stored in internal
+ *          packet area.
+ *
+ */
+eth_ip4_hdr_info *net_rx_pkt_get_ip4_info(struct NetRxPkt *pkt);
+
+/**
+ * fetches L4 header analysis results
+ *
+ * Return:  pointer to analysis results structure which is stored in internal
+ *          packet area.
+ *
+ */
+eth_l4_hdr_info *net_rx_pkt_get_l4_info(struct NetRxPkt *pkt);
+
+typedef enum {
+    NetPktRssIpV4,
+    NetPktRssIpV4Tcp,
+    NetPktRssIpV6Tcp,
+    NetPktRssIpV6,
+    NetPktRssIpV6Ex
+} NetRxPktRssType;
+
+/**
+* calculates RSS hash for packet
+*
+* @pkt:            packet
+* @type:           RSS hash type
+*
+* Return:  Toeplitz RSS hash.
+*
+*/
+uint32_t
+net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
+                         NetRxPktRssType type,
+                         uint8_t *key);
+
+/**
+* fetches IP identification for the packet
+*
+* @pkt:            packet
+*
+*/
+uint16_t net_rx_pkt_get_ip_id(struct NetRxPkt *pkt);
+
+/**
+* check if given packet is a TCP ACK packet
+*
+* @pkt:            packet
+*
+*/
+bool net_rx_pkt_is_tcp_ack(struct NetRxPkt *pkt);
+
+/**
+* check if given packet contains TCP data
+*
+* @pkt:            packet
+*
+*/
+bool net_rx_pkt_has_tcp_data(struct NetRxPkt *pkt);
+
+/**
+ * returns virtio header stored in rx context
+ *
+ * @pkt:            packet
+ * @ret:            virtio header
+ *
+ */
+struct virtio_net_hdr *net_rx_pkt_get_vhdr(struct NetRxPkt *pkt);
+
+/**
+ * returns packet type
+ *
+ * @pkt:            packet
+ * @ret:            packet type
+ *
+ */
+eth_pkt_types_e net_rx_pkt_get_packet_type(struct NetRxPkt *pkt);
+
+/**
+ * returns vlan tag
+ *
+ * @pkt:            packet
+ * @ret:            VLAN tag
+ *
+ */
+uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt);
+
+/**
+ * tells whether vlan was stripped from the packet
+ *
+ * @pkt:            packet
+ * @ret:            VLAN stripped sign
+ *
+ */
+bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt);
+
+/**
+ * notifies caller if the packet has virtio header
+ *
+ * @pkt:            packet
+ * @ret:            true if packet has virtio header, false otherwize
+ *
+ */
+bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt);
+
+/**
+* attach scatter-gather data to rx packet
+*
+* @pkt:            packet
+* @iov:            received data scatter-gather list
+* @iovcnt          number of elements in iov
+* @iovoff          data start offset in the iov
+* @strip_vlan:     should the module strip vlan from data
+*
+*/
+void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt,
+                                const struct iovec *iov,
+                                int iovcnt, size_t iovoff,
+                                bool strip_vlan);
+
+/**
+* attach scatter-gather data to rx packet
+*
+* @pkt:            packet
+* @iov:            received data scatter-gather list
+* @iovcnt          number of elements in iov
+* @iovoff          data start offset in the iov
+* @strip_vlan:     should the module strip vlan from data
+* @vet:            VLAN tag Ethernet type
+*
+*/
+void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt,
+                                   const struct iovec *iov, int iovcnt,
+                                   size_t iovoff, bool strip_vlan,
+                                   uint16_t vet);
+
+/**
+ * attach data to rx packet
+ *
+ * @pkt:            packet
+ * @data:           pointer to the data buffer
+ * @len:            data length
+ * @strip_vlan:     should the module strip vlan from data
+ *
+ */
+static inline void
+net_rx_pkt_attach_data(struct NetRxPkt *pkt, const void *data,
+                          size_t len, bool strip_vlan)
+{
+    const struct iovec iov = {
+        .iov_base = (void *) data,
+        .iov_len = len
+    };
+
+    net_rx_pkt_attach_iovec(pkt, &iov, 1, 0, strip_vlan);
+}
+
+/**
+ * returns io vector that holds the attached data
+ *
+ * @pkt:            packet
+ * @ret:            pointer to IOVec
+ *
+ */
+struct iovec *net_rx_pkt_get_iovec(struct NetRxPkt *pkt);
+
+/**
+* returns io vector length that holds the attached data
+*
+* @pkt:            packet
+* @ret:            IOVec length
+*
+*/
+uint16_t net_rx_pkt_get_iovec_len(struct NetRxPkt *pkt);
+
+/**
+ * prints rx packet data if debug is enabled
+ *
+ * @pkt:            packet
+ *
+ */
+void net_rx_pkt_dump(struct NetRxPkt *pkt);
+
+/**
+ * copy passed vhdr data to packet context
+ *
+ * @pkt:            packet
+ * @vhdr:           VHDR buffer
+ *
+ */
+void net_rx_pkt_set_vhdr(struct NetRxPkt *pkt,
+    struct virtio_net_hdr *vhdr);
+
+/**
+* copy passed vhdr data to packet context
+*
+* @pkt:            packet
+* @iov:            VHDR iov
+* @iovcnt:         VHDR iov array size
+*
+*/
+void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt,
+    const struct iovec *iov, int iovcnt);
+
+/**
+ * save packet type in packet context
+ *
+ * @pkt:            packet
+ * @packet_type:    the packet type
+ *
+ */
+void net_rx_pkt_set_packet_type(struct NetRxPkt *pkt,
+    eth_pkt_types_e packet_type);
+
+/**
+* validate TCP/UDP checksum of the packet
+*
+* @pkt:            packet
+* @csum_valid:     checksum validation result
+* @ret:            true if validation was performed, false in case packet is
+*                  not TCP/UDP or checksum validation is not possible
+*
+*/
+bool net_rx_pkt_validate_l4_csum(struct NetRxPkt *pkt, bool *csum_valid);
+
+/**
+* validate IPv4 checksum of the packet
+*
+* @pkt:            packet
+* @csum_valid:     checksum validation result
+* @ret:            true if validation was performed, false in case packet is
+*                  not TCP/UDP or checksum validation is not possible
+*
+*/
+bool net_rx_pkt_validate_l3_csum(struct NetRxPkt *pkt, bool *csum_valid);
+
+/**
+* fix IPv4 checksum of the packet
+*
+* @pkt:            packet
+* @ret:            true if checksum was fixed, false in case packet is
+*                  not TCP/UDP or checksum correction is not possible
+*
+*/
+bool net_rx_pkt_fix_l4_csum(struct NetRxPkt *pkt);
+
+#endif
--- a/hw/net/vmxnet_tx_pkt.c
+++ b/hw/net/vmxnet_tx_pkt.c
@@ -1,5 +1,5 @@
 /*
- * QEMU VMWARE VMXNET* paravirtual NICs - TX packets abstractions
+ * QEMU TX packets abstractions
 *
 * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
 *
@@ -15,25 +15,24 @@
 *
 */

-#include "qemu/osdep.h"
-#include "hw/hw.h"
-#include "vmxnet_tx_pkt.h"
+#include "net_tx_pkt.h"
 #include "net/eth.h"
-#include "qemu-common.h"
-#include "qemu/iov.h"
 #include "net/checksum.h"
 #include "net/tap.h"
 #include "net/net.h"
+#include "hw/pci/pci.h"

 enum {
-    VMXNET_TX_PKT_VHDR_FRAG = 0,
-    VMXNET_TX_PKT_L2HDR_FRAG,
-    VMXNET_TX_PKT_L3HDR_FRAG,
-    VMXNET_TX_PKT_PL_START_FRAG
+    NET_TX_PKT_VHDR_FRAG = 0,
+    NET_TX_PKT_L2HDR_FRAG,
+    NET_TX_PKT_L3HDR_FRAG,
+    NET_TX_PKT_PL_START_FRAG
 };

 /* TX packet private context */
-struct VmxnetTxPkt {
+struct NetTxPkt {
+    PCIDevice *pci_dev;
+
    struct virtio_net_hdr virt_hdr;
    bool has_virt_hdr;

@@ -44,6 +43,7 @@ struct VmxnetTxPkt {
    struct iovec *vec;

    uint8_t l2_hdr[ETH_MAX_L2_HDR_LEN];
+    uint8_t l3_hdr[ETH_MAX_IP_DGRAM_LEN];

    uint32_t payload_len;

@@ -53,32 +53,35 @@ struct VmxnetTxPkt {
    uint16_t hdr_len;
    eth_pkt_types_e packet_type;
    uint8_t l4proto;
+
+    bool is_loopback;
 };

-void vmxnet_tx_pkt_init(struct VmxnetTxPkt **pkt, uint32_t max_frags,
-    bool has_virt_hdr)
+void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev,
+    uint32_t max_frags, bool has_virt_hdr)
 {
-    struct VmxnetTxPkt *p = g_malloc0(sizeof *p);
+    struct NetTxPkt *p = g_malloc0(sizeof *p);
+
+    p->pci_dev = pci_dev;

    p->vec = g_malloc((sizeof *p->vec) *
-        (max_frags + VMXNET_TX_PKT_PL_START_FRAG));
+        (max_frags + NET_TX_PKT_PL_START_FRAG));

    p->raw = g_malloc((sizeof *p->raw) * max_frags);

    p->max_payload_frags = max_frags;
    p->max_raw_frags = max_frags;
    p->has_virt_hdr = has_virt_hdr;
-    p->vec[VMXNET_TX_PKT_VHDR_FRAG].iov_base = &p->virt_hdr;
-    p->vec[VMXNET_TX_PKT_VHDR_FRAG].iov_len =
+    p->vec[NET_TX_PKT_VHDR_FRAG].iov_base = &p->virt_hdr;
+    p->vec[NET_TX_PKT_VHDR_FRAG].iov_len =
        p->has_virt_hdr ? sizeof p->virt_hdr : 0;
-    p->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_base = &p->l2_hdr;
-    p->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base = NULL;
-    p->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len = 0;
+    p->vec[NET_TX_PKT_L2HDR_FRAG].iov_base = &p->l2_hdr;
+    p->vec[NET_TX_PKT_L3HDR_FRAG].iov_base = &p->l3_hdr;

    *pkt = p;
 }

-void vmxnet_tx_pkt_uninit(struct VmxnetTxPkt *pkt)
+void net_tx_pkt_uninit(struct NetTxPkt *pkt)
 {
    if (pkt) {
        g_free(pkt->vec);
@@ -87,49 +90,63 @@ void vmxnet_tx_pkt_uninit(struct VmxnetTxPkt *pkt)
    }
 }

-void vmxnet_tx_pkt_update_ip_checksums(struct VmxnetTxPkt *pkt)
+void net_tx_pkt_update_ip_hdr_checksum(struct NetTxPkt *pkt)
 {
    uint16_t csum;
-    uint32_t ph_raw_csum;
+    assert(pkt);
+    struct ip_header *ip_hdr;
+    ip_hdr = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
+
+    ip_hdr->ip_len = cpu_to_be16(pkt->payload_len +
+        pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len);
+
+    ip_hdr->ip_sum = 0;
+    csum = net_raw_checksum((uint8_t *)ip_hdr,
+        pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len);
+    ip_hdr->ip_sum = cpu_to_be16(csum);
+}
+
+void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt)
+{
+    uint16_t csum;
+    uint32_t cntr, cso;
    assert(pkt);
    uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN;
-    struct ip_header *ip_hdr;
+    void *ip_hdr = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;

-    if (VIRTIO_NET_HDR_GSO_TCPV4 != gso_type &&
-        VIRTIO_NET_HDR_GSO_UDP != gso_type) {
-        return;
-    }
-
-    ip_hdr = pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base;
-
-    if (pkt->payload_len + pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len >
+    if (pkt->payload_len + pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len >
        ETH_MAX_IP_DGRAM_LEN) {
        return;
    }

-    ip_hdr->ip_len = cpu_to_be16(pkt->payload_len +
-        pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len);
+    if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
+        gso_type == VIRTIO_NET_HDR_GSO_UDP) {
+        /* Calculate IP header checksum */
+        net_tx_pkt_update_ip_hdr_checksum(pkt);

-    /* Calculate IP header checksum                    */
-    ip_hdr->ip_sum = 0;
-    csum = net_raw_checksum((uint8_t *)ip_hdr,
-        pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len);
-    ip_hdr->ip_sum = cpu_to_be16(csum);
+        /* Calculate IP pseudo header checksum */
+        cntr = eth_calc_ip4_pseudo_hdr_csum(ip_hdr, pkt->payload_len, &cso);
+        csum = cpu_to_be16(~net_checksum_finish(cntr));
+    } else if (gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
+        /* Calculate IP pseudo header checksum */
+        cntr = eth_calc_ip6_pseudo_hdr_csum(ip_hdr, pkt->payload_len,
+                                            IP_PROTO_TCP, &cso);
+        csum = cpu_to_be16(~net_checksum_finish(cntr));
+    } else {
+        return;
+    }

-    /* Calculate IP pseudo header checksum             */
-    ph_raw_csum = eth_calc_pseudo_hdr_csum(ip_hdr, pkt->payload_len);
-    csum = cpu_to_be16(~net_checksum_finish(ph_raw_csum));
-    iov_from_buf(&pkt->vec[VMXNET_TX_PKT_PL_START_FRAG], pkt->payload_frags,
+    iov_from_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG], pkt->payload_frags,
                 pkt->virt_hdr.csum_offset, &csum, sizeof(csum));
 }

-static void vmxnet_tx_pkt_calculate_hdr_len(struct VmxnetTxPkt *pkt)
+static void net_tx_pkt_calculate_hdr_len(struct NetTxPkt *pkt)
 {
-    pkt->hdr_len = pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_len +
-        pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len;
+    pkt->hdr_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len +
+        pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len;
 }

-static bool vmxnet_tx_pkt_parse_headers(struct VmxnetTxPkt *pkt)
+static bool net_tx_pkt_parse_headers(struct NetTxPkt *pkt)
 {
    struct iovec *l2_hdr, *l3_hdr;
    size_t bytes_read;
@@ -138,8 +155,8 @@ static bool vmxnet_tx_pkt_parse_headers(struct VmxnetTxPkt *pkt)

    assert(pkt);

-    l2_hdr = &pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG];
-    l3_hdr = &pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG];
+    l2_hdr = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
+    l3_hdr = &pkt->vec[NET_TX_PKT_L3HDR_FRAG];

    bytes_read = iov_to_buf(pkt->raw, pkt->raw_frags, 0, l2_hdr->iov_base,
                            ETH_MAX_L2_HDR_LEN);
@@ -160,15 +177,19 @@ static bool vmxnet_tx_pkt_parse_headers(struct VmxnetTxPkt *pkt)

    if (bytes_read < l2_hdr->iov_len) {
        l2_hdr->iov_len = 0;
+        l3_hdr->iov_len = 0;
+        pkt->packet_type = ETH_PKT_UCAST;
        return false;
+    } else {
+        l2_hdr->iov_len = ETH_MAX_L2_HDR_LEN;
+        l2_hdr->iov_len = eth_get_l2_hdr_length(l2_hdr->iov_base);
+        pkt->packet_type = get_eth_packet_type(l2_hdr->iov_base);
    }

-    l3_proto = eth_get_l3_proto(l2_hdr->iov_base, l2_hdr->iov_len);
+    l3_proto = eth_get_l3_proto(l2_hdr, 1, l2_hdr->iov_len);

    switch (l3_proto) {
    case ETH_P_IP:
-        l3_hdr->iov_base = g_malloc(ETH_MAX_IP4_HDR_LEN);
-
        bytes_read = iov_to_buf(pkt->raw, pkt->raw_frags, l2_hdr->iov_len,
                                l3_hdr->iov_base, sizeof(struct ip_header));

@@ -178,27 +199,45 @@ static bool vmxnet_tx_pkt_parse_headers(struct VmxnetTxPkt *pkt)
        }

        l3_hdr->iov_len = IP_HDR_GET_LEN(l3_hdr->iov_base);
-        pkt->l4proto = ((struct ip_header *) l3_hdr->iov_base)->ip_p;

-        /* copy optional IPv4 header data */
-        bytes_read = iov_to_buf(pkt->raw, pkt->raw_frags,
-                                l2_hdr->iov_len + sizeof(struct ip_header),
-                                l3_hdr->iov_base + sizeof(struct ip_header),
-                                l3_hdr->iov_len - sizeof(struct ip_header));
-        if (bytes_read < l3_hdr->iov_len - sizeof(struct ip_header)) {
+        if (l3_hdr->iov_len < sizeof(struct ip_header)) {
            l3_hdr->iov_len = 0;
            return false;
        }
+
+        pkt->l4proto = ((struct ip_header *) l3_hdr->iov_base)->ip_p;
+
+        if (IP_HDR_GET_LEN(l3_hdr->iov_base) != sizeof(struct ip_header)) {
+            /* copy optional IPv4 header data if any*/
+            bytes_read = iov_to_buf(pkt->raw, pkt->raw_frags,
+                                    l2_hdr->iov_len + sizeof(struct ip_header),
+                                    l3_hdr->iov_base + sizeof(struct ip_header),
+                                    l3_hdr->iov_len - sizeof(struct ip_header));
+            if (bytes_read < l3_hdr->iov_len - sizeof(struct ip_header)) {
+                l3_hdr->iov_len = 0;
+                return false;
+            }
+        }
+
        break;

    case ETH_P_IPV6:
+    {
+        eth_ip6_hdr_info hdrinfo;
+
        if (!eth_parse_ipv6_hdr(pkt->raw, pkt->raw_frags, l2_hdr->iov_len,
-                               &pkt->l4proto, &full_ip6hdr_len)) {
+                                &hdrinfo)) {
            l3_hdr->iov_len = 0;
            return false;
        }

-        l3_hdr->iov_base = g_malloc(full_ip6hdr_len);
+        pkt->l4proto = hdrinfo.l4proto;
+        full_ip6hdr_len = hdrinfo.full_hdr_len;
+
+        if (full_ip6hdr_len > ETH_MAX_IP_DGRAM_LEN) {
+            l3_hdr->iov_len = 0;
+            return false;
+        }

        bytes_read = iov_to_buf(pkt->raw, pkt->raw_frags, l2_hdr->iov_len,
                                l3_hdr->iov_base, full_ip6hdr_len);
@@ -210,67 +249,62 @@ static bool vmxnet_tx_pkt_parse_headers(struct VmxnetTxPkt *pkt)
            l3_hdr->iov_len = full_ip6hdr_len;
        }
        break;
-
+    }
    default:
        l3_hdr->iov_len = 0;
        break;
    }

-    vmxnet_tx_pkt_calculate_hdr_len(pkt);
-    pkt->packet_type = get_eth_packet_type(l2_hdr->iov_base);
+    net_tx_pkt_calculate_hdr_len(pkt);
    return true;
 }

-static bool vmxnet_tx_pkt_rebuild_payload(struct VmxnetTxPkt *pkt)
+static void net_tx_pkt_rebuild_payload(struct NetTxPkt *pkt)
 {
-    size_t payload_len = iov_size(pkt->raw, pkt->raw_frags) - pkt->hdr_len;
-
-    pkt->payload_frags = iov_copy(&pkt->vec[VMXNET_TX_PKT_PL_START_FRAG],
+    pkt->payload_len = iov_size(pkt->raw, pkt->raw_frags) - pkt->hdr_len;
+    pkt->payload_frags = iov_copy(&pkt->vec[NET_TX_PKT_PL_START_FRAG],
                                pkt->max_payload_frags,
                                pkt->raw, pkt->raw_frags,
-                                pkt->hdr_len, payload_len);
+                                pkt->hdr_len, pkt->payload_len);
+}

-    if (pkt->payload_frags != (uint32_t) -1) {
-        pkt->payload_len = payload_len;
+bool net_tx_pkt_parse(struct NetTxPkt *pkt)
+{
+    if (net_tx_pkt_parse_headers(pkt)) {
+        net_tx_pkt_rebuild_payload(pkt);
        return true;
    } else {
        return false;
    }
 }

-bool vmxnet_tx_pkt_parse(struct VmxnetTxPkt *pkt)
-{
-    return vmxnet_tx_pkt_parse_headers(pkt) &&
-           vmxnet_tx_pkt_rebuild_payload(pkt);
-}
-
-struct virtio_net_hdr *vmxnet_tx_pkt_get_vhdr(struct VmxnetTxPkt *pkt)
+struct virtio_net_hdr *net_tx_pkt_get_vhdr(struct NetTxPkt *pkt)
 {
    assert(pkt);
    return &pkt->virt_hdr;
 }

-static uint8_t vmxnet_tx_pkt_get_gso_type(struct VmxnetTxPkt *pkt,
+static uint8_t net_tx_pkt_get_gso_type(struct NetTxPkt *pkt,
                                          bool tso_enable)
 {
    uint8_t rc = VIRTIO_NET_HDR_GSO_NONE;
    uint16_t l3_proto;

-    l3_proto = eth_get_l3_proto(pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_base,
-        pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_len);
+    l3_proto = eth_get_l3_proto(&pkt->vec[NET_TX_PKT_L2HDR_FRAG], 1,
+        pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len);

    if (!tso_enable) {
        goto func_exit;
    }

-    rc = eth_get_gso_type(l3_proto, pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base,
+    rc = eth_get_gso_type(l3_proto, pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base,
                          pkt->l4proto);

 func_exit:
    return rc;
 }

-void vmxnet_tx_pkt_build_vheader(struct VmxnetTxPkt *pkt, bool tso_enable,
+void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
    bool csum_enable, uint32_t gso_size)
 {
    struct tcp_hdr l4hdr;
@@ -279,7 +313,7 @@ void vmxnet_tx_pkt_build_vheader(struct VmxnetTxPkt *pkt, bool tso_enable,
    /* csum has to be enabled if tso is. */
    assert(csum_enable || !tso_enable);

-    pkt->virt_hdr.gso_type = vmxnet_tx_pkt_get_gso_type(pkt, tso_enable);
+    pkt->virt_hdr.gso_type = net_tx_pkt_get_gso_type(pkt, tso_enable);

    switch (pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
    case VIRTIO_NET_HDR_GSO_NONE:
@@ -288,16 +322,16 @@ void vmxnet_tx_pkt_build_vheader(struct VmxnetTxPkt *pkt, bool tso_enable,
        break;

    case VIRTIO_NET_HDR_GSO_UDP:
-        pkt->virt_hdr.gso_size = IP_FRAG_ALIGN_SIZE(gso_size);
+        pkt->virt_hdr.gso_size = gso_size;
        pkt->virt_hdr.hdr_len = pkt->hdr_len + sizeof(struct udp_header);
        break;

    case VIRTIO_NET_HDR_GSO_TCPV4:
    case VIRTIO_NET_HDR_GSO_TCPV6:
-        iov_to_buf(&pkt->vec[VMXNET_TX_PKT_PL_START_FRAG], pkt->payload_frags,
+        iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG], pkt->payload_frags,
                   0, &l4hdr, sizeof(l4hdr));
        pkt->virt_hdr.hdr_len = pkt->hdr_len + l4hdr.th_off * sizeof(uint32_t);
-        pkt->virt_hdr.gso_size = IP_FRAG_ALIGN_SIZE(gso_size);
+        pkt->virt_hdr.gso_size = gso_size;
        break;

    default:
@@ -322,23 +356,24 @@ void vmxnet_tx_pkt_build_vheader(struct VmxnetTxPkt *pkt, bool tso_enable,
    }
 }

-void vmxnet_tx_pkt_setup_vlan_header(struct VmxnetTxPkt *pkt, uint16_t vlan)
+void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt,
+    uint16_t vlan, uint16_t vlan_ethtype)
 {
    bool is_new;
    assert(pkt);

-    eth_setup_vlan_headers(pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_base,
-        vlan, &is_new);
+    eth_setup_vlan_headers_ex(pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base,
+        vlan, vlan_ethtype, &is_new);

    /* update l2hdrlen */
    if (is_new) {
        pkt->hdr_len += sizeof(struct vlan_header);
-        pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_len +=
+        pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len +=
            sizeof(struct vlan_header);
    }
 }

-bool vmxnet_tx_pkt_add_raw_fragment(struct VmxnetTxPkt *pkt, hwaddr pa,
+bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa,
    size_t len)
 {
    hwaddr mapped_len = 0;
@@ -353,44 +388,50 @@ bool vmxnet_tx_pkt_add_raw_fragment(struct VmxnetTxPkt *pkt, hwaddr pa,
    ventry = &pkt->raw[pkt->raw_frags];
    mapped_len = len;

-    ventry->iov_base = cpu_physical_memory_map(pa, &mapped_len, false);
-    ventry->iov_len = mapped_len;
-    pkt->raw_frags += !!ventry->iov_base;
+    ventry->iov_base = pci_dma_map(pkt->pci_dev, pa,
+                                   &mapped_len, DMA_DIRECTION_TO_DEVICE);

-    if ((ventry->iov_base == NULL) || (len != mapped_len)) {
+    if ((ventry->iov_base != NULL) && (len == mapped_len)) {
+        ventry->iov_len = mapped_len;
+        pkt->raw_frags++;
+        return true;
+    } else {
        return false;
    }
-
-    return true;
 }

-eth_pkt_types_e vmxnet_tx_pkt_get_packet_type(struct VmxnetTxPkt *pkt)
+bool net_tx_pkt_has_fragments(struct NetTxPkt *pkt)
+{
+    return pkt->raw_frags > 0;
+}
+
+eth_pkt_types_e net_tx_pkt_get_packet_type(struct NetTxPkt *pkt)
 {
    assert(pkt);

    return pkt->packet_type;
 }

-size_t vmxnet_tx_pkt_get_total_len(struct VmxnetTxPkt *pkt)
+size_t net_tx_pkt_get_total_len(struct NetTxPkt *pkt)
 {
    assert(pkt);

    return pkt->hdr_len + pkt->payload_len;
 }

-void vmxnet_tx_pkt_dump(struct VmxnetTxPkt *pkt)
+void net_tx_pkt_dump(struct NetTxPkt *pkt)
 {
-#ifdef VMXNET_TX_PKT_DEBUG
+#ifdef NET_TX_PKT_DEBUG
    assert(pkt);

    printf("TX PKT: hdr_len: %d, pkt_type: 0x%X, l2hdr_len: %lu, "
        "l3hdr_len: %lu, payload_len: %u\n", pkt->hdr_len, pkt->packet_type,
-        pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_len,
-        pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len, pkt->payload_len);
+        pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len,
+        pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len, pkt->payload_len);
 #endif
 }

-void vmxnet_tx_pkt_reset(struct VmxnetTxPkt *pkt)
+void net_tx_pkt_reset(struct NetTxPkt *pkt)
 {
    int i;

@@ -401,38 +442,31 @@ void vmxnet_tx_pkt_reset(struct VmxnetTxPkt *pkt)

    memset(&pkt->virt_hdr, 0, sizeof(pkt->virt_hdr));

-    g_free(pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base);
-    pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base = NULL;
-
    assert(pkt->vec);
-    for (i = VMXNET_TX_PKT_L2HDR_FRAG;
-         i < pkt->payload_frags + VMXNET_TX_PKT_PL_START_FRAG; i++) {
-        pkt->vec[i].iov_len = 0;
-    }
+
    pkt->payload_len = 0;
    pkt->payload_frags = 0;

    assert(pkt->raw);
    for (i = 0; i < pkt->raw_frags; i++) {
        assert(pkt->raw[i].iov_base);
-        cpu_physical_memory_unmap(pkt->raw[i].iov_base, pkt->raw[i].iov_len,
-                                  false, pkt->raw[i].iov_len);
-        pkt->raw[i].iov_len = 0;
+        pci_dma_unmap(pkt->pci_dev, pkt->raw[i].iov_base, pkt->raw[i].iov_len,
+                      DMA_DIRECTION_TO_DEVICE, 0);
    }
    pkt->raw_frags = 0;

    pkt->hdr_len = 0;
-    pkt->packet_type = 0;
    pkt->l4proto = 0;
 }

-static void vmxnet_tx_pkt_do_sw_csum(struct VmxnetTxPkt *pkt)
+static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
 {
-    struct iovec *iov = &pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG];
+    struct iovec *iov = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
    uint32_t csum_cntr;
    uint16_t csum = 0;
+    uint32_t cso;
    /* num of iovec without vhdr */
-    uint32_t iov_len = pkt->payload_frags + VMXNET_TX_PKT_PL_START_FRAG - 1;
+    uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
    uint16_t csl;
    struct ip_header *iphdr;
    size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
@@ -443,12 +477,13 @@ static void vmxnet_tx_pkt_do_sw_csum(struct VmxnetTxPkt *pkt)
    /* Calculate L4 TCP/UDP checksum */
    csl = pkt->payload_len;

-    /* data checksum */
-    csum_cntr =
-        net_checksum_add_iov(iov, iov_len, pkt->virt_hdr.csum_start, csl);
    /* add pseudo header to csum */
-    iphdr = pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base;
-    csum_cntr += eth_calc_pseudo_hdr_csum(iphdr, csl);
+    iphdr = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
+    csum_cntr = eth_calc_ip4_pseudo_hdr_csum(iphdr, csl, &cso);
+
+    /* data checksum */
+    csum_cntr +=
+        net_checksum_add_iov(iov, iov_len, pkt->virt_hdr.csum_start, csl, cso);

    /* Put the checksum obtained into the packet */
    csum = cpu_to_be16(net_checksum_finish(csum_cntr));
@@ -456,37 +491,37 @@ static void vmxnet_tx_pkt_do_sw_csum(struct VmxnetTxPkt *pkt)
 }

 enum {
-    VMXNET_TX_PKT_FRAGMENT_L2_HDR_POS = 0,
-    VMXNET_TX_PKT_FRAGMENT_L3_HDR_POS,
-    VMXNET_TX_PKT_FRAGMENT_HEADER_NUM
+    NET_TX_PKT_FRAGMENT_L2_HDR_POS = 0,
+    NET_TX_PKT_FRAGMENT_L3_HDR_POS,
+    NET_TX_PKT_FRAGMENT_HEADER_NUM
 };

-#define VMXNET_MAX_FRAG_SG_LIST (64)
+#define NET_MAX_FRAG_SG_LIST (64)

-static size_t vmxnet_tx_pkt_fetch_fragment(struct VmxnetTxPkt *pkt,
+static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
    int *src_idx, size_t *src_offset, struct iovec *dst, int *dst_idx)
 {
    size_t fetched = 0;
    struct iovec *src = pkt->vec;

-    *dst_idx = VMXNET_TX_PKT_FRAGMENT_HEADER_NUM;
+    *dst_idx = NET_TX_PKT_FRAGMENT_HEADER_NUM;

-    while (fetched < pkt->virt_hdr.gso_size) {
+    while (fetched < IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size)) {

        /* no more place in fragment iov */
-        if (*dst_idx == VMXNET_MAX_FRAG_SG_LIST) {
+        if (*dst_idx == NET_MAX_FRAG_SG_LIST) {
            break;
        }

        /* no more data in iovec */
-        if (*src_idx == (pkt->payload_frags + VMXNET_TX_PKT_PL_START_FRAG)) {
+        if (*src_idx == (pkt->payload_frags + NET_TX_PKT_PL_START_FRAG)) {
            break;
        }


        dst[*dst_idx].iov_base = src[*src_idx].iov_base + *src_offset;
        dst[*dst_idx].iov_len = MIN(src[*src_idx].iov_len - *src_offset,
-            pkt->virt_hdr.gso_size - fetched);
+            IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size) - fetched);

        *src_offset += dst[*dst_idx].iov_len;
        fetched += dst[*dst_idx].iov_len;
@@ -502,35 +537,45 @@ static size_t vmxnet_tx_pkt_fetch_fragment(struct VmxnetTxPkt *pkt,
    return fetched;
 }

-static bool vmxnet_tx_pkt_do_sw_fragmentation(struct VmxnetTxPkt *pkt,
+static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt,
+    NetClientState *nc, const struct iovec *iov, int iov_cnt)
+{
+    if (pkt->is_loopback) {
+        nc->info->receive_iov(nc, iov, iov_cnt);
+    } else {
+        qemu_sendv_packet(nc, iov, iov_cnt);
+    }
+}
+
+static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt,
    NetClientState *nc)
 {
-    struct iovec fragment[VMXNET_MAX_FRAG_SG_LIST];
+    struct iovec fragment[NET_MAX_FRAG_SG_LIST];
    size_t fragment_len = 0;
    bool more_frags = false;

    /* some pointers for shorter code */
    void *l2_iov_base, *l3_iov_base;
    size_t l2_iov_len, l3_iov_len;
-    int src_idx =  VMXNET_TX_PKT_PL_START_FRAG, dst_idx;
+    int src_idx =  NET_TX_PKT_PL_START_FRAG, dst_idx;
    size_t src_offset = 0;
    size_t fragment_offset = 0;

-    l2_iov_base = pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_base;
-    l2_iov_len = pkt->vec[VMXNET_TX_PKT_L2HDR_FRAG].iov_len;
-    l3_iov_base = pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_base;
-    l3_iov_len = pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len;
+    l2_iov_base = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base;
+    l2_iov_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len;
+    l3_iov_base = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
+    l3_iov_len = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len;

    /* Copy headers */
-    fragment[VMXNET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_base = l2_iov_base;
-    fragment[VMXNET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_len = l2_iov_len;
-    fragment[VMXNET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_base = l3_iov_base;
-    fragment[VMXNET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len;
+    fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_base = l2_iov_base;
+    fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_len = l2_iov_len;
+    fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_base = l3_iov_base;
+    fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len;


    /* Put as much data as possible and send */
    do {
-        fragment_len = vmxnet_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset,
+        fragment_len = net_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset,
            fragment, &dst_idx);

        more_frags = (fragment_offset + fragment_len < pkt->payload_len);
@@ -540,7 +585,7 @@ static bool vmxnet_tx_pkt_do_sw_fragmentation(struct VmxnetTxPkt *pkt,

        eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);

-        qemu_sendv_packet(nc, fragment, dst_idx);
+        net_tx_pkt_sendv(pkt, nc, fragment, dst_idx);

        fragment_offset += fragment_len;

@@ -549,13 +594,13 @@ static bool vmxnet_tx_pkt_do_sw_fragmentation(struct VmxnetTxPkt *pkt,
    return true;
 }

-bool vmxnet_tx_pkt_send(struct VmxnetTxPkt *pkt, NetClientState *nc)
+bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc)
 {
    assert(pkt);

    if (!pkt->has_virt_hdr &&
        pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-        vmxnet_tx_pkt_do_sw_csum(pkt);
+        net_tx_pkt_do_sw_csum(pkt);
    }

    /*
@@ -565,17 +610,28 @@ bool vmxnet_tx_pkt_send(struct VmxnetTxPkt *pkt, NetClientState *nc)
    if (VIRTIO_NET_HDR_GSO_NONE != pkt->virt_hdr.gso_type) {
        if (pkt->payload_len >
            ETH_MAX_IP_DGRAM_LEN -
-            pkt->vec[VMXNET_TX_PKT_L3HDR_FRAG].iov_len) {
+            pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len) {
            return false;
        }
    }

    if (pkt->has_virt_hdr ||
        pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
-        qemu_sendv_packet(nc, pkt->vec,
-            pkt->payload_frags + VMXNET_TX_PKT_PL_START_FRAG);
+        net_tx_pkt_sendv(pkt, nc, pkt->vec,
+            pkt->payload_frags + NET_TX_PKT_PL_START_FRAG);
        return true;
    }

-    return vmxnet_tx_pkt_do_sw_fragmentation(pkt, nc);
+    return net_tx_pkt_do_sw_fragmentation(pkt, nc);
+}
+
+bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc)
+{
+    bool res;
+
+    pkt->is_loopback = true;
+    res = net_tx_pkt_send(pkt, nc);
+    pkt->is_loopback = false;
+
+    return res;
 }
--- a/hw/net/net_tx_pkt.h
+++ b/hw/net/net_tx_pkt.h
@@ -0,0 +1,191 @@
+/*
+ * QEMU TX packets abstraction
+ *
+ * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Tamir Shomer <tamirs@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef NET_TX_PKT_H
+#define NET_TX_PKT_H
+
+#include "qemu/osdep.h"
+#include "net/eth.h"
+#include "exec/hwaddr.h"
+
+/* define to enable packet dump functions */
+/*#define NET_TX_PKT_DEBUG*/
+
+struct NetTxPkt;
+
+/**
+ * Init function for tx packet functionality
+ *
+ * @pkt:            packet pointer
+ * @pci_dev:        PCI device processing this packet
+ * @max_frags:      max tx ip fragments
+ * @has_virt_hdr:   device uses virtio header.
+ */
+void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev,
+    uint32_t max_frags, bool has_virt_hdr);
+
+/**
+ * Clean all tx packet resources.
+ *
+ * @pkt:            packet.
+ */
+void net_tx_pkt_uninit(struct NetTxPkt *pkt);
+
+/**
+ * get virtio header
+ *
+ * @pkt:            packet
+ * @ret:            virtio header
+ */
+struct virtio_net_hdr *net_tx_pkt_get_vhdr(struct NetTxPkt *pkt);
+
+/**
+ * build virtio header (will be stored in module context)
+ *
+ * @pkt:            packet
+ * @tso_enable:     TSO enabled
+ * @csum_enable:    CSO enabled
+ * @gso_size:       MSS size for TSO
+ *
+ */
+void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
+    bool csum_enable, uint32_t gso_size);
+
+/**
+* updates vlan tag, and adds vlan header with custom ethernet type
+* in case it is missing.
+*
+* @pkt:            packet
+* @vlan:           VLAN tag
+* @vlan_ethtype:   VLAN header Ethernet type
+*
+*/
+void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt,
+    uint16_t vlan, uint16_t vlan_ethtype);
+
+/**
+* updates vlan tag, and adds vlan header in case it is missing
+*
+* @pkt:            packet
+* @vlan:           VLAN tag
+*
+*/
+static inline void
+net_tx_pkt_setup_vlan_header(struct NetTxPkt *pkt, uint16_t vlan)
+{
+    net_tx_pkt_setup_vlan_header_ex(pkt, vlan, ETH_P_VLAN);
+}
+
+/**
+ * populate data fragment into pkt context.
+ *
+ * @pkt:            packet
+ * @pa:             physical address of fragment
+ * @len:            length of fragment
+ *
+ */
+bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa,
+    size_t len);
+
+/**
+ * Fix ip header fields and calculate IP header and pseudo header checksums.
+ *
+ * @pkt:            packet
+ *
+ */
+void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt);
+
+/**
+ * Calculate the IP header checksum.
+ *
+ * @pkt:            packet
+ *
+ */
+void net_tx_pkt_update_ip_hdr_checksum(struct NetTxPkt *pkt);
+
+/**
+ * get length of all populated data.
+ *
+ * @pkt:            packet
+ * @ret:            total data length
+ *
+ */
+size_t net_tx_pkt_get_total_len(struct NetTxPkt *pkt);
+
+/**
+ * get packet type
+ *
+ * @pkt:            packet
+ * @ret:            packet type
+ *
+ */
+eth_pkt_types_e net_tx_pkt_get_packet_type(struct NetTxPkt *pkt);
+
+/**
+ * prints packet data if debug is enabled
+ *
+ * @pkt:            packet
+ *
+ */
+void net_tx_pkt_dump(struct NetTxPkt *pkt);
+
+/**
+ * reset tx packet private context (needed to be called between packets)
+ *
+ * @pkt:            packet
+ *
+ */
+void net_tx_pkt_reset(struct NetTxPkt *pkt);
+
+/**
+ * Send packet to qemu. handles sw offloads if vhdr is not supported.
+ *
+ * @pkt:            packet
+ * @nc:             NetClientState
+ * @ret:            operation result
+ *
+ */
+bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc);
+
+/**
+* Redirect packet directly to receive path (emulate loopback phy).
+* Handles sw offloads if vhdr is not supported.
+*
+* @pkt:            packet
+* @nc:             NetClientState
+* @ret:            operation result
+*
+*/
+bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc);
+
+/**
+ * parse raw packet data and analyze offload requirements.
+ *
+ * @pkt:            packet
+ *
+ */
+bool net_tx_pkt_parse(struct NetTxPkt *pkt);
+
+/**
+* indicates if there are data fragments held by this packet object.
+*
+* @pkt:            packet
+*
+*/
+bool net_tx_pkt_has_fragments(struct NetTxPkt *pkt);
+
+#endif
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -1867,11 +1867,6 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor)
    return 1;
 }

-/* structures and macros for task offloading */
-#define TCP_HEADER_DATA_OFFSET(tcp) (((be16_to_cpu(tcp->th_offset_flags) >> 12)&0xf) << 2)
-#define TCP_FLAGS_ONLY(flags) ((flags)&0x3f)
-#define TCP_HEADER_FLAGS(tcp) TCP_FLAGS_ONLY(be16_to_cpu(tcp->th_offset_flags))
-
 #define TCP_HEADER_CLEAR_FLAGS(tcp, off) ((tcp)->th_offset_flags &= cpu_to_be16(~TCP_FLAGS_ONLY(off)))

 /* produces ones' complement sum of data */
--- a/hw/net/spapr_llan.c
+++ b/hw/net/spapr_llan.c
@@ -110,6 +110,7 @@ typedef struct VIOsPAPRVLANDevice {
    hwaddr buf_list;
    uint32_t add_buf_ptr, use_buf_ptr, rx_bufs;
    hwaddr rxq_ptr;
+    QEMUTimer *rxp_timer;
    uint32_t compat_flags;             /* Compatability flags for migration */
    RxBufPool *rx_pool[RX_MAX_POOLS];  /* Receive buffer descriptor pools */
 } VIOsPAPRVLANDevice;
@@ -121,6 +122,21 @@ static int spapr_vlan_can_receive(NetClientState *nc)
    return (dev->isopen && dev->rx_bufs > 0);
 }

+/**
+ * The last 8 bytes of the receive buffer list page (that has been
+ * supplied by the guest with the H_REGISTER_LOGICAL_LAN call) contain
+ * a counter for frames that have been dropped because there was no
+ * suitable receive buffer available. This function is used to increase
+ * this counter by one.
+ */
+static void spapr_vlan_record_dropped_rx_frame(VIOsPAPRVLANDevice *dev)
+{
+    uint64_t cnt;
+
+    cnt = vio_ldq(&dev->sdev, dev->buf_list + 4096 - 8);
+    vio_stq(&dev->sdev, dev->buf_list + 4096 - 8, cnt + 1);
+}
+
 /**
 * Get buffer descriptor from one of our receive buffer pools
 */
@@ -206,7 +222,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
    }

    if (!dev->rx_bufs) {
-        return -1;
+        spapr_vlan_record_dropped_rx_frame(dev);
+        return 0;
    }

    if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
@@ -215,7 +232,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
        bd = spapr_vlan_get_rx_bd_from_page(dev, size);
    }
    if (!bd) {
-        return -1;
+        spapr_vlan_record_dropped_rx_frame(dev);
+        return 0;
    }

    dev->rx_bufs--;
@@ -266,6 +284,13 @@ static NetClientInfo net_spapr_vlan_info = {
    .receive = spapr_vlan_receive,
 };

+static void spapr_vlan_flush_rx_queue(void *opaque)
+{
+    VIOsPAPRVLANDevice *dev = opaque;
+
+    qemu_flush_queued_packets(qemu_get_queue(dev->nic));
+}
+
 static void spapr_vlan_reset_rx_pool(RxBufPool *rxp)
 {
    /*
@@ -302,6 +327,9 @@ static void spapr_vlan_realize(VIOsPAPRDevice *sdev, Error **errp)
    dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf,
                            object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev);
    qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
+
+    dev->rxp_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, spapr_vlan_flush_rx_queue,
+                                  dev);
 }

 static void spapr_vlan_instance_init(Object *obj)
@@ -332,6 +360,11 @@ static void spapr_vlan_instance_finalize(Object *obj)
            dev->rx_pool[i] = NULL;
        }
    }
+
+    if (dev->rxp_timer) {
+        timer_del(dev->rxp_timer);
+        timer_free(dev->rxp_timer);
+    }
 }

 void spapr_vlan_create(VIOsPAPRBus *bus, NICInfo *nd)
@@ -629,7 +662,13 @@ static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu,

    dev->rx_bufs++;

-    qemu_flush_queued_packets(qemu_get_queue(dev->nic));
+    /*
+     * Give guest some more time to add additional RX buffers before we
+     * flush the receive queue, so that e.g. fragmented IP packets can
+     * be passed to the guest in one go later (instead of passing single
+     * fragments if there is only one receive buffer available).
+     */
+    timer_mod(dev->rxp_timer, qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + 500);

    return H_SUCCESS;
 }
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -30,8 +30,8 @@
 #include "vmxnet3.h"
 #include "vmxnet_debug.h"
 #include "vmware_utils.h"
-#include "vmxnet_tx_pkt.h"
-#include "vmxnet_rx_pkt.h"
+#include "net_tx_pkt.h"
+#include "net_rx_pkt.h"

 #define PCI_DEVICE_ID_VMWARE_VMXNET3_REVISION 0x1
 #define VMXNET3_MSIX_BAR_SIZE 0x2000
@@ -314,13 +314,13 @@ typedef struct {
        bool peer_has_vhdr;

        /* TX packets to QEMU interface */
-        struct VmxnetTxPkt *tx_pkt;
+        struct NetTxPkt *tx_pkt;
        uint32_t offload_mode;
        uint32_t cso_or_gso_size;
        uint16_t tci;
        bool needs_vlan;

-        struct VmxnetRxPkt *rx_pkt;
+        struct NetRxPkt *rx_pkt;

        bool tx_sop;
        bool skip_current_tx_pkt;
@@ -474,7 +474,7 @@ static void vmxnet3_set_variable_mac(VMXNET3State *s, uint32_t h, uint32_t l)
    s->conf.macaddr.a[4] = VMXNET3_GET_BYTE(h, 0);
    s->conf.macaddr.a[5] = VMXNET3_GET_BYTE(h, 1);

-    VMW_CFPRN("Variable MAC: " VMXNET_MF, VMXNET_MA(s->conf.macaddr.a));
+    VMW_CFPRN("Variable MAC: " MAC_FMT, MAC_ARG(s->conf.macaddr.a));

    qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
@@ -546,18 +546,18 @@ vmxnet3_setup_tx_offloads(VMXNET3State *s)
 {
    switch (s->offload_mode) {
    case VMXNET3_OM_NONE:
-        vmxnet_tx_pkt_build_vheader(s->tx_pkt, false, false, 0);
+        net_tx_pkt_build_vheader(s->tx_pkt, false, false, 0);
        break;

    case VMXNET3_OM_CSUM:
-        vmxnet_tx_pkt_build_vheader(s->tx_pkt, false, true, 0);
+        net_tx_pkt_build_vheader(s->tx_pkt, false, true, 0);
        VMW_PKPRN("L4 CSO requested\n");
        break;

    case VMXNET3_OM_TSO:
-        vmxnet_tx_pkt_build_vheader(s->tx_pkt, true, true,
+        net_tx_pkt_build_vheader(s->tx_pkt, true, true,
            s->cso_or_gso_size);
-        vmxnet_tx_pkt_update_ip_checksums(s->tx_pkt);
+        net_tx_pkt_update_ip_checksums(s->tx_pkt);
        VMW_PKPRN("GSO offload requested.");
        break;

@@ -590,12 +590,12 @@ static void
 vmxnet3_on_tx_done_update_stats(VMXNET3State *s, int qidx,
    Vmxnet3PktStatus status)
 {
-    size_t tot_len = vmxnet_tx_pkt_get_total_len(s->tx_pkt);
+    size_t tot_len = net_tx_pkt_get_total_len(s->tx_pkt);
    struct UPT1_TxStats *stats = &s->txq_descr[qidx].txq_stats;

    switch (status) {
    case VMXNET3_PKT_STATUS_OK:
-        switch (vmxnet_tx_pkt_get_packet_type(s->tx_pkt)) {
+        switch (net_tx_pkt_get_packet_type(s->tx_pkt)) {
        case ETH_PKT_BCAST:
            stats->bcastPktsTxOK++;
            stats->bcastBytesTxOK += tot_len;
@@ -643,7 +643,7 @@ vmxnet3_on_rx_done_update_stats(VMXNET3State *s,
                                Vmxnet3PktStatus status)
 {
    struct UPT1_RxStats *stats = &s->rxq_descr[qidx].rxq_stats;
-    size_t tot_len = vmxnet_rx_pkt_get_total_len(s->rx_pkt);
+    size_t tot_len = net_rx_pkt_get_total_len(s->rx_pkt);

    switch (status) {
    case VMXNET3_PKT_STATUS_OUT_OF_BUF:
@@ -654,7 +654,7 @@ vmxnet3_on_rx_done_update_stats(VMXNET3State *s,
        stats->pktsRxError++;
        break;
    case VMXNET3_PKT_STATUS_OK:
-        switch (vmxnet_rx_pkt_get_packet_type(s->rx_pkt)) {
+        switch (net_rx_pkt_get_packet_type(s->rx_pkt)) {
        case ETH_PKT_BCAST:
            stats->bcastPktsRxOK++;
            stats->bcastBytesRxOK += tot_len;
@@ -715,10 +715,10 @@ vmxnet3_send_packet(VMXNET3State *s, uint32_t qidx)
    }

    /* debug prints */
-    vmxnet3_dump_virt_hdr(vmxnet_tx_pkt_get_vhdr(s->tx_pkt));
-    vmxnet_tx_pkt_dump(s->tx_pkt);
+    vmxnet3_dump_virt_hdr(net_tx_pkt_get_vhdr(s->tx_pkt));
+    net_tx_pkt_dump(s->tx_pkt);

-    if (!vmxnet_tx_pkt_send(s->tx_pkt, qemu_get_queue(s->nic))) {
+    if (!net_tx_pkt_send(s->tx_pkt, qemu_get_queue(s->nic))) {
        status = VMXNET3_PKT_STATUS_DISCARD;
        goto func_exit;
    }
@@ -746,7 +746,7 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx)
            data_len = (txd.len > 0) ? txd.len : VMXNET3_MAX_TX_BUF_SIZE;
            data_pa = le64_to_cpu(txd.addr);

-            if (!vmxnet_tx_pkt_add_raw_fragment(s->tx_pkt,
+            if (!net_tx_pkt_add_raw_fragment(s->tx_pkt,
                                                data_pa,
                                                data_len)) {
                s->skip_current_tx_pkt = true;
@@ -759,9 +759,9 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx)
        }

        if (txd.eop) {
-            if (!s->skip_current_tx_pkt && vmxnet_tx_pkt_parse(s->tx_pkt)) {
+            if (!s->skip_current_tx_pkt && net_tx_pkt_parse(s->tx_pkt)) {
                if (s->needs_vlan) {
-                    vmxnet_tx_pkt_setup_vlan_header(s->tx_pkt, s->tci);
+                    net_tx_pkt_setup_vlan_header(s->tx_pkt, s->tci);
                }

                vmxnet3_send_packet(s, qidx);
@@ -773,7 +773,7 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx)
            vmxnet3_complete_packet(s, qidx, txd_idx);
            s->tx_sop = true;
            s->skip_current_tx_pkt = false;
-            vmxnet_tx_pkt_reset(s->tx_pkt);
+            net_tx_pkt_reset(s->tx_pkt);
        }
    }
 }
@@ -802,7 +802,9 @@ vmxnet3_pop_rxc_descr(VMXNET3State *s, int qidx, uint32_t *descr_gen)
    hwaddr daddr =
        vmxnet3_ring_curr_cell_pa(&s->rxq_descr[qidx].comp_ring);

-    cpu_physical_memory_read(daddr, &rxcd, sizeof(struct Vmxnet3_RxCompDesc));
+    pci_dma_read(PCI_DEVICE(s), daddr,
+                 &rxcd, sizeof(struct Vmxnet3_RxCompDesc));
+
    ring_gen = vmxnet3_ring_curr_gen(&s->rxq_descr[qidx].comp_ring);

    if (rxcd.gen != ring_gen) {
@@ -928,7 +930,7 @@ vmxnet3_get_next_rx_descr(VMXNET3State *s, bool is_head,
 * in the case the host OS performs forwarding, it will forward an
 * incorrectly checksummed packet.
 */
-static void vmxnet3_rx_need_csum_calculate(struct VmxnetRxPkt *pkt,
+static void vmxnet3_rx_need_csum_calculate(struct NetRxPkt *pkt,
                                           const void *pkt_data,
                                           size_t pkt_len)
 {
@@ -937,16 +939,16 @@ static void vmxnet3_rx_need_csum_calculate(struct VmxnetRxPkt *pkt,
    uint8_t *data;
    int len;

-    if (!vmxnet_rx_pkt_has_virt_hdr(pkt)) {
+    if (!net_rx_pkt_has_virt_hdr(pkt)) {
        return;
    }

-    vhdr = vmxnet_rx_pkt_get_vhdr(pkt);
+    vhdr = net_rx_pkt_get_vhdr(pkt);
    if (!VMXNET_FLAG_IS_SET(vhdr->flags, VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
        return;
    }

-    vmxnet_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
+    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
    if (!(isip4 || isip6) || !(istcp || isudp)) {
        return;
    }
@@ -970,7 +972,7 @@ static void vmxnet3_rx_need_csum_calculate(struct VmxnetRxPkt *pkt,
    vhdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID;
 }

-static void vmxnet3_rx_update_descr(struct VmxnetRxPkt *pkt,
+static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt,
    struct Vmxnet3_RxCompDesc *rxcd)
 {
    int csum_ok, is_gso;
@@ -978,16 +980,16 @@ static void vmxnet3_rx_update_descr(struct VmxnetRxPkt *pkt,
    struct virtio_net_hdr *vhdr;
    uint8_t offload_type;

-    if (vmxnet_rx_pkt_is_vlan_stripped(pkt)) {
+    if (net_rx_pkt_is_vlan_stripped(pkt)) {
        rxcd->ts = 1;
-        rxcd->tci = vmxnet_rx_pkt_get_vlan_tag(pkt);
+        rxcd->tci = net_rx_pkt_get_vlan_tag(pkt);
    }

-    if (!vmxnet_rx_pkt_has_virt_hdr(pkt)) {
+    if (!net_rx_pkt_has_virt_hdr(pkt)) {
        goto nocsum;
    }

-    vhdr = vmxnet_rx_pkt_get_vhdr(pkt);
+    vhdr = net_rx_pkt_get_vhdr(pkt);
    /*
     * Checksum is valid when lower level tell so or when lower level
     * requires checksum offload telling that packet produced/bridged
@@ -1004,7 +1006,7 @@ static void vmxnet3_rx_update_descr(struct VmxnetRxPkt *pkt,
        goto nocsum;
    }

-    vmxnet_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
+    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
    if ((!istcp && !isudp) || (!isip4 && !isip6)) {
        goto nocsum;
    }
@@ -1023,10 +1025,11 @@ nocsum:
 }

 static void
-vmxnet3_physical_memory_writev(const struct iovec *iov,
-                               size_t start_iov_off,
-                               hwaddr target_addr,
-                               size_t bytes_to_copy)
+vmxnet3_pci_dma_writev(PCIDevice *pci_dev,
+                       const struct iovec *iov,
+                       size_t start_iov_off,
+                       hwaddr target_addr,
+                       size_t bytes_to_copy)
 {
    size_t curr_off = 0;
    size_t copied = 0;
@@ -1036,9 +1039,9 @@ vmxnet3_physical_memory_writev(const struct iovec *iov,
            size_t chunk_len =
                MIN((curr_off + iov->iov_len) - start_iov_off, bytes_to_copy);

-            cpu_physical_memory_write(target_addr + copied,
-                                      iov->iov_base + start_iov_off - curr_off,
-                                      chunk_len);
+            pci_dma_write(pci_dev, target_addr + copied,
+                          iov->iov_base + start_iov_off - curr_off,
+                          chunk_len);

            copied += chunk_len;
            start_iov_off += chunk_len;
@@ -1063,13 +1066,13 @@ vmxnet3_indicate_packet(VMXNET3State *s)
    uint32_t new_rxcd_gen = VMXNET3_INIT_GEN;
    hwaddr new_rxcd_pa = 0;
    hwaddr ready_rxcd_pa = 0;
-    struct iovec *data = vmxnet_rx_pkt_get_iovec(s->rx_pkt);
+    struct iovec *data = net_rx_pkt_get_iovec(s->rx_pkt);
    size_t bytes_copied = 0;
-    size_t bytes_left = vmxnet_rx_pkt_get_total_len(s->rx_pkt);
+    size_t bytes_left = net_rx_pkt_get_total_len(s->rx_pkt);
    uint16_t num_frags = 0;
    size_t chunk_size;

-    vmxnet_rx_pkt_dump(s->rx_pkt);
+    net_rx_pkt_dump(s->rx_pkt);

    while (bytes_left > 0) {

@@ -1088,15 +1091,15 @@ vmxnet3_indicate_packet(VMXNET3State *s)
        }

        chunk_size = MIN(bytes_left, rxd.len);
-        vmxnet3_physical_memory_writev(data, bytes_copied,
-                                       le64_to_cpu(rxd.addr), chunk_size);
+        vmxnet3_pci_dma_writev(PCI_DEVICE(s), data, bytes_copied,
+                               le64_to_cpu(rxd.addr), chunk_size);
        bytes_copied += chunk_size;
        bytes_left -= chunk_size;

        vmxnet3_dump_rx_descr(&rxd);

        if (ready_rxcd_pa != 0) {
-            cpu_physical_memory_write(ready_rxcd_pa, &rxcd, sizeof(rxcd));
+            pci_dma_write(PCI_DEVICE(s), ready_rxcd_pa, &rxcd, sizeof(rxcd));
        }

        memset(&rxcd, 0, sizeof(struct Vmxnet3_RxCompDesc));
@@ -1127,7 +1130,8 @@ vmxnet3_indicate_packet(VMXNET3State *s)
    if (ready_rxcd_pa != 0) {
        rxcd.eop = 1;
        rxcd.err = (bytes_left != 0);
-        cpu_physical_memory_write(ready_rxcd_pa, &rxcd, sizeof(rxcd));
+
+        pci_dma_write(PCI_DEVICE(s), ready_rxcd_pa, &rxcd, sizeof(rxcd));

        /* Flush RX descriptor changes */
        smp_wmb();
@@ -1219,16 +1223,16 @@ static void vmxnet3_reset_interrupt_states(VMXNET3State *s)
 static void vmxnet3_reset_mac(VMXNET3State *s)
 {
    memcpy(&s->conf.macaddr.a, &s->perm_mac.a, sizeof(s->perm_mac.a));
-    VMW_CFPRN("MAC address set to: " VMXNET_MF, VMXNET_MA(s->conf.macaddr.a));
+    VMW_CFPRN("MAC address set to: " MAC_FMT, MAC_ARG(s->conf.macaddr.a));
 }

 static void vmxnet3_deactivate_device(VMXNET3State *s)
 {
    if (s->device_active) {
        VMW_CBPRN("Deactivating vmxnet3...");
-        vmxnet_tx_pkt_reset(s->tx_pkt);
-        vmxnet_tx_pkt_uninit(s->tx_pkt);
-        vmxnet_rx_pkt_uninit(s->rx_pkt);
+        net_tx_pkt_reset(s->tx_pkt);
+        net_tx_pkt_uninit(s->tx_pkt);
+        net_rx_pkt_uninit(s->rx_pkt);
        s->device_active = false;
    }
 }
@@ -1298,10 +1302,11 @@ static void vmxnet3_update_mcast_filters(VMXNET3State *s)
            VMXNET3_READ_DRV_SHARED64(s->drv_shmem,
                                      devRead.rxFilterConf.mfTablePA);

-        cpu_physical_memory_read(mcast_list_pa, s->mcast_list, list_bytes);
+        pci_dma_read(PCI_DEVICE(s), mcast_list_pa, s->mcast_list, list_bytes);
+
        VMW_CFPRN("Current multicast list len is %d:", s->mcast_list_len);
        for (i = 0; i < s->mcast_list_len; i++) {
-            VMW_CFPRN("\t" VMXNET_MF, VMXNET_MA(s->mcast_list[i].a));
+            VMW_CFPRN("\t" MAC_FMT, MAC_ARG(s->mcast_list[i].a));
        }
    }
 }
@@ -1328,15 +1333,17 @@ static void vmxnet3_fill_stats(VMXNET3State *s)
        return;

    for (i = 0; i < s->txq_num; i++) {
-        cpu_physical_memory_write(s->txq_descr[i].tx_stats_pa,
-                                  &s->txq_descr[i].txq_stats,
-                                  sizeof(s->txq_descr[i].txq_stats));
+        pci_dma_write(PCI_DEVICE(s),
+                      s->txq_descr[i].tx_stats_pa,
+                      &s->txq_descr[i].txq_stats,
+                      sizeof(s->txq_descr[i].txq_stats));
    }

    for (i = 0; i < s->rxq_num; i++) {
-        cpu_physical_memory_write(s->rxq_descr[i].rx_stats_pa,
-                                  &s->rxq_descr[i].rxq_stats,
-                                  sizeof(s->rxq_descr[i].rxq_stats));
+        pci_dma_write(PCI_DEVICE(s),
+                      s->rxq_descr[i].rx_stats_pa,
+                      &s->rxq_descr[i].rxq_stats,
+                      sizeof(s->rxq_descr[i].rxq_stats));
    }
 }

@@ -1558,8 +1565,9 @@ static void vmxnet3_activate_device(VMXNET3State *s)

    /* Preallocate TX packet wrapper */
    VMW_CFPRN("Max TX fragments is %u", s->max_tx_frags);
-    vmxnet_tx_pkt_init(&s->tx_pkt, s->max_tx_frags, s->peer_has_vhdr);
-    vmxnet_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);
+    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s),
+                    s->max_tx_frags, s->peer_has_vhdr);
+    net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);

    /* Read rings memory locations for RX queues */
    for (i = 0; i < s->rxq_num; i++) {
@@ -1965,7 +1973,7 @@ vmxnet3_rx_filter_may_indicate(VMXNET3State *s, const void *data,
        return false;
    }

-    switch (vmxnet_rx_pkt_get_packet_type(s->rx_pkt)) {
+    switch (net_rx_pkt_get_packet_type(s->rx_pkt)) {
    case ETH_PKT_UCAST:
        if (!VMXNET_FLAG_IS_SET(s->rx_mode, VMXNET3_RXM_UCAST)) {
            return false;
@@ -2013,7 +2021,7 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size)
    }

    if (s->peer_has_vhdr) {
-        vmxnet_rx_pkt_set_vhdr(s->rx_pkt, (struct virtio_net_hdr *)buf);
+        net_rx_pkt_set_vhdr(s->rx_pkt, (struct virtio_net_hdr *)buf);
        buf += sizeof(struct virtio_net_hdr);
        size -= sizeof(struct virtio_net_hdr);
    }
@@ -2026,13 +2034,13 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size)
        size = sizeof(min_buf);
    }

-    vmxnet_rx_pkt_set_packet_type(s->rx_pkt,
+    net_rx_pkt_set_packet_type(s->rx_pkt,
        get_eth_packet_type(PKT_GET_ETH_HDR(buf)));

    if (vmxnet3_rx_filter_may_indicate(s, buf, size)) {
-        vmxnet_rx_pkt_set_protocols(s->rx_pkt, buf, size);
+        net_rx_pkt_set_protocols(s->rx_pkt, buf, size);
        vmxnet3_rx_need_csum_calculate(s->rx_pkt, buf, size);
-        vmxnet_rx_pkt_attach_data(s->rx_pkt, buf, size, s->rx_vlan_stripping);
+        net_rx_pkt_attach_data(s->rx_pkt, buf, size, s->rx_vlan_stripping);
        bytes_indicated = vmxnet3_indicate_packet(s) ? size : -1;
        if (bytes_indicated < size) {
            VMW_PKPRN("RX: %zu of %zu bytes indicated", bytes_indicated, size);
@@ -2102,7 +2110,7 @@ static void vmxnet3_net_init(VMXNET3State *s)

    s->link_status_and_speed = VMXNET3_LINK_SPEED | VMXNET3_LINK_STATUS_UP;

-    VMW_CFPRN("Permanent MAC: " VMXNET_MF, VMXNET_MA(s->perm_mac.a));
+    VMW_CFPRN("Permanent MAC: " MAC_FMT, MAC_ARG(s->perm_mac.a));

    s->nic = qemu_new_nic(&net_vmxnet3_info, &s->conf,
                          object_get_typename(OBJECT(s)),
@@ -2255,9 +2263,9 @@ static const MemoryRegionOps b1_ops = {
    },
 };

-static uint8_t *vmxnet3_device_serial_num(VMXNET3State *s)
+static uint64_t vmxnet3_device_serial_num(VMXNET3State *s)
 {
-    static uint64_t dsn_payload;
+    uint64_t dsn_payload;
    uint8_t *dsnp = (uint8_t *)&dsn_payload;

    dsnp[0] = 0xfe;
@@ -2268,7 +2276,7 @@ static uint8_t *vmxnet3_device_serial_num(VMXNET3State *s)
    dsnp[5] = s->conf.macaddr.a[1];
    dsnp[6] = s->conf.macaddr.a[2];
    dsnp[7] = 0xff;
-    return dsnp;
+    return dsn_payload;
 }

 static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp)
@@ -2313,10 +2321,8 @@ static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp)
            pcie_endpoint_cap_init(pci_dev, VMXNET3_EXP_EP_OFFSET);
        }

-        pcie_add_capability(pci_dev, PCI_EXT_CAP_ID_DSN, 0x1,
-                            VMXNET3_DSN_OFFSET, PCI_EXT_CAP_DSN_SIZEOF);
-        memcpy(pci_dev->config + VMXNET3_DSN_OFFSET + 4,
-               vmxnet3_device_serial_num(s), sizeof(uint64_t));
+        pcie_dev_ser_num_init(pci_dev, VMXNET3_DSN_OFFSET,
+                              vmxnet3_device_serial_num(s));
    }

    register_savevm(dev, "vmxnet3-msix", -1, 1,
@@ -2538,8 +2544,9 @@ static int vmxnet3_post_load(void *opaque, int version_id)
    VMXNET3State *s = opaque;
    PCIDevice *d = PCI_DEVICE(s);

-    vmxnet_tx_pkt_init(&s->tx_pkt, s->max_tx_frags, s->peer_has_vhdr);
-    vmxnet_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);
+    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s),
+                    s->max_tx_frags, s->peer_has_vhdr);
+    net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);

    if (s->msix_used) {
        if  (!vmxnet3_use_msix_vectors(s, VMXNET3_MAX_INTRS)) {
--- a/hw/net/vmxnet_debug.h
+++ b/hw/net/vmxnet_debug.h
@@ -142,7 +142,4 @@
        }                                                                     \
    } while (0)

-#define VMXNET_MF       "%02X:%02X:%02X:%02X:%02X:%02X"
-#define VMXNET_MA(a)    (a)[0], (a)[1], (a)[2], (a)[3], (a)[4], (a)[5]
-
 #endif /* _QEMU_VMXNET3_DEBUG_H  */
--- a/hw/net/vmxnet_rx_pkt.c
+++ b/hw/net/vmxnet_rx_pkt.c
@@ -1,187 +0,0 @@
-/*
- * QEMU VMWARE VMXNET* paravirtual NICs - RX packets abstractions
- *
- * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
- *
- * Developed by Daynix Computing LTD (http://www.daynix.com)
- *
- * Authors:
- * Dmitry Fleytman <dmitry@daynix.com>
- * Tamir Shomer <tamirs@daynix.com>
- * Yan Vugenfirer <yan@daynix.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "vmxnet_rx_pkt.h"
-#include "net/eth.h"
-#include "qemu-common.h"
-#include "qemu/iov.h"
-#include "net/checksum.h"
-#include "net/tap.h"
-
-/*
- * RX packet may contain up to 2 fragments - rebuilt eth header
- * in case of VLAN tag stripping
- * and payload received from QEMU - in any case
- */
-#define VMXNET_MAX_RX_PACKET_FRAGMENTS (2)
-
-struct VmxnetRxPkt {
-    struct virtio_net_hdr virt_hdr;
-    uint8_t ehdr_buf[ETH_MAX_L2_HDR_LEN];
-    struct iovec vec[VMXNET_MAX_RX_PACKET_FRAGMENTS];
-    uint16_t vec_len;
-    uint32_t tot_len;
-    uint16_t tci;
-    bool vlan_stripped;
-    bool has_virt_hdr;
-    eth_pkt_types_e packet_type;
-
-    /* Analysis results */
-    bool isip4;
-    bool isip6;
-    bool isudp;
-    bool istcp;
-};
-
-void vmxnet_rx_pkt_init(struct VmxnetRxPkt **pkt, bool has_virt_hdr)
-{
-    struct VmxnetRxPkt *p = g_malloc0(sizeof *p);
-    p->has_virt_hdr = has_virt_hdr;
-    *pkt = p;
-}
-
-void vmxnet_rx_pkt_uninit(struct VmxnetRxPkt *pkt)
-{
-    g_free(pkt);
-}
-
-struct virtio_net_hdr *vmxnet_rx_pkt_get_vhdr(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-    return &pkt->virt_hdr;
-}
-
-void vmxnet_rx_pkt_attach_data(struct VmxnetRxPkt *pkt, const void *data,
-                               size_t len, bool strip_vlan)
-{
-    uint16_t tci = 0;
-    uint16_t ploff;
-    assert(pkt);
-    pkt->vlan_stripped = false;
-
-    if (strip_vlan) {
-        pkt->vlan_stripped = eth_strip_vlan(data, pkt->ehdr_buf, &ploff, &tci);
-    }
-
-    if (pkt->vlan_stripped) {
-        pkt->vec[0].iov_base = pkt->ehdr_buf;
-        pkt->vec[0].iov_len = ploff - sizeof(struct vlan_header);
-        pkt->vec[1].iov_base = (uint8_t *) data + ploff;
-        pkt->vec[1].iov_len = len - ploff;
-        pkt->vec_len = 2;
-        pkt->tot_len = len - ploff + sizeof(struct eth_header);
-    } else {
-        pkt->vec[0].iov_base = (void *)data;
-        pkt->vec[0].iov_len = len;
-        pkt->vec_len = 1;
-        pkt->tot_len = len;
-    }
-
-    pkt->tci = tci;
-}
-
-void vmxnet_rx_pkt_dump(struct VmxnetRxPkt *pkt)
-{
-#ifdef VMXNET_RX_PKT_DEBUG
-    VmxnetRxPkt *pkt = (VmxnetRxPkt *)pkt;
-    assert(pkt);
-
-    printf("RX PKT: tot_len: %d, vlan_stripped: %d, vlan_tag: %d\n",
-              pkt->tot_len, pkt->vlan_stripped, pkt->tci);
-#endif
-}
-
-void vmxnet_rx_pkt_set_packet_type(struct VmxnetRxPkt *pkt,
-    eth_pkt_types_e packet_type)
-{
-    assert(pkt);
-
-    pkt->packet_type = packet_type;
-
-}
-
-eth_pkt_types_e vmxnet_rx_pkt_get_packet_type(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->packet_type;
-}
-
-size_t vmxnet_rx_pkt_get_total_len(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->tot_len;
-}
-
-void vmxnet_rx_pkt_set_protocols(struct VmxnetRxPkt *pkt, const void *data,
-                                 size_t len)
-{
-    assert(pkt);
-
-    eth_get_protocols(data, len, &pkt->isip4, &pkt->isip6,
-        &pkt->isudp, &pkt->istcp);
-}
-
-void vmxnet_rx_pkt_get_protocols(struct VmxnetRxPkt *pkt,
-                                 bool *isip4, bool *isip6,
-                                 bool *isudp, bool *istcp)
-{
-    assert(pkt);
-
-    *isip4 = pkt->isip4;
-    *isip6 = pkt->isip6;
-    *isudp = pkt->isudp;
-    *istcp = pkt->istcp;
-}
-
-struct iovec *vmxnet_rx_pkt_get_iovec(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->vec;
-}
-
-void vmxnet_rx_pkt_set_vhdr(struct VmxnetRxPkt *pkt,
-                            struct virtio_net_hdr *vhdr)
-{
-    assert(pkt);
-
-    memcpy(&pkt->virt_hdr, vhdr, sizeof pkt->virt_hdr);
-}
-
-bool vmxnet_rx_pkt_is_vlan_stripped(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->vlan_stripped;
-}
-
-bool vmxnet_rx_pkt_has_virt_hdr(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->has_virt_hdr;
-}
-
-uint16_t vmxnet_rx_pkt_get_vlan_tag(struct VmxnetRxPkt *pkt)
-{
-    assert(pkt);
-
-    return pkt->tci;
-}
--- a/hw/net/vmxnet_rx_pkt.h
+++ b/hw/net/vmxnet_rx_pkt.h
@@ -1,174 +0,0 @@
-/*
- * QEMU VMWARE VMXNET* paravirtual NICs - RX packets abstraction
- *
- * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
- *
- * Developed by Daynix Computing LTD (http://www.daynix.com)
- *
- * Authors:
- * Dmitry Fleytman <dmitry@daynix.com>
- * Tamir Shomer <tamirs@daynix.com>
- * Yan Vugenfirer <yan@daynix.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef VMXNET_RX_PKT_H
-#define VMXNET_RX_PKT_H
-
-#include "net/eth.h"
-
-/* defines to enable packet dump functions */
-/*#define VMXNET_RX_PKT_DEBUG*/
-
-struct VmxnetRxPkt;
-
-/**
- * Clean all rx packet resources
- *
- * @pkt:            packet
- *
- */
-void vmxnet_rx_pkt_uninit(struct VmxnetRxPkt *pkt);
-
-/**
- * Init function for rx packet functionality
- *
- * @pkt:            packet pointer
- * @has_virt_hdr:   device uses virtio header
- *
- */
-void vmxnet_rx_pkt_init(struct VmxnetRxPkt **pkt, bool has_virt_hdr);
-
-/**
- * returns total length of data attached to rx context
- *
- * @pkt:            packet
- *
- * Return:  nothing
- *
- */
-size_t vmxnet_rx_pkt_get_total_len(struct VmxnetRxPkt *pkt);
-
-/**
- * parse and set packet analysis results
- *
- * @pkt:            packet
- * @data:           pointer to the data buffer to be parsed
- * @len:            data length
- *
- */
-void vmxnet_rx_pkt_set_protocols(struct VmxnetRxPkt *pkt, const void *data,
-                                 size_t len);
-
-/**
- * fetches packet analysis results
- *
- * @pkt:            packet
- * @isip4:          whether the packet given is IPv4
- * @isip6:          whether the packet given is IPv6
- * @isudp:          whether the packet given is UDP
- * @istcp:          whether the packet given is TCP
- *
- */
-void vmxnet_rx_pkt_get_protocols(struct VmxnetRxPkt *pkt,
-                                 bool *isip4, bool *isip6,
-                                 bool *isudp, bool *istcp);
-
-/**
- * returns virtio header stored in rx context
- *
- * @pkt:            packet
- * @ret:            virtio header
- *
- */
-struct virtio_net_hdr *vmxnet_rx_pkt_get_vhdr(struct VmxnetRxPkt *pkt);
-
-/**
- * returns packet type
- *
- * @pkt:            packet
- * @ret:            packet type
- *
- */
-eth_pkt_types_e vmxnet_rx_pkt_get_packet_type(struct VmxnetRxPkt *pkt);
-
-/**
- * returns vlan tag
- *
- * @pkt:            packet
- * @ret:            VLAN tag
- *
- */
-uint16_t vmxnet_rx_pkt_get_vlan_tag(struct VmxnetRxPkt *pkt);
-
-/**
- * tells whether vlan was stripped from the packet
- *
- * @pkt:            packet
- * @ret:            VLAN stripped sign
- *
- */
-bool vmxnet_rx_pkt_is_vlan_stripped(struct VmxnetRxPkt *pkt);
-
-/**
- * notifies caller if the packet has virtio header
- *
- * @pkt:            packet
- * @ret:            true if packet has virtio header, false otherwize
- *
- */
-bool vmxnet_rx_pkt_has_virt_hdr(struct VmxnetRxPkt *pkt);
-
-/**
- * attach data to rx packet
- *
- * @pkt:            packet
- * @data:           pointer to the data buffer
- * @len:            data length
- * @strip_vlan:     should the module strip vlan from data
- *
- */
-void vmxnet_rx_pkt_attach_data(struct VmxnetRxPkt *pkt, const void *data,
-    size_t len, bool strip_vlan);
-
-/**
- * returns io vector that holds the attached data
- *
- * @pkt:            packet
- * @ret:            pointer to IOVec
- *
- */
-struct iovec *vmxnet_rx_pkt_get_iovec(struct VmxnetRxPkt *pkt);
-
-/**
- * prints rx packet data if debug is enabled
- *
- * @pkt:            packet
- *
- */
-void vmxnet_rx_pkt_dump(struct VmxnetRxPkt *pkt);
-
-/**
- * copy passed vhdr data to packet context
- *
- * @pkt:            packet
- * @vhdr:           VHDR buffer
- *
- */
-void vmxnet_rx_pkt_set_vhdr(struct VmxnetRxPkt *pkt,
-    struct virtio_net_hdr *vhdr);
-
-/**
- * save packet type in packet context
- *
- * @pkt:            packet
- * @packet_type:    the packet type
- *
- */
-void vmxnet_rx_pkt_set_packet_type(struct VmxnetRxPkt *pkt,
-    eth_pkt_types_e packet_type);
-
-#endif
--- a/hw/net/vmxnet_tx_pkt.h
+++ b/hw/net/vmxnet_tx_pkt.h
@@ -1,146 +0,0 @@
-/*
- * QEMU VMWARE VMXNET* paravirtual NICs - TX packets abstraction
- *
- * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
- *
- * Developed by Daynix Computing LTD (http://www.daynix.com)
- *
- * Authors:
- * Dmitry Fleytman <dmitry@daynix.com>
- * Tamir Shomer <tamirs@daynix.com>
- * Yan Vugenfirer <yan@daynix.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef VMXNET_TX_PKT_H
-#define VMXNET_TX_PKT_H
-
-#include "net/eth.h"
-#include "exec/hwaddr.h"
-
-/* define to enable packet dump functions */
-/*#define VMXNET_TX_PKT_DEBUG*/
-
-struct VmxnetTxPkt;
-
-/**
- * Init function for tx packet functionality
- *
- * @pkt:            packet pointer
- * @max_frags:      max tx ip fragments
- * @has_virt_hdr:   device uses virtio header.
- */
-void vmxnet_tx_pkt_init(struct VmxnetTxPkt **pkt, uint32_t max_frags,
-    bool has_virt_hdr);
-
-/**
- * Clean all tx packet resources.
- *
- * @pkt:            packet.
- */
-void vmxnet_tx_pkt_uninit(struct VmxnetTxPkt *pkt);
-
-/**
- * get virtio header
- *
- * @pkt:            packet
- * @ret:            virtio header
- */
-struct virtio_net_hdr *vmxnet_tx_pkt_get_vhdr(struct VmxnetTxPkt *pkt);
-
-/**
- * build virtio header (will be stored in module context)
- *
- * @pkt:            packet
- * @tso_enable:     TSO enabled
- * @csum_enable:    CSO enabled
- * @gso_size:       MSS size for TSO
- *
- */
-void vmxnet_tx_pkt_build_vheader(struct VmxnetTxPkt *pkt, bool tso_enable,
-    bool csum_enable, uint32_t gso_size);
-
-/**
- * updates vlan tag, and adds vlan header in case it is missing
- *
- * @pkt:            packet
- * @vlan:           VLAN tag
- *
- */
-void vmxnet_tx_pkt_setup_vlan_header(struct VmxnetTxPkt *pkt, uint16_t vlan);
-
-/**
- * populate data fragment into pkt context.
- *
- * @pkt:            packet
- * @pa:             physical address of fragment
- * @len:            length of fragment
- *
- */
-bool vmxnet_tx_pkt_add_raw_fragment(struct VmxnetTxPkt *pkt, hwaddr pa,
-    size_t len);
-
-/**
- * fix ip header fields and calculate checksums needed.
- *
- * @pkt:            packet
- *
- */
-void vmxnet_tx_pkt_update_ip_checksums(struct VmxnetTxPkt *pkt);
-
-/**
- * get length of all populated data.
- *
- * @pkt:            packet
- * @ret:            total data length
- *
- */
-size_t vmxnet_tx_pkt_get_total_len(struct VmxnetTxPkt *pkt);
-
-/**
- * get packet type
- *
- * @pkt:            packet
- * @ret:            packet type
- *
- */
-eth_pkt_types_e vmxnet_tx_pkt_get_packet_type(struct VmxnetTxPkt *pkt);
-
-/**
- * prints packet data if debug is enabled
- *
- * @pkt:            packet
- *
- */
-void vmxnet_tx_pkt_dump(struct VmxnetTxPkt *pkt);
-
-/**
- * reset tx packet private context (needed to be called between packets)
- *
- * @pkt:            packet
- *
- */
-void vmxnet_tx_pkt_reset(struct VmxnetTxPkt *pkt);
-
-/**
- * Send packet to qemu. handles sw offloads if vhdr is not supported.
- *
- * @pkt:            packet
- * @nc:             NetClientState
- * @ret:            operation result
- *
- */
-bool vmxnet_tx_pkt_send(struct VmxnetTxPkt *pkt, NetClientState *nc);
-
-/**
- * parse raw packet data and analyze offload requirements.
- *
- * @pkt:            packet
- *
- */
-bool vmxnet_tx_pkt_parse(struct VmxnetTxPkt *pkt);
-
-#endif
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -72,7 +72,7 @@ void msix_set_pending(PCIDevice *dev, unsigned int vector)
    *msix_pending_byte(dev, vector) |= msix_pending_mask(vector);
 }

-static void msix_clr_pending(PCIDevice *dev, int vector)
+void msix_clr_pending(PCIDevice *dev, int vector)
 {
    *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector);
 }
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -43,26 +43,15 @@
 /***************************************************************************
 * pci express capability helper functions
 */
-int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
+
+static void
+pcie_cap_v1_fill(uint8_t *exp_cap, uint8_t port, uint8_t type, uint8_t version)
 {
-    int pos;
-    uint8_t *exp_cap;
-
-    assert(pci_is_express(dev));
-
-    pos = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
-                                 PCI_EXP_VER2_SIZEOF);
-    if (pos < 0) {
-        return pos;
-    }
-    dev->exp.exp_cap = pos;
-    exp_cap = dev->config + pos;
-
    /* capability register
-       interrupt message number defaults to 0 */
+    interrupt message number defaults to 0 */
    pci_set_word(exp_cap + PCI_EXP_FLAGS,
                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
-                 PCI_EXP_FLAGS_VER2);
+                 version);

    /* device capability register
     * table 7-12:
@@ -81,7 +70,27 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)

    pci_set_word(exp_cap + PCI_EXP_LNKSTA,
                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25 |PCI_EXP_LNKSTA_DLLLA);
+}

+int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
+{
+    /* PCIe cap v2 init */
+    int pos;
+    uint8_t *exp_cap;
+
+    assert(pci_is_express(dev));
+
+    pos = pci_add_capability(dev, PCI_CAP_ID_EXP, offset, PCI_EXP_VER2_SIZEOF);
+    if (pos < 0) {
+        return pos;
+    }
+    dev->exp.exp_cap = pos;
+    exp_cap = dev->config + pos;
+
+    /* Filling values common with v1 */
+    pcie_cap_v1_fill(exp_cap, port, type, PCI_EXP_FLAGS_VER2);
+
+    /* Filling v2 specific values */
    pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);

@@ -89,7 +98,29 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
    return pos;
 }

-int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
+int pcie_cap_v1_init(PCIDevice *dev, uint8_t offset, uint8_t type,
+                     uint8_t port)
+{
+    /* PCIe cap v1 init */
+    int pos;
+    uint8_t *exp_cap;
+
+    assert(pci_is_express(dev));
+
+    pos = pci_add_capability(dev, PCI_CAP_ID_EXP, offset, PCI_EXP_VER1_SIZEOF);
+    if (pos < 0) {
+        return pos;
+    }
+    dev->exp.exp_cap = pos;
+    exp_cap = dev->config + pos;
+
+    pcie_cap_v1_fill(exp_cap, port, type, PCI_EXP_FLAGS_VER1);
+
+    return pos;
+}
+
+static int
+pcie_endpoint_cap_common_init(PCIDevice *dev, uint8_t offset, uint8_t cap_size)
 {
    uint8_t type = PCI_EXP_TYPE_ENDPOINT;

@@ -102,7 +133,19 @@ int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
        type = PCI_EXP_TYPE_RC_END;
    }

-    return pcie_cap_init(dev, offset, type, 0);
+    return (cap_size == PCI_EXP_VER1_SIZEOF)
+        ? pcie_cap_v1_init(dev, offset, type, 0)
+        : pcie_cap_init(dev, offset, type, 0);
+}
+
+int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
+{
+    return pcie_endpoint_cap_common_init(dev, offset, PCI_EXP_VER2_SIZEOF);
+}
+
+int pcie_endpoint_cap_v1_init(PCIDevice *dev, uint8_t offset)
+{
+    return pcie_endpoint_cap_common_init(dev, offset, PCI_EXP_VER1_SIZEOF);
 }

 void pcie_cap_exit(PCIDevice *dev)
@@ -110,6 +153,11 @@ void pcie_cap_exit(PCIDevice *dev)
    pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF);
 }

+void pcie_cap_v1_exit(PCIDevice *dev)
+{
+    pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER1_SIZEOF);
+}
+
 uint8_t pcie_cap_get_type(const PCIDevice *dev)
 {
    uint32_t pos = dev->exp.exp_cap;
@@ -647,3 +695,13 @@ void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
                        offset, PCI_ARI_SIZEOF);
    pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
 }
+
+void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num)
+{
+    static const int pci_dsn_ver = 1;
+    static const int pci_dsn_cap = 4;
+
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_DSN, pci_dsn_ver, offset,
+                        PCI_EXT_CAP_DSN_SIZEOF);
+    pci_set_quad(dev->config + offset + pci_dsn_cap, ser_num);
+}
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1842,6 +1842,10 @@ static void ppc_spapr_init(MachineState *machine)
        exit(1);
    }
    spapr->rtas_size = get_image_size(filename);
+    if (spapr->rtas_size < 0) {
+        error_report("Could not get size of LPAR rtas '%s'", filename);
+        exit(1);
+    }
    spapr->rtas_blob = g_malloc(spapr->rtas_size);
    if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
        error_report("Could not load LPAR rtas '%s'", filename);
@@ -2132,15 +2136,6 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
    int i, fdt_offset, fdt_size;
    void *fdt;

-    /*
-     * Check for DRC connectors and send hotplug notification to the
-     * guest only in case of hotplugged memory. This allows cold plugged
-     * memory to be specified at boot time.
-     */
-    if (!dev->hotplugged) {
-        return;
-    }
-
    for (i = 0; i < nr_lmbs; i++) {
        drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
                addr/SPAPR_MEMORY_BLOCK_SIZE);
@@ -2154,7 +2149,12 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
        drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
        addr += SPAPR_MEMORY_BLOCK_SIZE;
    }
-    spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+    /* send hotplug notification to the
+     * guest only in case of hotplugged memory
+     */
+    if (dev->hotplugged) {
+       spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+    }
 }

 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -186,6 +186,7 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu, target_ulong ptex,
 static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                             target_ulong opcode, target_ulong *args)
 {
+    CPUPPCState *env = &cpu->env;
    target_ulong flags = args[0];
    target_ulong pte_index = args[1];
    target_ulong avpn = args[2];
@@ -196,6 +197,7 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,

    switch (ret) {
    case REMOVE_SUCCESS:
+        check_tlb_flush(env);
        return H_SUCCESS;

    case REMOVE_NOT_FOUND:
@@ -232,7 +234,9 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                                  target_ulong opcode, target_ulong *args)
 {
+    CPUPPCState *env = &cpu->env;
    int i;
+    target_ulong rc = H_SUCCESS;

    for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
        target_ulong *tsh = &args[i*2];
@@ -265,14 +269,18 @@ static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
            break;

        case REMOVE_PARM:
-            return H_PARAMETER;
+            rc = H_PARAMETER;
+            goto exit;

        case REMOVE_HW:
-            return H_HARDWARE;
+            rc = H_HARDWARE;
+            goto exit;
        }
    }
+ exit:
+    check_tlb_flush(env);

-    return H_SUCCESS;
+    return rc;
 }

 static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr,
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -76,6 +76,37 @@ static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce)
    }
 }

+static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
+                                       uint32_t page_shift,
+                                       uint32_t nb_table,
+                                       int *fd,
+                                       bool need_vfio)
+{
+    uint64_t *table = NULL;
+    uint64_t window_size = (uint64_t)nb_table << page_shift;
+
+    if (kvm_enabled() && !(window_size >> 32)) {
+        table = kvmppc_create_spapr_tce(liobn, window_size, fd, need_vfio);
+    }
+
+    if (!table) {
+        *fd = -1;
+        table = g_malloc0(nb_table * sizeof(uint64_t));
+    }
+
+    trace_spapr_iommu_new_table(liobn, table, *fd);
+
+    return table;
+}
+
+static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table)
+{
+    if (!kvm_enabled() ||
+        (kvmppc_remove_spapr_tce(table, fd, nb_table) != 0)) {
+        g_free(table);
+    }
+}
+
 /* Called from RCU critical section */
 static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
                                               bool is_write)
@@ -142,21 +173,13 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
 static int spapr_tce_table_realize(DeviceState *dev)
 {
    sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
-    uint64_t window_size = (uint64_t)tcet->nb_table << tcet->page_shift;

-    if (kvm_enabled() && !(window_size >> 32)) {
-        tcet->table = kvmppc_create_spapr_tce(tcet->liobn,
-                                              window_size,
-                                              &tcet->fd,
-                                              tcet->need_vfio);
-    }
-
-    if (!tcet->table) {
-        size_t table_size = tcet->nb_table * sizeof(uint64_t);
-        tcet->table = g_malloc0(table_size);
-    }
-
-    trace_spapr_iommu_new_table(tcet->liobn, tcet, tcet->table, tcet->fd);
+    tcet->fd = -1;
+    tcet->table = spapr_tce_alloc_table(tcet->liobn,
+                                        tcet->page_shift,
+                                        tcet->nb_table,
+                                        &tcet->fd,
+                                        tcet->need_vfio);

    memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops,
                             "iommu-spapr",
@@ -242,11 +265,8 @@ static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)

    QLIST_REMOVE(tcet, list);

-    if (!kvm_enabled() ||
-        (kvmppc_remove_spapr_tce(tcet->table, tcet->fd,
-                                 tcet->nb_table) != 0)) {
-        g_free(tcet->table);
-    }
+    spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
+    tcet->fd = -1;
 }

 MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet)
@@ -278,7 +298,7 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
    tcet->table[index] = tce;

    entry.target_as = &address_space_memory,
-    entry.iova = ioba & page_mask;
+    entry.iova = (ioba - tcet->bus_offset) & page_mask;
    entry.translated_addr = tce & page_mask;
    entry.addr_mask = ~page_mask;
    entry.perm = spapr_tce_iommu_access_flags(tce);
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1093,13 +1093,11 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
        spapr_tce_set_need_vfio(tcet, true);
    }

-    if (dev->hotplugged) {
-        fdt = create_device_tree(&fdt_size);
-        fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
-        if (!fdt_start_offset) {
-            error_setg(errp, "Failed to create pci child device tree node");
-            goto out;
-        }
+    fdt = create_device_tree(&fdt_size);
+    fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
+    if (!fdt_start_offset) {
+        error_setg(errp, "Failed to create pci child device tree node");
+        goto out;
    }

    drck->attach(drc, DEVICE(pdev),
@@ -1816,7 +1814,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
    _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                     sizeof(interrupt_map)));

-    tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(phb->index, 0));
+    tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
    if (!tcet) {
        return -1;
    }
--- a/hw/s390x/s390-skeys.c
+++ b/hw/s390x/s390-skeys.c
@@ -47,15 +47,11 @@ void s390_skeys_init(void)
    qdev_init_nofail(DEVICE(obj));
 }

-static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn,
+static void write_keys(FILE *f, uint8_t *keys, uint64_t startgfn,
                       uint64_t count, Error **errp)
 {
    uint64_t curpage = startgfn;
    uint64_t maxpage = curpage + count - 1;
-    const char *fmt = "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d,"
-                      " ch=%d, reserved=%d\n";
-    char buf[128];
-    int len;

    for (; curpage <= maxpage; curpage++) {
        uint8_t acc = (*keys & 0xF0) >> 4;
@@ -64,10 +60,9 @@ static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn,
        int ch = (*keys & 0x02);
        int res = (*keys & 0x01);

-        len = snprintf(buf, sizeof(buf), fmt, curpage,
-                       *keys, acc, fp, ref, ch, res);
-        assert(len < sizeof(buf));
-        qemu_put_buffer(f, (uint8_t *)buf, len);
+        fprintf(f, "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d,"
+                " ch=%d, reserved=%d\n",
+                curpage, *keys, acc, fp, ref, ch, res);
        keys++;
    }
 }
@@ -116,7 +111,8 @@ void qmp_dump_skeys(const char *filename, Error **errp)
    vaddr cur_gfn = 0;
    uint8_t *buf;
    int ret;
-    QEMUFile *f;
+    int fd;
+    FILE *f;

    /* Quick check to see if guest is using storage keys*/
    if (!skeyclass->skeys_enabled(ss)) {
@@ -125,8 +121,14 @@ void qmp_dump_skeys(const char *filename, Error **errp)
        return;
    }

-    f = qemu_fopen(filename, "wb");
+    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+    if (fd < 0) {
+        error_setg_file_open(errp, errno, filename);
+        return;
+    }
+    f = fdopen(fd, "wb");
    if (!f) {
+        close(fd);
        error_setg_file_open(errp, errno, filename);
        return;
    }
@@ -162,7 +164,7 @@ out_free:
    error_propagate(errp, lerr);
    g_free(buf);
 out:
-    qemu_fclose(f);
+    fclose(f);
 }

 static void qemu_s390_skeys_init(Object *obj)
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -650,7 +650,9 @@ static int megasas_init_firmware(MegasasState *s, MegasasCmd *cmd)
    pa_hi = le32_to_cpu(initq->pi_addr_hi);
    s->producer_pa = ((uint64_t) pa_hi << 32) | pa_lo;
    s->reply_queue_head = ldl_le_pci_dma(pcid, s->producer_pa);
+    s->reply_queue_head %= MEGASAS_MAX_FRAMES;
    s->reply_queue_tail = ldl_le_pci_dma(pcid, s->consumer_pa);
+    s->reply_queue_tail %= MEGASAS_MAX_FRAMES;
    flags = le32_to_cpu(initq->flags);
    if (flags & MFI_QUEUE_FLAG_CONTEXT64) {
        s->flags |= MEGASAS_MASK_USE_QUEUE64;
@@ -1293,7 +1295,7 @@ static int megasas_dcmd_ld_get_info(MegasasState *s, MegasasCmd *cmd)

 static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd)
 {
-    uint8_t data[4096];
+    uint8_t data[4096] = { 0 };
    struct mfi_config_data *info;
    int num_pd_disks = 0, array_offset, ld_offset;
    BusChild *kid;
@@ -1446,7 +1448,7 @@ static int megasas_dcmd_set_properties(MegasasState *s, MegasasCmd *cmd)
                                            dcmd_size);
        return MFI_STAT_INVALID_PARAMETER;
    }
-    dma_buf_write((uint8_t *)&info, cmd->iov_size, &cmd->qsg);
+    dma_buf_write((uint8_t *)&info, dcmd_size, &cmd->qsg);
    trace_megasas_dcmd_unsupported(cmd->index, cmd->iov_size);
    return MFI_STAT_OK;
 }
--- a/hw/scsi/mptsas.c
+++ b/hw/scsi/mptsas.c
@@ -754,11 +754,6 @@ static void mptsas_fetch_request(MPTSASState *s)
    hwaddr addr;
    int size;

-    if (s->state != MPI_IOC_STATE_OPERATIONAL) {
-        mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE);
-        return;
-    }
-
    /* Read the message header from the guest first. */
    addr = s->host_mfa_high_addr | MPTSAS_FIFO_GET(s, request_post);
    pci_dma_read(pci, addr, req, sizeof(hdr));
@@ -789,6 +784,10 @@ static void mptsas_fetch_requests(void *opaque)
 {
    MPTSASState *s = opaque;

+    if (s->state != MPI_IOC_STATE_OPERATIONAL) {
+        mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE);
+        return;
+    }
    while (!MPTSAS_FIFO_EMPTY(s, request_post)) {
        mptsas_fetch_request(s);
    }
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -53,7 +53,21 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
 #define DEFAULT_MAX_UNMAP_SIZE      (1 << 30)   /* 1 GB */
 #define DEFAULT_MAX_IO_SIZE         INT_MAX     /* 2 GB - 1 block */

-typedef struct SCSIDiskState SCSIDiskState;
+#define TYPE_SCSI_DISK_BASE         "scsi-disk-base"
+
+#define SCSI_DISK_BASE(obj) \
+     OBJECT_CHECK(SCSIDiskState, (obj), TYPE_SCSI_DISK_BASE)
+#define SCSI_DISK_BASE_CLASS(klass) \
+     OBJECT_CLASS_CHECK(SCSIDiskClass, (klass), TYPE_SCSI_DISK_BASE)
+#define SCSI_DISK_BASE_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(SCSIDiskClass, (obj), TYPE_SCSI_DISK_BASE)
+
+typedef struct SCSIDiskClass {
+    SCSIDeviceClass parent_class;
+    DMAIOFunc       *dma_readv;
+    DMAIOFunc       *dma_writev;
+    bool            (*need_fua_emulation)(SCSICommand *cmd);
+} SCSIDiskClass;

 typedef struct SCSIDiskReq {
    SCSIRequest req;
@@ -62,16 +76,18 @@ typedef struct SCSIDiskReq {
    uint32_t sector_count;
    uint32_t buflen;
    bool started;
+    bool need_fua_emulation;
    struct iovec iov;
    QEMUIOVector qiov;
    BlockAcctCookie acct;
+    unsigned char *status;
 } SCSIDiskReq;

 #define SCSI_DISK_F_REMOVABLE             0
 #define SCSI_DISK_F_DPOFUA                1
 #define SCSI_DISK_F_NO_REMOVABLE_DEVOPS   2

-struct SCSIDiskState
+typedef struct SCSIDiskState
 {
    SCSIDevice qdev;
    uint32_t features;
@@ -88,7 +104,7 @@ struct SCSIDiskState
    char *product;
    bool tray_open;
    bool tray_locked;
-};
+} SCSIDiskState;

 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed);

@@ -161,6 +177,29 @@ static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req)
    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
 }

+static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
+{
+    if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
+        return true;
+    }
+
+    if (ret < 0) {
+        return scsi_handle_rw_error(r, -ret, acct_failed);
+    }
+
+    if (r->status && *r->status) {
+        if (acct_failed) {
+            SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+            block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+        }
+        scsi_req_complete(&r->req, *r->status);
+        return true;
+    }
+
+    return false;
+}
+
 static void scsi_aio_complete(void *opaque, int ret)
 {
    SCSIDiskReq *r = (SCSIDiskReq *)opaque;
@@ -168,17 +207,10 @@ static void scsi_aio_complete(void *opaque, int ret)

    assert(r->req.aiocb != NULL);
    r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
    scsi_req_complete(&r->req, GOOD);

@@ -217,13 +249,9 @@ static void scsi_write_do_fua(SCSIDiskReq *r)
    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);

    assert(r->req.aiocb == NULL);
+    assert(!r->req.io_canceled);

-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
-        goto done;
-    }
-
-    if (scsi_is_cmd_fua(&r->req.cmd)) {
+    if (r->need_fua_emulation) {
        block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                         BLOCK_ACCT_FLUSH);
        r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r);
@@ -231,26 +259,16 @@ static void scsi_write_do_fua(SCSIDiskReq *r)
    }

    scsi_req_complete(&r->req, GOOD);
-
-done:
    scsi_req_unref(&r->req);
 }

 static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
 {
    assert(r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
    r->sector += r->sector_count;
    r->sector_count = 0;
    if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
@@ -288,17 +306,10 @@ static void scsi_read_complete(void * opaque, int ret)

    assert(r->req.aiocb != NULL);
    r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
    DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size);

@@ -315,35 +326,29 @@ done:
 static void scsi_do_read(SCSIDiskReq *r, int ret)
 {
    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));

    assert (r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
    /* The request is used as the AIO opaque value, so add a ref.  */
    scsi_req_ref(&r->req);

    if (r->req.sg) {
        dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
        r->req.resid -= r->req.sg->size;
-        r->req.aiocb = dma_blk_read(s->qdev.conf.blk, r->req.sg, r->sector,
-                                    scsi_dma_complete, r);
+        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
+                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+                                  sdc->dma_readv, r, scsi_dma_complete, r,
+                                  DMA_DIRECTION_FROM_DEVICE);
    } else {
        scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
        block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                         r->qiov.size, BLOCK_ACCT_READ);
-        r->req.aiocb = blk_aio_preadv(s->qdev.conf.blk,
-                                      r->sector << BDRV_SECTOR_BITS, &r->qiov,
-                                      0, scsi_read_complete, r);
+        r->req.aiocb = sdc->dma_readv(r->sector, &r->qiov,
+                                      scsi_read_complete, r, r);
    }

 done:
@@ -398,7 +403,7 @@ static void scsi_read_data(SCSIRequest *req)

    first = !r->started;
    r->started = true;
-    if (first && scsi_is_cmd_fua(&r->req.cmd)) {
+    if (first && r->need_fua_emulation) {
        block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                         BLOCK_ACCT_FLUSH);
        r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
@@ -455,18 +460,10 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
    uint32_t n;

    assert (r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
    n = r->qiov.size / 512;
    r->sector += n;
    r->sector_count -= n;
@@ -503,6 +500,7 @@ static void scsi_write_data(SCSIRequest *req)
 {
    SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));

    /* No data transfer may already be in progress */
    assert(r->req.aiocb == NULL);
@@ -539,14 +537,15 @@ static void scsi_write_data(SCSIRequest *req)
    if (r->req.sg) {
        dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
        r->req.resid -= r->req.sg->size;
-        r->req.aiocb = dma_blk_write(s->qdev.conf.blk, r->req.sg, r->sector,
-                                     scsi_dma_complete, r);
+        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
+                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+                                  sdc->dma_writev, r, scsi_dma_complete, r,
+                                  DMA_DIRECTION_TO_DEVICE);
    } else {
        block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                         r->qiov.size, BLOCK_ACCT_WRITE);
-        r->req.aiocb = blk_aio_pwritev(s->qdev.conf.blk,
-                                       r->sector << BDRV_SECTOR_BITS, &r->qiov,
-                                       0, scsi_write_complete, r);
+        r->req.aiocb = sdc->dma_writev(r->sector << BDRV_SECTOR_BITS, &r->qiov,
+                                       scsi_write_complete, r, r);
    }
 }

@@ -1598,18 +1597,10 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret)
    uint32_t nb_sectors;

    assert(r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
    if (data->count > 0) {
        sector_num = ldq_be_p(&data->inbuf[0]);
        nb_sectors = ldl_be_p(&data->inbuf[8]) & 0xffffffffULL;
@@ -1709,17 +1700,10 @@ static void scsi_write_same_complete(void *opaque, int ret)

    assert(r->req.aiocb != NULL);
    r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
        goto done;
    }

-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);

    data->nb_sectors -= data->iov.iov_len / 512;
@@ -1778,7 +1762,7 @@ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
        block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                         nb_sectors * s->qdev.blocksize,
                        BLOCK_ACCT_WRITE);
-        r->req.aiocb = blk_aio_write_zeroes(s->qdev.conf.blk,
+        r->req.aiocb = blk_aio_pwrite_zeroes(s->qdev.conf.blk,
                                r->req.cmd.lba * s->qdev.blocksize,
                                nb_sectors * s->qdev.blocksize,
                                flags, scsi_aio_complete, r);
@@ -2136,6 +2120,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
 {
    SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
    uint32_t len;
    uint8_t command;

@@ -2194,6 +2179,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
        scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
        return 0;
    }
+    r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
    if (r->sector_count == 0) {
        scsi_req_complete(&r->req, GOOD);
    }
@@ -2576,16 +2562,145 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
    scsi_generic_read_device_identification(&s->qdev);
 }

+typedef struct SCSIBlockReq {
+    SCSIDiskReq req;
+    sg_io_hdr_t io_header;
+
+    /* Selected bytes of the original CDB, copied into our own CDB.  */
+    uint8_t cmd, cdb1, group_number;
+
+    /* CDB passed to SG_IO.  */
+    uint8_t cdb[16];
+} SCSIBlockReq;
+
+static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
+                                      int64_t offset, QEMUIOVector *iov,
+                                      int direction,
+                                      BlockCompletionFunc *cb, void *opaque)
+{
+    sg_io_hdr_t *io_header = &req->io_header;
+    SCSIDiskReq *r = &req->req;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    int nb_logical_blocks;
+    uint64_t lba;
+    BlockAIOCB *aiocb;
+
+    /* This is not supported yet.  It can only happen if the guest does
+     * reads and writes that are not aligned to one logical sectors
+     * _and_ cover multiple MemoryRegions.
+     */
+    assert(offset % s->qdev.blocksize == 0);
+    assert(iov->size % s->qdev.blocksize == 0);
+
+    io_header->interface_id = 'S';
+
+    /* The data transfer comes from the QEMUIOVector.  */
+    io_header->dxfer_direction = direction;
+    io_header->dxfer_len = iov->size;
+    io_header->dxferp = (void *)iov->iov;
+    io_header->iovec_count = iov->niov;
+    assert(io_header->iovec_count == iov->niov); /* no overflow! */
+
+    /* Build a new CDB with the LBA and length patched in, in case
+     * DMA helpers split the transfer in multiple segments.  Do not
+     * build a CDB smaller than what the guest wanted, and only build
+     * a larger one if strictly necessary.
+     */
+    io_header->cmdp = req->cdb;
+    lba = offset / s->qdev.blocksize;
+    nb_logical_blocks = io_header->dxfer_len / s->qdev.blocksize;
+
+    if ((req->cmd >> 5) == 0 && lba <= 0x1ffff) {
+        /* 6-byte CDB */
+        stl_be_p(&req->cdb[0], lba | (req->cmd << 24));
+        req->cdb[4] = nb_logical_blocks;
+        req->cdb[5] = 0;
+        io_header->cmd_len = 6;
+    } else if ((req->cmd >> 5) <= 1 && lba <= 0xffffffffULL) {
+        /* 10-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x20;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        req->cdb[6] = req->group_number;
+        stw_be_p(&req->cdb[7], nb_logical_blocks);
+        req->cdb[9] = 0;
+        io_header->cmd_len = 10;
+    } else if ((req->cmd >> 5) != 4 && lba <= 0xffffffffULL) {
+        /* 12-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0xA0;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[6], nb_logical_blocks);
+        req->cdb[10] = req->group_number;
+        req->cdb[11] = 0;
+        io_header->cmd_len = 12;
+    } else {
+        /* 16-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x80;
+        req->cdb[1] = req->cdb1;
+        stq_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[10], nb_logical_blocks);
+        req->cdb[14] = req->group_number;
+        req->cdb[15] = 0;
+        io_header->cmd_len = 16;
+    }
+
+    /* The rest is as in scsi-generic.c.  */
+    io_header->mx_sb_len = sizeof(r->req.sense);
+    io_header->sbp = r->req.sense;
+    io_header->timeout = UINT_MAX;
+    io_header->usr_ptr = r;
+    io_header->flags |= SG_FLAG_DIRECT_IO;
+
+    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
+    assert(aiocb != NULL);
+    return aiocb;
+}
+
+static bool scsi_block_no_fua(SCSICommand *cmd)
+{
+    return false;
+}
+
+static BlockAIOCB *scsi_block_dma_readv(int64_t offset,
+                                        QEMUIOVector *iov,
+                                        BlockCompletionFunc *cb, void *cb_opaque,
+                                        void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_FROM_DEV, cb, cb_opaque);
+}
+
+static BlockAIOCB *scsi_block_dma_writev(int64_t offset,
+                                         QEMUIOVector *iov,
+                                         BlockCompletionFunc *cb, void *cb_opaque,
+                                         void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_TO_DEV, cb, cb_opaque);
+}
+
 static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 {
    switch (buf[0]) {
+    case VERIFY_10:
+    case VERIFY_12:
+    case VERIFY_16:
+        /* Check if BYTCHK == 0x01 (data-out buffer contains data
+         * for the number of logical blocks specified in the length
+         * field).  For other modes, do not use scatter/gather operation.
+         */
+        if ((buf[1] & 6) != 2) {
+            return false;
+        }
+        break;
+
    case READ_6:
    case READ_10:
    case READ_12:
    case READ_16:
-    case VERIFY_10:
-    case VERIFY_12:
-    case VERIFY_16:
    case WRITE_6:
    case WRITE_10:
    case WRITE_12:
@@ -2593,21 +2708,8 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
    case WRITE_VERIFY_10:
    case WRITE_VERIFY_12:
    case WRITE_VERIFY_16:
-        /* If we are not using O_DIRECT, we might read stale data from the
-         * host cache if writes were made using other commands than these
-         * ones (such as WRITE SAME or EXTENDED COPY, etc.).  So, without
-         * O_DIRECT everything must go through SG_IO.
-         */
-        if (!(blk_get_flags(s->qdev.conf.blk) & BDRV_O_NOCACHE)) {
-            break;
-        }
-
-        /* MMC writing cannot be done via pread/pwrite, because it sometimes
+        /* MMC writing cannot be done via DMA helpers, because it sometimes
         * involves writing beyond the maximum LBA or to negative LBA (lead-in).
-         * And once you do these writes, reading from the block device is
-         * unreliable, too.  It is even possible that reads deliver random data
-         * from the host page cache (this is probably a Linux bug).
-         *
         * We might use scsi_disk_dma_reqops as long as no writing commands are
         * seen, but performance usually isn't paramount on optical media.  So,
         * just make scsi-block operate the same as scsi-generic for them.
@@ -2625,6 +2727,54 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 }


+static int32_t scsi_block_dma_command(SCSIRequest *req, uint8_t *buf)
+{
+    SCSIBlockReq *r = (SCSIBlockReq *)req;
+    r->cmd = req->cmd.buf[0];
+    switch (r->cmd >> 5) {
+    case 0:
+        /* 6-byte CDB.  */
+        r->cdb1 = r->group_number = 0;
+        break;
+    case 1:
+        /* 10-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[6];
+    case 4:
+        /* 12-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[10];
+        break;
+    case 5:
+        /* 16-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[14];
+        break;
+    default:
+        abort();
+    }
+
+    if (r->cdb1 & 0xe0) {
+        /* Protection information is not supported.  */
+        scsi_check_condition(&r->req, SENSE_CODE(INVALID_FIELD));
+        return 0;
+    }
+
+    r->req.status = &r->io_header.status;
+    return scsi_disk_dma_command(req, buf);
+}
+
+static const SCSIReqOps scsi_block_dma_reqops = {
+    .size         = sizeof(SCSIBlockReq),
+    .free_req     = scsi_free_request,
+    .send_command = scsi_block_dma_command,
+    .read_data    = scsi_read_data,
+    .write_data   = scsi_write_data,
+    .get_buf      = scsi_get_buf,
+    .load_request = scsi_disk_load_request,
+    .save_request = scsi_disk_save_request,
+};
+
 static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
                                           uint32_t lun, uint8_t *buf,
                                           void *hba_private)
@@ -2635,7 +2785,7 @@ static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
        return scsi_req_alloc(&scsi_generic_req_ops, &s->qdev, tag, lun,
                              hba_private);
    } else {
-        return scsi_req_alloc(&scsi_disk_dma_reqops, &s->qdev, tag, lun,
+        return scsi_req_alloc(&scsi_block_dma_reqops, &s->qdev, tag, lun,
                              hba_private);
    }
 }
@@ -2654,6 +2804,46 @@ static int scsi_block_parse_cdb(SCSIDevice *d, SCSICommand *cmd,

 #endif

+static
+BlockAIOCB *scsi_dma_readv(int64_t offset, QEMUIOVector *iov,
+                           BlockCompletionFunc *cb, void *cb_opaque,
+                           void *opaque)
+{
+    SCSIDiskReq *r = opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    return blk_aio_preadv(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+}
+
+static
+BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
+                            BlockCompletionFunc *cb, void *cb_opaque,
+                            void *opaque)
+{
+    SCSIDiskReq *r = opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+}
+
+static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
+
+    dc->fw_name = "disk";
+    dc->reset = scsi_disk_reset;
+    sdc->dma_readv = scsi_dma_readv;
+    sdc->dma_writev = scsi_dma_writev;
+    sdc->need_fua_emulation = scsi_is_cmd_fua;
+}
+
+static const TypeInfo scsi_disk_base_info = {
+    .name          = TYPE_SCSI_DISK_BASE,
+    .parent        = TYPE_SCSI_DEVICE,
+    .class_init    = scsi_disk_base_class_initfn,
+    .instance_size = sizeof(SCSIDiskState),
+    .class_size    = sizeof(SCSIDiskClass),
+};
+
 #define DEFINE_SCSI_DISK_PROPERTIES()                                \
    DEFINE_BLOCK_PROPERTIES(SCSIDiskState, qdev.conf),               \
    DEFINE_PROP_STRING("ver", SCSIDiskState, version),               \
@@ -2701,17 +2891,14 @@ static void scsi_hd_class_initfn(ObjectClass *klass, void *data)
    sc->realize      = scsi_hd_realize;
    sc->alloc_req    = scsi_new_request;
    sc->unit_attention_reported = scsi_disk_unit_attention_reported;
-    dc->fw_name = "disk";
    dc->desc = "virtual SCSI disk";
-    dc->reset = scsi_disk_reset;
    dc->props = scsi_hd_properties;
    dc->vmsd  = &vmstate_scsi_disk_state;
 }

 static const TypeInfo scsi_hd_info = {
    .name          = "scsi-hd",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
    .class_init    = scsi_hd_class_initfn,
 };

@@ -2733,17 +2920,14 @@ static void scsi_cd_class_initfn(ObjectClass *klass, void *data)
    sc->realize      = scsi_cd_realize;
    sc->alloc_req    = scsi_new_request;
    sc->unit_attention_reported = scsi_disk_unit_attention_reported;
-    dc->fw_name = "disk";
    dc->desc = "virtual SCSI CD-ROM";
-    dc->reset = scsi_disk_reset;
    dc->props = scsi_cd_properties;
    dc->vmsd  = &vmstate_scsi_disk_state;
 }

 static const TypeInfo scsi_cd_info = {
    .name          = "scsi-cd",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
    .class_init    = scsi_cd_class_initfn,
 };

@@ -2757,21 +2941,22 @@ static void scsi_block_class_initfn(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    SCSIDeviceClass *sc = SCSI_DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);

    sc->realize      = scsi_block_realize;
    sc->alloc_req    = scsi_block_new_request;
    sc->parse_cdb    = scsi_block_parse_cdb;
-    dc->fw_name = "disk";
+    sdc->dma_readv   = scsi_block_dma_readv;
+    sdc->dma_writev  = scsi_block_dma_writev;
+    sdc->need_fua_emulation = scsi_block_no_fua;
    dc->desc = "SCSI block device passthrough";
-    dc->reset = scsi_disk_reset;
    dc->props = scsi_block_properties;
    dc->vmsd  = &vmstate_scsi_disk_state;
 }

 static const TypeInfo scsi_block_info = {
    .name          = "scsi-block",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
    .class_init    = scsi_block_class_initfn,
 };
 #endif
@@ -2809,13 +2994,13 @@ static void scsi_disk_class_initfn(ObjectClass *klass, void *data)

 static const TypeInfo scsi_disk_info = {
    .name          = "scsi-disk",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
    .class_init    = scsi_disk_class_initfn,
 };

 static void scsi_disk_register_types(void)
 {
+    type_register_static(&scsi_disk_base_info);
    type_register_static(&scsi_hd_info);
    type_register_static(&scsi_cd_info);
 #ifdef __linux__
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -222,6 +222,18 @@ static void scsi_read_complete(void * opaque, int ret)
            r->buf[3] |= 0x80;
        }
    }
+    if (s->type == TYPE_DISK &&
+        r->req.cmd.buf[0] == INQUIRY &&
+        r->req.cmd.buf[2] == 0xb0) {
+        uint32_t max_xfer_len = blk_get_max_transfer_length(s->conf.blk);
+        if (max_xfer_len) {
+            stl_be_p(&r->buf[8], max_xfer_len);
+            /* Also take care of the opt xfer len. */
+            if (ldl_be_p(&r->buf[12]) > max_xfer_len) {
+                stl_be_p(&r->buf[12], max_xfer_len);
+            }
+        }
+    }
    scsi_req_data(&r->req, len);
    scsi_req_unref(&r->req);
 }
--- a/hw/scsi/vmw_pvscsi.c
+++ b/hw/scsi/vmw_pvscsi.c
@@ -153,7 +153,7 @@ pvscsi_log2(uint32_t input)
    return log;
 }

-static void
+static int
 pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)
 {
    int i;
@@ -161,6 +161,10 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)
    uint32_t req_ring_size, cmp_ring_size;
    m->rs_pa = ri->ringsStatePPN << VMW_PAGE_SHIFT;

+    if ((ri->reqRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES)
+        || (ri->cmpRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES)) {
+        return -1;
+    }
    req_ring_size = ri->reqRingNumPages * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE;
    cmp_ring_size = ri->cmpRingNumPages * PVSCSI_MAX_NUM_CMP_ENTRIES_PER_PAGE;
    txr_len_log2 = pvscsi_log2(req_ring_size - 1);
@@ -192,15 +196,20 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)

    /* Flush ring state page changes */
    smp_wmb();
+
+    return 0;
 }

-static void
+static int
 pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri)
 {
    int i;
    uint32_t len_log2;
    uint32_t ring_size;

+    if (ri->numPages > PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES) {
+        return -1;
+    }
    ring_size = ri->numPages * PVSCSI_MAX_NUM_MSG_ENTRIES_PER_PAGE;
    len_log2 = pvscsi_log2(ring_size - 1);

@@ -220,6 +229,8 @@ pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri)

    /* Flush ring state page changes */
    smp_wmb();
+
+    return 0;
 }

 static void
@@ -770,7 +781,10 @@ pvscsi_on_cmd_setup_rings(PVSCSIState *s)
    trace_pvscsi_on_cmd_arrived("PVSCSI_CMD_SETUP_RINGS");

    pvscsi_dbg_dump_tx_rings_config(rc);
-    pvscsi_ring_init_data(&s->rings, rc);
+    if (pvscsi_ring_init_data(&s->rings, rc) < 0) {
+        return PVSCSI_COMMAND_PROCESSING_FAILED;
+    }
+
    s->rings_info_valid = TRUE;
    return PVSCSI_COMMAND_PROCESSING_SUCCEEDED;
 }
@@ -850,7 +864,9 @@ pvscsi_on_cmd_setup_msg_ring(PVSCSIState *s)
    }

    if (s->rings_info_valid) {
-        pvscsi_ring_init_msg(&s->rings, rc);
+        if (pvscsi_ring_init_msg(&s->rings, rc) < 0) {
+            return PVSCSI_COMMAND_PROCESSING_FAILED;
+        }
        s->msg_ring_info_valid = TRUE;
    }
    return sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t);
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -260,14 +260,20 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
    VFIOContainer *container = giommu->container;
    IOMMUTLBEntry *iotlb = data;
+    hwaddr iova = iotlb->iova + giommu->iommu_offset;
    MemoryRegion *mr;
    hwaddr xlat;
    hwaddr len = iotlb->addr_mask + 1;
    void *vaddr;
    int ret;

-    trace_vfio_iommu_map_notify(iotlb->iova,
-                                iotlb->iova + iotlb->addr_mask);
+    trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask);
+
+    if (iotlb->target_as != &address_space_memory) {
+        error_report("Wrong target AS \"%s\", only system memory is allowed",
+                     iotlb->target_as->name ? iotlb->target_as->name : "none");
+        return;
+    }

    /*
     * The IOMMU TLB entry we have just covers translation through
@@ -294,21 +300,21 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)

    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
        vaddr = memory_region_get_ram_ptr(mr) + xlat;
-        ret = vfio_dma_map(container, iotlb->iova,
+        ret = vfio_dma_map(container, iova,
                           iotlb->addr_mask + 1, vaddr,
                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
        if (ret) {
            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                         container, iotlb->iova,
+                         container, iova,
                         iotlb->addr_mask + 1, vaddr, ret);
        }
    } else {
-        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
        if (ret) {
            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                         "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iotlb->iova,
+                         container, iova,
                         iotlb->addr_mask + 1, ret);
        }
    }
@@ -380,6 +386,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
         */
        giommu = g_malloc0(sizeof(*giommu));
        giommu->iommu = section->mr;
+        giommu->iommu_offset = section->offset_within_address_space -
+                               section->offset_within_region;
        giommu->container = container;
        giommu->n.notify = vfio_iommu_map_notify;
        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
@@ -433,6 +441,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
 {
    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
    hwaddr iova, end;
+    Int128 llend, llsize;
    int ret;

    if (vfio_listener_skipped_section(section)) {
@@ -471,21 +480,25 @@ static void vfio_listener_region_del(MemoryListener *listener,
    }

    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
-    end = (section->offset_within_address_space + int128_get64(section->size)) &
-          TARGET_PAGE_MASK;
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));

-    if (iova >= end) {
+    if (int128_ge(int128_make64(iova), llend)) {
        return;
    }
+    end = int128_get64(int128_sub(llend, int128_one()));

-    trace_vfio_listener_region_del(iova, end - 1);
+    llsize = int128_sub(llend, int128_make64(iova));

-    ret = vfio_dma_unmap(container, iova, end - iova);
+    trace_vfio_listener_region_del(iova, end);
+
+    ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
    memory_region_unref(section->mr);
    if (ret) {
        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                     "0x%"HWADDR_PRIx") = %d (%m)",
-                     container, iova, end - iova, ret);
+                     container, iova, int128_get64(llsize), ret);
    }
 }

@@ -499,6 +512,54 @@ static void vfio_listener_release(VFIOContainer *container)
    memory_listener_unregister(&container->listener);
 }

+static struct vfio_info_cap_header *
+vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
+{
+    struct vfio_info_cap_header *hdr;
+    void *ptr = info;
+
+    if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
+        return NULL;
+    }
+
+    for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+        if (hdr->id == id) {
+            return hdr;
+        }
+    }
+
+    return NULL;
+}
+
+static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                           struct vfio_region_info *info)
+{
+    struct vfio_info_cap_header *hdr;
+    struct vfio_region_info_cap_sparse_mmap *sparse;
+    int i;
+
+    hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
+    if (!hdr) {
+        return;
+    }
+
+    sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
+
+    trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
+                                         region->nr, sparse->nr_areas);
+
+    region->nr_mmaps = sparse->nr_areas;
+    region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+
+    for (i = 0; i < region->nr_mmaps; i++) {
+        region->mmaps[i].offset = sparse->areas[i].offset;
+        region->mmaps[i].size = sparse->areas[i].size;
+        trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
+                                            region->mmaps[i].offset +
+                                            region->mmaps[i].size);
+    }
+}
+
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
                      int index, const char *name)
 {
@@ -525,11 +586,14 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
            region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
            !(region->size & ~qemu_real_host_page_mask)) {

-            region->nr_mmaps = 1;
-            region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+            vfio_setup_region_sparse_mmaps(region, info);

-            region->mmaps[0].offset = 0;
-            region->mmaps[0].size = region->size;
+            if (!region->nr_mmaps) {
+                region->nr_mmaps = 1;
+                region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+                region->mmaps[0].offset = 0;
+                region->mmaps[0].size = region->size;
+            }
        }
    }

@@ -1089,16 +1153,60 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
    *info = g_malloc0(argsz);

    (*info)->index = index;
+retry:
    (*info)->argsz = argsz;

    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
        g_free(*info);
+        *info = NULL;
        return -errno;
    }

+    if ((*info)->argsz > argsz) {
+        argsz = (*info)->argsz;
+        *info = g_realloc(*info, argsz);
+
+        goto retry;
+    }
+
    return 0;
 }

+int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+                             uint32_t subtype, struct vfio_region_info **info)
+{
+    int i;
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        struct vfio_info_cap_header *hdr;
+        struct vfio_region_info_cap_type *cap_type;
+
+        if (vfio_get_region_info(vbasedev, i, info)) {
+            continue;
+        }
+
+        hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
+        if (!hdr) {
+            g_free(*info);
+            continue;
+        }
+
+        cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
+
+        trace_vfio_get_dev_region(vbasedev->name, i,
+                                  cap_type->type, cap_type->subtype);
+
+        if (cap_type->type == type && cap_type->subtype == subtype) {
+            return 0;
+        }
+
+        g_free(*info);
+    }
+
+    *info = NULL;
+    return -ENODEV;
+}
+
 /*
 * Interfaces for IBM EEH (Enhanced Error Handling)
 */
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -11,9 +11,12 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/range.h"
+#include "qapi/error.h"
+#include "hw/nvram/fw_cfg.h"
 #include "pci.h"
 #include "trace.h"
-#include "qemu/range.h"

 /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
 static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
@@ -961,6 +964,643 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
 }

+/*
+ * Intel IGD support
+ *
+ * Obviously IGD is not a discrete device, this is evidenced not only by it
+ * being integrated into the CPU, but by the various chipset and BIOS
+ * dependencies that it brings along with it.  Intel is trying to move away
+ * from this and Broadwell and newer devices can run in what Intel calls
+ * "Universal Pass-Through" mode, or UPT.  Theoretically in UPT mode, nothing
+ * more is required beyond assigning the IGD device to a VM.  There are
+ * however support limitations to this mode.  It only supports IGD as a
+ * secondary graphics device in the VM and it doesn't officially support any
+ * physical outputs.
+ *
+ * The code here attempts to enable what we'll call legacy mode assignment,
+ * IGD retains most of the capabilities we expect for it to have on bare
+ * metal.  To enable this mode, the IGD device must be assigned to the VM
+ * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA
+ * support, we must have VM BIOS support for reserving and populating some
+ * of the required tables, and we need to tweak the chipset with revisions
+ * and IDs and an LPC/ISA bridge device.  The intention is to make all of
+ * this happen automatically by installing the device at the correct VM PCI
+ * bus address.  If any of the conditions are not met, we cross our fingers
+ * and hope the user knows better.
+ *
+ * NB - It is possible to enable physical outputs in UPT mode by supplying
+ * an OpRegion table.  We don't do this by default because the guest driver
+ * behaves differently if an OpRegion is provided and no monitor is attached
+ * vs no OpRegion and a monitor being attached or not.  Effectively, if a
+ * headless setup is desired, the OpRegion gets in the way of that.
+ */
+
+/*
+ * This presumes the device is already known to be an Intel VGA device, so we
+ * take liberties in which device ID bits match which generation.  This should
+ * not be taken as an indication that all the devices are supported, or even
+ * supportable, some of them don't even support VT-d.
+ * See linux:include/drm/i915_pciids.h for IDs.
+ */
+static int igd_gen(VFIOPCIDevice *vdev)
+{
+    if ((vdev->device_id & 0xfff) == 0xa84) {
+        return 8; /* Broxton */
+    }
+
+    switch (vdev->device_id & 0xff00) {
+    /* Old, untested, unavailable, unknown */
+    case 0x0000:
+    case 0x2500:
+    case 0x2700:
+    case 0x2900:
+    case 0x2a00:
+    case 0x2e00:
+    case 0x3500:
+    case 0xa000:
+        return -1;
+    /* SandyBridge, IvyBridge, ValleyView, Haswell */
+    case 0x0100:
+    case 0x0400:
+    case 0x0a00:
+    case 0x0c00:
+    case 0x0d00:
+    case 0x0f00:
+        return 6;
+    /* BroadWell, CherryView, SkyLake, KabyLake */
+    case 0x1600:
+    case 0x1900:
+    case 0x2200:
+    case 0x5900:
+        return 8;
+    }
+
+    return 8; /* Assume newer is compatible */
+}
+
+typedef struct VFIOIGDQuirk {
+    struct VFIOPCIDevice *vdev;
+    uint32_t index;
+} VFIOIGDQuirk;
+
+#define IGD_GMCH 0x50 /* Graphics Control Register */
+#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */
+#define IGD_ASLS 0xfc /* ASL Storage Register */
+
+/*
+ * The OpRegion includes the Video BIOS Table, which seems important for
+ * telling the driver what sort of outputs it has.  Without this, the device
+ * may work in the guest, but we may not get output.  This also requires BIOS
+ * support to reserve and populate a section of guest memory sufficient for
+ * the table and to write the base address of that memory to the ASLS register
+ * of the IGD device.
+ */
+int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
+                               struct vfio_region_info *info)
+{
+    int ret;
+
+    vdev->igd_opregion = g_malloc0(info->size);
+    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
+                info->size, info->offset);
+    if (ret != info->size) {
+        error_report("vfio: Error reading IGD OpRegion");
+        g_free(vdev->igd_opregion);
+        vdev->igd_opregion = NULL;
+        return -EINVAL;
+    }
+
+    /*
+     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
+     * allocate 32bit reserved memory for, copy these contents into, and write
+     * the reserved memory base address to the device ASLS register at 0xFC.
+     * Alignment of this reserved region seems flexible, but using a 4k page
+     * alignment seems to work well.  This interface assumes a single IGD
+     * device, which may be at VM address 00:02.0 in legacy mode or another
+     * address in UPT mode.
+     *
+     * NB, there may be future use cases discovered where the VM should have
+     * direct interaction with the host OpRegion, in which case the write to
+     * the ASLS register would trigger MemoryRegion setup to enable that.
+     */
+    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
+                    vdev->igd_opregion, info->size);
+
+    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
+
+    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
+    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
+    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
+
+    return 0;
+}
+
+/*
+ * The rather short list of registers that we copy from the host devices.
+ * The LPC/ISA bridge values are definitely needed to support the vBIOS, the
+ * host bridge values may or may not be needed depending on the guest OS.
+ * Since we're only munging revision and subsystem values on the host bridge,
+ * we don't require our own device.  The LPC/ISA bridge needs to be our very
+ * own though.
+ */
+typedef struct {
+    uint8_t offset;
+    uint8_t len;
+} IGDHostInfo;
+
+static const IGDHostInfo igd_host_bridge_infos[] = {
+    {PCI_REVISION_ID,         2},
+    {PCI_SUBSYSTEM_VENDOR_ID, 2},
+    {PCI_SUBSYSTEM_ID,        2},
+};
+
+static const IGDHostInfo igd_lpc_bridge_infos[] = {
+    {PCI_VENDOR_ID,           2},
+    {PCI_DEVICE_ID,           2},
+    {PCI_REVISION_ID,         2},
+    {PCI_SUBSYSTEM_VENDOR_ID, 2},
+    {PCI_SUBSYSTEM_ID,        2},
+};
+
+static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev,
+                             struct vfio_region_info *info,
+                             const IGDHostInfo *list, int len)
+{
+    int i, ret;
+
+    for (i = 0; i < len; i++) {
+        ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset,
+                    list[i].len, info->offset + list[i].offset);
+        if (ret != list[i].len) {
+            error_report("IGD copy failed: %m");
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Stuff a few values into the host bridge.
+ */
+static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev,
+                                  struct vfio_region_info *info)
+{
+    PCIBus *bus;
+    PCIDevice *host_bridge;
+    int ret;
+
+    bus = pci_device_root_bus(&vdev->pdev);
+    host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0));
+
+    if (!host_bridge) {
+        error_report("Can't find host bridge");
+        return -ENODEV;
+    }
+
+    ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos,
+                            ARRAY_SIZE(igd_host_bridge_infos));
+    if (!ret) {
+        trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name);
+    }
+
+    return ret;
+}
+
+/*
+ * IGD LPC/ISA bridge support code.  The vBIOS needs this, but we can't write
+ * arbitrary values into just any bridge, so we must create our own.  We try
+ * to handle if the user has created it for us, which they might want to do
+ * to enable multifuction so we don't occupy the whole PCI slot.
+ */
+static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp)
+{
+    if (pdev->devfn != PCI_DEVFN(0x1f, 0)) {
+        error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0");
+    }
+}
+
+static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment";
+    dc->hotpluggable = false;
+    k->realize = vfio_pci_igd_lpc_bridge_realize;
+    k->class_id = PCI_CLASS_BRIDGE_ISA;
+}
+
+static TypeInfo vfio_pci_igd_lpc_bridge_info = {
+    .name = "vfio-pci-igd-lpc-bridge",
+    .parent = TYPE_PCI_DEVICE,
+    .class_init = vfio_pci_igd_lpc_bridge_class_init,
+};
+
+static void vfio_pci_igd_register_types(void)
+{
+    type_register_static(&vfio_pci_igd_lpc_bridge_info);
+}
+
+type_init(vfio_pci_igd_register_types)
+
+static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
+                                 struct vfio_region_info *info)
+{
+    PCIDevice *lpc_bridge;
+    int ret;
+
+    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                 0, PCI_DEVFN(0x1f, 0));
+    if (!lpc_bridge) {
+        lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev),
+                                 PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge");
+    }
+
+    ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos,
+                            ARRAY_SIZE(igd_lpc_bridge_infos));
+    if (!ret) {
+        trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name);
+    }
+
+    return ret;
+}
+
+/*
+ * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE
+ * entry, older IGDs use 2MB and 32bit.  Each PTE maps a 4k page.  Therefore
+ * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index
+ * for programming the GTT.
+ *
+ * See linux:include/drm/i915_drm.h for shift and mask values.
+ */
+static int vfio_igd_gtt_max(VFIOPCIDevice *vdev)
+{
+    uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
+    int ggms, gen = igd_gen(vdev);
+
+    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
+    ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
+    if (gen > 6) {
+        ggms = 1 << ggms;
+    }
+
+    ggms *= 1024 * 1024;
+
+    return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8);
+}
+
+/*
+ * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes.
+ * Somehow the host stolen memory range is used for this, but how the ROM gets
+ * it is a mystery, perhaps it's hardcoded into the ROM.  Thankfully though, it
+ * reprograms the GTT through the IOBAR where we can trap it and transpose the
+ * programming to the VM allocated buffer.  That buffer gets reserved by the VM
+ * firmware via the fw_cfg entry added below.  Here we're just monitoring the
+ * IOBAR address and data registers to detect a write sequence targeting the
+ * GTTADR.  This code is developed by observed behavior and doesn't have a
+ * direct spec reference, unfortunately.
+ */
+static uint64_t vfio_igd_quirk_data_read(void *opaque,
+                                         hwaddr addr, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = ~0;
+
+    return vfio_region_read(&vdev->bars[4].region, addr + 4, size);
+}
+
+static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr,
+                                      uint64_t data, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+    uint64_t val = data;
+    int gen = igd_gen(vdev);
+
+    /*
+     * Programming the GGMS starts at index 0x1 and uses every 4th index (ie.
+     * 0x1, 0x5, 0x9, 0xd,...).  For pre-Gen8 each 4-byte write is a whole PTE
+     * entry, with 0th bit enable set.  For Gen8 and up, PTEs are 64bit, so
+     * entries 0x5 & 0xd are the high dword, in our case zero.  Each PTE points
+     * to a 4k page, which we translate to a page from the VM allocated region,
+     * pointed to by the BDSM register.  If this is not set, we fail.
+     *
+     * We trap writes to the full configured GTT size, but we typically only
+     * see the vBIOS writing up to (nearly) the 1MB barrier.  In fact it often
+     * seems to miss the last entry for an even 1MB GTT.  Doing a gratuitous
+     * write of that last entry does work, but is hopefully unnecessary since
+     * we clear the previous GTT on initialization.
+     */
+    if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) {
+        if (gen < 8 || (igd->index % 8 == 1)) {
+            uint32_t base;
+
+            base = pci_get_long(vdev->pdev.config + IGD_BDSM);
+            if (!base) {
+                hw_error("vfio-igd: Guest attempted to program IGD GTT before "
+                         "BIOS reserved stolen memory.  Unsupported BIOS?");
+            }
+
+            val = base | (data & ((1 << 20) - 1));
+        } else {
+            val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */
+        }
+
+        trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name,
+                                      igd->index, data, val);
+    }
+
+    vfio_region_write(&vdev->bars[4].region, addr + 4, val, size);
+
+    igd->index = ~0;
+}
+
+static const MemoryRegionOps vfio_igd_data_quirk = {
+    .read = vfio_igd_quirk_data_read,
+    .write = vfio_igd_quirk_data_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static uint64_t vfio_igd_quirk_index_read(void *opaque,
+                                          hwaddr addr, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = ~0;
+
+    return vfio_region_read(&vdev->bars[4].region, addr, size);
+}
+
+static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr,
+                                       uint64_t data, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = data;
+
+    vfio_region_write(&vdev->bars[4].region, addr, data, size);
+}
+
+static const MemoryRegionOps vfio_igd_index_quirk = {
+    .read = vfio_igd_quirk_index_read,
+    .write = vfio_igd_quirk_index_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
+{
+    struct vfio_region_info *rom = NULL, *opregion = NULL,
+                            *host = NULL, *lpc = NULL;
+    VFIOQuirk *quirk;
+    VFIOIGDQuirk *igd;
+    PCIDevice *lpc_bridge;
+    int i, ret, ggms_mb, gms_mb = 0, gen;
+    uint64_t *bdsm_size;
+    uint32_t gmch;
+    uint16_t cmd_orig, cmd;
+
+    /*
+     * This must be an Intel VGA device at address 00:02.0 for us to even
+     * consider enabling legacy mode.  The vBIOS has dependencies on the
+     * PCI bus address.
+     */
+    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
+        !vfio_is_vga(vdev) || nr != 4 ||
+        &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                       0, PCI_DEVFN(0x2, 0))) {
+        return;
+    }
+
+    /*
+     * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we
+     * can stuff host values into, so if there's already one there and it's not
+     * one we can hack on, legacy mode is no-go.  Sorry Q35.
+     */
+    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                 0, PCI_DEVFN(0x1f, 0));
+    if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge),
+                                           "vfio-pci-igd-lpc-bridge")) {
+        error_report("IGD device %s cannot support legacy mode due to existing "
+                     "devices at address 1f.0", vdev->vbasedev.name);
+        return;
+    }
+
+    /*
+     * IGD is not a standard, they like to change their specs often.  We
+     * only attempt to support back to SandBridge and we hope that newer
+     * devices maintain compatibility with generation 8.
+     */
+    gen = igd_gen(vdev);
+    if (gen != 6 && gen != 8) {
+        error_report("IGD device %s is unsupported in legacy mode, "
+                     "try SandyBridge or newer", vdev->vbasedev.name);
+        return;
+    }
+
+    /*
+     * Most of what we're doing here is to enable the ROM to run, so if
+     * there's no ROM, there's no point in setting up this quirk.
+     * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support.
+     */
+    ret = vfio_get_region_info(&vdev->vbasedev,
+                               VFIO_PCI_ROM_REGION_INDEX, &rom);
+    if ((ret || !rom->size) && !vdev->pdev.romfile) {
+        error_report("IGD device %s has no ROM, legacy mode disabled",
+                     vdev->vbasedev.name);
+        goto out;
+    }
+
+    /*
+     * Ignore the hotplug corner case, mark the ROM failed, we can't
+     * create the devices we need for legacy mode in the hotplug scenario.
+     */
+    if (vdev->pdev.qdev.hotplugged) {
+        error_report("IGD device %s hotplugged, ROM disabled, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        vdev->rom_read_failed = true;
+        goto out;
+    }
+
+    /*
+     * Check whether we have all the vfio device specific regions to
+     * support legacy mode (added in Linux v4.6).  If not, bail.
+     */
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
+    if (ret) {
+        error_report("IGD device %s does not support OpRegion access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
+    if (ret) {
+        error_report("IGD device %s does not support host bridge access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
+    if (ret) {
+        error_report("IGD device %s does not support LPC bridge access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
+
+    /*
+     * If IGD VGA Disable is clear (expected) and VGA is not already enabled,
+     * try to enable it.  Probably shouldn't be using legacy mode without VGA,
+     * but also no point in us enabling VGA if disabled in hardware.
+     */
+    if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev)) {
+        error_report("IGD device %s failed to enable VGA access, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Create our LPC/ISA bridge */
+    ret = vfio_pci_igd_lpc_init(vdev, lpc);
+    if (ret) {
+        error_report("IGD device %s failed to create LPC bridge, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Stuff some host values into the VM PCI host bridge */
+    ret = vfio_pci_igd_host_init(vdev, host);
+    if (ret) {
+        error_report("IGD device %s failed to modify host bridge, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Setup OpRegion access */
+    ret = vfio_pci_igd_opregion_init(vdev, opregion);
+    if (ret) {
+        error_report("IGD device %s failed to setup OpRegion, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->mem = g_new0(MemoryRegion, 2);
+    quirk->nr_mem = 2;
+    igd = quirk->data = g_malloc0(sizeof(*igd));
+    igd->vdev = vdev;
+    igd->index = ~0;
+
+    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
+                          igd, "vfio-igd-index-quirk", 4);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        0, &quirk->mem[0], 1);
+
+    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk,
+                          igd, "vfio-igd-data-quirk", 4);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        4, &quirk->mem[1], 1);
+
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+
+    /* Determine the size of stolen memory needed for GTT */
+    ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
+    if (gen > 6) {
+        ggms_mb = 1 << ggms_mb;
+    }
+
+    /*
+     * Assume we have no GMS memory, but allow it to be overrided by device
+     * option (experimental).  The spec doesn't actually allow zero GMS when
+     * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
+     * so let's not waste VM memory for it.
+     */
+    gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
+
+    if (vdev->igd_gms) {
+        if (vdev->igd_gms <= 0x10) {
+            gms_mb = vdev->igd_gms * 32;
+            gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
+        } else {
+            error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
+            vdev->igd_gms = 0;
+        }
+    }
+
+    /*
+     * Request reserved memory for stolen memory via fw_cfg.  VM firmware
+     * must allocate a 1MB aligned reserved memory region below 4GB with
+     * the requested size (in bytes) for use by the Intel PCI class VGA
+     * device at VM address 00:02.0.  The base address of this reserved
+     * memory region must be written to the device BDSM regsiter at PCI
+     * config offset 0x5C.
+     */
+    bdsm_size = g_malloc(sizeof(*bdsm_size));
+    *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024);
+    fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
+                    bdsm_size, sizeof(*bdsm_size));
+
+    /* GMCH is read-only, emulated */
+    pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
+    pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
+    pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
+
+    /* BDSM is read-write, emulated.  The BIOS needs to be able to write it */
+    pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
+    pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
+    pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0);
+
+    /*
+     * This IOBAR gives us access to GTTADR, which allows us to write to
+     * the GTT itself.  So let's go ahead and write zero to all the GTT
+     * entries to avoid spurious DMA faults.  Be sure I/O access is enabled
+     * before talking to the device.
+     */
+    if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
+              vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
+        error_report("IGD device %s - failed to read PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    cmd = cmd_orig | PCI_COMMAND_IO;
+
+    if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd),
+               vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) {
+        error_report("IGD device %s - failed to write PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) {
+        vfio_region_write(&vdev->bars[4].region, 0, i, 4);
+        vfio_region_write(&vdev->bars[4].region, 4, 0, 4);
+    }
+
+    if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
+               vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
+        error_report("IGD device %s - failed to restore PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
+
+out:
+    g_free(rom);
+    g_free(opregion);
+    g_free(host);
+    g_free(lpc);
+}
+
 /*
 * Common quirk probe entry points.
 */
@@ -1010,6 +1650,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
    vfio_probe_nvidia_bar5_quirk(vdev, nr);
    vfio_probe_nvidia_bar0_quirk(vdev, nr);
    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
+    vfio_probe_igd_bar4_quirk(vdev, nr);
 }

 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1440,8 +1440,6 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
                     vdev->vbasedev.name, nr);
    }

-    vfio_bar_quirk_setup(vdev, nr);
-
    pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
 }

@@ -1452,29 +1450,6 @@ static void vfio_bars_setup(VFIOPCIDevice *vdev)
    for (i = 0; i < PCI_ROM_SLOT; i++) {
        vfio_bar_setup(vdev, i);
    }
-
-    if (vdev->vga) {
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_MEM],
-                              "vfio-vga-mmio@0xa0000",
-                              QEMU_PCI_VGA_MEM_SIZE);
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
-                              "vfio-vga-io@0x3b0",
-                              QEMU_PCI_VGA_IO_LO_SIZE);
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
-                              "vfio-vga-io@0x3c0",
-                              QEMU_PCI_VGA_IO_HI_SIZE);
-
-        pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
-                         &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
-                         &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
-        vfio_vga_quirk_setup(vdev);
-    }
 }

 static void vfio_bars_exit(VFIOPCIDevice *vdev)
@@ -2061,43 +2036,62 @@ int vfio_populate_vga(VFIOPCIDevice *vdev)
    struct vfio_region_info *reg_info;
    int ret;

-    if (vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
-        ret = vfio_get_region_info(vbasedev,
-                                   VFIO_PCI_VGA_REGION_INDEX, &reg_info);
-        if (ret) {
-            return ret;
-        }
-
-        if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
-            !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
-            reg_info->size < 0xbffff + 1) {
-            error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
-                         (unsigned long)reg_info->flags,
-                         (unsigned long)reg_info->size);
-            g_free(reg_info);
-            return -EINVAL;
-        }
-
-        vdev->vga = g_new0(VFIOVGA, 1);
-
-        vdev->vga->fd_offset = reg_info->offset;
-        vdev->vga->fd = vdev->vbasedev.fd;
-
-        g_free(reg_info);
-
-        vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
-
-        vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
-
-        vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
+    ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
+    if (ret) {
+        return ret;
    }

+    if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
+        !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
+        reg_info->size < 0xbffff + 1) {
+        error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
+                     (unsigned long)reg_info->flags,
+                     (unsigned long)reg_info->size);
+        g_free(reg_info);
+        return -EINVAL;
+    }
+
+    vdev->vga = g_new0(VFIOVGA, 1);
+
+    vdev->vga->fd_offset = reg_info->offset;
+    vdev->vga->fd = vdev->vbasedev.fd;
+
+    g_free(reg_info);
+
+    vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
+
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_MEM],
+                          "vfio-vga-mmio@0xa0000",
+                          QEMU_PCI_VGA_MEM_SIZE);
+
+    vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
+
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
+                          "vfio-vga-io@0x3b0",
+                          QEMU_PCI_VGA_IO_LO_SIZE);
+
+    vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
+
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
+                          "vfio-vga-io@0x3c0",
+                          QEMU_PCI_VGA_IO_HI_SIZE);
+
+    pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
+                     &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
+                     &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
+
    return 0;
 }

@@ -2398,7 +2392,7 @@ static int vfio_initfn(PCIDevice *pdev)
    ssize_t len;
    struct stat st;
    int groupid;
-    int ret;
+    int i, ret;

    if (!vdev->vbasedev.sysfsdev) {
        vdev->vbasedev.sysfsdev =
@@ -2560,6 +2554,43 @@ static int vfio_initfn(PCIDevice *pdev)
        goto out_teardown;
    }

+    if (vdev->vga) {
+        vfio_vga_quirk_setup(vdev);
+    }
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_bar_quirk_setup(vdev, i);
+    }
+
+    if (!vdev->igd_opregion &&
+        vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
+        struct vfio_region_info *opregion;
+
+        if (vdev->pdev.qdev.hotplugged) {
+            error_report("Cannot support IGD OpRegion feature on hotplugged "
+                         "device %s", vdev->vbasedev.name);
+            ret = -EINVAL;
+            goto out_teardown;
+        }
+
+        ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
+        if (ret) {
+            error_report("Device %s does not support requested IGD OpRegion "
+                         "feature", vdev->vbasedev.name);
+            goto out_teardown;
+        }
+
+        ret = vfio_pci_igd_opregion_init(vdev, opregion);
+        g_free(opregion);
+        if (ret) {
+            error_report("Device %s IGD OpRegion initialization failed",
+                         vdev->vbasedev.name);
+            goto out_teardown;
+        }
+    }
+
    /* QEMU emulates all of MSI & MSIX */
    if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
        memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
@@ -2603,6 +2634,13 @@ static void vfio_instance_finalize(Object *obj)
    vfio_bars_finalize(vdev);
    g_free(vdev->emulated_config_bits);
    g_free(vdev->rom);
+    /*
+     * XXX Leaking igd_opregion is not an oversight, we can't remove the
+     * fw_cfg entry therefore leaking this allocation seems like the safest
+     * option.
+     *
+     * g_free(vdev->igd_opregion);
+     */
    vfio_put_device(vdev);
    vfio_put_group(group);
 }
@@ -2677,6 +2715,8 @@ static Property vfio_pci_dev_properties[] = {
                    VFIO_FEATURE_ENABLE_VGA_BIT, false),
    DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
                    VFIO_FEATURE_ENABLE_REQ_BIT, true),
+    DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
+                    VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
    DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
    DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
    DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
@@ -2687,6 +2727,7 @@ static Property vfio_pci_dev_properties[] = {
                       sub_vendor_id, PCI_ANY_ID),
    DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                       sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
    /*
     * TODO - support passed fds... is this necessary?
     * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice {
    int interrupt; /* Current interrupt type */
    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
    VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
+    void *igd_opregion;
    PCIHostDeviceAddress host;
    EventNotifier err_notifier;
    EventNotifier req_notifier;
@@ -128,7 +129,11 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2
+#define VFIO_FEATURE_ENABLE_IGD_OPREGION \
+                                (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
    int32_t bootindex;
+    uint32_t igd_gms;
    uint8_t pm_cap;
    bool has_vga;
    bool pci_aer;
@@ -159,4 +164,7 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev);

 int vfio_populate_vga(VFIOPCIDevice *vdev);

+int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
+                               struct vfio_region_info *info);
+
 #endif /* HW_VFIO_VFIO_PCI_H */
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -17,7 +17,6 @@
 #include "sysemu/kvm.h"
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
-#include "exec/ram_addr.h"
 #include "migration/migration.h"

 #include <sys/ioctl.h>
@@ -247,18 +246,18 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,

    for (i = 0; i < dev->mem->nregions; ++i) {
        struct vhost_memory_region *reg = dev->mem->regions + i;
-        ram_addr_t ram_addr;
+        ram_addr_t offset;
+        MemoryRegion *mr;

        assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
-        qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr,
-                                &ram_addr);
-        fd = qemu_get_ram_fd(ram_addr);
+        mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr,
+                                     &offset);
+        fd = memory_region_get_fd(mr);
        if (fd > 0) {
            msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
            msg.payload.memory.regions[fd_num].memory_size  = reg->memory_size;
            msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
-            msg.payload.memory.regions[fd_num].mmap_offset = reg->userspace_addr -
-                (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr);
+            msg.payload.memory.regions[fd_num].mmap_offset = offset;
            assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
            fds[fd_num++] = fd;
        }
@@ -616,17 +615,15 @@ static bool vhost_user_can_merge(struct vhost_dev *dev,
                                 uint64_t start1, uint64_t size1,
                                 uint64_t start2, uint64_t size2)
 {
-    ram_addr_t ram_addr;
+    ram_addr_t offset;
    int mfd, rfd;
    MemoryRegion *mr;

-    mr = qemu_ram_addr_from_host((void *)(uintptr_t)start1, &ram_addr);
-    assert(mr);
-    mfd = qemu_get_ram_fd(ram_addr);
+    mr = memory_region_from_host((void *)(uintptr_t)start1, &offset);
+    mfd = memory_region_get_fd(mr);

-    mr = qemu_ram_addr_from_host((void *)(uintptr_t)start2, &ram_addr);
-    assert(mr);
-    rfd = qemu_get_ram_fd(ram_addr);
+    mr = memory_region_from_host((void *)(uintptr_t)start2, &offset);
+    rfd = memory_region_get_fd(mr);

    return mfd == rfd;
 }
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -17,7 +17,6 @@ typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
 typedef struct BlockJobTxn BlockJobTxn;
-typedef struct BdrvNextIterator BdrvNextIterator;

 typedef struct BlockDriverInfo {
    /* in bytes, 0 if irrelevant */
@@ -198,7 +197,6 @@ BlockDriver *bdrv_find_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
                QemuOpts *opts, Error **errp);
 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
-BlockDriverState *bdrv_new_root(void);
 BlockDriverState *bdrv_new(void);
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
 void bdrv_replace_in_backing_chain(BlockDriverState *old,
@@ -214,8 +212,8 @@ BdrvChild *bdrv_open_child(const char *filename,
 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
                           const char *bdref_key, Error **errp);
-int bdrv_open(BlockDriverState **pbs, const char *filename,
-              const char *reference, QDict *options, int flags, Error **errp);
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp);
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                    BlockDriverState *bs,
                                    QDict *options, int flags);
@@ -244,10 +242,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
    const void *buf, int count);
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
    int nb_sectors, QEMUIOVector *qiov);
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
-int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
    int nb_sectors, QEMUIOVector *qiov);
 /*
@@ -402,7 +396,19 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                 Error **errp);
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs);
+
+typedef struct BdrvNextIterator {
+    enum {
+        BDRV_NEXT_BACKEND_ROOTS,
+        BDRV_NEXT_MONITOR_OWNED,
+    } phase;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+} BdrvNextIterator;
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it);
+BlockDriverState *bdrv_next(BdrvNextIterator *it);
+
 BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
 int bdrv_is_encrypted(BlockDriverState *bs);
 int bdrv_key_required(BlockDriverState *bs);
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -719,7 +719,8 @@ void hmp_drive_add_node(Monitor *mon, const char *optstr);

 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
-                                  const BdrvChildRole *child_role);
+                                  const BdrvChildRole *child_role,
+                                  void *opaque);
 void bdrv_root_unref_child(BdrvChild *child);

 const char *bdrv_get_parent_name(const BlockDriverState *bs);
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -82,7 +82,7 @@ struct BlockJob {
    const BlockJobDriver *driver;

    /** The block device on which the job is operating.  */
-    BlockDriverState *bs;
+    BlockBackend *blk;

    /**
     * The ID of the block job. Currently the BlockBackend name of the BDS
@@ -135,6 +135,9 @@ struct BlockJob {
     */
    bool deferred_to_main_loop;

+    /** Element of the list of block jobs */
+    QLIST_ENTRY(BlockJob) job_list;
+
    /** Status that is published by the query-block-jobs QMP API */
    BlockDeviceIoStatus iostatus;

@@ -172,6 +175,17 @@ struct BlockJob {
    QLIST_ENTRY(BlockJob) txn_list;
 };

+/**
+ * block_job_next:
+ * @job: A block job, or %NULL.
+ *
+ * Get the next element from the list of block jobs after @job, or the
+ * first one if @job is %NULL.
+ *
+ * Returns the requested job, or %NULL if there are no more jobs left.
+ */
+BlockJob *block_job_next(BlockJob *job);
+
 /**
 * block_job_create:
 * @job_type: The class object for the newly-created job.
@@ -356,6 +370,13 @@ bool block_job_is_paused(BlockJob *job);
 */
 int block_job_cancel_sync(BlockJob *job);

+/**
+ * block_job_cancel_sync_all:
+ *
+ * Synchronously cancels all jobs using block_job_cancel_sync().
+ */
+void block_job_cancel_sync_all(void);
+
 /**
 * block_job_complete_sync:
 * @job: The job to be completed.
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -57,10 +57,10 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);

 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
 /* This should not be used by devices.  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host(void *ptr);
 RAMBlock *qemu_ram_block_by_name(const char *name);
 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
-                                   ram_addr_t *ram_addr, ram_addr_t *offset);
+                                   ram_addr_t *offset);
 void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev);
 void qemu_ram_unset_idstr(RAMBlock *block);
 const char *qemu_ram_get_idstr(RAMBlock *rb);
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -32,6 +32,8 @@
 #include "qom/object.h"
 #include "qemu/rcu.h"

+#define RAM_ADDR_INVALID (~(ram_addr_t)0)
+
 #define MAX_PHYS_ADDR_SPACE_BITS 62
 #define MAX_PHYS_ADDR            (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1)

@@ -666,6 +668,35 @@ static inline bool memory_region_is_rom(MemoryRegion *mr)
 */
 int memory_region_get_fd(MemoryRegion *mr);

+/**
+ * memory_region_set_fd: Mark a RAM memory region as backed by a
+ * file descriptor.
+ *
+ * This function is typically used after memory_region_init_ram_ptr().
+ *
+ * @mr: the memory region being queried.
+ * @fd: the file descriptor that backs @mr.
+ */
+void memory_region_set_fd(MemoryRegion *mr, int fd);
+
+/**
+ * memory_region_from_host: Convert a pointer into a RAM memory region
+ * and an offset within it.
+ *
+ * Given a host pointer inside a RAM memory region (created with
+ * memory_region_init_ram() or memory_region_init_ram_ptr()), return
+ * the MemoryRegion and the offset within it.
+ *
+ * Use with care; by the time this function returns, the returned pointer is
+ * not protected by RCU anymore.  If the caller is not within an RCU critical
+ * section and does not hold the iothread lock, it must have other means of
+ * protecting the pointer, such as a reference to the region that includes
+ * the incoming ram_addr_t.
+ *
+ * @mr: the memory region being queried.
+ */
+MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset);
+
 /**
 * memory_region_get_ram_ptr: Get a pointer into a RAM memory region.
 *
@@ -1362,7 +1393,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
 					MemoryRegion *mr);
 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
                                    MemTxAttrs attrs, uint8_t *buf, int len);
-void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);

 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 {
@@ -1400,8 +1431,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
            l = len;
            mr = address_space_translate(as, addr, &addr1, &l, false);
            if (len == l && memory_access_is_direct(mr, false)) {
-                addr1 += memory_region_get_ram_addr(mr);
-                ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+                ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
                memcpy(buf, ptr, len);
            } else {
                result = address_space_read_continue(as, addr, attrs, buf, len,
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -105,9 +105,6 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t max_size,
                                                    uint64_t length,
                                                    void *host),
                                    MemoryRegion *mr, Error **errp);
-int qemu_get_ram_fd(ram_addr_t addr);
-void qemu_set_ram_fd(ram_addr_t addr, int fd);
-void *qemu_get_ram_block_host_ptr(ram_addr_t addr);
 void qemu_ram_free(RAMBlock *block);

 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp);
--- a/include/hw/arm/fsl-imx6.h
+++ b/include/hw/arm/fsl-imx6.h
@@ -28,6 +28,7 @@
 #include "hw/gpio/imx_gpio.h"
 #include "hw/sd/sdhci.h"
 #include "hw/ssi/imx_spi.h"
+#include "hw/net/imx_fec.h"
 #include "exec/memory.h"
 #include "cpu.h"

@@ -58,6 +59,7 @@ typedef struct FslIMX6State {
    IMXGPIOState   gpio[FSL_IMX6_NUM_GPIOS];
    SDHCIState     esdhc[FSL_IMX6_NUM_ESDHCS];
    IMXSPIState    spi[FSL_IMX6_NUM_ECSPIS];
+    IMXFECState    eth;
    MemoryRegion   rom;
    MemoryRegion   caam;
    MemoryRegion   ocram;
@@ -436,8 +438,8 @@ typedef struct FslIMX6State {
 #define FSL_IMX6_HDMI_MASTER_IRQ 115
 #define FSL_IMX6_HDMI_CEC_IRQ 116
 #define FSL_IMX6_MLB150_LOW_IRQ 117
-#define FSL_IMX6_ENET_MAC_IRQ 118
-#define FSL_IMX6_ENET_MAC_1588_IRQ 119
+#define FSL_IMX6_ENET_MAC_1588_IRQ 118
+#define FSL_IMX6_ENET_MAC_IRQ 119
 #define FSL_IMX6_PCIE1_IRQ 120
 #define FSL_IMX6_PCIE2_IRQ 121
 #define FSL_IMX6_PCIE3_IRQ 122
--- a/include/hw/cris/etraxfs.h
+++ b/include/hw/cris/etraxfs.h
@@ -46,4 +46,20 @@ etraxfs_eth_init(NICInfo *nd, hwaddr base, int phyaddr,
    return dev;
 }

+static inline DeviceState *etraxfs_ser_create(hwaddr addr,
+                                              qemu_irq irq,
+                                              CharDriverState *chr)
+{
+    DeviceState *dev;
+    SysBusDevice *s;
+
+    dev = qdev_create(NULL, "etraxfs,serial");
+    s = SYS_BUS_DEVICE(dev);
+    qdev_prop_set_chr(dev, "chardev", chr);
+    qdev_init_nofail(dev);
+    sysbus_mmio_map(s, 0, addr);
+    sysbus_connect_irq(s, 0, irq);
+    return dev;
+}
+
 #endif
--- a/include/hw/net/imx_fec.h
+++ b/include/hw/net/imx_fec.h
@@ -1,5 +1,5 @@
 /*
- * i.MX Fast Ethernet Controller emulation.
+ * i.MX FEC/ENET Ethernet Controller emulation.
 *
 * Copyright (c) 2013 Jean-Christophe Dubois. <jcd@tribudubois.net>
 *
@@ -27,27 +27,147 @@
 #define TYPE_IMX_FEC "imx.fec"
 #define IMX_FEC(obj) OBJECT_CHECK(IMXFECState, (obj), TYPE_IMX_FEC)

+#define TYPE_IMX_ENET "imx.enet"
+
 #include "hw/sysbus.h"
 #include "net/net.h"

-#define FEC_MAX_FRAME_SIZE 2032
+#define ENET_EIR               1
+#define ENET_EIMR              2
+#define ENET_RDAR              4
+#define ENET_TDAR              5
+#define ENET_ECR               9
+#define ENET_MMFR              16
+#define ENET_MSCR              17
+#define ENET_MIBC              25
+#define ENET_RCR               33
+#define ENET_TCR               49
+#define ENET_PALR              57
+#define ENET_PAUR              58
+#define ENET_OPD               59
+#define ENET_IAUR              70
+#define ENET_IALR              71
+#define ENET_GAUR              72
+#define ENET_GALR              73
+#define ENET_TFWR              81
+#define ENET_FRBR              83
+#define ENET_FRSR              84
+#define ENET_RDSR              96
+#define ENET_TDSR              97
+#define ENET_MRBR              98
+#define ENET_RSFL              100
+#define ENET_RSEM              101
+#define ENET_RAEM              102
+#define ENET_RAFL              103
+#define ENET_TSEM              104
+#define ENET_TAEM              105
+#define ENET_TAFL              106
+#define ENET_TIPG              107
+#define ENET_FTRL              108
+#define ENET_TACC              112
+#define ENET_RACC              113
+#define ENET_MIIGSK_CFGR       192
+#define ENET_MIIGSK_ENR        194
+#define ENET_ATCR              256
+#define ENET_ATVR              257
+#define ENET_ATOFF             258
+#define ENET_ATPER             259
+#define ENET_ATCOR             260
+#define ENET_ATINC             261
+#define ENET_ATSTMP            262
+#define ENET_TGSR              385
+#define ENET_TCSR0             386
+#define ENET_TCCR0             387
+#define ENET_TCSR1             388
+#define ENET_TCCR1             389
+#define ENET_TCSR2             390
+#define ENET_TCCR2             391
+#define ENET_TCSR3             392
+#define ENET_TCCR3             393
+#define ENET_MAX               400

-#define FEC_INT_HB      (1 << 31)
-#define FEC_INT_BABR    (1 << 30)
-#define FEC_INT_BABT    (1 << 29)
-#define FEC_INT_GRA     (1 << 28)
-#define FEC_INT_TXF     (1 << 27)
-#define FEC_INT_TXB     (1 << 26)
-#define FEC_INT_RXF     (1 << 25)
-#define FEC_INT_RXB     (1 << 24)
-#define FEC_INT_MII     (1 << 23)
-#define FEC_INT_EBERR   (1 << 22)
-#define FEC_INT_LC      (1 << 21)
-#define FEC_INT_RL      (1 << 20)
-#define FEC_INT_UN      (1 << 19)
+#define ENET_MAX_FRAME_SIZE    2032

-#define FEC_EN      2
-#define FEC_RESET   1
+/* EIR and EIMR */
+#define ENET_INT_HB            (1 << 31)
+#define ENET_INT_BABR          (1 << 30)
+#define ENET_INT_BABT          (1 << 29)
+#define ENET_INT_GRA           (1 << 28)
+#define ENET_INT_TXF           (1 << 27)
+#define ENET_INT_TXB           (1 << 26)
+#define ENET_INT_RXF           (1 << 25)
+#define ENET_INT_RXB           (1 << 24)
+#define ENET_INT_MII           (1 << 23)
+#define ENET_INT_EBERR         (1 << 22)
+#define ENET_INT_LC            (1 << 21)
+#define ENET_INT_RL            (1 << 20)
+#define ENET_INT_UN            (1 << 19)
+#define ENET_INT_PLR           (1 << 18)
+#define ENET_INT_WAKEUP        (1 << 17)
+#define ENET_INT_TS_AVAIL      (1 << 16)
+#define ENET_INT_TS_TIMER      (1 << 15)
+
+#define ENET_INT_MAC           (ENET_INT_HB | ENET_INT_BABR | ENET_INT_BABT | \
+                                ENET_INT_GRA | ENET_INT_TXF | ENET_INT_TXB | \
+                                ENET_INT_RXF | ENET_INT_RXB | ENET_INT_MII | \
+                                ENET_INT_EBERR | ENET_INT_LC | ENET_INT_RL | \
+                                ENET_INT_UN | ENET_INT_PLR | ENET_INT_WAKEUP | \
+                                ENET_INT_TS_AVAIL)
+
+/* RDAR */
+#define ENET_RDAR_RDAR         (1 << 24)
+
+/* TDAR */
+#define ENET_TDAR_TDAR         (1 << 24)
+
+/* ECR */
+#define ENET_ECR_RESET         (1 << 0)
+#define ENET_ECR_ETHEREN       (1 << 1)
+#define ENET_ECR_MAGICEN       (1 << 2)
+#define ENET_ECR_SLEEP         (1 << 3)
+#define ENET_ECR_EN1588        (1 << 4)
+#define ENET_ECR_SPEED         (1 << 5)
+#define ENET_ECR_DBGEN         (1 << 6)
+#define ENET_ECR_STOPEN        (1 << 7)
+#define ENET_ECR_DSBWP         (1 << 8)
+
+/* MIBC */
+#define ENET_MIBC_MIB_DIS      (1 << 31)
+#define ENET_MIBC_MIB_IDLE     (1 << 30)
+#define ENET_MIBC_MIB_CLEAR    (1 << 29)
+
+/* RCR */
+#define ENET_RCR_LOOP          (1 << 0)
+#define ENET_RCR_DRT           (1 << 1)
+#define ENET_RCR_MII_MODE      (1 << 2)
+#define ENET_RCR_PROM          (1 << 3)
+#define ENET_RCR_BC_REJ        (1 << 4)
+#define ENET_RCR_FCE           (1 << 5)
+#define ENET_RCR_RGMII_EN      (1 << 6)
+#define ENET_RCR_RMII_MODE     (1 << 8)
+#define ENET_RCR_RMII_10T      (1 << 9)
+#define ENET_RCR_PADEN         (1 << 12)
+#define ENET_RCR_PAUFWD        (1 << 13)
+#define ENET_RCR_CRCFWD        (1 << 14)
+#define ENET_RCR_CFEN          (1 << 15)
+#define ENET_RCR_MAX_FL_SHIFT  (16)
+#define ENET_RCR_MAX_FL_LENGTH (14)
+#define ENET_RCR_NLC           (1 << 30)
+#define ENET_RCR_GRS           (1 << 31)
+
+/* TCR */
+#define ENET_TCR_GTS           (1 << 0)
+#define ENET_TCR_FDEN          (1 << 2)
+#define ENET_TCR_TFC_PAUSE     (1 << 3)
+#define ENET_TCR_RFC_PAUSE     (1 << 4)
+#define ENET_TCR_ADDSEL_SHIFT  (5)
+#define ENET_TCR_ADDSEL_LENGTH (3)
+#define ENET_TCR_CRCFWD        (1 << 9)
+
+/* RDSR */
+#define ENET_TWFR_TFWR_SHIFT   (0)
+#define ENET_TWFR_TFWR_LENGTH  (6)
+#define ENET_TWFR_STRFWD       (1 << 8)

 /* Buffer Descriptor.  */
 typedef struct {
@@ -56,22 +176,60 @@ typedef struct {
    uint32_t data;
 } IMXFECBufDesc;

-#define FEC_BD_R    (1 << 15)
-#define FEC_BD_E    (1 << 15)
-#define FEC_BD_O1   (1 << 14)
-#define FEC_BD_W    (1 << 13)
-#define FEC_BD_O2   (1 << 12)
-#define FEC_BD_L    (1 << 11)
-#define FEC_BD_TC   (1 << 10)
-#define FEC_BD_ABC  (1 << 9)
-#define FEC_BD_M    (1 << 8)
-#define FEC_BD_BC   (1 << 7)
-#define FEC_BD_MC   (1 << 6)
-#define FEC_BD_LG   (1 << 5)
-#define FEC_BD_NO   (1 << 4)
-#define FEC_BD_CR   (1 << 2)
-#define FEC_BD_OV   (1 << 1)
-#define FEC_BD_TR   (1 << 0)
+#define ENET_BD_R              (1 << 15)
+#define ENET_BD_E              (1 << 15)
+#define ENET_BD_O1             (1 << 14)
+#define ENET_BD_W              (1 << 13)
+#define ENET_BD_O2             (1 << 12)
+#define ENET_BD_L              (1 << 11)
+#define ENET_BD_TC             (1 << 10)
+#define ENET_BD_ABC            (1 << 9)
+#define ENET_BD_M              (1 << 8)
+#define ENET_BD_BC             (1 << 7)
+#define ENET_BD_MC             (1 << 6)
+#define ENET_BD_LG             (1 << 5)
+#define ENET_BD_NO             (1 << 4)
+#define ENET_BD_CR             (1 << 2)
+#define ENET_BD_OV             (1 << 1)
+#define ENET_BD_TR             (1 << 0)
+
+typedef struct {
+    uint16_t length;
+    uint16_t flags;
+    uint32_t data;
+    uint16_t status;
+    uint16_t option;
+    uint16_t checksum;
+    uint16_t head_proto;
+    uint32_t last_buffer;
+    uint32_t timestamp;
+    uint32_t reserved[2];
+} IMXENETBufDesc;
+
+#define ENET_BD_ME             (1 << 15)
+#define ENET_BD_TX_INT         (1 << 14)
+#define ENET_BD_TS             (1 << 13)
+#define ENET_BD_PINS           (1 << 12)
+#define ENET_BD_IINS           (1 << 11)
+#define ENET_BD_PE             (1 << 10)
+#define ENET_BD_CE             (1 << 9)
+#define ENET_BD_UC             (1 << 8)
+#define ENET_BD_RX_INT         (1 << 7)
+
+#define ENET_BD_TXE            (1 << 15)
+#define ENET_BD_UE             (1 << 13)
+#define ENET_BD_EE             (1 << 12)
+#define ENET_BD_FE             (1 << 11)
+#define ENET_BD_LCE            (1 << 10)
+#define ENET_BD_OE             (1 << 9)
+#define ENET_BD_TSE            (1 << 8)
+#define ENET_BD_ICE            (1 << 5)
+#define ENET_BD_PCR            (1 << 4)
+#define ENET_BD_VLAN           (1 << 2)
+#define ENET_BD_IPV6           (1 << 1)
+#define ENET_BD_FRAG           (1 << 0)
+
+#define ENET_BD_BDU            (1 << 31)

 typedef struct IMXFECState {
    /*< private >*/
@@ -80,34 +238,20 @@ typedef struct IMXFECState {
    /*< public >*/
    NICState *nic;
    NICConf conf;
-    qemu_irq irq;
+    qemu_irq irq[2];
    MemoryRegion iomem;

-    uint32_t irq_state;
-    uint32_t eir;
-    uint32_t eimr;
-    uint32_t rx_enabled;
+    uint32_t regs[ENET_MAX];
    uint32_t rx_descriptor;
    uint32_t tx_descriptor;
-    uint32_t ecr;
-    uint32_t mmfr;
-    uint32_t mscr;
-    uint32_t mibc;
-    uint32_t rcr;
-    uint32_t tcr;
-    uint32_t tfwr;
-    uint32_t frsr;
-    uint32_t erdsr;
-    uint32_t etdsr;
-    uint32_t emrbr;
-    uint32_t miigsk_cfgr;
-    uint32_t miigsk_enr;

    uint32_t phy_status;
    uint32_t phy_control;
    uint32_t phy_advertise;
    uint32_t phy_int;
    uint32_t phy_int_mask;
+
+    bool is_fec;
 } IMXFECState;

 #endif
--- a/include/hw/pci/msix.h
+++ b/include/hw/pci/msix.h
@@ -29,6 +29,7 @@ int msix_present(PCIDevice *dev);

 bool msix_is_masked(PCIDevice *dev, unsigned vector);
 void msix_set_pending(PCIDevice *dev, unsigned vector);
+void msix_clr_pending(PCIDevice *dev, int vector);

 int msix_vector_use(PCIDevice *dev, unsigned vector);
 void msix_vector_unuse(PCIDevice *dev, unsigned vector);
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -465,16 +465,23 @@ pci_get_long(const uint8_t *config)
    return ldl_le_p(config);
 }

+/*
+ * PCI capabilities and/or their fields
+ * are generally DWORD aligned only so
+ * mechanism used by pci_set/get_quad()
+ * must be tolerant to unaligned pointers
+ *
+ */
 static inline void
 pci_set_quad(uint8_t *config, uint64_t val)
 {
-    cpu_to_le64w((uint64_t *)config, val);
+    stq_le_p(config, val);
 }

 static inline uint64_t
 pci_get_quad(const uint8_t *config)
 {
-    return le64_to_cpup((const uint64_t *)config);
+    return ldq_le_p(config);
 }

 static inline void
--- a/Show More
+++ b/Show More