migration/next for 20140225

-----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABCAAGBQJTDKLvAAoJEPSH7xhYctcjhA8QAKDslw9iovAHU4c0NgQxp3yE 08dAD6bznHPkc6ENZEbV4+Yx9AvtGwYeKE4IlVqxDaSCBQ1T/lGr6Di/X/Yuwjo9 80/av6cFpFsO9fw4fhFRNjU0n8xKeN2S/kjCQhz07Zky2mD2fEoLnTrhmjBRCsVN tVCWOYzbkNbIFUCsJB0OBfC/qH0r5RuB2/SuNnwk4NwT5r7+UxMtfZ+BIE4Kez3n l6G4L1XO3julErp/8BQmIChnHH7QtTfQzBahJIlBsiLiqHhX1f1v6Q0CRln+A9S1 jfAK/1zqpYVOAb59R2u0FCgB793sV0P+aa71ORRP1g57lFC5KsGJghQq0OoWr1YA OHrOFPm2YHdTBsU7BG3ndMSbNgZspVAxns6mcSkcDWEH0JDv+FhK08+45tDqkAOu 9hWuYA5p6hodOEBLprNit7lK+7coAKDCkIM4hzPMVZxGCucDqRmtI0oHadjar1Wi nTbxeDqsh67mr6+QXSR8PRQ3y0TDsuBS6Sm2+Bchv1Nt5GiAKaMySiPuXGQlMSS1 3ohy77Ltz42ci1+mFSp6aVaZO8hEkakaN8Hg53T57IVTSqy4B9t/R3bvi+SsysCt BMaHONUnOuloKtA5dnOd6Q+hLE8tw3UNGFB71VZoj1tEbXj48WpIZ1IpQYbVAoyQ DR2+Wccft0O3GVAgLAo0 =yrmU -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20140225' into staging migration/next for 20140225 # gpg: Signature made Tue 25 Feb 2014 14:04:31 GMT using RSA key ID 5872D723 # gpg: Can't check signature: public key not found * remotes/juanquintela/tags/migration/20140225: rdma: rename 'x-rdma' => 'rdma' Fix two XBZRLE corruption issues Fix vmstate_info_int32_le comparison/assign qemu_file: use fwrite() correctly Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-02-26 22:31:10 +00:00 · 2014-02-26 22:31:10 +00:00 · 6f6831f61a
commit 6f6831f61a
parent bc3fbad816 41310c6878
9 changed files with 84 additions and 47 deletions
--- a/arch_init.c
+++ b/arch_init.c
@ -122,7 +122,6 @@ static void check_guest_throttling(void);
 #define RAM_SAVE_FLAG_XBZRLE   0x40
 /* 0x80 is reserved in migration.h start with 0x100 next */

-
 static struct defconfig_file {
    const char *filename;
    /* Indicates it is an user config file (disabled by -no-user-config) */
@ -133,6 +132,7 @@ static struct defconfig_file {
    { NULL }, /* end of list */
 };

+static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];

 int qemu_read_default_config_files(bool userconfig)
 {
@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
    return size;
 }

+/* This is the last block that we have visited serching for dirty pages
+ */
+static RAMBlock *last_seen_block;
+/* This is the last block from where we have sent data */
+static RAMBlock *last_sent_block;
+static ram_addr_t last_offset;
+static unsigned long *migration_bitmap;
+static uint64_t migration_dirty_pages;
+static uint32_t last_version;
+static bool ram_bulk_stage;
+
+/* Update the xbzrle cache to reflect a page that's been sent as all 0.
+ * The important thing is that a stale (not-yet-0'd) page be replaced
+ * by the new data.
+ * As a bonus, if the page wasn't in the cache it gets added so that
+ * when a small write is made into the 0'd page it gets XBZRLE sent
+ */
+static void xbzrle_cache_zero_page(ram_addr_t current_addr)
+{
+    if (ram_bulk_stage || !migrate_use_xbzrle()) {
+        return;
+    }
+
+    /* We don't care if this fails to allocate a new cache page
+     * as long as it updated an old one */
+    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE);
+}
+
 #define ENCODING_FLAG_XBZRLE 0x1

 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
    return bytes_sent;
 }

-
-/* This is the last block that we have visited serching for dirty pages
- */
-static RAMBlock *last_seen_block;
-/* This is the last block from where we have sent data */
-static RAMBlock *last_sent_block;
-static ram_addr_t last_offset;
-static unsigned long *migration_bitmap;
-static uint64_t migration_dirty_pages;
-static uint32_t last_version;
-static bool ram_bulk_stage;
-
 static inline
 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
                                                 ram_addr_t start)
@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
        } else {
            int ret;
            uint8_t *p;
+            bool send_async = true;
            int cont = (block == last_sent_block) ?
                RAM_SAVE_FLAG_CONTINUE : 0;

@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
            ret = ram_control_save_page(f, block->offset,
                               offset, TARGET_PAGE_SIZE, &bytes_sent);

+            current_addr = block->offset + offset;
            if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
                if (ret != RAM_SAVE_CONTROL_DELAYED) {
                    if (bytes_sent > 0) {
@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                                            RAM_SAVE_FLAG_COMPRESS);
                qemu_put_byte(f, 0);
                bytes_sent++;
+                /* Must let xbzrle know, otherwise a previous (now 0'd) cached
+                 * page would be stale
+                 */
+                xbzrle_cache_zero_page(current_addr);
            } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
-                current_addr = block->offset + offset;
                bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                              offset, cont, last_stage);
                if (!last_stage) {
+                    /* We must send exactly what's in the xbzrle cache
+                     * even if the page wasn't xbzrle compressed, so that
+                     * it's right next time.
+                     */
                    p = get_cached_data(XBZRLE.cache, current_addr);
+
+                    /* Can't send this cached data async, since the cache page
+                     * might get updated before it gets to the wire
+                     */
+                    send_async = false;
                }
            }

            /* XBZRLE overflow or normal page */
            if (bytes_sent == -1) {
                bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
+                if (send_async) {
                    qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+                } else {
+                    qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+                }
                bytes_sent += TARGET_PAGE_SIZE;
                acct_info.norm_pages++;
            }
--- a/docs/rdma.txt
+++ b/docs/rdma.txt
@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely
 high-performance RDMA hardware using the following command:

 QEMU Monitor Command:
-$ migrate_set_capability x-rdma-pin-all on # disabled by default
+$ migrate_set_capability rdma-pin-all on # disabled by default

 Performing this action will cause all 8GB to be pinned, so if that's
 not what you want, then please ignore this step altogether.
@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device

 Next, on the destination machine, add the following to the QEMU command line:

-qemu ..... -incoming x-rdma:host:port
+qemu ..... -incoming rdma:host:port

 Finally, perform the actual migration on the source machine:

 QEMU Monitor Command:
-$ migrate -d x-rdma:host:port
+$ migrate -d rdma:host:port

 PERFORMANCE
 ===========
@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in
 active use and the VM itself is completely idle using the same 40 gbps
 infiniband link:

-1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
-2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
+1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
+2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps

 These numbers would of course scale up to whatever size virtual machine
 you have to migrate using RDMA.
@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration.

 TODO:
 =====
-1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be
-   renamed to 'rdma' after the experimental phase of this work has
-   completed upstream.
-2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
+1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
   are not compatible with infinband memory pinning and will result in
   an aborted migration (but with the source VM left unaffected).
-3. Use of the recent /proc/<pid>/pagemap would likely speed up
+2. Use of the recent /proc/<pid>/pagemap would likely speed up
   the use of KSM and ballooning while using RDMA.
-4. Also, some form of balloon-device usage tracking would also
+3. Also, some form of balloon-device usage tracking would also
   help alleviate some issues.
-5. Move UNREGISTER requests to a separate thread.
-6. Use LRU to provide more fine-grained direction of UNREGISTER
+4. Use LRU to provide more fine-grained direction of UNREGISTER
   requests for unpinning memory in an overcommitted environment.
-7. Expose UNREGISTER support to the user by way of workload-specific
+5. Expose UNREGISTER support to the user by way of workload-specific
   hints about application behavior.
--- a/include/migration/page_cache.h
+++ b/include/migration/page_cache.h
@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr);
 * @addr: page address
 * @pdata: pointer to the page
 */
-int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata);
+int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata);

 /**
 * cache_resize: resize the page cache. In case of size reduction the extra
--- a/migration-rdma.c
+++ b/migration-rdma.c
@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque,
    }

    ret = qemu_rdma_source_init(rdma, &local_err,
-        s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
+        s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]);

    if (ret) {
        goto err;
--- a/migration.c
+++ b/migration.c
@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
    if (strstart(uri, "tcp:", &p))
        tcp_start_incoming_migration(p, errp);
 #ifdef CONFIG_RDMA
-    else if (strstart(uri, "x-rdma:", &p))
+    else if (strstart(uri, "rdma:", &p))
        rdma_start_incoming_migration(p, errp);
 #endif
 #if !defined(WIN32)
@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
    if (strstart(uri, "tcp:", &p)) {
        tcp_start_outgoing_migration(s, p, &local_err);
 #ifdef CONFIG_RDMA
-    } else if (strstart(uri, "x-rdma:", &p)) {
+    } else if (strstart(uri, "rdma:", &p)) {
        rdma_start_outgoing_migration(s, p, &local_err);
 #endif
 #if !defined(WIN32)
@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void)

    s = migrate_get_current();

-    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL];
 }

 bool migrate_auto_converge(void)
--- a/page_cache.c
+++ b/page_cache.c
@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr)
    return cache_get_by_addr(cache, addr)->it_data;
 }

-int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata)
+int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata)
 {

    CacheItem *it = NULL;
--- a/qapi-schema.json
+++ b/qapi-schema.json
@ -751,10 +751,9 @@
 #          This feature allows us to minimize migration traffic for certain work
 #          loads, by sending compressed difference of the pages
 #
-# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is
+# @rdma-pin-all: Controls whether or not the entire VM memory footprint is
 #          mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage.
-#          Disabled by default. Experimental: may (or may not) be renamed after
-#          further testing is complete. (since 1.6)
+#          Disabled by default. (since 2.0)
 #
 # @zero-blocks: During storage migration encode blocks of zeroes efficiently. This
 #          essentially saves 1MB of zeroes per block on the wire. Enabling requires
@ -768,7 +767,7 @@
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+  'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] }

 ##
 # @MigrationCapabilityStatus
--- a/qemu-file.c
+++ b/qemu-file.c
@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
                            int size)
 {
    QEMUFileStdio *s = opaque;
-    return fwrite(buf, 1, size, s->stdio_file);
+    int res;
+
+    res = fwrite(buf, 1, size, s->stdio_file);
+
+    if (res != size) {
+        return -EIO;	/* fake errno value */
+    }
+    return res;
 }

 static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
--- a/vmstate.c
+++ b/vmstate.c
@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = {
    .put  = put_int32,
 };

-/* 32 bit int. See that the received value is the less or the same
-   than the one in the field */
+/* 32 bit int. Check that the received value is less than or equal to
+   the one in the field */

 static int get_int32_le(QEMUFile *f, void *pv, size_t size)
 {
-    int32_t *old = pv;
-    int32_t new;
-    qemu_get_sbe32s(f, &new);
+    int32_t *cur = pv;
+    int32_t loaded;
+    qemu_get_sbe32s(f, &loaded);

-    if (*old <= new) {
+    if (loaded <= *cur) {
+        *cur = loaded;
        return 0;
    }
    return -EINVAL;
 }

 const VMStateInfo vmstate_info_int32_le = {
-    .name = "int32 equal",
+    .name = "int32 le",
    .get  = get_int32_le,
    .put  = put_int32,
 };