migration/next for 20140225
-----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABCAAGBQJTDKLvAAoJEPSH7xhYctcjhA8QAKDslw9iovAHU4c0NgQxp3yE 08dAD6bznHPkc6ENZEbV4+Yx9AvtGwYeKE4IlVqxDaSCBQ1T/lGr6Di/X/Yuwjo9 80/av6cFpFsO9fw4fhFRNjU0n8xKeN2S/kjCQhz07Zky2mD2fEoLnTrhmjBRCsVN tVCWOYzbkNbIFUCsJB0OBfC/qH0r5RuB2/SuNnwk4NwT5r7+UxMtfZ+BIE4Kez3n l6G4L1XO3julErp/8BQmIChnHH7QtTfQzBahJIlBsiLiqHhX1f1v6Q0CRln+A9S1 jfAK/1zqpYVOAb59R2u0FCgB793sV0P+aa71ORRP1g57lFC5KsGJghQq0OoWr1YA OHrOFPm2YHdTBsU7BG3ndMSbNgZspVAxns6mcSkcDWEH0JDv+FhK08+45tDqkAOu 9hWuYA5p6hodOEBLprNit7lK+7coAKDCkIM4hzPMVZxGCucDqRmtI0oHadjar1Wi nTbxeDqsh67mr6+QXSR8PRQ3y0TDsuBS6Sm2+Bchv1Nt5GiAKaMySiPuXGQlMSS1 3ohy77Ltz42ci1+mFSp6aVaZO8hEkakaN8Hg53T57IVTSqy4B9t/R3bvi+SsysCt BMaHONUnOuloKtA5dnOd6Q+hLE8tw3UNGFB71VZoj1tEbXj48WpIZ1IpQYbVAoyQ DR2+Wccft0O3GVAgLAo0 =yrmU -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20140225' into staging migration/next for 20140225 # gpg: Signature made Tue 25 Feb 2014 14:04:31 GMT using RSA key ID 5872D723 # gpg: Can't check signature: public key not found * remotes/juanquintela/tags/migration/20140225: rdma: rename 'x-rdma' => 'rdma' Fix two XBZRLE corruption issues Fix vmstate_info_int32_le comparison/assign qemu_file: use fwrite() correctly Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
		
						commit
						6f6831f61a
					
				
							
								
								
									
										62
									
								
								arch_init.c
									
									
									
									
									
								
							
							
						
						
									
										62
									
								
								arch_init.c
									
									
									
									
									
								
							@ -122,7 +122,6 @@ static void check_guest_throttling(void);
 | 
			
		||||
#define RAM_SAVE_FLAG_XBZRLE   0x40
 | 
			
		||||
/* 0x80 is reserved in migration.h start with 0x100 next */
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static struct defconfig_file {
 | 
			
		||||
    const char *filename;
 | 
			
		||||
    /* Indicates it is an user config file (disabled by -no-user-config) */
 | 
			
		||||
@ -133,6 +132,7 @@ static struct defconfig_file {
 | 
			
		||||
    { NULL }, /* end of list */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
 | 
			
		||||
 | 
			
		||||
int qemu_read_default_config_files(bool userconfig)
 | 
			
		||||
{
 | 
			
		||||
@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 | 
			
		||||
    return size;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* This is the last block that we have visited serching for dirty pages
 | 
			
		||||
 */
 | 
			
		||||
static RAMBlock *last_seen_block;
 | 
			
		||||
/* This is the last block from where we have sent data */
 | 
			
		||||
static RAMBlock *last_sent_block;
 | 
			
		||||
static ram_addr_t last_offset;
 | 
			
		||||
static unsigned long *migration_bitmap;
 | 
			
		||||
static uint64_t migration_dirty_pages;
 | 
			
		||||
static uint32_t last_version;
 | 
			
		||||
static bool ram_bulk_stage;
 | 
			
		||||
 | 
			
		||||
/* Update the xbzrle cache to reflect a page that's been sent as all 0.
 | 
			
		||||
 * The important thing is that a stale (not-yet-0'd) page be replaced
 | 
			
		||||
 * by the new data.
 | 
			
		||||
 * As a bonus, if the page wasn't in the cache it gets added so that
 | 
			
		||||
 * when a small write is made into the 0'd page it gets XBZRLE sent
 | 
			
		||||
 */
 | 
			
		||||
static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 | 
			
		||||
{
 | 
			
		||||
    if (ram_bulk_stage || !migrate_use_xbzrle()) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* We don't care if this fails to allocate a new cache page
 | 
			
		||||
     * as long as it updated an old one */
 | 
			
		||||
    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define ENCODING_FLAG_XBZRLE 0x1
 | 
			
		||||
 | 
			
		||||
static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
 | 
			
		||||
@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
 | 
			
		||||
    return bytes_sent;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* This is the last block that we have visited serching for dirty pages
 | 
			
		||||
 */
 | 
			
		||||
static RAMBlock *last_seen_block;
 | 
			
		||||
/* This is the last block from where we have sent data */
 | 
			
		||||
static RAMBlock *last_sent_block;
 | 
			
		||||
static ram_addr_t last_offset;
 | 
			
		||||
static unsigned long *migration_bitmap;
 | 
			
		||||
static uint64_t migration_dirty_pages;
 | 
			
		||||
static uint32_t last_version;
 | 
			
		||||
static bool ram_bulk_stage;
 | 
			
		||||
 | 
			
		||||
static inline
 | 
			
		||||
ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
 | 
			
		||||
                                                 ram_addr_t start)
 | 
			
		||||
@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 | 
			
		||||
        } else {
 | 
			
		||||
            int ret;
 | 
			
		||||
            uint8_t *p;
 | 
			
		||||
            bool send_async = true;
 | 
			
		||||
            int cont = (block == last_sent_block) ?
 | 
			
		||||
                RAM_SAVE_FLAG_CONTINUE : 0;
 | 
			
		||||
 | 
			
		||||
@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 | 
			
		||||
            ret = ram_control_save_page(f, block->offset,
 | 
			
		||||
                               offset, TARGET_PAGE_SIZE, &bytes_sent);
 | 
			
		||||
 | 
			
		||||
            current_addr = block->offset + offset;
 | 
			
		||||
            if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 | 
			
		||||
                if (ret != RAM_SAVE_CONTROL_DELAYED) {
 | 
			
		||||
                    if (bytes_sent > 0) {
 | 
			
		||||
@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 | 
			
		||||
                                            RAM_SAVE_FLAG_COMPRESS);
 | 
			
		||||
                qemu_put_byte(f, 0);
 | 
			
		||||
                bytes_sent++;
 | 
			
		||||
                /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 | 
			
		||||
                 * page would be stale
 | 
			
		||||
                 */
 | 
			
		||||
                xbzrle_cache_zero_page(current_addr);
 | 
			
		||||
            } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 | 
			
		||||
                current_addr = block->offset + offset;
 | 
			
		||||
                bytes_sent = save_xbzrle_page(f, p, current_addr, block,
 | 
			
		||||
                                              offset, cont, last_stage);
 | 
			
		||||
                if (!last_stage) {
 | 
			
		||||
                    /* We must send exactly what's in the xbzrle cache
 | 
			
		||||
                     * even if the page wasn't xbzrle compressed, so that
 | 
			
		||||
                     * it's right next time.
 | 
			
		||||
                     */
 | 
			
		||||
                    p = get_cached_data(XBZRLE.cache, current_addr);
 | 
			
		||||
 | 
			
		||||
                    /* Can't send this cached data async, since the cache page
 | 
			
		||||
                     * might get updated before it gets to the wire
 | 
			
		||||
                     */
 | 
			
		||||
                    send_async = false;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            /* XBZRLE overflow or normal page */
 | 
			
		||||
            if (bytes_sent == -1) {
 | 
			
		||||
                bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
 | 
			
		||||
                if (send_async) {
 | 
			
		||||
                    qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 | 
			
		||||
                } else {
 | 
			
		||||
                    qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 | 
			
		||||
                }
 | 
			
		||||
                bytes_sent += TARGET_PAGE_SIZE;
 | 
			
		||||
                acct_info.norm_pages++;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely
 | 
			
		||||
high-performance RDMA hardware using the following command:
 | 
			
		||||
 | 
			
		||||
QEMU Monitor Command:
 | 
			
		||||
$ migrate_set_capability x-rdma-pin-all on # disabled by default
 | 
			
		||||
$ migrate_set_capability rdma-pin-all on # disabled by default
 | 
			
		||||
 | 
			
		||||
Performing this action will cause all 8GB to be pinned, so if that's
 | 
			
		||||
not what you want, then please ignore this step altogether.
 | 
			
		||||
@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device
 | 
			
		||||
 | 
			
		||||
Next, on the destination machine, add the following to the QEMU command line:
 | 
			
		||||
 | 
			
		||||
qemu ..... -incoming x-rdma:host:port
 | 
			
		||||
qemu ..... -incoming rdma:host:port
 | 
			
		||||
 | 
			
		||||
Finally, perform the actual migration on the source machine:
 | 
			
		||||
 | 
			
		||||
QEMU Monitor Command:
 | 
			
		||||
$ migrate -d x-rdma:host:port
 | 
			
		||||
$ migrate -d rdma:host:port
 | 
			
		||||
 | 
			
		||||
PERFORMANCE
 | 
			
		||||
===========
 | 
			
		||||
@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in
 | 
			
		||||
active use and the VM itself is completely idle using the same 40 gbps
 | 
			
		||||
infiniband link:
 | 
			
		||||
 | 
			
		||||
1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
 | 
			
		||||
2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
 | 
			
		||||
1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
 | 
			
		||||
2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
 | 
			
		||||
 | 
			
		||||
These numbers would of course scale up to whatever size virtual machine
 | 
			
		||||
you have to migrate using RDMA.
 | 
			
		||||
@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration.
 | 
			
		||||
 | 
			
		||||
TODO:
 | 
			
		||||
=====
 | 
			
		||||
1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be
 | 
			
		||||
   renamed to 'rdma' after the experimental phase of this work has
 | 
			
		||||
   completed upstream.
 | 
			
		||||
2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
 | 
			
		||||
1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
 | 
			
		||||
   are not compatible with infinband memory pinning and will result in
 | 
			
		||||
   an aborted migration (but with the source VM left unaffected).
 | 
			
		||||
3. Use of the recent /proc/<pid>/pagemap would likely speed up
 | 
			
		||||
2. Use of the recent /proc/<pid>/pagemap would likely speed up
 | 
			
		||||
   the use of KSM and ballooning while using RDMA.
 | 
			
		||||
4. Also, some form of balloon-device usage tracking would also
 | 
			
		||||
3. Also, some form of balloon-device usage tracking would also
 | 
			
		||||
   help alleviate some issues.
 | 
			
		||||
5. Move UNREGISTER requests to a separate thread.
 | 
			
		||||
6. Use LRU to provide more fine-grained direction of UNREGISTER
 | 
			
		||||
4. Use LRU to provide more fine-grained direction of UNREGISTER
 | 
			
		||||
   requests for unpinning memory in an overcommitted environment.
 | 
			
		||||
7. Expose UNREGISTER support to the user by way of workload-specific
 | 
			
		||||
5. Expose UNREGISTER support to the user by way of workload-specific
 | 
			
		||||
   hints about application behavior.
 | 
			
		||||
 | 
			
		||||
@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr);
 | 
			
		||||
 * @addr: page address
 | 
			
		||||
 * @pdata: pointer to the page
 | 
			
		||||
 */
 | 
			
		||||
int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata);
 | 
			
		||||
int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * cache_resize: resize the page cache. In case of size reduction the extra
 | 
			
		||||
 | 
			
		||||
@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ret = qemu_rdma_source_init(rdma, &local_err,
 | 
			
		||||
        s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
 | 
			
		||||
        s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]);
 | 
			
		||||
 | 
			
		||||
    if (ret) {
 | 
			
		||||
        goto err;
 | 
			
		||||
 | 
			
		||||
@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
 | 
			
		||||
    if (strstart(uri, "tcp:", &p))
 | 
			
		||||
        tcp_start_incoming_migration(p, errp);
 | 
			
		||||
#ifdef CONFIG_RDMA
 | 
			
		||||
    else if (strstart(uri, "x-rdma:", &p))
 | 
			
		||||
    else if (strstart(uri, "rdma:", &p))
 | 
			
		||||
        rdma_start_incoming_migration(p, errp);
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(WIN32)
 | 
			
		||||
@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 | 
			
		||||
    if (strstart(uri, "tcp:", &p)) {
 | 
			
		||||
        tcp_start_outgoing_migration(s, p, &local_err);
 | 
			
		||||
#ifdef CONFIG_RDMA
 | 
			
		||||
    } else if (strstart(uri, "x-rdma:", &p)) {
 | 
			
		||||
    } else if (strstart(uri, "rdma:", &p)) {
 | 
			
		||||
        rdma_start_outgoing_migration(s, p, &local_err);
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(WIN32)
 | 
			
		||||
@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void)
 | 
			
		||||
 | 
			
		||||
    s = migrate_get_current();
 | 
			
		||||
 | 
			
		||||
    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
 | 
			
		||||
    return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool migrate_auto_converge(void)
 | 
			
		||||
 | 
			
		||||
@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr)
 | 
			
		||||
    return cache_get_by_addr(cache, addr)->it_data;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata)
 | 
			
		||||
int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
    CacheItem *it = NULL;
 | 
			
		||||
 | 
			
		||||
@ -751,10 +751,9 @@
 | 
			
		||||
#          This feature allows us to minimize migration traffic for certain work
 | 
			
		||||
#          loads, by sending compressed difference of the pages
 | 
			
		||||
#
 | 
			
		||||
# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is
 | 
			
		||||
# @rdma-pin-all: Controls whether or not the entire VM memory footprint is
 | 
			
		||||
#          mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage.
 | 
			
		||||
#          Disabled by default. Experimental: may (or may not) be renamed after
 | 
			
		||||
#          further testing is complete. (since 1.6)
 | 
			
		||||
#          Disabled by default. (since 2.0)
 | 
			
		||||
#
 | 
			
		||||
# @zero-blocks: During storage migration encode blocks of zeroes efficiently. This
 | 
			
		||||
#          essentially saves 1MB of zeroes per block on the wire. Enabling requires
 | 
			
		||||
@ -768,7 +767,7 @@
 | 
			
		||||
# Since: 1.2
 | 
			
		||||
##
 | 
			
		||||
{ 'enum': 'MigrationCapability',
 | 
			
		||||
  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
 | 
			
		||||
  'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] }
 | 
			
		||||
 | 
			
		||||
##
 | 
			
		||||
# @MigrationCapabilityStatus
 | 
			
		||||
 | 
			
		||||
@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
 | 
			
		||||
                            int size)
 | 
			
		||||
{
 | 
			
		||||
    QEMUFileStdio *s = opaque;
 | 
			
		||||
    return fwrite(buf, 1, size, s->stdio_file);
 | 
			
		||||
    int res;
 | 
			
		||||
 | 
			
		||||
    res = fwrite(buf, 1, size, s->stdio_file);
 | 
			
		||||
 | 
			
		||||
    if (res != size) {
 | 
			
		||||
        return -EIO;	/* fake errno value */
 | 
			
		||||
    }
 | 
			
		||||
    return res;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										15
									
								
								vmstate.c
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								vmstate.c
									
									
									
									
									
								
							@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = {
 | 
			
		||||
    .put  = put_int32,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/* 32 bit int. See that the received value is the less or the same
 | 
			
		||||
   than the one in the field */
 | 
			
		||||
/* 32 bit int. Check that the received value is less than or equal to
 | 
			
		||||
   the one in the field */
 | 
			
		||||
 | 
			
		||||
static int get_int32_le(QEMUFile *f, void *pv, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    int32_t *old = pv;
 | 
			
		||||
    int32_t new;
 | 
			
		||||
    qemu_get_sbe32s(f, &new);
 | 
			
		||||
    int32_t *cur = pv;
 | 
			
		||||
    int32_t loaded;
 | 
			
		||||
    qemu_get_sbe32s(f, &loaded);
 | 
			
		||||
 | 
			
		||||
    if (*old <= new) {
 | 
			
		||||
    if (loaded <= *cur) {
 | 
			
		||||
        *cur = loaded;
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
    return -EINVAL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const VMStateInfo vmstate_info_int32_le = {
 | 
			
		||||
    .name = "int32 equal",
 | 
			
		||||
    .name = "int32 le",
 | 
			
		||||
    .get  = get_int32_le,
 | 
			
		||||
    .put  = put_int32,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user