memory: RCU ram_list.dirty_memory[] for safe RAM hotplug
Although accesses to ram_list.dirty_memory[] use atomics so multiple threads can safely dirty the bitmap, the data structure is not fully thread-safe yet. This patch handles the RAM hotplug case where ram_list.dirty_memory[] is grown. ram_list.dirty_memory[] is change from a regular bitmap to an RCU array of pointers to fixed-size bitmap blocks. Threads can continue accessing bitmap blocks while the array is being extended. See the comments in the code for an in-depth explanation of struct DirtyMemoryBlocks. I have tested that live migration with virtio-blk dataplane works. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Message-Id: <1453728801-5398-2-git-send-email-stefanha@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
		
							parent
							
								
									8bafcb2164
								
							
						
					
					
						commit
						5b82b703b6
					
				
							
								
								
									
										75
									
								
								exec.c
									
									
									
									
									
								
							
							
						
						
									
										75
									
								
								exec.c
									
									
									
									
									
								
							| @ -980,8 +980,9 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, | ||||
|                                               ram_addr_t length, | ||||
|                                               unsigned client) | ||||
| { | ||||
|     DirtyMemoryBlocks *blocks; | ||||
|     unsigned long end, page; | ||||
|     bool dirty; | ||||
|     bool dirty = false; | ||||
| 
 | ||||
|     if (length == 0) { | ||||
|         return false; | ||||
| @ -989,8 +990,22 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, | ||||
| 
 | ||||
|     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; | ||||
|     page = start >> TARGET_PAGE_BITS; | ||||
|     dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client], | ||||
|                                          page, end - page); | ||||
| 
 | ||||
|     rcu_read_lock(); | ||||
| 
 | ||||
|     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); | ||||
| 
 | ||||
|     while (page < end) { | ||||
|         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); | ||||
| 
 | ||||
|         dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx], | ||||
|                                               offset, num); | ||||
|         page += num; | ||||
|     } | ||||
| 
 | ||||
|     rcu_read_unlock(); | ||||
| 
 | ||||
|     if (dirty && tcg_enabled()) { | ||||
|         tlb_reset_dirty_range_all(start, length); | ||||
| @ -1504,6 +1519,47 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp) | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| /* Called with ram_list.mutex held */ | ||||
| static void dirty_memory_extend(ram_addr_t old_ram_size, | ||||
|                                 ram_addr_t new_ram_size) | ||||
| { | ||||
|     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, | ||||
|                                              DIRTY_MEMORY_BLOCK_SIZE); | ||||
|     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, | ||||
|                                              DIRTY_MEMORY_BLOCK_SIZE); | ||||
|     int i; | ||||
| 
 | ||||
|     /* Only need to extend if block count increased */ | ||||
|     if (new_num_blocks <= old_num_blocks) { | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     for (i = 0; i < DIRTY_MEMORY_NUM; i++) { | ||||
|         DirtyMemoryBlocks *old_blocks; | ||||
|         DirtyMemoryBlocks *new_blocks; | ||||
|         int j; | ||||
| 
 | ||||
|         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]); | ||||
|         new_blocks = g_malloc(sizeof(*new_blocks) + | ||||
|                               sizeof(new_blocks->blocks[0]) * new_num_blocks); | ||||
| 
 | ||||
|         if (old_num_blocks) { | ||||
|             memcpy(new_blocks->blocks, old_blocks->blocks, | ||||
|                    old_num_blocks * sizeof(old_blocks->blocks[0])); | ||||
|         } | ||||
| 
 | ||||
|         for (j = old_num_blocks; j < new_num_blocks; j++) { | ||||
|             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE); | ||||
|         } | ||||
| 
 | ||||
|         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks); | ||||
| 
 | ||||
|         if (old_blocks) { | ||||
|             g_free_rcu(old_blocks, rcu); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) | ||||
| { | ||||
|     RAMBlock *block; | ||||
| @ -1543,6 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) | ||||
|               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); | ||||
|     if (new_ram_size > old_ram_size) { | ||||
|         migration_bitmap_extend(old_ram_size, new_ram_size); | ||||
|         dirty_memory_extend(old_ram_size, new_ram_size); | ||||
|     } | ||||
|     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
 | ||||
|      * QLIST (which has an RCU-friendly variant) does not have insertion at | ||||
| @ -1568,18 +1625,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) | ||||
|     ram_list.version++; | ||||
|     qemu_mutex_unlock_ramlist(); | ||||
| 
 | ||||
|     new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; | ||||
| 
 | ||||
|     if (new_ram_size > old_ram_size) { | ||||
|         int i; | ||||
| 
 | ||||
|         /* ram_list.dirty_memory[] is protected by the iothread lock.  */ | ||||
|         for (i = 0; i < DIRTY_MEMORY_NUM; i++) { | ||||
|             ram_list.dirty_memory[i] = | ||||
|                 bitmap_zero_extend(ram_list.dirty_memory[i], | ||||
|                                    old_ram_size, new_ram_size); | ||||
|        } | ||||
|     } | ||||
|     cpu_physical_memory_set_dirty_range(new_block->offset, | ||||
|                                         new_block->used_length, | ||||
|                                         DIRTY_CLIENTS_ALL); | ||||
|  | ||||
| @ -49,13 +49,43 @@ static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset) | ||||
|     return (char *)block->host + offset; | ||||
| } | ||||
| 
 | ||||
| /* The dirty memory bitmap is split into fixed-size blocks to allow growth
 | ||||
|  * under RCU.  The bitmap for a block can be accessed as follows: | ||||
|  * | ||||
|  *   rcu_read_lock(); | ||||
|  * | ||||
|  *   DirtyMemoryBlocks *blocks = | ||||
|  *       atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]); | ||||
|  * | ||||
|  *   ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|  *   unsigned long *block = blocks.blocks[idx]; | ||||
|  *   ...access block bitmap... | ||||
|  * | ||||
|  *   rcu_read_unlock(); | ||||
|  * | ||||
|  * Remember to check for the end of the block when accessing a range of | ||||
|  * addresses.  Move on to the next block if you reach the end. | ||||
|  * | ||||
|  * Organization into blocks allows dirty memory to grow (but not shrink) under | ||||
|  * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new | ||||
|  * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept | ||||
|  * the same.  Other threads can safely access existing blocks while dirty | ||||
|  * memory is being grown.  When no threads are using the old DirtyMemoryBlocks | ||||
|  * anymore it is freed by RCU (but the underlying blocks stay because they are | ||||
|  * pointed to from the new DirtyMemoryBlocks). | ||||
|  */ | ||||
| #define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8) | ||||
| typedef struct { | ||||
|     struct rcu_head rcu; | ||||
|     unsigned long *blocks[]; | ||||
| } DirtyMemoryBlocks; | ||||
| 
 | ||||
| typedef struct RAMList { | ||||
|     QemuMutex mutex; | ||||
|     /* Protected by the iothread lock.  */ | ||||
|     unsigned long *dirty_memory[DIRTY_MEMORY_NUM]; | ||||
|     RAMBlock *mru_block; | ||||
|     /* RCU-enabled, writes protected by the ramlist lock. */ | ||||
|     QLIST_HEAD(, RAMBlock) blocks; | ||||
|     DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM]; | ||||
|     uint32_t version; | ||||
| } RAMList; | ||||
| extern RAMList ram_list; | ||||
| @ -89,30 +119,70 @@ static inline bool cpu_physical_memory_get_dirty(ram_addr_t start, | ||||
|                                                  ram_addr_t length, | ||||
|                                                  unsigned client) | ||||
| { | ||||
|     unsigned long end, page, next; | ||||
|     DirtyMemoryBlocks *blocks; | ||||
|     unsigned long end, page; | ||||
|     bool dirty = false; | ||||
| 
 | ||||
|     assert(client < DIRTY_MEMORY_NUM); | ||||
| 
 | ||||
|     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; | ||||
|     page = start >> TARGET_PAGE_BITS; | ||||
|     next = find_next_bit(ram_list.dirty_memory[client], end, page); | ||||
| 
 | ||||
|     return next < end; | ||||
|     rcu_read_lock(); | ||||
| 
 | ||||
|     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); | ||||
| 
 | ||||
|     while (page < end) { | ||||
|         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); | ||||
| 
 | ||||
|         if (find_next_bit(blocks->blocks[idx], offset, num) < num) { | ||||
|             dirty = true; | ||||
|             break; | ||||
|         } | ||||
| 
 | ||||
|         page += num; | ||||
|     } | ||||
| 
 | ||||
|     rcu_read_unlock(); | ||||
| 
 | ||||
|     return dirty; | ||||
| } | ||||
| 
 | ||||
| static inline bool cpu_physical_memory_all_dirty(ram_addr_t start, | ||||
|                                                  ram_addr_t length, | ||||
|                                                  unsigned client) | ||||
| { | ||||
|     unsigned long end, page, next; | ||||
|     DirtyMemoryBlocks *blocks; | ||||
|     unsigned long end, page; | ||||
|     bool dirty = true; | ||||
| 
 | ||||
|     assert(client < DIRTY_MEMORY_NUM); | ||||
| 
 | ||||
|     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; | ||||
|     page = start >> TARGET_PAGE_BITS; | ||||
|     next = find_next_zero_bit(ram_list.dirty_memory[client], end, page); | ||||
| 
 | ||||
|     return next >= end; | ||||
|     rcu_read_lock(); | ||||
| 
 | ||||
|     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); | ||||
| 
 | ||||
|     while (page < end) { | ||||
|         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); | ||||
| 
 | ||||
|         if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) { | ||||
|             dirty = false; | ||||
|             break; | ||||
|         } | ||||
| 
 | ||||
|         page += num; | ||||
|     } | ||||
| 
 | ||||
|     rcu_read_unlock(); | ||||
| 
 | ||||
|     return dirty; | ||||
| } | ||||
| 
 | ||||
| static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr, | ||||
| @ -154,16 +224,31 @@ static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start, | ||||
| static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr, | ||||
|                                                       unsigned client) | ||||
| { | ||||
|     unsigned long page, idx, offset; | ||||
|     DirtyMemoryBlocks *blocks; | ||||
| 
 | ||||
|     assert(client < DIRTY_MEMORY_NUM); | ||||
|     set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]); | ||||
| 
 | ||||
|     page = addr >> TARGET_PAGE_BITS; | ||||
|     idx = page / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|     offset = page % DIRTY_MEMORY_BLOCK_SIZE; | ||||
| 
 | ||||
|     rcu_read_lock(); | ||||
| 
 | ||||
|     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); | ||||
| 
 | ||||
|     set_bit_atomic(offset, blocks->blocks[idx]); | ||||
| 
 | ||||
|     rcu_read_unlock(); | ||||
| } | ||||
| 
 | ||||
| static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start, | ||||
|                                                        ram_addr_t length, | ||||
|                                                        uint8_t mask) | ||||
| { | ||||
|     DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM]; | ||||
|     unsigned long end, page; | ||||
|     unsigned long **d = ram_list.dirty_memory; | ||||
|     int i; | ||||
| 
 | ||||
|     if (!mask && !xen_enabled()) { | ||||
|         return; | ||||
| @ -171,15 +256,36 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start, | ||||
| 
 | ||||
|     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; | ||||
|     page = start >> TARGET_PAGE_BITS; | ||||
|     if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) { | ||||
|         bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page); | ||||
| 
 | ||||
|     rcu_read_lock(); | ||||
| 
 | ||||
|     for (i = 0; i < DIRTY_MEMORY_NUM; i++) { | ||||
|         blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]); | ||||
|     } | ||||
|     if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) { | ||||
|         bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page); | ||||
|     } | ||||
|     if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) { | ||||
|         bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page); | ||||
| 
 | ||||
|     while (page < end) { | ||||
|         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); | ||||
| 
 | ||||
|         if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) { | ||||
|             bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx], | ||||
|                               offset, num); | ||||
|         } | ||||
|         if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) { | ||||
|             bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx], | ||||
|                               offset, num); | ||||
|         } | ||||
|         if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) { | ||||
|             bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx], | ||||
|                               offset, num); | ||||
|         } | ||||
| 
 | ||||
|         page += num; | ||||
|     } | ||||
| 
 | ||||
|     rcu_read_unlock(); | ||||
| 
 | ||||
|     xen_modified_memory(start, length); | ||||
| } | ||||
| 
 | ||||
| @ -199,21 +305,41 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap, | ||||
|     /* start address is aligned at the start of a word? */ | ||||
|     if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) && | ||||
|         (hpratio == 1)) { | ||||
|         unsigned long **blocks[DIRTY_MEMORY_NUM]; | ||||
|         unsigned long idx; | ||||
|         unsigned long offset; | ||||
|         long k; | ||||
|         long nr = BITS_TO_LONGS(pages); | ||||
| 
 | ||||
|         idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         offset = BIT_WORD((start >> TARGET_PAGE_BITS) % | ||||
|                           DIRTY_MEMORY_BLOCK_SIZE); | ||||
| 
 | ||||
|         rcu_read_lock(); | ||||
| 
 | ||||
|         for (i = 0; i < DIRTY_MEMORY_NUM; i++) { | ||||
|             blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks; | ||||
|         } | ||||
| 
 | ||||
|         for (k = 0; k < nr; k++) { | ||||
|             if (bitmap[k]) { | ||||
|                 unsigned long temp = leul_to_cpu(bitmap[k]); | ||||
|                 unsigned long **d = ram_list.dirty_memory; | ||||
| 
 | ||||
|                 atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp); | ||||
|                 atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp); | ||||
|                 atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset], temp); | ||||
|                 atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp); | ||||
|                 if (tcg_enabled()) { | ||||
|                     atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp); | ||||
|                     atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], temp); | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { | ||||
|                 offset = 0; | ||||
|                 idx++; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         rcu_read_unlock(); | ||||
| 
 | ||||
|         xen_modified_memory(start, pages << TARGET_PAGE_BITS); | ||||
|     } else { | ||||
|         uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE; | ||||
| @ -265,18 +391,33 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(unsigned long *dest, | ||||
|     if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) { | ||||
|         int k; | ||||
|         int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); | ||||
|         unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]; | ||||
|         unsigned long * const *src; | ||||
|         unsigned long idx = (page * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE; | ||||
|         unsigned long offset = BIT_WORD((page * BITS_PER_LONG) % | ||||
|                                         DIRTY_MEMORY_BLOCK_SIZE); | ||||
| 
 | ||||
|         rcu_read_lock(); | ||||
| 
 | ||||
|         src = atomic_rcu_read( | ||||
|                 &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks; | ||||
| 
 | ||||
|         for (k = page; k < page + nr; k++) { | ||||
|             if (src[k]) { | ||||
|                 unsigned long bits = atomic_xchg(&src[k], 0); | ||||
|             if (src[idx][offset]) { | ||||
|                 unsigned long bits = atomic_xchg(&src[idx][offset], 0); | ||||
|                 unsigned long new_dirty; | ||||
|                 new_dirty = ~dest[k]; | ||||
|                 dest[k] |= bits; | ||||
|                 new_dirty &= bits; | ||||
|                 num_dirty += ctpopl(new_dirty); | ||||
|             } | ||||
| 
 | ||||
|             if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { | ||||
|                 offset = 0; | ||||
|                 idx++; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         rcu_read_unlock(); | ||||
|     } else { | ||||
|         for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { | ||||
|             if (cpu_physical_memory_test_and_clear_dirty( | ||||
|  | ||||
| @ -609,7 +609,6 @@ static void migration_bitmap_sync_init(void) | ||||
|     iterations_prev = 0; | ||||
| } | ||||
| 
 | ||||
| /* Called with iothread lock held, to protect ram_list.dirty_memory[] */ | ||||
| static void migration_bitmap_sync(void) | ||||
| { | ||||
|     RAMBlock *block; | ||||
| @ -1921,8 +1920,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) | ||||
|         acct_clear(); | ||||
|     } | ||||
| 
 | ||||
|     /* iothread lock needed for ram_list.dirty_memory[] */ | ||||
|     qemu_mutex_lock_iothread(); | ||||
|     qemu_mutex_lock_ramlist(); | ||||
|     rcu_read_lock(); | ||||
|     bytes_transferred = 0; | ||||
| @ -1947,7 +1944,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) | ||||
|     memory_global_dirty_log_start(); | ||||
|     migration_bitmap_sync(); | ||||
|     qemu_mutex_unlock_ramlist(); | ||||
|     qemu_mutex_unlock_iothread(); | ||||
| 
 | ||||
|     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Stefan Hajnoczi
						Stefan Hajnoczi