migration: allow fault thread to pause
Allows the fault thread to stop handling page faults temporarily. When network failure happened (and if we expect a recovery afterwards), we should not allow the fault thread to continue sending things to source, instead, it should halt for a while until the connection is rebuilt. When the dest main thread noticed the failure, it kicks the fault thread to switch to pause state. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20180502104740.12123-7-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
This commit is contained in:
		
							parent
							
								
									14b1742eaa
								
							
						
					
					
						commit
						3a7804c306
					
				| @ -160,6 +160,7 @@ MigrationIncomingState *migration_incoming_get_current(void) | ||||
|         qemu_mutex_init(&mis_current.rp_mutex); | ||||
|         qemu_event_init(&mis_current.main_thread_load_event, false); | ||||
|         qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0); | ||||
|         qemu_sem_init(&mis_current.postcopy_pause_sem_fault, 0); | ||||
| 
 | ||||
|         init_dirty_bitmap_incoming_migration(); | ||||
| 
 | ||||
|  | ||||
| @ -76,6 +76,7 @@ struct MigrationIncomingState { | ||||
| 
 | ||||
|     /* notify PAUSED postcopy incoming migrations to try to continue */ | ||||
|     QemuSemaphore postcopy_pause_sem_dst; | ||||
|     QemuSemaphore postcopy_pause_sem_fault; | ||||
| }; | ||||
| 
 | ||||
| MigrationIncomingState *migration_incoming_get_current(void); | ||||
|  | ||||
| @ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) | ||||
|                                       affected_cpu); | ||||
| } | ||||
| 
 | ||||
| static bool postcopy_pause_fault_thread(MigrationIncomingState *mis) | ||||
| { | ||||
|     trace_postcopy_pause_fault_thread(); | ||||
| 
 | ||||
|     qemu_sem_wait(&mis->postcopy_pause_sem_fault); | ||||
| 
 | ||||
|     trace_postcopy_pause_fault_thread_continued(); | ||||
| 
 | ||||
|     return true; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Handle faults detected by the USERFAULT markings | ||||
|  */ | ||||
| @ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque) | ||||
|             break; | ||||
|         } | ||||
| 
 | ||||
|         if (!mis->to_src_file) { | ||||
|             /*
 | ||||
|              * Possibly someone tells us that the return path is | ||||
|              * broken already using the event. We should hold until | ||||
|              * the channel is rebuilt. | ||||
|              */ | ||||
|             if (postcopy_pause_fault_thread(mis)) { | ||||
|                 mis->last_rb = NULL; | ||||
|                 /* Continue to read the userfaultfd */ | ||||
|             } else { | ||||
|                 error_report("%s: paused but don't allow to continue", | ||||
|                              __func__); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if (pfd[1].revents) { | ||||
|             uint64_t tmp64 = 0; | ||||
| 
 | ||||
| @ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque) | ||||
|                     (uintptr_t)(msg.arg.pagefault.address), | ||||
|                                 msg.arg.pagefault.feat.ptid, rb); | ||||
| 
 | ||||
| retry: | ||||
|             /*
 | ||||
|              * Send the request to the source - we want to request one | ||||
|              * of our host page sizes (which is >= TPS) | ||||
|              */ | ||||
|             if (rb != mis->last_rb) { | ||||
|                 mis->last_rb = rb; | ||||
|                 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), | ||||
|                                          rb_offset, qemu_ram_pagesize(rb)); | ||||
|                 ret = migrate_send_rp_req_pages(mis, | ||||
|                                                 qemu_ram_get_idstr(rb), | ||||
|                                                 rb_offset, | ||||
|                                                 qemu_ram_pagesize(rb)); | ||||
|             } else { | ||||
|                 /* Save some space */ | ||||
|                 migrate_send_rp_req_pages(mis, NULL, | ||||
|                                          rb_offset, qemu_ram_pagesize(rb)); | ||||
|                 ret = migrate_send_rp_req_pages(mis, | ||||
|                                                 NULL, | ||||
|                                                 rb_offset, | ||||
|                                                 qemu_ram_pagesize(rb)); | ||||
|             } | ||||
| 
 | ||||
|             if (ret) { | ||||
|                 /* May be network failure, try to wait for recovery */ | ||||
|                 if (ret == -EIO && postcopy_pause_fault_thread(mis)) { | ||||
|                     /* We got reconnected somehow, try to continue */ | ||||
|                     mis->last_rb = NULL; | ||||
|                     goto retry; | ||||
|                 } else { | ||||
|                     /* This is a unavoidable fault */ | ||||
|                     error_report("%s: migrate_send_rp_req_pages() get %d", | ||||
|                                  __func__, ret); | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|  | ||||
| @ -2083,6 +2083,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) | ||||
|     mis->to_src_file = NULL; | ||||
|     qemu_mutex_unlock(&mis->rp_mutex); | ||||
| 
 | ||||
|     /* Notify the fault thread for the invalidated file handle */ | ||||
|     postcopy_fault_thread_notify(mis); | ||||
| 
 | ||||
|     error_report("Detected IO failure for postcopy. " | ||||
|                  "Migration paused."); | ||||
| 
 | ||||
|  | ||||
| @ -101,6 +101,8 @@ open_return_path_on_source_continue(void) "" | ||||
| postcopy_start(void) "" | ||||
| postcopy_pause_return_path(void) "" | ||||
| postcopy_pause_return_path_continued(void) "" | ||||
| postcopy_pause_fault_thread(void) "" | ||||
| postcopy_pause_fault_thread_continued(void) "" | ||||
| postcopy_pause_continued(void) "" | ||||
| postcopy_pause_incoming(void) "" | ||||
| postcopy_pause_incoming_continued(void) "" | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Peter Xu
						Peter Xu