ioapic: allow buggy guests mishandling level-triggered interrupts to make progress
It was found that Hyper-V 2016 on KVM in some configurations (q35 machine + piix4-usb-uhci) hangs on boot. Root-cause was that one of Hyper-V level-triggered interrupt handler performs EOI before fixing the cause of the interrupt. This results in IOAPIC keep re-raising the level-triggered interrupt after EOI because irq-line remains asserted. Gory details: https://www.spinics.net/lists/kvm/msg184484.html (the whole thread). Turns out we were dealing with similar issues before; in-kernel IOAPIC implementation has commit 184564efae4d ("kvm: ioapic: conditionally delay irq delivery duringeoi broadcast") which describes a very similar issue. Steal the idea from the above mentioned commit for IOAPIC implementation in QEMU. SUCCESSIVE_IRQ_MAX_COUNT, delay and the comment are borrowed as well. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Message-Id: <20190402080215.10747-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
		
							parent
							
								
									29de280401
								
							
						
					
					
						commit
						958a01dab8
					
				@ -139,6 +139,15 @@ static void ioapic_service(IOAPICCommonState *s)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SUCCESSIVE_IRQ_MAX_COUNT 10000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void delayed_ioapic_service_cb(void *opaque)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    IOAPICCommonState *s = opaque;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ioapic_service(s);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void ioapic_set_irq(void *opaque, int vector, int level)
 | 
					static void ioapic_set_irq(void *opaque, int vector, int level)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    IOAPICCommonState *s = opaque;
 | 
					    IOAPICCommonState *s = opaque;
 | 
				
			||||||
@ -222,13 +231,39 @@ void ioapic_eoi_broadcast(int vector)
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
        for (n = 0; n < IOAPIC_NUM_PINS; n++) {
 | 
					        for (n = 0; n < IOAPIC_NUM_PINS; n++) {
 | 
				
			||||||
            entry = s->ioredtbl[n];
 | 
					            entry = s->ioredtbl[n];
 | 
				
			||||||
            if ((entry & IOAPIC_LVT_REMOTE_IRR)
 | 
					
 | 
				
			||||||
                && (entry & IOAPIC_VECTOR_MASK) == vector) {
 | 
					            if ((entry & IOAPIC_VECTOR_MASK) != vector ||
 | 
				
			||||||
 | 
					                ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
 | 
				
			||||||
 | 
					                continue;
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if (!(entry & IOAPIC_LVT_REMOTE_IRR)) {
 | 
				
			||||||
 | 
					                continue;
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            trace_ioapic_clear_remote_irr(n, vector);
 | 
					            trace_ioapic_clear_remote_irr(n, vector);
 | 
				
			||||||
            s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
 | 
					            s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
 | 
					            if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
 | 
				
			||||||
 | 
					                ++s->irq_eoi[vector];
 | 
				
			||||||
 | 
					                if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) {
 | 
				
			||||||
 | 
					                    /*
 | 
				
			||||||
 | 
					                     * Real hardware does not deliver the interrupt immediately
 | 
				
			||||||
 | 
					                     * during eoi broadcast, and this lets a buggy guest make
 | 
				
			||||||
 | 
					                     * slow progress even if it does not correctly handle a
 | 
				
			||||||
 | 
					                     * level-triggered interrupt. Emulate this behavior if we
 | 
				
			||||||
 | 
					                     * detect an interrupt storm.
 | 
				
			||||||
 | 
					                     */
 | 
				
			||||||
 | 
					                    s->irq_eoi[vector] = 0;
 | 
				
			||||||
 | 
					                    timer_mod_anticipate(s->delayed_ioapic_service_timer,
 | 
				
			||||||
 | 
					                                         qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 | 
				
			||||||
 | 
					                                         NANOSECONDS_PER_SECOND / 100);
 | 
				
			||||||
 | 
					                    trace_ioapic_eoi_delayed_reassert(vector);
 | 
				
			||||||
 | 
					                } else {
 | 
				
			||||||
                    ioapic_service(s);
 | 
					                    ioapic_service(s);
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					            } else {
 | 
				
			||||||
 | 
					                s->irq_eoi[vector] = 0;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -401,6 +436,9 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
 | 
				
			|||||||
    memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s,
 | 
					    memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s,
 | 
				
			||||||
                          "ioapic", 0x1000);
 | 
					                          "ioapic", 0x1000);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    s->delayed_ioapic_service_timer =
 | 
				
			||||||
 | 
					        timer_new_ns(QEMU_CLOCK_VIRTUAL, delayed_ioapic_service_cb, s);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS);
 | 
					    qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ioapics[ioapic_no] = s;
 | 
					    ioapics[ioapic_no] = s;
 | 
				
			||||||
@ -408,6 +446,14 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
 | 
				
			|||||||
    qemu_add_machine_init_done_notifier(&s->machine_done);
 | 
					    qemu_add_machine_init_done_notifier(&s->machine_done);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void ioapic_unrealize(DeviceState *dev, Error **errp)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    IOAPICCommonState *s = IOAPIC_COMMON(dev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    timer_del(s->delayed_ioapic_service_timer);
 | 
				
			||||||
 | 
					    timer_free(s->delayed_ioapic_service_timer);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static Property ioapic_properties[] = {
 | 
					static Property ioapic_properties[] = {
 | 
				
			||||||
    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF),
 | 
					    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF),
 | 
				
			||||||
    DEFINE_PROP_END_OF_LIST(),
 | 
					    DEFINE_PROP_END_OF_LIST(),
 | 
				
			||||||
@ -419,6 +465,7 @@ static void ioapic_class_init(ObjectClass *klass, void *data)
 | 
				
			|||||||
    DeviceClass *dc = DEVICE_CLASS(klass);
 | 
					    DeviceClass *dc = DEVICE_CLASS(klass);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    k->realize = ioapic_realize;
 | 
					    k->realize = ioapic_realize;
 | 
				
			||||||
 | 
					    k->unrealize = ioapic_unrealize;
 | 
				
			||||||
    /*
 | 
					    /*
 | 
				
			||||||
     * If APIC is in kernel, we need to update the kernel cache after
 | 
					     * If APIC is in kernel, we need to update the kernel cache after
 | 
				
			||||||
     * migration, otherwise first 24 gsi routes will be invalid.
 | 
					     * migration, otherwise first 24 gsi routes will be invalid.
 | 
				
			||||||
 | 
				
			|||||||
@ -25,6 +25,7 @@ apic_mem_writel(uint64_t addr, uint32_t val) "0x%"PRIx64" = 0x%08x"
 | 
				
			|||||||
ioapic_set_remote_irr(int n) "set remote irr for pin %d"
 | 
					ioapic_set_remote_irr(int n) "set remote irr for pin %d"
 | 
				
			||||||
ioapic_clear_remote_irr(int n, int vector) "clear remote irr for pin %d vector %d"
 | 
					ioapic_clear_remote_irr(int n, int vector) "clear remote irr for pin %d vector %d"
 | 
				
			||||||
ioapic_eoi_broadcast(int vector) "EOI broadcast for vector %d"
 | 
					ioapic_eoi_broadcast(int vector) "EOI broadcast for vector %d"
 | 
				
			||||||
 | 
					ioapic_eoi_delayed_reassert(int vector) "delayed reassert on EOI broadcast for vector %d"
 | 
				
			||||||
ioapic_mem_read(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem read addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" retval 0x%"PRIx32
 | 
					ioapic_mem_read(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem read addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" retval 0x%"PRIx32
 | 
				
			||||||
ioapic_mem_write(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem write addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" val 0x%"PRIx32
 | 
					ioapic_mem_write(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem write addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" val 0x%"PRIx32
 | 
				
			||||||
ioapic_set_irq(int vector, int level) "vector: %d level: %d"
 | 
					ioapic_set_irq(int vector, int level) "vector: %d level: %d"
 | 
				
			||||||
 | 
				
			|||||||
@ -96,6 +96,7 @@ typedef struct IOAPICCommonClass {
 | 
				
			|||||||
    SysBusDeviceClass parent_class;
 | 
					    SysBusDeviceClass parent_class;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DeviceRealize realize;
 | 
					    DeviceRealize realize;
 | 
				
			||||||
 | 
					    DeviceUnrealize unrealize;
 | 
				
			||||||
    void (*pre_save)(IOAPICCommonState *s);
 | 
					    void (*pre_save)(IOAPICCommonState *s);
 | 
				
			||||||
    void (*post_load)(IOAPICCommonState *s);
 | 
					    void (*post_load)(IOAPICCommonState *s);
 | 
				
			||||||
} IOAPICCommonClass;
 | 
					} IOAPICCommonClass;
 | 
				
			||||||
@ -111,6 +112,8 @@ struct IOAPICCommonState {
 | 
				
			|||||||
    uint8_t version;
 | 
					    uint8_t version;
 | 
				
			||||||
    uint64_t irq_count[IOAPIC_NUM_PINS];
 | 
					    uint64_t irq_count[IOAPIC_NUM_PINS];
 | 
				
			||||||
    int irq_level[IOAPIC_NUM_PINS];
 | 
					    int irq_level[IOAPIC_NUM_PINS];
 | 
				
			||||||
 | 
					    int irq_eoi[IOAPIC_NUM_PINS];
 | 
				
			||||||
 | 
					    QEMUTimer *delayed_ioapic_service_timer;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void ioapic_reset_common(DeviceState *dev);
 | 
					void ioapic_reset_common(DeviceState *dev);
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user