 30499fdd98
			
		
	
	
		30499fdd98
		
	
	
	
	
		
			
			Running a guest with 128 NUMA nodes crashes QEMU:
../../util/error.c:59: error_setv: Assertion `*errp == NULL' failed.
The crash happens when setting the FWNMI migration blocker:
2861	    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
2862	        /* Create the error string for live migration blocker */
2863	        error_setg(&spapr->fwnmi_migration_blocker,
2864	            "A machine check is being handled during migration. The handler"
2865	            "may run and log hardware error on the destination");
2866	    }
Inspection reveals that papr->fwnmi_migration_blocker isn't NULL:
(gdb) p spapr->fwnmi_migration_blocker
$1 = (Error *) 0x8000000004000000
Since this is the only place where papr->fwnmi_migration_blocker is
set, this means someone wrote there in our back. Further analysis
points to spapr_numa_associativity_init(), especially the part
that initializes the associative arrays for NVLink GPUs:
    max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM;
ie. max_nodes_with_gpus = 128 + 6, but the array isn't sized to
accommodate the 6 extra nodes:
struct SpaprMachineState {
    .
    .
    .
    uint32_t numa_assoc_array[MAX_NODES][NUMA_ASSOC_SIZE];
    Error *fwnmi_migration_blocker;
};
and the following loops happily overwrite spapr->fwnmi_migration_blocker,
and probably more:
    for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) {
        spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS);
        for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) {
            uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ?
                                 SPAPR_GPU_NUMA_ID : cpu_to_be32(i);
            spapr->numa_assoc_array[i][j] = gpu_assoc;
        }
        spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);
    }
Fix the size of the array. This requires "hw/ppc/spapr.h" to see
NVGPU_MAX_NUM. Including "hw/pci-host/spapr.h" introduces a
circular dependency that breaks the build, so this moves the
definition of NVGPU_MAX_NUM to "hw/ppc/spapr.h" instead.
Reported-by: Min Deng <mdeng@redhat.com>
BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1908693
Fixes: dd7e1d7ae431 ("spapr_numa: move NVLink2 associativity handling to spapr_numa.c")
Cc: danielhb413@gmail.com
Signed-off-by: Greg Kurz <groug@kaod.org>
Message-Id: <160829960428.734871.12634150161215429514.stgit@bahia.lan>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
		
	
			
		
			
				
	
	
		
			214 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * QEMU SPAPR PCI BUS definitions
 | |
|  *
 | |
|  * Copyright (c) 2011 Alexey Kardashevskiy <aik@au1.ibm.com>
 | |
|  *
 | |
|  * This library is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * This library is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| #ifndef PCI_HOST_SPAPR_H
 | |
| #define PCI_HOST_SPAPR_H
 | |
| 
 | |
| #include "hw/ppc/spapr.h"
 | |
| #include "hw/pci/pci.h"
 | |
| #include "hw/pci/pci_host.h"
 | |
| #include "hw/ppc/xics.h"
 | |
| #include "qom/object.h"
 | |
| 
 | |
| #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
 | |
| 
 | |
| OBJECT_DECLARE_SIMPLE_TYPE(SpaprPhbState, SPAPR_PCI_HOST_BRIDGE)
 | |
| 
 | |
| #define SPAPR_PCI_DMA_MAX_WINDOWS    2
 | |
| 
 | |
| 
 | |
| typedef struct SpaprPciMsi {
 | |
|     uint32_t first_irq;
 | |
|     uint32_t num;
 | |
| } SpaprPciMsi;
 | |
| 
 | |
| typedef struct SpaprPciMsiMig {
 | |
|     uint32_t key;
 | |
|     SpaprPciMsi value;
 | |
| } SpaprPciMsiMig;
 | |
| 
 | |
| typedef struct SpaprPciLsi {
 | |
|     uint32_t irq;
 | |
| } SpaprPciLsi;
 | |
| 
 | |
| typedef struct SpaprPhbPciNvGpuConfig SpaprPhbPciNvGpuConfig;
 | |
| 
 | |
| struct SpaprPhbState {
 | |
|     PCIHostState parent_obj;
 | |
| 
 | |
|     uint32_t index;
 | |
|     uint64_t buid;
 | |
|     char *dtbusname;
 | |
|     bool dr_enabled;
 | |
| 
 | |
|     MemoryRegion memspace, iospace;
 | |
|     hwaddr mem_win_addr, mem_win_size, mem64_win_addr, mem64_win_size;
 | |
|     uint64_t mem64_win_pciaddr;
 | |
|     hwaddr io_win_addr, io_win_size;
 | |
|     MemoryRegion mem32window, mem64window, iowindow, msiwindow;
 | |
| 
 | |
|     uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
 | |
|     hwaddr dma_win_addr, dma_win_size;
 | |
|     AddressSpace iommu_as;
 | |
|     MemoryRegion iommu_root;
 | |
| 
 | |
|     SpaprPciLsi lsi_table[PCI_NUM_PINS];
 | |
| 
 | |
|     GHashTable *msi;
 | |
|     /* Temporary cache for migration purposes */
 | |
|     int32_t msi_devs_num;
 | |
|     SpaprPciMsiMig *msi_devs;
 | |
| 
 | |
|     QLIST_ENTRY(SpaprPhbState) list;
 | |
| 
 | |
|     bool ddw_enabled;
 | |
|     uint64_t page_size_mask;
 | |
|     uint64_t dma64_win_addr;
 | |
| 
 | |
|     uint32_t numa_node;
 | |
| 
 | |
|     bool pcie_ecs; /* Allow access to PCIe extended config space? */
 | |
| 
 | |
|     /* Fields for migration compatibility hacks */
 | |
|     bool pre_2_8_migration;
 | |
|     uint32_t mig_liobn;
 | |
|     hwaddr mig_mem_win_addr, mig_mem_win_size;
 | |
|     hwaddr mig_io_win_addr, mig_io_win_size;
 | |
|     hwaddr nv2_gpa_win_addr;
 | |
|     hwaddr nv2_atsd_win_addr;
 | |
|     SpaprPhbPciNvGpuConfig *nvgpus;
 | |
|     bool pre_5_1_assoc;
 | |
| };
 | |
| 
 | |
| #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
 | |
| #define SPAPR_PCI_MEM32_WIN_SIZE     \
 | |
|     ((1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET)
 | |
| #define SPAPR_PCI_MEM64_WIN_SIZE     0x10000000000ULL /* 1 TiB */
 | |
| 
 | |
| /* All PCI outbound windows will be within this range */
 | |
| #define SPAPR_PCI_BASE               (1ULL << 45) /* 32 TiB */
 | |
| #define SPAPR_PCI_LIMIT              (1ULL << 46) /* 64 TiB */
 | |
| 
 | |
| #define SPAPR_MAX_PHBS ((SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / \
 | |
|                         SPAPR_PCI_MEM64_WIN_SIZE - 1)
 | |
| 
 | |
| #define SPAPR_PCI_IO_WIN_SIZE        0x10000
 | |
| 
 | |
| #define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
 | |
| 
 | |
| #define SPAPR_PCI_NV2RAM64_WIN_BASE  SPAPR_PCI_LIMIT
 | |
| #define SPAPR_PCI_NV2RAM64_WIN_SIZE  (2 * TiB) /* For up to 6 GPUs 256GB each */
 | |
| 
 | |
| /* Max number of NVLinks per GPU in any physical box */
 | |
| #define NVGPU_MAX_LINKS              3
 | |
| 
 | |
| /*
 | |
|  * GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB
 | |
|  * which is enough. We do not need DMA for ATSD so we put them at 128TiB.
 | |
|  */
 | |
| #define SPAPR_PCI_NV2ATSD_WIN_BASE   (128 * TiB)
 | |
| #define SPAPR_PCI_NV2ATSD_WIN_SIZE   (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \
 | |
|                                       64 * KiB)
 | |
| 
 | |
| int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb,
 | |
|                  uint32_t intc_phandle, void *fdt, int *node_offset);
 | |
| 
 | |
| void spapr_pci_rtas_init(void);
 | |
| 
 | |
| SpaprPhbState *spapr_pci_find_phb(SpaprMachineState *spapr, uint64_t buid);
 | |
| PCIDevice *spapr_pci_find_dev(SpaprMachineState *spapr, uint64_t buid,
 | |
|                               uint32_t config_addr);
 | |
| 
 | |
| /* DRC callbacks */
 | |
| void spapr_phb_remove_pci_device_cb(DeviceState *dev);
 | |
| int spapr_pci_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
 | |
|                           void *fdt, int *fdt_start_offset, Error **errp);
 | |
| 
 | |
| /* VFIO EEH hooks */
 | |
| #ifdef CONFIG_LINUX
 | |
| bool spapr_phb_eeh_available(SpaprPhbState *sphb);
 | |
| int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
 | |
|                                   unsigned int addr, int option);
 | |
| int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state);
 | |
| int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option);
 | |
| int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb);
 | |
| void spapr_phb_vfio_reset(DeviceState *qdev);
 | |
| void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp);
 | |
| void spapr_phb_nvgpu_free(SpaprPhbState *sphb);
 | |
| void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
 | |
|                                  Error **errp);
 | |
| void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt);
 | |
| void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
 | |
|                                         SpaprPhbState *sphb);
 | |
| #else
 | |
| static inline bool spapr_phb_eeh_available(SpaprPhbState *sphb)
 | |
| {
 | |
|     return false;
 | |
| }
 | |
| static inline int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
 | |
|                                                 unsigned int addr, int option)
 | |
| {
 | |
|     return RTAS_OUT_HW_ERROR;
 | |
| }
 | |
| static inline int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb,
 | |
|                                                int *state)
 | |
| {
 | |
|     return RTAS_OUT_HW_ERROR;
 | |
| }
 | |
| static inline int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
 | |
| {
 | |
|     return RTAS_OUT_HW_ERROR;
 | |
| }
 | |
| static inline int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
 | |
| {
 | |
|     return RTAS_OUT_HW_ERROR;
 | |
| }
 | |
| static inline void spapr_phb_vfio_reset(DeviceState *qdev)
 | |
| {
 | |
| }
 | |
| static inline void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
 | |
| {
 | |
| }
 | |
| static inline void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
 | |
| {
 | |
| }
 | |
| static inline void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt,
 | |
|                                                int bus_off, Error **errp)
 | |
| {
 | |
| }
 | |
| static inline void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb,
 | |
|                                                    void *fdt)
 | |
| {
 | |
| }
 | |
| static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt,
 | |
|                                                       int offset,
 | |
|                                                       SpaprPhbState *sphb)
 | |
| {
 | |
| }
 | |
| #endif
 | |
| 
 | |
| void spapr_phb_dma_reset(SpaprPhbState *sphb);
 | |
| 
 | |
| static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb)
 | |
| {
 | |
|     return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
 | |
| }
 | |
| 
 | |
| #endif /* PCI_HOST_SPAPR_H */
 |