 195801d700
			
		
	
	
		195801d700
		
	
	
	
	
		
			
			The Big QEMU Lock (BQL) has many names and they are confusing. The actual QemuMutex variable is called qemu_global_mutex but it's commonly referred to as the BQL in discussions and some code comments. The locking APIs, however, are called qemu_mutex_lock_iothread() and qemu_mutex_unlock_iothread(). The "iothread" name is historic and comes from when the main thread was split into into KVM vcpu threads and the "iothread" (now called the main loop thread). I have contributed to the confusion myself by introducing a separate --object iothread, a separate concept unrelated to the BQL. The "iothread" name is no longer appropriate for the BQL. Rename the locking APIs to: - void bql_lock(void) - void bql_unlock(void) - bool bql_locked(void) There are more APIs with "iothread" in their names. Subsequent patches will rename them. There are also comments and documentation that will be updated in later patches. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Fabiano Rosas <farosas@suse.de> Acked-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Cédric Le Goater <clg@kaod.org> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: Eric Farman <farman@linux.ibm.com> Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com> Acked-by: Hyman Huang <yong.huang@smartx.com> Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com> Message-id: 20240102153529.486531-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
		
			
				
	
	
		
			959 lines
		
	
	
		
			29 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			959 lines
		
	
	
		
			29 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /**
 | |
|  * QEMU vfio-user-server server object
 | |
|  *
 | |
|  * Copyright © 2022 Oracle and/or its affiliates.
 | |
|  *
 | |
|  * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
 | |
|  *
 | |
|  * See the COPYING file in the top-level directory.
 | |
|  *
 | |
|  */
 | |
| 
 | |
| /**
 | |
|  * Usage: add options:
 | |
|  *     -machine x-remote,vfio-user=on,auto-shutdown=on
 | |
|  *     -device <PCI-device>,id=<pci-dev-id>
 | |
|  *     -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
 | |
|  *             device=<pci-dev-id>
 | |
|  *
 | |
|  * Note that x-vfio-user-server object must be used with x-remote machine only.
 | |
|  * This server could only support PCI devices for now.
 | |
|  *
 | |
|  * type - SocketAddress type - presently "unix" alone is supported. Required
 | |
|  *        option
 | |
|  *
 | |
|  * path - named unix socket, it will be created by the server. It is
 | |
|  *        a required option
 | |
|  *
 | |
|  * device - id of a device on the server, a required option. PCI devices
 | |
|  *          alone are supported presently.
 | |
|  *
 | |
|  * notes - x-vfio-user-server could block IO and monitor during the
 | |
|  *         initialization phase.
 | |
|  *
 | |
|  *         When x-remote machine has the auto-shutdown property
 | |
|  *         enabled (default), x-vfio-user-server terminates after the last
 | |
|  *         client disconnects. Otherwise, it will continue running until
 | |
|  *         explicitly killed.
 | |
|  */
 | |
| 
 | |
| #include "qemu/osdep.h"
 | |
| 
 | |
| #include "qom/object.h"
 | |
| #include "qom/object_interfaces.h"
 | |
| #include "qemu/error-report.h"
 | |
| #include "trace.h"
 | |
| #include "sysemu/runstate.h"
 | |
| #include "hw/boards.h"
 | |
| #include "hw/remote/machine.h"
 | |
| #include "qapi/error.h"
 | |
| #include "qapi/qapi-visit-sockets.h"
 | |
| #include "qapi/qapi-events-misc.h"
 | |
| #include "qemu/notify.h"
 | |
| #include "qemu/thread.h"
 | |
| #include "qemu/main-loop.h"
 | |
| #include "sysemu/sysemu.h"
 | |
| #include "libvfio-user.h"
 | |
| #include "hw/qdev-core.h"
 | |
| #include "hw/pci/pci.h"
 | |
| #include "qemu/timer.h"
 | |
| #include "exec/memory.h"
 | |
| #include "hw/pci/msi.h"
 | |
| #include "hw/pci/msix.h"
 | |
| #include "hw/remote/vfio-user-obj.h"
 | |
| 
 | |
| #define TYPE_VFU_OBJECT "x-vfio-user-server"
 | |
| OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
 | |
| 
 | |
| /**
 | |
|  * VFU_OBJECT_ERROR - reports an error message.
 | |
|  *
 | |
|  * If auto_shutdown is set, it aborts the machine on error. Otherwise,
 | |
|  * it logs an error message without aborting. auto_shutdown is disabled
 | |
|  * when the server serves clients from multiple VMs; as such, an error
 | |
|  * from one VM shouldn't be able to disrupt other VM's services.
 | |
|  */
 | |
| #define VFU_OBJECT_ERROR(o, fmt, ...)                                     \
 | |
|     {                                                                     \
 | |
|         if (vfu_object_auto_shutdown()) {                                 \
 | |
|             error_setg(&error_abort, (fmt), ## __VA_ARGS__);              \
 | |
|         } else {                                                          \
 | |
|             error_report((fmt), ## __VA_ARGS__);                          \
 | |
|         }                                                                 \
 | |
|     }                                                                     \
 | |
| 
 | |
| struct VfuObjectClass {
 | |
|     ObjectClass parent_class;
 | |
| 
 | |
|     unsigned int nr_devs;
 | |
| };
 | |
| 
 | |
| struct VfuObject {
 | |
|     /* private */
 | |
|     Object parent;
 | |
| 
 | |
|     SocketAddress *socket;
 | |
| 
 | |
|     char *device;
 | |
| 
 | |
|     Error *err;
 | |
| 
 | |
|     Notifier machine_done;
 | |
| 
 | |
|     vfu_ctx_t *vfu_ctx;
 | |
| 
 | |
|     PCIDevice *pci_dev;
 | |
| 
 | |
|     Error *unplug_blocker;
 | |
| 
 | |
|     int vfu_poll_fd;
 | |
| 
 | |
|     MSITriggerFunc *default_msi_trigger;
 | |
|     MSIPrepareMessageFunc *default_msi_prepare_message;
 | |
|     MSIxPrepareMessageFunc *default_msix_prepare_message;
 | |
| };
 | |
| 
 | |
| static void vfu_object_init_ctx(VfuObject *o, Error **errp);
 | |
| 
 | |
| static bool vfu_object_auto_shutdown(void)
 | |
| {
 | |
|     bool auto_shutdown = true;
 | |
|     Error *local_err = NULL;
 | |
| 
 | |
|     if (!current_machine) {
 | |
|         return auto_shutdown;
 | |
|     }
 | |
| 
 | |
|     auto_shutdown = object_property_get_bool(OBJECT(current_machine),
 | |
|                                              "auto-shutdown",
 | |
|                                              &local_err);
 | |
| 
 | |
|     /*
 | |
|      * local_err would be set if no such property exists - safe to ignore.
 | |
|      * Unlikely scenario as auto-shutdown is always defined for
 | |
|      * TYPE_REMOTE_MACHINE, and  TYPE_VFU_OBJECT only works with
 | |
|      * TYPE_REMOTE_MACHINE
 | |
|      */
 | |
|     if (local_err) {
 | |
|         auto_shutdown = true;
 | |
|         error_free(local_err);
 | |
|     }
 | |
| 
 | |
|     return auto_shutdown;
 | |
| }
 | |
| 
 | |
| static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
 | |
|                                   void *opaque, Error **errp)
 | |
| {
 | |
|     VfuObject *o = VFU_OBJECT(obj);
 | |
| 
 | |
|     if (o->vfu_ctx) {
 | |
|         error_setg(errp, "vfu: Unable to set socket property - server busy");
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     qapi_free_SocketAddress(o->socket);
 | |
| 
 | |
|     o->socket = NULL;
 | |
| 
 | |
|     visit_type_SocketAddress(v, name, &o->socket, errp);
 | |
| 
 | |
|     if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
 | |
|         error_setg(errp, "vfu: Unsupported socket type - %s",
 | |
|                    SocketAddressType_str(o->socket->type));
 | |
|         qapi_free_SocketAddress(o->socket);
 | |
|         o->socket = NULL;
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     trace_vfu_prop("socket", o->socket->u.q_unix.path);
 | |
| 
 | |
|     vfu_object_init_ctx(o, errp);
 | |
| }
 | |
| 
 | |
| static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
 | |
| {
 | |
|     VfuObject *o = VFU_OBJECT(obj);
 | |
| 
 | |
|     if (o->vfu_ctx) {
 | |
|         error_setg(errp, "vfu: Unable to set device property - server busy");
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     g_free(o->device);
 | |
| 
 | |
|     o->device = g_strdup(str);
 | |
| 
 | |
|     trace_vfu_prop("device", str);
 | |
| 
 | |
|     vfu_object_init_ctx(o, errp);
 | |
| }
 | |
| 
 | |
| static void vfu_object_ctx_run(void *opaque)
 | |
| {
 | |
|     VfuObject *o = opaque;
 | |
|     const char *vfu_id;
 | |
|     char *vfu_path, *pci_dev_path;
 | |
|     int ret = -1;
 | |
| 
 | |
|     while (ret != 0) {
 | |
|         ret = vfu_run_ctx(o->vfu_ctx);
 | |
|         if (ret < 0) {
 | |
|             if (errno == EINTR) {
 | |
|                 continue;
 | |
|             } else if (errno == ENOTCONN) {
 | |
|                 vfu_id = object_get_canonical_path_component(OBJECT(o));
 | |
|                 vfu_path = object_get_canonical_path(OBJECT(o));
 | |
|                 g_assert(o->pci_dev);
 | |
|                 pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
 | |
|                  /* o->device is a required property and is non-NULL here */
 | |
|                 g_assert(o->device);
 | |
|                 qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
 | |
|                                                   o->device, pci_dev_path);
 | |
|                 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 | |
|                 o->vfu_poll_fd = -1;
 | |
|                 object_unparent(OBJECT(o));
 | |
|                 g_free(vfu_path);
 | |
|                 g_free(pci_dev_path);
 | |
|                 break;
 | |
|             } else {
 | |
|                 VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
 | |
|                                  o->device, strerror(errno));
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void vfu_object_attach_ctx(void *opaque)
 | |
| {
 | |
|     VfuObject *o = opaque;
 | |
|     GPollFD pfds[1];
 | |
|     int ret;
 | |
| 
 | |
|     qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 | |
| 
 | |
|     pfds[0].fd = o->vfu_poll_fd;
 | |
|     pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
 | |
| 
 | |
| retry_attach:
 | |
|     ret = vfu_attach_ctx(o->vfu_ctx);
 | |
|     if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 | |
|         /**
 | |
|          * vfu_object_attach_ctx can block QEMU's main loop
 | |
|          * during attach - the monitor and other IO
 | |
|          * could be unresponsive during this time.
 | |
|          */
 | |
|         (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
 | |
|         goto retry_attach;
 | |
|     } else if (ret < 0) {
 | |
|         VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
 | |
|                          o->device, strerror(errno));
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
 | |
|     if (o->vfu_poll_fd < 0) {
 | |
|         VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
 | |
| }
 | |
| 
 | |
| static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
 | |
|                                      size_t count, loff_t offset,
 | |
|                                      const bool is_write)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
|     uint32_t pci_access_width = sizeof(uint32_t);
 | |
|     size_t bytes = count;
 | |
|     uint32_t val = 0;
 | |
|     char *ptr = buf;
 | |
|     int len;
 | |
| 
 | |
|     /*
 | |
|      * Writes to the BAR registers would trigger an update to the
 | |
|      * global Memory and IO AddressSpaces. But the remote device
 | |
|      * never uses the global AddressSpaces, therefore overlapping
 | |
|      * memory regions are not a problem
 | |
|      */
 | |
|     while (bytes > 0) {
 | |
|         len = (bytes > pci_access_width) ? pci_access_width : bytes;
 | |
|         if (is_write) {
 | |
|             memcpy(&val, ptr, len);
 | |
|             pci_host_config_write_common(o->pci_dev, offset,
 | |
|                                          pci_config_size(o->pci_dev),
 | |
|                                          val, len);
 | |
|             trace_vfu_cfg_write(offset, val);
 | |
|         } else {
 | |
|             val = pci_host_config_read_common(o->pci_dev, offset,
 | |
|                                               pci_config_size(o->pci_dev), len);
 | |
|             memcpy(ptr, &val, len);
 | |
|             trace_vfu_cfg_read(offset, val);
 | |
|         }
 | |
|         offset += len;
 | |
|         ptr += len;
 | |
|         bytes -= len;
 | |
|     }
 | |
| 
 | |
|     return count;
 | |
| }
 | |
| 
 | |
| static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
|     AddressSpace *dma_as = NULL;
 | |
|     MemoryRegion *subregion = NULL;
 | |
|     g_autofree char *name = NULL;
 | |
|     struct iovec *iov = &info->iova;
 | |
| 
 | |
|     if (!info->vaddr) {
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
 | |
|                            (uint64_t)info->vaddr);
 | |
| 
 | |
|     subregion = g_new0(MemoryRegion, 1);
 | |
| 
 | |
|     memory_region_init_ram_ptr(subregion, NULL, name,
 | |
|                                iov->iov_len, info->vaddr);
 | |
| 
 | |
|     dma_as = pci_device_iommu_address_space(o->pci_dev);
 | |
| 
 | |
|     memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
 | |
| 
 | |
|     trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
 | |
| }
 | |
| 
 | |
| static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
|     AddressSpace *dma_as = NULL;
 | |
|     MemoryRegion *mr = NULL;
 | |
|     ram_addr_t offset;
 | |
| 
 | |
|     mr = memory_region_from_host(info->vaddr, &offset);
 | |
|     if (!mr) {
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     dma_as = pci_device_iommu_address_space(o->pci_dev);
 | |
| 
 | |
|     memory_region_del_subregion(dma_as->root, mr);
 | |
| 
 | |
|     object_unparent((OBJECT(mr)));
 | |
| 
 | |
|     trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
 | |
| }
 | |
| 
 | |
| static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
 | |
|                             hwaddr size, const bool is_write)
 | |
| {
 | |
|     uint8_t *ptr = buf;
 | |
|     bool release_lock = false;
 | |
|     uint8_t *ram_ptr = NULL;
 | |
|     MemTxResult result;
 | |
|     int access_size;
 | |
|     uint64_t val;
 | |
| 
 | |
|     if (memory_access_is_direct(mr, is_write)) {
 | |
|         /**
 | |
|          * Some devices expose a PCI expansion ROM, which could be buffer
 | |
|          * based as compared to other regions which are primarily based on
 | |
|          * MemoryRegionOps. memory_region_find() would already check
 | |
|          * for buffer overflow, we don't need to repeat it here.
 | |
|          */
 | |
|         ram_ptr = memory_region_get_ram_ptr(mr);
 | |
| 
 | |
|         if (is_write) {
 | |
|             memcpy((ram_ptr + offset), buf, size);
 | |
|         } else {
 | |
|             memcpy(buf, (ram_ptr + offset), size);
 | |
|         }
 | |
| 
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     while (size) {
 | |
|         /**
 | |
|          * The read/write logic used below is similar to the ones in
 | |
|          * flatview_read/write_continue()
 | |
|          */
 | |
|         release_lock = prepare_mmio_access(mr);
 | |
| 
 | |
|         access_size = memory_access_size(mr, size, offset);
 | |
| 
 | |
|         if (is_write) {
 | |
|             val = ldn_he_p(ptr, access_size);
 | |
| 
 | |
|             result = memory_region_dispatch_write(mr, offset, val,
 | |
|                                                   size_memop(access_size),
 | |
|                                                   MEMTXATTRS_UNSPECIFIED);
 | |
|         } else {
 | |
|             result = memory_region_dispatch_read(mr, offset, &val,
 | |
|                                                  size_memop(access_size),
 | |
|                                                  MEMTXATTRS_UNSPECIFIED);
 | |
| 
 | |
|             stn_he_p(ptr, access_size, val);
 | |
|         }
 | |
| 
 | |
|         if (release_lock) {
 | |
|             bql_unlock();
 | |
|             release_lock = false;
 | |
|         }
 | |
| 
 | |
|         if (result != MEMTX_OK) {
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         size -= access_size;
 | |
|         ptr += access_size;
 | |
|         offset += access_size;
 | |
|     }
 | |
| 
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
 | |
|                                 hwaddr bar_offset, char * const buf,
 | |
|                                 hwaddr len, const bool is_write)
 | |
| {
 | |
|     MemoryRegionSection section = { 0 };
 | |
|     uint8_t *ptr = (uint8_t *)buf;
 | |
|     MemoryRegion *section_mr = NULL;
 | |
|     uint64_t section_size;
 | |
|     hwaddr section_offset;
 | |
|     hwaddr size = 0;
 | |
| 
 | |
|     while (len) {
 | |
|         section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
 | |
|                                      bar_offset, len);
 | |
| 
 | |
|         if (!section.mr) {
 | |
|             warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
 | |
|             return size;
 | |
|         }
 | |
| 
 | |
|         section_mr = section.mr;
 | |
|         section_offset = section.offset_within_region;
 | |
|         section_size = int128_get64(section.size);
 | |
| 
 | |
|         if (is_write && section_mr->readonly) {
 | |
|             warn_report("vfu: attempting to write to readonly region in "
 | |
|                         "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
 | |
|                         pci_bar, bar_offset,
 | |
|                         (bar_offset + section_size));
 | |
|             memory_region_unref(section_mr);
 | |
|             return size;
 | |
|         }
 | |
| 
 | |
|         if (vfu_object_mr_rw(section_mr, ptr, section_offset,
 | |
|                              section_size, is_write)) {
 | |
|             warn_report("vfu: failed to %s "
 | |
|                         "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
 | |
|                         is_write ? "write to" : "read from", bar_offset,
 | |
|                         (bar_offset + section_size), pci_bar);
 | |
|             memory_region_unref(section_mr);
 | |
|             return size;
 | |
|         }
 | |
| 
 | |
|         size += section_size;
 | |
|         bar_offset += section_size;
 | |
|         ptr += section_size;
 | |
|         len -= section_size;
 | |
| 
 | |
|         memory_region_unref(section_mr);
 | |
|     }
 | |
| 
 | |
|     return size;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
 | |
|  *
 | |
|  * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
 | |
|  * define vfu_object_bar2_handler
 | |
|  */
 | |
| #define VFU_OBJECT_BAR_HANDLER(BAR_NO)                                         \
 | |
|     static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx,        \
 | |
|                                         char * const buf, size_t count,        \
 | |
|                                         loff_t offset, const bool is_write)    \
 | |
|     {                                                                          \
 | |
|         VfuObject *o = vfu_get_private(vfu_ctx);                               \
 | |
|         PCIDevice *pci_dev = o->pci_dev;                                       \
 | |
|                                                                                \
 | |
|         return vfu_object_bar_rw(pci_dev, BAR_NO, offset,                      \
 | |
|                                  buf, count, is_write);                        \
 | |
|     }                                                                          \
 | |
| 
 | |
| VFU_OBJECT_BAR_HANDLER(0)
 | |
| VFU_OBJECT_BAR_HANDLER(1)
 | |
| VFU_OBJECT_BAR_HANDLER(2)
 | |
| VFU_OBJECT_BAR_HANDLER(3)
 | |
| VFU_OBJECT_BAR_HANDLER(4)
 | |
| VFU_OBJECT_BAR_HANDLER(5)
 | |
| VFU_OBJECT_BAR_HANDLER(6)
 | |
| 
 | |
| static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
 | |
|     &vfu_object_bar0_handler,
 | |
|     &vfu_object_bar1_handler,
 | |
|     &vfu_object_bar2_handler,
 | |
|     &vfu_object_bar3_handler,
 | |
|     &vfu_object_bar4_handler,
 | |
|     &vfu_object_bar5_handler,
 | |
|     &vfu_object_bar6_handler,
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * vfu_object_register_bars - Identify active BAR regions of pdev and setup
 | |
|  *                            callbacks to handle read/write accesses
 | |
|  */
 | |
| static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
 | |
| {
 | |
|     int flags = VFU_REGION_FLAG_RW;
 | |
|     int i;
 | |
| 
 | |
|     for (i = 0; i < PCI_NUM_REGIONS; i++) {
 | |
|         if (!pdev->io_regions[i].size) {
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
 | |
|             pdev->io_regions[i].memory->readonly) {
 | |
|             flags &= ~VFU_REGION_FLAG_WRITE;
 | |
|         }
 | |
| 
 | |
|         vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
 | |
|                          (size_t)pdev->io_regions[i].size,
 | |
|                          vfu_object_bar_handlers[i],
 | |
|                          flags, NULL, 0, -1, 0);
 | |
| 
 | |
|         trace_vfu_bar_register(i, pdev->io_regions[i].addr,
 | |
|                                pdev->io_regions[i].size);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
 | |
| {
 | |
|     int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
 | |
|                                 pci_dev->devfn);
 | |
| 
 | |
|     return pci_bdf;
 | |
| }
 | |
| 
 | |
| static void vfu_object_set_irq(void *opaque, int pirq, int level)
 | |
| {
 | |
|     PCIBus *pci_bus = opaque;
 | |
|     PCIDevice *pci_dev = NULL;
 | |
|     vfu_ctx_t *vfu_ctx = NULL;
 | |
|     int pci_bus_num, devfn;
 | |
| 
 | |
|     if (level) {
 | |
|         pci_bus_num = PCI_BUS_NUM(pirq);
 | |
|         devfn = PCI_BDF_TO_DEVFN(pirq);
 | |
| 
 | |
|         /*
 | |
|          * pci_find_device() performs at O(1) if the device is attached
 | |
|          * to the root PCI bus. Whereas, if the device is attached to a
 | |
|          * secondary PCI bus (such as when a root port is involved),
 | |
|          * finding the parent PCI bus could take O(n)
 | |
|          */
 | |
|         pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
 | |
| 
 | |
|         vfu_ctx = pci_dev->irq_opaque;
 | |
| 
 | |
|         g_assert(vfu_ctx);
 | |
| 
 | |
|         vfu_irq_trigger(vfu_ctx, 0);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
 | |
|                                              unsigned int vector)
 | |
| {
 | |
|     MSIMessage msg;
 | |
| 
 | |
|     msg.address = 0;
 | |
|     msg.data = vector;
 | |
| 
 | |
|     return msg;
 | |
| }
 | |
| 
 | |
| static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
 | |
| {
 | |
|     vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
 | |
| 
 | |
|     vfu_irq_trigger(vfu_ctx, msg.data);
 | |
| }
 | |
| 
 | |
| static void vfu_object_setup_msi_cbs(VfuObject *o)
 | |
| {
 | |
|     o->default_msi_trigger = o->pci_dev->msi_trigger;
 | |
|     o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
 | |
|     o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
 | |
| 
 | |
|     o->pci_dev->msi_trigger = vfu_object_msi_trigger;
 | |
|     o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
 | |
|     o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
 | |
| }
 | |
| 
 | |
| static void vfu_object_restore_msi_cbs(VfuObject *o)
 | |
| {
 | |
|     o->pci_dev->msi_trigger = o->default_msi_trigger;
 | |
|     o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
 | |
|     o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
 | |
| }
 | |
| 
 | |
| static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
 | |
|                                uint32_t count, bool mask)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
|     uint32_t vector;
 | |
| 
 | |
|     for (vector = start; vector < count; vector++) {
 | |
|         msix_set_mask(o->pci_dev, vector, mask);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
 | |
|                               uint32_t count, bool mask)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
|     Error *err = NULL;
 | |
|     uint32_t vector;
 | |
| 
 | |
|     for (vector = start; vector < count; vector++) {
 | |
|         msi_set_mask(o->pci_dev, vector, mask, &err);
 | |
|         if (err) {
 | |
|             VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
 | |
|                              error_get_pretty(err));
 | |
|             error_free(err);
 | |
|             err = NULL;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
 | |
| {
 | |
|     vfu_ctx_t *vfu_ctx = o->vfu_ctx;
 | |
|     int ret;
 | |
| 
 | |
|     ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
 | |
|     if (ret < 0) {
 | |
|         return ret;
 | |
|     }
 | |
| 
 | |
|     if (msix_nr_vectors_allocated(pci_dev)) {
 | |
|         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
 | |
|                                        msix_nr_vectors_allocated(pci_dev));
 | |
|         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
 | |
|                                      &vfu_msix_irq_state);
 | |
|     } else if (msi_nr_vectors_allocated(pci_dev)) {
 | |
|         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
 | |
|                                        msi_nr_vectors_allocated(pci_dev));
 | |
|         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
 | |
|                                      &vfu_msi_irq_state);
 | |
|     }
 | |
| 
 | |
|     if (ret < 0) {
 | |
|         return ret;
 | |
|     }
 | |
| 
 | |
|     vfu_object_setup_msi_cbs(o);
 | |
| 
 | |
|     pci_dev->irq_opaque = vfu_ctx;
 | |
| 
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| void vfu_object_set_bus_irq(PCIBus *pci_bus)
 | |
| {
 | |
|     int bus_num = pci_bus_num(pci_bus);
 | |
|     int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
 | |
| 
 | |
|     pci_bus_irqs(pci_bus, vfu_object_set_irq, pci_bus, max_bdf);
 | |
|     pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
 | |
| }
 | |
| 
 | |
| static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
 | |
| {
 | |
|     VfuObject *o = vfu_get_private(vfu_ctx);
 | |
| 
 | |
|     /* vfu_object_ctx_run() handles lost connection */
 | |
|     if (type == VFU_RESET_LOST_CONN) {
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     device_cold_reset(DEVICE(o->pci_dev));
 | |
| 
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
 | |
|  * properties. It also depends on devices instantiated in QEMU. These
 | |
|  * dependencies are not available during the instance_init phase of this
 | |
|  * object's life-cycle. As such, the server is initialized after the
 | |
|  * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
 | |
|  * when the machine is setup, and the dependencies are available.
 | |
|  */
 | |
| static void vfu_object_machine_done(Notifier *notifier, void *data)
 | |
| {
 | |
|     VfuObject *o = container_of(notifier, VfuObject, machine_done);
 | |
|     Error *err = NULL;
 | |
| 
 | |
|     vfu_object_init_ctx(o, &err);
 | |
| 
 | |
|     if (err) {
 | |
|         error_propagate(&error_abort, err);
 | |
|     }
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
 | |
|  *     an unplug blocker for the associated PCI device. Setup a FD handler
 | |
|  *     to process incoming messages in the context's socket.
 | |
|  *
 | |
|  *     The socket and device properties are mandatory, and this function
 | |
|  *     will not create the context without them - the setters for these
 | |
|  *     properties should call this function when the property is set. The
 | |
|  *     machine should also be ready when this function is invoked - it is
 | |
|  *     because QEMU objects are initialized before devices, and the
 | |
|  *     associated PCI device wouldn't be available at the object
 | |
|  *     initialization time. Until these conditions are satisfied, this
 | |
|  *     function would return early without performing any task.
 | |
|  */
 | |
| static void vfu_object_init_ctx(VfuObject *o, Error **errp)
 | |
| {
 | |
|     DeviceState *dev = NULL;
 | |
|     vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
 | |
|     int ret;
 | |
| 
 | |
|     if (o->vfu_ctx || !o->socket || !o->device ||
 | |
|             !phase_check(PHASE_MACHINE_READY)) {
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     if (o->err) {
 | |
|         error_propagate(errp, o->err);
 | |
|         o->err = NULL;
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
 | |
|                                 LIBVFIO_USER_FLAG_ATTACH_NB,
 | |
|                                 o, VFU_DEV_TYPE_PCI);
 | |
|     if (o->vfu_ctx == NULL) {
 | |
|         error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     dev = qdev_find_recursive(sysbus_get_default(), o->device);
 | |
|     if (dev == NULL) {
 | |
|         error_setg(errp, "vfu: Device %s not found", o->device);
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
 | |
|         error_setg(errp, "vfu: %s not a PCI device", o->device);
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     o->pci_dev = PCI_DEVICE(dev);
 | |
| 
 | |
|     object_ref(OBJECT(o->pci_dev));
 | |
| 
 | |
|     if (pci_is_express(o->pci_dev)) {
 | |
|         pci_type = VFU_PCI_TYPE_EXPRESS;
 | |
|     }
 | |
| 
 | |
|     ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp,
 | |
|                    "vfu: Failed to attach PCI device %s to context - %s",
 | |
|                    o->device, strerror(errno));
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     error_setg(&o->unplug_blocker,
 | |
|                "vfu: %s for %s must be deleted before unplugging",
 | |
|                TYPE_VFU_OBJECT, o->device);
 | |
|     qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 | |
| 
 | |
|     ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
 | |
|                            pci_config_size(o->pci_dev), &vfu_object_cfg_access,
 | |
|                            VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
 | |
|                            NULL, 0, -1, 0);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp,
 | |
|                    "vfu: Failed to setup config space handlers for %s- %s",
 | |
|                    o->device, strerror(errno));
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
 | |
|                    o->device);
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
 | |
| 
 | |
|     ret = vfu_object_setup_irqs(o, o->pci_dev);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp, "vfu: Failed to setup interrupts for %s",
 | |
|                    o->device);
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp, "vfu: Failed to setup reset callback");
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     ret = vfu_realize_ctx(o->vfu_ctx);
 | |
|     if (ret < 0) {
 | |
|         error_setg(errp, "vfu: Failed to realize device %s- %s",
 | |
|                    o->device, strerror(errno));
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
 | |
|     if (o->vfu_poll_fd < 0) {
 | |
|         error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
 | |
|         goto fail;
 | |
|     }
 | |
| 
 | |
|     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
 | |
| 
 | |
|     return;
 | |
| 
 | |
| fail:
 | |
|     vfu_destroy_ctx(o->vfu_ctx);
 | |
|     if (o->unplug_blocker && o->pci_dev) {
 | |
|         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 | |
|         error_free(o->unplug_blocker);
 | |
|         o->unplug_blocker = NULL;
 | |
|     }
 | |
|     if (o->pci_dev) {
 | |
|         vfu_object_restore_msi_cbs(o);
 | |
|         o->pci_dev->irq_opaque = NULL;
 | |
|         object_unref(OBJECT(o->pci_dev));
 | |
|         o->pci_dev = NULL;
 | |
|     }
 | |
|     o->vfu_ctx = NULL;
 | |
| }
 | |
| 
 | |
| static void vfu_object_init(Object *obj)
 | |
| {
 | |
|     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
 | |
|     VfuObject *o = VFU_OBJECT(obj);
 | |
| 
 | |
|     k->nr_devs++;
 | |
| 
 | |
|     if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
 | |
|         error_setg(&o->err, "vfu: %s only compatible with %s machine",
 | |
|                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     if (!phase_check(PHASE_MACHINE_READY)) {
 | |
|         o->machine_done.notify = vfu_object_machine_done;
 | |
|         qemu_add_machine_init_done_notifier(&o->machine_done);
 | |
|     }
 | |
| 
 | |
|     o->vfu_poll_fd = -1;
 | |
| }
 | |
| 
 | |
| static void vfu_object_finalize(Object *obj)
 | |
| {
 | |
|     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
 | |
|     VfuObject *o = VFU_OBJECT(obj);
 | |
| 
 | |
|     k->nr_devs--;
 | |
| 
 | |
|     qapi_free_SocketAddress(o->socket);
 | |
| 
 | |
|     o->socket = NULL;
 | |
| 
 | |
|     if (o->vfu_poll_fd != -1) {
 | |
|         qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 | |
|         o->vfu_poll_fd = -1;
 | |
|     }
 | |
| 
 | |
|     if (o->vfu_ctx) {
 | |
|         vfu_destroy_ctx(o->vfu_ctx);
 | |
|         o->vfu_ctx = NULL;
 | |
|     }
 | |
| 
 | |
|     g_free(o->device);
 | |
| 
 | |
|     o->device = NULL;
 | |
| 
 | |
|     if (o->unplug_blocker && o->pci_dev) {
 | |
|         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 | |
|         error_free(o->unplug_blocker);
 | |
|         o->unplug_blocker = NULL;
 | |
|     }
 | |
| 
 | |
|     if (o->pci_dev) {
 | |
|         vfu_object_restore_msi_cbs(o);
 | |
|         o->pci_dev->irq_opaque = NULL;
 | |
|         object_unref(OBJECT(o->pci_dev));
 | |
|         o->pci_dev = NULL;
 | |
|     }
 | |
| 
 | |
|     if (!k->nr_devs && vfu_object_auto_shutdown()) {
 | |
|         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
 | |
|     }
 | |
| 
 | |
|     if (o->machine_done.notify) {
 | |
|         qemu_remove_machine_init_done_notifier(&o->machine_done);
 | |
|         o->machine_done.notify = NULL;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void vfu_object_class_init(ObjectClass *klass, void *data)
 | |
| {
 | |
|     VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
 | |
| 
 | |
|     k->nr_devs = 0;
 | |
| 
 | |
|     object_class_property_add(klass, "socket", "SocketAddress", NULL,
 | |
|                               vfu_object_set_socket, NULL, NULL);
 | |
|     object_class_property_set_description(klass, "socket",
 | |
|                                           "SocketAddress "
 | |
|                                           "(ex: type=unix,path=/tmp/sock). "
 | |
|                                           "Only UNIX is presently supported");
 | |
|     object_class_property_add_str(klass, "device", NULL,
 | |
|                                   vfu_object_set_device);
 | |
|     object_class_property_set_description(klass, "device",
 | |
|                                           "device ID - only PCI devices "
 | |
|                                           "are presently supported");
 | |
| }
 | |
| 
 | |
| static const TypeInfo vfu_object_info = {
 | |
|     .name = TYPE_VFU_OBJECT,
 | |
|     .parent = TYPE_OBJECT,
 | |
|     .instance_size = sizeof(VfuObject),
 | |
|     .instance_init = vfu_object_init,
 | |
|     .instance_finalize = vfu_object_finalize,
 | |
|     .class_size = sizeof(VfuObjectClass),
 | |
|     .class_init = vfu_object_class_init,
 | |
|     .interfaces = (InterfaceInfo[]) {
 | |
|         { TYPE_USER_CREATABLE },
 | |
|         { }
 | |
|     }
 | |
| };
 | |
| 
 | |
| static void vfu_register_types(void)
 | |
| {
 | |
|     type_register_static(&vfu_object_info);
 | |
| }
 | |
| 
 | |
| type_init(vfu_register_types);
 |