[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <YuRaCCN9MqE5OjXH@dev-arch.thelio-3990X>
Date: Fri, 29 Jul 2022 15:07:04 -0700
From: Nathan Chancellor <nathan@...nel.org>
To: Jorgen Hansen <jhansen@...are.com>
Cc: linux-kernel@...r.kernel.org,
virtualization@...ts.linux-foundation.org,
gregkh@...uxfoundation.org, pv-drivers@...are.com,
Vishnu Dasa <vdasa@...are.com>
Subject: Re: [PATCH 8/8] VMCI: dma dg: add support for DMA datagrams receive
Hi Jorgen,
On Wed, Feb 02, 2022 at 06:49:10AM -0800, Jorgen Hansen wrote:
> Use the DMA based receive operation instead of the ioread8_rep
> based datagram receive when DMA datagrams are supported.
>
> In the receive operation, configure the header to point to the
> page aligned VMCI_MAX_DG_SIZE part of the receive buffer
> using s/g configuration for the header. This ensures that the
> existing dispatch routine can be used with little modification.
> Initiate the receive by writing the lower 32 bit of the buffer
> to the VMCI_DATA_IN_LOW_ADDR register, and wait for the busy
> flag to be changed by the device using a wait queue.
>
> The existing dispatch routine for received datagrams is reused
> for the DMA datagrams with a few modifications:
> - the receive buffer is always the maximum size for DMA datagrams
> (IO ports would try with a shorter buffer first to reduce
> overhead of the ioread8_rep operation).
> - for DMA datagrams, datagrams are provided contiguous in the
> buffer as opposed to IO port datagrams, where they can start
> on any page boundary
>
> Reviewed-by: Vishnu Dasa <vdasa@...are.com>
> Signed-off-by: Jorgen Hansen <jhansen@...are.com>
> ---
> drivers/misc/vmw_vmci/vmci_guest.c | 103 ++++++++++++++++++++++-------
> 1 file changed, 79 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
> index ca73a1913404..2bcfa292772d 100644
> --- a/drivers/misc/vmw_vmci/vmci_guest.c
> +++ b/drivers/misc/vmw_vmci/vmci_guest.c
> @@ -58,6 +58,7 @@ struct vmci_guest_device {
>
> struct tasklet_struct datagram_tasklet;
> struct tasklet_struct bm_tasklet;
> + struct wait_queue_head inout_wq;
>
> void *data_buffer;
> dma_addr_t data_buffer_base;
> @@ -115,6 +116,36 @@ void vmci_write_reg(struct vmci_guest_device *dev, u32 val, u32 reg)
> iowrite32(val, dev->iobase + reg);
> }
>
> +static void vmci_read_data(struct vmci_guest_device *vmci_dev,
> + void *dest, size_t size)
> +{
> + if (vmci_dev->mmio_base == NULL)
> + ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> + dest, size);
> + else {
> + /*
> + * For DMA datagrams, the data_buffer will contain the header on the
> + * first page, followed by the incoming datagram(s) on the following
> + * pages. The header uses an S/G element immediately following the
> + * header on the first page to point to the data area.
> + */
> + struct vmci_data_in_out_header *buffer_header = vmci_dev->data_buffer;
> + struct vmci_sg_elem *sg_array = (struct vmci_sg_elem *)(buffer_header + 1);
> + size_t buffer_offset = dest - vmci_dev->data_buffer;
> +
> + buffer_header->opcode = 1;
> + buffer_header->size = 1;
> + buffer_header->busy = 0;
> + sg_array[0].addr = vmci_dev->data_buffer_base + buffer_offset;
> + sg_array[0].size = size;
> +
> + vmci_write_reg(vmci_dev, lower_32_bits(vmci_dev->data_buffer_base),
> + VMCI_DATA_IN_LOW_ADDR);
> +
> + wait_event(vmci_dev->inout_wq, buffer_header->busy == 1);
Apologies if this has been reported somewhere (or if this change is not
the root cause, I did not bother bisecting, I just inferred based on the
call chain) or if there is a patch for this issue somewhere that I did
not notice.
When CONFIG_DEBUG_ATOMIC_SLEEP is enabled, this call to wait_event()
causes a BUG in dmesg, which I see when booting Fedora Rawhide in VMware
Fusion on my MacBook Air. As far as I can tell, the kernel is vanilla.
[ 20.264639] BUG: sleeping function called from invalid context at drivers/misc/vmw_vmci/vmci_guest.c:145
[ 20.264643] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 762, name: vmtoolsd
[ 20.264645] preempt_count: 101, expected: 0
[ 20.264646] RCU nest depth: 0, expected: 0
[ 20.264647] 1 lock held by vmtoolsd/762:
[ 20.264648] #0: ffff0000874ae440 (sk_lock-AF_VSOCK){+.+.}-{0:0}, at: vsock_connect+0x60/0x330 [vsock]
[ 20.264658] Preemption disabled at:
[ 20.264659] [<ffff80000151d7d8>] vmci_send_datagram+0x44/0xa0 [vmw_vmci]
[ 20.264665] CPU: 0 PID: 762 Comm: vmtoolsd Not tainted 5.19.0-0.rc8.20220727git39c3c396f813.60.fc37.aarch64 #1
[ 20.264667] Hardware name: VMware, Inc. VBSA/VBSA, BIOS VEFI 12/31/2020
[ 20.264668] Call trace:
[ 20.264669] dump_backtrace+0xc4/0x130
[ 20.264672] show_stack+0x24/0x80
[ 20.264673] dump_stack_lvl+0x88/0xb4
[ 20.264676] dump_stack+0x18/0x34
[ 20.264677] __might_resched+0x1a0/0x280
[ 20.264679] __might_sleep+0x58/0x90
[ 20.264681] vmci_read_data+0x74/0x120 [vmw_vmci]
[ 20.264683] vmci_dispatch_dgs+0x64/0x204 [vmw_vmci]
[ 20.264686] tasklet_action_common.constprop.0+0x13c/0x150
[ 20.264688] tasklet_action+0x40/0x50
[ 20.264689] __do_softirq+0x23c/0x6b4
[ 20.264690] __irq_exit_rcu+0x104/0x214
[ 20.264691] irq_exit_rcu+0x1c/0x50
[ 20.264693] el1_interrupt+0x38/0x6c
[ 20.264695] el1h_64_irq_handler+0x18/0x24
[ 20.264696] el1h_64_irq+0x68/0x6c
[ 20.264697] preempt_count_sub+0xa4/0xe0
[ 20.264698] _raw_spin_unlock_irqrestore+0x64/0xb0
[ 20.264701] vmci_send_datagram+0x7c/0xa0 [vmw_vmci]
[ 20.264703] vmci_datagram_dispatch+0x84/0x100 [vmw_vmci]
[ 20.264706] vmci_datagram_send+0x2c/0x40 [vmw_vmci]
[ 20.264709] vmci_transport_send_control_pkt+0xb8/0x120 [vmw_vsock_vmci_transport]
[ 20.264711] vmci_transport_connect+0x40/0x7c [vmw_vsock_vmci_transport]
[ 20.264713] vsock_connect+0x278/0x330 [vsock]
[ 20.264715] __sys_connect_file+0x8c/0xc0
[ 20.264718] __sys_connect+0x84/0xb4
[ 20.264720] __arm64_sys_connect+0x2c/0x3c
[ 20.264721] invoke_syscall+0x78/0x100
[ 20.264723] el0_svc_common.constprop.0+0x68/0x124
[ 20.264724] do_el0_svc+0x38/0x4c
[ 20.264725] el0_svc+0x60/0x180
[ 20.264726] el0t_64_sync_handler+0x11c/0x150
[ 20.264728] el0t_64_sync+0x190/0x194
Attached is the entire dmesg in case it is useful, as I see several
messages with different "preemption disabled at" values.
If there is any additional information I can provide or patches I can
test, please let me know.
Cheers,
Nathan
> + }
> +}
> +
> int vmci_write_data(struct vmci_guest_device *dev, struct vmci_datagram *dg)
> {
> int result;
> @@ -260,15 +291,17 @@ static int vmci_check_host_caps(struct pci_dev *pdev)
> }
>
> /*
> - * Reads datagrams from the data in port and dispatches them. We
> - * always start reading datagrams into only the first page of the
> - * datagram buffer. If the datagrams don't fit into one page, we
> - * use the maximum datagram buffer size for the remainder of the
> - * invocation. This is a simple heuristic for not penalizing
> - * small datagrams.
> + * Reads datagrams from the device and dispatches them. For IO port
> + * based access to the device, we always start reading datagrams into
> + * only the first page of the datagram buffer. If the datagrams don't
> + * fit into one page, we use the maximum datagram buffer size for the
> + * remainder of the invocation. This is a simple heuristic for not
> + * penalizing small datagrams. For DMA-based datagrams, we always
> + * use the maximum datagram buffer size, since there is no performance
> + * penalty for doing so.
> *
> * This function assumes that it has exclusive access to the data
> - * in port for the duration of the call.
> + * in register(s) for the duration of the call.
> */
> static void vmci_dispatch_dgs(unsigned long data)
> {
> @@ -276,23 +309,41 @@ static void vmci_dispatch_dgs(unsigned long data)
> u8 *dg_in_buffer = vmci_dev->data_buffer;
> struct vmci_datagram *dg;
> size_t dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> - size_t current_dg_in_buffer_size = PAGE_SIZE;
> + size_t current_dg_in_buffer_size;
> size_t remaining_bytes;
> + bool is_io_port = vmci_dev->mmio_base == NULL;
>
> BUILD_BUG_ON(VMCI_MAX_DG_SIZE < PAGE_SIZE);
>
> - ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer, current_dg_in_buffer_size);
> + if (!is_io_port) {
> + /* For mmio, the first page is used for the header. */
> + dg_in_buffer += PAGE_SIZE;
> +
> + /*
> + * For DMA-based datagram operations, there is no performance
> + * penalty for reading the maximum buffer size.
> + */
> + current_dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> + } else {
> + current_dg_in_buffer_size = PAGE_SIZE;
> + }
> + vmci_read_data(vmci_dev, dg_in_buffer, current_dg_in_buffer_size);
> dg = (struct vmci_datagram *)dg_in_buffer;
> remaining_bytes = current_dg_in_buffer_size;
>
> + /*
> + * Read through the buffer until an invalid datagram header is
> + * encountered. The exit condition for datagrams read through
> + * VMCI_DATA_IN_ADDR is a bit more complicated, since a datagram
> + * can start on any page boundary in the buffer.
> + */
> while (dg->dst.resource != VMCI_INVALID_ID ||
> - remaining_bytes > PAGE_SIZE) {
> + (is_io_port && remaining_bytes > PAGE_SIZE)) {
> unsigned dg_in_size;
>
> /*
> - * When the input buffer spans multiple pages, a datagram can
> - * start on any page boundary in the buffer.
> + * If using VMCI_DATA_IN_ADDR, skip to the next page
> + * as a datagram can start on any page boundary.
> */
> if (dg->dst.resource == VMCI_INVALID_ID) {
> dg = (struct vmci_datagram *)roundup(
> @@ -342,11 +393,10 @@ static void vmci_dispatch_dgs(unsigned long data)
> current_dg_in_buffer_size =
> dg_in_buffer_size;
>
> - ioread8_rep(vmci_dev->iobase +
> - VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer +
> + vmci_read_data(vmci_dev,
> + dg_in_buffer +
> remaining_bytes,
> - current_dg_in_buffer_size -
> + current_dg_in_buffer_size -
> remaining_bytes);
> }
>
> @@ -384,10 +434,8 @@ static void vmci_dispatch_dgs(unsigned long data)
> current_dg_in_buffer_size = dg_in_buffer_size;
>
> for (;;) {
> - ioread8_rep(vmci_dev->iobase +
> - VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer,
> - current_dg_in_buffer_size);
> + vmci_read_data(vmci_dev, dg_in_buffer,
> + current_dg_in_buffer_size);
> if (bytes_to_skip <= current_dg_in_buffer_size)
> break;
>
> @@ -404,8 +452,7 @@ static void vmci_dispatch_dgs(unsigned long data)
> if (remaining_bytes < VMCI_DG_HEADERSIZE) {
> /* Get the next batch of datagrams. */
>
> - ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer,
> + vmci_read_data(vmci_dev, dg_in_buffer,
> current_dg_in_buffer_size);
> dg = (struct vmci_datagram *)dg_in_buffer;
> remaining_bytes = current_dg_in_buffer_size;
> @@ -463,8 +510,11 @@ static irqreturn_t vmci_interrupt(int irq, void *_dev)
> icr &= ~VMCI_ICR_NOTIFICATION;
> }
>
> - if (icr & VMCI_ICR_DMA_DATAGRAM)
> +
> + if (icr & VMCI_ICR_DMA_DATAGRAM) {
> + wake_up_all(&dev->inout_wq);
> icr &= ~VMCI_ICR_DMA_DATAGRAM;
> + }
>
> if (icr != 0)
> dev_warn(dev->dev,
> @@ -497,6 +547,10 @@ static irqreturn_t vmci_interrupt_bm(int irq, void *_dev)
> */
> static irqreturn_t vmci_interrupt_dma_datagram(int irq, void *_dev)
> {
> + struct vmci_guest_device *dev = _dev;
> +
> + wake_up_all(&dev->inout_wq);
> +
> return IRQ_HANDLED;
> }
>
> @@ -586,6 +640,7 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
> vmci_dispatch_dgs, (unsigned long)vmci_dev);
> tasklet_init(&vmci_dev->bm_tasklet,
> vmci_process_bitmap, (unsigned long)vmci_dev);
> + init_waitqueue_head(&vmci_dev->inout_wq);
>
> if (mmio_base != NULL) {
> vmci_dev->tx_buffer = dma_alloc_coherent(&pdev->dev, VMCI_DMA_DG_BUFFER_SIZE,
> --
> 2.25.1
>
>
View attachment "vmci_read_data.log" of type "text/plain" (113913 bytes)
Powered by blists - more mailing lists