lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Fri, 29 Jul 2022 15:07:04 -0700
From:   Nathan Chancellor <nathan@...nel.org>
To:     Jorgen Hansen <jhansen@...are.com>
Cc:     linux-kernel@...r.kernel.org,
        virtualization@...ts.linux-foundation.org,
        gregkh@...uxfoundation.org, pv-drivers@...are.com,
        Vishnu Dasa <vdasa@...are.com>
Subject: Re: [PATCH 8/8] VMCI: dma dg: add support for DMA datagrams receive

Hi Jorgen,

On Wed, Feb 02, 2022 at 06:49:10AM -0800, Jorgen Hansen wrote:
> Use the DMA based receive operation instead of the ioread8_rep
> based datagram receive when DMA datagrams are supported.
> 
> In the receive operation, configure the header to point to the
> page aligned VMCI_MAX_DG_SIZE part of the receive buffer
> using s/g configuration for the header. This ensures that the
> existing dispatch routine can be used with little modification.
> Initiate the receive by writing the lower 32 bit of the buffer
> to the VMCI_DATA_IN_LOW_ADDR register, and wait for the busy
> flag to be changed by the device using a wait queue.
> 
> The existing dispatch routine for received  datagrams is reused
> for the DMA datagrams with a few modifications:
> - the receive buffer is always the maximum size for DMA datagrams
>   (IO ports would try with a shorter buffer first to reduce
>   overhead of the ioread8_rep operation).
> - for DMA datagrams, datagrams are provided contiguous in the
>   buffer as opposed to IO port datagrams, where they can start
>   on any page boundary
> 
> Reviewed-by: Vishnu Dasa <vdasa@...are.com>
> Signed-off-by: Jorgen Hansen <jhansen@...are.com>
> ---
>  drivers/misc/vmw_vmci/vmci_guest.c | 103 ++++++++++++++++++++++-------
>  1 file changed, 79 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
> index ca73a1913404..2bcfa292772d 100644
> --- a/drivers/misc/vmw_vmci/vmci_guest.c
> +++ b/drivers/misc/vmw_vmci/vmci_guest.c
> @@ -58,6 +58,7 @@ struct vmci_guest_device {
>  
>  	struct tasklet_struct datagram_tasklet;
>  	struct tasklet_struct bm_tasklet;
> +	struct wait_queue_head inout_wq;
>  
>  	void *data_buffer;
>  	dma_addr_t data_buffer_base;
> @@ -115,6 +116,36 @@ void vmci_write_reg(struct vmci_guest_device *dev, u32 val, u32 reg)
>  		iowrite32(val, dev->iobase + reg);
>  }
>  
> +static void vmci_read_data(struct vmci_guest_device *vmci_dev,
> +			   void *dest, size_t size)
> +{
> +	if (vmci_dev->mmio_base == NULL)
> +		ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> +			    dest, size);
> +	else {
> +		/*
> +		 * For DMA datagrams, the data_buffer will contain the header on the
> +		 * first page, followed by the incoming datagram(s) on the following
> +		 * pages. The header uses an S/G element immediately following the
> +		 * header on the first page to point to the data area.
> +		 */
> +		struct vmci_data_in_out_header *buffer_header = vmci_dev->data_buffer;
> +		struct vmci_sg_elem *sg_array = (struct vmci_sg_elem *)(buffer_header + 1);
> +		size_t buffer_offset = dest - vmci_dev->data_buffer;
> +
> +		buffer_header->opcode = 1;
> +		buffer_header->size = 1;
> +		buffer_header->busy = 0;
> +		sg_array[0].addr = vmci_dev->data_buffer_base + buffer_offset;
> +		sg_array[0].size = size;
> +
> +		vmci_write_reg(vmci_dev, lower_32_bits(vmci_dev->data_buffer_base),
> +			       VMCI_DATA_IN_LOW_ADDR);
> +
> +		wait_event(vmci_dev->inout_wq, buffer_header->busy == 1);

Apologies if this has been reported somewhere (or if this change is not
the root cause, I did not bother bisecting, I just inferred based on the
call chain) or if there is a patch for this issue somewhere that I did
not notice.

When CONFIG_DEBUG_ATOMIC_SLEEP is enabled, this call to wait_event()
causes a BUG in dmesg, which I see when booting Fedora Rawhide in VMware
Fusion on my MacBook Air. As far as I can tell, the kernel is vanilla.

[   20.264639] BUG: sleeping function called from invalid context at drivers/misc/vmw_vmci/vmci_guest.c:145
[   20.264643] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 762, name: vmtoolsd
[   20.264645] preempt_count: 101, expected: 0
[   20.264646] RCU nest depth: 0, expected: 0
[   20.264647] 1 lock held by vmtoolsd/762:
[   20.264648]  #0: ffff0000874ae440 (sk_lock-AF_VSOCK){+.+.}-{0:0}, at: vsock_connect+0x60/0x330 [vsock]
[   20.264658] Preemption disabled at:
[   20.264659] [<ffff80000151d7d8>] vmci_send_datagram+0x44/0xa0 [vmw_vmci]
[   20.264665] CPU: 0 PID: 762 Comm: vmtoolsd Not tainted 5.19.0-0.rc8.20220727git39c3c396f813.60.fc37.aarch64 #1
[   20.264667] Hardware name: VMware, Inc. VBSA/VBSA, BIOS VEFI 12/31/2020
[   20.264668] Call trace:
[   20.264669]  dump_backtrace+0xc4/0x130
[   20.264672]  show_stack+0x24/0x80
[   20.264673]  dump_stack_lvl+0x88/0xb4
[   20.264676]  dump_stack+0x18/0x34
[   20.264677]  __might_resched+0x1a0/0x280
[   20.264679]  __might_sleep+0x58/0x90
[   20.264681]  vmci_read_data+0x74/0x120 [vmw_vmci]
[   20.264683]  vmci_dispatch_dgs+0x64/0x204 [vmw_vmci]
[   20.264686]  tasklet_action_common.constprop.0+0x13c/0x150
[   20.264688]  tasklet_action+0x40/0x50
[   20.264689]  __do_softirq+0x23c/0x6b4
[   20.264690]  __irq_exit_rcu+0x104/0x214
[   20.264691]  irq_exit_rcu+0x1c/0x50
[   20.264693]  el1_interrupt+0x38/0x6c
[   20.264695]  el1h_64_irq_handler+0x18/0x24
[   20.264696]  el1h_64_irq+0x68/0x6c
[   20.264697]  preempt_count_sub+0xa4/0xe0
[   20.264698]  _raw_spin_unlock_irqrestore+0x64/0xb0
[   20.264701]  vmci_send_datagram+0x7c/0xa0 [vmw_vmci]
[   20.264703]  vmci_datagram_dispatch+0x84/0x100 [vmw_vmci]
[   20.264706]  vmci_datagram_send+0x2c/0x40 [vmw_vmci]
[   20.264709]  vmci_transport_send_control_pkt+0xb8/0x120 [vmw_vsock_vmci_transport]
[   20.264711]  vmci_transport_connect+0x40/0x7c [vmw_vsock_vmci_transport]
[   20.264713]  vsock_connect+0x278/0x330 [vsock]
[   20.264715]  __sys_connect_file+0x8c/0xc0
[   20.264718]  __sys_connect+0x84/0xb4
[   20.264720]  __arm64_sys_connect+0x2c/0x3c
[   20.264721]  invoke_syscall+0x78/0x100
[   20.264723]  el0_svc_common.constprop.0+0x68/0x124
[   20.264724]  do_el0_svc+0x38/0x4c
[   20.264725]  el0_svc+0x60/0x180
[   20.264726]  el0t_64_sync_handler+0x11c/0x150
[   20.264728]  el0t_64_sync+0x190/0x194

Attached is the entire dmesg in case it is useful, as I see several
messages with different "preemption disabled at" values.

If there is any additional information I can provide or patches I can
test, please let me know.

Cheers,
Nathan

> +	}
> +}
> +
>  int vmci_write_data(struct vmci_guest_device *dev, struct vmci_datagram *dg)
>  {
>  	int result;
> @@ -260,15 +291,17 @@ static int vmci_check_host_caps(struct pci_dev *pdev)
>  }
>  
>  /*
> - * Reads datagrams from the data in port and dispatches them. We
> - * always start reading datagrams into only the first page of the
> - * datagram buffer. If the datagrams don't fit into one page, we
> - * use the maximum datagram buffer size for the remainder of the
> - * invocation. This is a simple heuristic for not penalizing
> - * small datagrams.
> + * Reads datagrams from the device and dispatches them. For IO port
> + * based access to the device, we always start reading datagrams into
> + * only the first page of the datagram buffer. If the datagrams don't
> + * fit into one page, we use the maximum datagram buffer size for the
> + * remainder of the invocation. This is a simple heuristic for not
> + * penalizing small datagrams. For DMA-based datagrams, we always
> + * use the maximum datagram buffer size, since there is no performance
> + * penalty for doing so.
>   *
>   * This function assumes that it has exclusive access to the data
> - * in port for the duration of the call.
> + * in register(s) for the duration of the call.
>   */
>  static void vmci_dispatch_dgs(unsigned long data)
>  {
> @@ -276,23 +309,41 @@ static void vmci_dispatch_dgs(unsigned long data)
>  	u8 *dg_in_buffer = vmci_dev->data_buffer;
>  	struct vmci_datagram *dg;
>  	size_t dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> -	size_t current_dg_in_buffer_size = PAGE_SIZE;
> +	size_t current_dg_in_buffer_size;
>  	size_t remaining_bytes;
> +	bool is_io_port = vmci_dev->mmio_base == NULL;
>  
>  	BUILD_BUG_ON(VMCI_MAX_DG_SIZE < PAGE_SIZE);
>  
> -	ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> -		    vmci_dev->data_buffer, current_dg_in_buffer_size);
> +	if (!is_io_port) {
> +		/* For mmio, the first page is used for the header. */
> +		dg_in_buffer += PAGE_SIZE;
> +
> +		/*
> +		 * For DMA-based datagram operations, there is no performance
> +		 * penalty for reading the maximum buffer size.
> +		 */
> +		current_dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> +	} else {
> +		current_dg_in_buffer_size = PAGE_SIZE;
> +	}
> +	vmci_read_data(vmci_dev, dg_in_buffer, current_dg_in_buffer_size);
>  	dg = (struct vmci_datagram *)dg_in_buffer;
>  	remaining_bytes = current_dg_in_buffer_size;
>  
> +	/*
> +	 * Read through the buffer until an invalid datagram header is
> +	 * encountered. The exit condition for datagrams read through
> +	 * VMCI_DATA_IN_ADDR is a bit more complicated, since a datagram
> +	 * can start on any page boundary in the buffer.
> +	 */
>  	while (dg->dst.resource != VMCI_INVALID_ID ||
> -	       remaining_bytes > PAGE_SIZE) {
> +	       (is_io_port && remaining_bytes > PAGE_SIZE)) {
>  		unsigned dg_in_size;
>  
>  		/*
> -		 * When the input buffer spans multiple pages, a datagram can
> -		 * start on any page boundary in the buffer.
> +		 * If using VMCI_DATA_IN_ADDR, skip to the next page
> +		 * as a datagram can start on any page boundary.
>  		 */
>  		if (dg->dst.resource == VMCI_INVALID_ID) {
>  			dg = (struct vmci_datagram *)roundup(
> @@ -342,11 +393,10 @@ static void vmci_dispatch_dgs(unsigned long data)
>  					current_dg_in_buffer_size =
>  					    dg_in_buffer_size;
>  
> -				ioread8_rep(vmci_dev->iobase +
> -						VMCI_DATA_IN_ADDR,
> -					vmci_dev->data_buffer +
> +				vmci_read_data(vmci_dev,
> +					       dg_in_buffer +
>  						remaining_bytes,
> -					current_dg_in_buffer_size -
> +					       current_dg_in_buffer_size -
>  						remaining_bytes);
>  			}
>  
> @@ -384,10 +434,8 @@ static void vmci_dispatch_dgs(unsigned long data)
>  				current_dg_in_buffer_size = dg_in_buffer_size;
>  
>  			for (;;) {
> -				ioread8_rep(vmci_dev->iobase +
> -						VMCI_DATA_IN_ADDR,
> -					vmci_dev->data_buffer,
> -					current_dg_in_buffer_size);
> +				vmci_read_data(vmci_dev, dg_in_buffer,
> +					       current_dg_in_buffer_size);
>  				if (bytes_to_skip <= current_dg_in_buffer_size)
>  					break;
>  
> @@ -404,8 +452,7 @@ static void vmci_dispatch_dgs(unsigned long data)
>  		if (remaining_bytes < VMCI_DG_HEADERSIZE) {
>  			/* Get the next batch of datagrams. */
>  
> -			ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> -				    vmci_dev->data_buffer,
> +			vmci_read_data(vmci_dev, dg_in_buffer,
>  				    current_dg_in_buffer_size);
>  			dg = (struct vmci_datagram *)dg_in_buffer;
>  			remaining_bytes = current_dg_in_buffer_size;
> @@ -463,8 +510,11 @@ static irqreturn_t vmci_interrupt(int irq, void *_dev)
>  			icr &= ~VMCI_ICR_NOTIFICATION;
>  		}
>  
> -		if (icr & VMCI_ICR_DMA_DATAGRAM)
> +
> +		if (icr & VMCI_ICR_DMA_DATAGRAM) {
> +			wake_up_all(&dev->inout_wq);
>  			icr &= ~VMCI_ICR_DMA_DATAGRAM;
> +		}
>  
>  		if (icr != 0)
>  			dev_warn(dev->dev,
> @@ -497,6 +547,10 @@ static irqreturn_t vmci_interrupt_bm(int irq, void *_dev)
>   */
>  static irqreturn_t vmci_interrupt_dma_datagram(int irq, void *_dev)
>  {
> +	struct vmci_guest_device *dev = _dev;
> +
> +	wake_up_all(&dev->inout_wq);
> +
>  	return IRQ_HANDLED;
>  }
>  
> @@ -586,6 +640,7 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
>  		     vmci_dispatch_dgs, (unsigned long)vmci_dev);
>  	tasklet_init(&vmci_dev->bm_tasklet,
>  		     vmci_process_bitmap, (unsigned long)vmci_dev);
> +	init_waitqueue_head(&vmci_dev->inout_wq);
>  
>  	if (mmio_base != NULL) {
>  		vmci_dev->tx_buffer = dma_alloc_coherent(&pdev->dev, VMCI_DMA_DG_BUFFER_SIZE,
> -- 
> 2.25.1
> 
> 

View attachment "vmci_read_data.log" of type "text/plain" (113913 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ