linux-kernel - RE: [PATCH v3 5/5] udmabuf: remove udmabuf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <IA0PR11MB71858D28621C745C1663B593F8822@IA0PR11MB7185.namprd11.prod.outlook.com>
Date: Sat, 17 Aug 2024 01:05:29 +0000
From: "Kasireddy, Vivek" <vivek.kasireddy@...el.com>
To: Huan Yang <link@...o.com>, Gerd Hoffmann <kraxel@...hat.com>, Sumit Semwal
	<sumit.semwal@...aro.org>, Christian König
	<christian.koenig@....com>, "dri-devel@...ts.freedesktop.org"
	<dri-devel@...ts.freedesktop.org>, "linux-media@...r.kernel.org"
	<linux-media@...r.kernel.org>, "linaro-mm-sig@...ts.linaro.org"
	<linaro-mm-sig@...ts.linaro.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>
CC: "opensource.kernel@...o.com" <opensource.kernel@...o.com>
Subject: RE: [PATCH v3 5/5] udmabuf: remove udmabuf_folio

Hi Huan,

> 
> Currently, udmabuf handles folio by creating an unpin list to record
> each folio obtained from the list and unpinning them when released. To
> maintain this approach, many data structures have been established.
> 
> However, maintaining this type of data structure requires a significant
> amount of memory and traversing the list is a substantial overhead,
> which is not friendly to the CPU cache.
> 
> Considering that during creation, we arranged the folio array in the
> order of pin and set the offset according to pgcnt.
> 
> We actually don't need to use unpin_list to unpin during release.
> Instead, we can iterate through the folios array during release and
> unpin any folio that is different from the ones previously accessed.
No, that won't work because iterating the folios array doesn't tell you
anything about how many times a folio was pinned (via memfd_pin_folios()),
as a folio could be part of multiple ranges.

For example, if userspace provides ranges 64..128 and 256..512 (assuming
these are 4k sized subpage offsets and we have a 2MB hugetlb folio), then
the same folio would cover both ranges and there would be 2 entries for
this folio in unpin_list. But, with your logic, we'd be erroneously unpinning
it only once.

Not sure if there are any great solutions available to address this situation,
but one option I can think of is to convert unpin_list to unpin array (dynamically
resized with krealloc?) and track its length separately. Or, as suggested earlier,
another way is to not use unpin_list for memfds backed by shmem, but I suspect
this may not work if THP is enabled.

Thanks,
Vivek

> 
> By this, not only saves the overhead of the udmabuf_folio data structure
> but also makes array access more cache-friendly.
> 
> Signed-off-by: Huan Yang <link@...o.com>
> ---
>  drivers/dma-buf/udmabuf.c | 68 +++++++++++++++++----------------------
>  1 file changed, 30 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 8f9cb0e2e71a..1e7f46c33d1a 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -26,16 +26,19 @@ MODULE_PARM_DESC(size_limit_mb, "Max size of a
> dmabuf, in megabytes. Default is
> 
>  struct udmabuf {
>  	pgoff_t pagecount;
> -	struct folio **folios;
>  	struct sg_table *sg;
>  	struct miscdevice *device;
> +	struct folio **folios;
> +	/**
> +	 * offset in folios array's folio, byte unit.
> +	 * udmabuf can use either shmem or hugetlb pages, an array based
> on
> +	 * pages may not be suitable.
> +	 * Especially when HVO is enabled, the tail page will be released,
> +	 * so our reference to the page will no longer be correct.
> +	 * Hence, it's necessary to record the offset in order to reference
> +	 * the correct PFN within the folio.
> +	 */
>  	pgoff_t *offsets;
> -	struct list_head unpin_list;
> -};
> -
> -struct udmabuf_folio {
> -	struct folio *folio;
> -	struct list_head list;
>  };
> 
>  static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct
> *vma)
> @@ -160,32 +163,28 @@ static void unmap_udmabuf(struct
> dma_buf_attachment *at,
>  	return put_sg_table(at->dev, sg, direction);
>  }
> 
> -static void unpin_all_folios(struct list_head *unpin_list)
> +/**
> + * unpin_all_folios:		unpin each folio we pinned in create
> + * The udmabuf set all folio in folios and pinned it, but for large folio,
> + * We may have only used a small portion of the physical in the folio.
> + * we will repeatedly, sequentially set the folio into the array to ensure
> + * that the offset can index the correct folio at the corresponding index.
> + * Hence, we only need to unpin the first iterred folio.
> + */
> +static void unpin_all_folios(struct udmabuf *ubuf)
>  {
> -	struct udmabuf_folio *ubuf_folio;
> -
> -	while (!list_empty(unpin_list)) {
> -		ubuf_folio = list_first_entry(unpin_list,
> -					      struct udmabuf_folio, list);
> -		unpin_folio(ubuf_folio->folio);
> -
> -		list_del(&ubuf_folio->list);
> -		kfree(ubuf_folio);
> -	}
> -}
> +	pgoff_t pg;
> +	struct folio *last = NULL;
> 
> -static int add_to_unpin_list(struct list_head *unpin_list,
> -			     struct folio *folio)
> -{
> -	struct udmabuf_folio *ubuf_folio;
> +	for (pg = 0; pg < ubuf->pagecount; pg++) {
> +		struct folio *tmp = ubuf->folios[pg];
> 
> -	ubuf_folio = kzalloc(sizeof(*ubuf_folio), GFP_KERNEL);
> -	if (!ubuf_folio)
> -		return -ENOMEM;
> +		if (tmp == last)
> +			continue;
> 
> -	ubuf_folio->folio = folio;
> -	list_add_tail(&ubuf_folio->list, unpin_list);
> -	return 0;
> +		last = tmp;
> +		unpin_folio(tmp);
> +	}
>  }
> 
>  static void release_udmabuf(struct dma_buf *buf)
> @@ -196,7 +195,7 @@ static void release_udmabuf(struct dma_buf *buf)
>  	if (ubuf->sg)
>  		put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL);
> 
> -	unpin_all_folios(&ubuf->unpin_list);
> +	unpin_all_folios(ubuf);
>  	kvfree(ubuf->offsets);
>  	kvfree(ubuf->folios);
>  	kfree(ubuf);
> @@ -308,7 +307,6 @@ static long udmabuf_create(struct miscdevice
> *device,
>  	if (!ubuf)
>  		return -ENOMEM;
> 
> -	INIT_LIST_HEAD(&ubuf->unpin_list);
>  	pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT;
>  	for (i = 0; i < head->count; i++) {
>  		if (!IS_ALIGNED(list[i].offset, PAGE_SIZE))
> @@ -366,12 +364,6 @@ static long udmabuf_create(struct miscdevice
> *device,
>  			u32 k;
>  			long fsize = folio_size(folios[j]);
> 
> -			ret = add_to_unpin_list(&ubuf->unpin_list, folios[j]);
> -			if (ret < 0) {
> -				kfree(folios);
> -				goto err;
> -			}
> -
>  			for (k = pgoff; k < fsize; k += PAGE_SIZE) {
>  				ubuf->folios[pgbuf] = folios[j];
>  				ubuf->offsets[pgbuf] = k;
> @@ -399,7 +391,7 @@ static long udmabuf_create(struct miscdevice
> *device,
>  err:
>  	if (memfd)
>  		fput(memfd);
> -	unpin_all_folios(&ubuf->unpin_list);
> +	unpin_all_folios(ubuf);
>  	kvfree(ubuf->offsets);
>  	kvfree(ubuf->folios);
>  	kfree(ubuf);
> --
> 2.45.2