linux-kernel - RE: [PATCH v2 4/4] udmabuf: remove folio unpin list

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <IA0PR11MB7185115BF4B741E9229D0E68F8BB2@IA0PR11MB7185.namprd11.prod.outlook.com>
Date: Sat, 10 Aug 2024 02:52:22 +0000
From: "Kasireddy, Vivek" <vivek.kasireddy@...el.com>
To: Huan Yang <link@...o.com>, Gerd Hoffmann <kraxel@...hat.com>, Sumit Semwal
	<sumit.semwal@...aro.org>, Christian König
	<christian.koenig@....com>, "dri-devel@...ts.freedesktop.org"
	<dri-devel@...ts.freedesktop.org>, "linux-media@...r.kernel.org"
	<linux-media@...r.kernel.org>, "linaro-mm-sig@...ts.linaro.org"
	<linaro-mm-sig@...ts.linaro.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>
CC: "opensource.kernel@...o.com" <opensource.kernel@...o.com>
Subject: RE: [PATCH v2 4/4] udmabuf: remove folio unpin list

Hi Huan,

> 
> Currently, udmabuf handles folio by creating an unpin list to record
> each folio obtained from the list and unpinning them when released. To
> maintain this approach, many data structures have been established.
> 
> However, maintaining this type of data structure requires a significant
> amount of memory and traversing the list is a substantial overhead,
Have you tried to quantify this overhead?

> which is not friendly to the CPU cache, TLB, and so on.
> 
> Therefore, this patch removes the relationship between the folio and its
> offset in the linear address mapping.
> 
> As an alternative, udmabuf both maintain the folio array and page array,
> folio array use to unpin, and the page array is used as before to handle
> the requirements for the page.
Using pages is a step backwards, given the trend towards embracing folios.
Moreover, the feedback from the former hugetlb maintainer (Mike Kravetz)
was to not use subpages (or tail pages) of a hugetlb folio directly in udmabuf
driver as it would cause problems, particularly when hugetlb vmemmap
optimization (HVO) is enabled. AFAIU, if HVO is enabled by default, a tail page's
struct page pointer may not be available (as it may very well be freed to
save memory). Given all of this, it made sense to convert the udmabuf driver
to only use the head pages of a folio along with the offsets of tail pages.

> 
> So, udmabuf's folios only save the folio struct, foliocount point
> the size of array. pages save page in folios, number offset given by
> create list, pagecount point the size of array.
> 
> Even if we restore the pages structure, its memory usage should be
> smaller than the combined memory usage of offsets(8 bytes in 64bit
> machine)
> and udmabuf_folio structures(24 bytes in 64bit machine).
> 
> By doing this, we can accept the overhead of the udmabuf_folio structure
> and the performance loss of traversing the list during unpinning.
Does your use-case involve frequent pinning/unpinning operations? Note
that this would be considered "shortterm" pin, which is different from the
the way the folios are currently pinned in udmabuf driver, which is considered
"longterm" pin.

However, one optimization I can think of, for memfds backed by shmem, is
to not use unpin_list completely. This way you can probably avoid creating
udmabuf_folio objects and having to traverse the list. But this would require
differentiating udmabufs backed by shmem vs hugetlb folios, which is not
great in my opinion and may not work if THP is enabled.

Thanks,
Vivek

> 
> Signed-off-by: Huan Yang <link@...o.com>
> ---
>  drivers/dma-buf/udmabuf.c | 167 ++++++++++++++------------------------
>  1 file changed, 61 insertions(+), 106 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 9737f063b6b3..442ed99d8b33 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -25,17 +25,24 @@ module_param(size_limit_mb, int, 0644);
>  MODULE_PARM_DESC(size_limit_mb, "Max size of a dmabuf, in megabytes.
> Default is 64.");
> 
>  struct udmabuf {
> +	/**
> +	 * Each page used by udmabuf in the folio. When obtaining a page
> from a
> +	 * folio, it does not necessarily begin from the head page. This is
> +	 * determined by the offset of the memfd when udmabuf created.
> +	 */
>  	pgoff_t pagecount;
> +	struct page **pages;
> +
> +	/**
> +	 * Each folio in memfd, when a udmabuf is created, it is pinned to
> +	 * ensure that the folio is not moved or reclaimed.
> +	 * folio array used to unpin all when releasing.
> +	 */
> +	pgoff_t foliocount;
>  	struct folio **folios;
> +
>  	struct sg_table *sg;
>  	struct miscdevice *device;
> -	pgoff_t *offsets;
> -	struct list_head unpin_list;
> -};
> -
> -struct udmabuf_folio {
> -	struct folio *folio;
> -	struct list_head list;
>  };
> 
>  static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct
> *vma)
> @@ -51,9 +58,7 @@ static int mmap_udmabuf(struct dma_buf *buf, struct
> vm_area_struct *vma)
> 
>  	for (pgoff = vma->vm_pgoff, end = vma->vm_end, addr = vma-
> >vm_start;
>  	     addr < end; pgoff++, addr += PAGE_SIZE) {
> -		struct page *page =
> -			folio_page(ubuf->folios[pgoff],
> -				   ubuf->offsets[pgoff] >> PAGE_SHIFT);
> +		struct page *page = ubuf->pages[pgoff];
> 
>  		ret = remap_pfn_range(vma, addr, page_to_pfn(page),
> PAGE_SIZE,
>  				      vma->vm_page_prot);
> @@ -67,22 +72,11 @@ static int mmap_udmabuf(struct dma_buf *buf,
> struct vm_area_struct *vma)
>  static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map)
>  {
>  	struct udmabuf *ubuf = buf->priv;
> -	struct page **pages;
>  	void *vaddr;
> -	pgoff_t pg;
> 
>  	dma_resv_assert_held(buf->resv);
> 
> -	pages = kvmalloc_array(ubuf->pagecount, sizeof(*pages),
> GFP_KERNEL);
> -	if (!pages)
> -		return -ENOMEM;
> -
> -	for (pg = 0; pg < ubuf->pagecount; pg++)
> -		pages[pg] = folio_page(ubuf->folios[pg],
> -				       ubuf->offsets[pg] >> PAGE_SHIFT);
> -
> -	vaddr = vm_map_ram(pages, ubuf->pagecount, -1);
> -	kvfree(pages);
> +	vaddr = vm_map_ram(ubuf->pages, ubuf->pagecount, -1);
>  	if (!vaddr)
>  		return -EINVAL;
> 
> @@ -104,30 +98,25 @@ static struct sg_table *get_sg_table(struct device
> *dev, struct dma_buf *buf,
>  {
>  	struct udmabuf *ubuf = buf->priv;
>  	struct sg_table *sg;
> -	struct scatterlist *sgl;
> -	unsigned int i = 0;
>  	int ret;
> 
>  	sg = kzalloc(sizeof(*sg), GFP_KERNEL);
>  	if (!sg)
>  		return ERR_PTR(-ENOMEM);
> 
> -	ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
> +	ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> +					0, ubuf->pagecount << PAGE_SHIFT,
> +					GFP_KERNEL);
>  	if (ret < 0)
> -		goto err_alloc;
> -
> -	for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)
> -		sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
> -			     ubuf->offsets[i]);
> +		goto err;
> 
>  	ret = dma_map_sgtable(dev, sg, direction, 0);
>  	if (ret < 0)
> -		goto err_map;
> +		goto err;
>  	return sg;
> 
> -err_map:
> +err:
>  	sg_free_table(sg);
> -err_alloc:
>  	kfree(sg);
>  	return ERR_PTR(ret);
>  }
> @@ -153,34 +142,6 @@ static void unmap_udmabuf(struct
> dma_buf_attachment *at,
>  	return put_sg_table(at->dev, sg, direction);
>  }
> 
> -static void unpin_all_folios(struct list_head *unpin_list)
> -{
> -	struct udmabuf_folio *ubuf_folio;
> -
> -	while (!list_empty(unpin_list)) {
> -		ubuf_folio = list_first_entry(unpin_list,
> -					      struct udmabuf_folio, list);
> -		unpin_folio(ubuf_folio->folio);
> -
> -		list_del(&ubuf_folio->list);
> -		kfree(ubuf_folio);
> -	}
> -}
> -
> -static int add_to_unpin_list(struct list_head *unpin_list,
> -			     struct folio *folio)
> -{
> -	struct udmabuf_folio *ubuf_folio;
> -
> -	ubuf_folio = kzalloc(sizeof(*ubuf_folio), GFP_KERNEL);
> -	if (!ubuf_folio)
> -		return -ENOMEM;
> -
> -	ubuf_folio->folio = folio;
> -	list_add_tail(&ubuf_folio->list, unpin_list);
> -	return 0;
> -}
> -
>  static void release_udmabuf(struct dma_buf *buf)
>  {
>  	struct udmabuf *ubuf = buf->priv;
> @@ -189,9 +150,9 @@ static void release_udmabuf(struct dma_buf *buf)
>  	if (ubuf->sg)
>  		put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL);
> 
> -	unpin_all_folios(&ubuf->unpin_list);
> -	kvfree(ubuf->offsets);
> +	unpin_folios(ubuf->folios, ubuf->foliocount);
>  	kvfree(ubuf->folios);
> +	kvfree(ubuf->pages);
>  	kfree(ubuf);
>  }
> 
> @@ -289,19 +250,18 @@ static long udmabuf_create(struct miscdevice
> *device,
>  			   struct udmabuf_create_list *head,
>  			   struct udmabuf_create_item *list)
>  {
> -	pgoff_t pgoff, pgcnt, pglimit, pgbuf = 0;
> -	long nr_folios, ret = -EINVAL;
> +	pgoff_t pgoff, pgcnt, pglimit, nr_pages;
> +	long nr_folios = 0, ret = -EINVAL;
>  	struct file *memfd = NULL;
>  	struct folio **folios;
>  	struct udmabuf *ubuf;
> -	u32 i, j, k, flags;
> +	u32 i, flags;
>  	loff_t end;
> 
>  	ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
>  	if (!ubuf)
>  		return -ENOMEM;
> 
> -	INIT_LIST_HEAD(&ubuf->unpin_list);
>  	pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT;
>  	for (i = 0; i < head->count; i++) {
>  		if (!IS_ALIGNED(list[i].offset, PAGE_SIZE))
> @@ -322,64 +282,58 @@ static long udmabuf_create(struct miscdevice
> *device,
>  		ret = -ENOMEM;
>  		goto err;
>  	}
> -	ubuf->offsets =
> -		kvcalloc(ubuf->pagecount, sizeof(*ubuf->offsets),
> GFP_KERNEL);
> -	if (!ubuf->offsets) {
> +	folios = ubuf->folios;
> +
> +	ubuf->pages = kvmalloc_array(ubuf->pagecount, sizeof(*ubuf-
> >pages),
> +				     GFP_KERNEL);
> +	if (!ubuf->pages) {
>  		ret = -ENOMEM;
>  		goto err;
>  	}
> 
> -	pgbuf = 0;
> -	for (i = 0; i < head->count; i++) {
> +	for (i = 0, nr_pages = 0; i < head->count; i++) {
> +		u32 j, pg;
> +
>  		memfd = fget(list[i].memfd);
>  		ret = check_memfd_seals(memfd);
>  		if (ret < 0)
>  			goto err;
> 
>  		pgcnt = list[i].size >> PAGE_SHIFT;
> -		folios = kvmalloc_array(pgcnt, sizeof(*folios), GFP_KERNEL);
> -		if (!folios) {
> -			ret = -ENOMEM;
> -			goto err;
> -		}
> 
>  		end = list[i].offset + (pgcnt << PAGE_SHIFT) - 1;
> -		ret = memfd_pin_folios(memfd, list[i].offset, end,
> -				       folios, pgcnt, &pgoff);
> +		ret = memfd_pin_folios(memfd, list[i].offset, end, folios,
> +				       pgcnt, &pgoff);
>  		if (ret <= 0) {
> -			kvfree(folios);
> -			if (!ret)
> -				ret = -EINVAL;
> +			ret = ret ?: -EINVAL;
>  			goto err;
>  		}
> 
> -		nr_folios = ret;
> -		pgoff >>= PAGE_SHIFT;
> -		for (j = 0, k = 0; j < pgcnt; j++) {
> -			ubuf->folios[pgbuf] = folios[k];
> -			ubuf->offsets[pgbuf] = pgoff << PAGE_SHIFT;
> -
> -			if (j == 0 || ubuf->folios[pgbuf-1] != folios[k]) {
> -				ret = add_to_unpin_list(&ubuf->unpin_list,
> -							folios[k]);
> -				if (ret < 0) {
> -					kfree(folios);
> -					goto err;
> -				}
> -			}
> -
> -			pgbuf++;
> -			if (++pgoff == folio_nr_pages(folios[k])) {
> -				pgoff = 0;
> -				if (++k == nr_folios)
> -					break;
> +		/**
> +		 * Iter the pinned folios and record them for later unpin
> +		 * when releasing.
> +		 * memfd may start from any offset, so we need check it
> +		 * carefully at first.
> +		 */
> +		for (j = 0, pgoff >>= PAGE_SHIFT, pg = 0; j < ret;
> +		     ++j, pgoff = 0) {
> +			pgoff_t k;
> +			struct folio *folio = folios[j];
> +
> +			for (k = pgoff; k < folio_nr_pages(folio); ++k) {
> +				ubuf->pages[nr_pages++] = folio_page(folio,
> k);
> +
> +				if (++pg >= pgcnt)
> +					goto end;
>  			}
>  		}
> -
> -		kvfree(folios);
> +end:
> +		folios += ret;
> +		nr_folios += ret;
>  		fput(memfd);
>  		memfd = NULL;
>  	}
> +	ubuf->foliocount = nr_folios;
> 
>  	flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0;
>  	ret = export_udmabuf(ubuf, device, flags);
> @@ -391,8 +345,9 @@ static long udmabuf_create(struct miscdevice
> *device,
>  err:
>  	if (memfd)
>  		fput(memfd);
> -	unpin_all_folios(&ubuf->unpin_list);
> -	kvfree(ubuf->offsets);
> +	if (nr_folios)
> +		unpin_folios(ubuf->folios, nr_folios);
> +	kvfree(ubuf->pages);
>  	kvfree(ubuf->folios);
>  	kfree(ubuf);
>  	return ret;
> --
> 2.45.2