lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <507890da-4fa8-4936-b856-f90d75b5ddfa@gmail.com>
Date: Sun, 18 Aug 2024 22:17:30 +0800
From: Yunsheng Lin <yunshenglin0825@...il.com>
To: Alexander H Duyck <alexander.duyck@...il.com>,
 Yunsheng Lin <linyunsheng@...wei.com>, davem@...emloft.net, kuba@...nel.org,
 pabeni@...hat.com
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
 Mat Martineau <martineau@...nel.org>, Ayush Sawal <ayush.sawal@...lsio.com>,
 Eric Dumazet <edumazet@...gle.com>,
 Willem de Bruijn <willemdebruijn.kernel@...il.com>,
 Jason Wang <jasowang@...hat.com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 John Fastabend <john.fastabend@...il.com>,
 Jakub Sitnicki <jakub@...udflare.com>, David Ahern <dsahern@...nel.org>,
 Matthieu Baerts <matttbe@...nel.org>, Geliang Tang <geliang@...nel.org>,
 Jamal Hadi Salim <jhs@...atatu.com>, Cong Wang <xiyou.wangcong@...il.com>,
 Jiri Pirko <jiri@...nulli.us>, Boris Pismenny <borisp@...dia.com>,
 bpf@...r.kernel.org, mptcp@...ts.linux.dev
Subject: Re: [PATCH net-next v13 12/14] net: replace page_frag with
 page_frag_cache

On 8/15/2024 6:01 AM, Alexander H Duyck wrote:
> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
>> Use the newly introduced prepare/probe/commit API to
>> replace page_frag with page_frag_cache for sk_page_frag().
>>
>> CC: Alexander Duyck <alexander.duyck@...il.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
>> Acked-by: Mat Martineau <martineau@...nel.org>
>> ---
>>   .../chelsio/inline_crypto/chtls/chtls.h       |   3 -
>>   .../chelsio/inline_crypto/chtls/chtls_io.c    | 100 ++++---------
>>   .../chelsio/inline_crypto/chtls/chtls_main.c  |   3 -
>>   drivers/net/tun.c                             |  48 +++---
>>   include/linux/sched.h                         |   2 +-
>>   include/net/sock.h                            |  14 +-
>>   kernel/exit.c                                 |   3 +-
>>   kernel/fork.c                                 |   3 +-
>>   net/core/skbuff.c                             |  59 +++++---
>>   net/core/skmsg.c                              |  22 +--
>>   net/core/sock.c                               |  46 ++++--
>>   net/ipv4/ip_output.c                          |  33 +++--
>>   net/ipv4/tcp.c                                |  32 ++--
>>   net/ipv4/tcp_output.c                         |  28 ++--
>>   net/ipv6/ip6_output.c                         |  33 +++--
>>   net/kcm/kcmsock.c                             |  27 ++--
>>   net/mptcp/protocol.c                          |  67 +++++----
>>   net/sched/em_meta.c                           |   2 +-
>>   net/tls/tls_device.c                          | 137 ++++++++++--------
>>   19 files changed, 347 insertions(+), 315 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
>> index 7ff82b6778ba..fe2b6a8ef718 100644
>> --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
>> +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
>> @@ -234,7 +234,6 @@ struct chtls_dev {
>>   	struct list_head list_node;
>>   	struct list_head rcu_node;
>>   	struct list_head na_node;
>> -	unsigned int send_page_order;
>>   	int max_host_sndbuf;
>>   	u32 round_robin_cnt;
>>   	struct key_map kmap;
>> @@ -453,8 +452,6 @@ enum {
>>   
>>   /* The ULP mode/submode of an skbuff */
>>   #define skb_ulp_mode(skb)  (ULP_SKB_CB(skb)->ulp_mode)
>> -#define TCP_PAGE(sk)   (sk->sk_frag.page)
>> -#define TCP_OFF(sk)    (sk->sk_frag.offset)
>>   
>>   static inline struct chtls_dev *to_chtls_dev(struct tls_toe_device *tlsdev)
>>   {
>> diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
>> index d567e42e1760..334381c1587f 100644
>> --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
>> +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
>> @@ -825,12 +825,6 @@ void skb_entail(struct sock *sk, struct sk_buff *skb, int flags)
>>   	ULP_SKB_CB(skb)->flags = flags;
>>   	__skb_queue_tail(&csk->txq, skb);
>>   	sk->sk_wmem_queued += skb->truesize;
>> -
>> -	if (TCP_PAGE(sk) && TCP_OFF(sk)) {
>> -		put_page(TCP_PAGE(sk));
>> -		TCP_PAGE(sk) = NULL;
>> -		TCP_OFF(sk) = 0;
>> -	}
>>   }
>>   
>>   static struct sk_buff *get_tx_skb(struct sock *sk, int size)
>> @@ -882,16 +876,12 @@ static void push_frames_if_head(struct sock *sk)
>>   		chtls_push_frames(csk, 1);
>>   }
>>   
>> -static int chtls_skb_copy_to_page_nocache(struct sock *sk,
>> -					  struct iov_iter *from,
>> -					  struct sk_buff *skb,
>> -					  struct page *page,
>> -					  int off, int copy)
>> +static int chtls_skb_copy_to_va_nocache(struct sock *sk, struct iov_iter *from,
>> +					struct sk_buff *skb, char *va, int copy)
>>   {
>>   	int err;
>>   
>> -	err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) +
>> -				       off, copy, skb->len);
>> +	err = skb_do_copy_data_nocache(sk, skb, from, va, copy, skb->len);
>>   	if (err)
>>   		return err;
>>   
>> @@ -1114,82 +1104,44 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>>   			if (err)
>>   				goto do_fault;
>>   		} else {
>> +			struct page_frag_cache *pfrag = &sk->sk_frag;
> 
> Is this even valid? Shouldn't it be using sk_page_frag to get the

chtls_sendmsg() only use sk->sk_frag, see below.

> reference here? Seems like it might be trying to instantiate an unused
> cache.

I am not sure if I understand what you meant by "trying to instantiate
an unused cache". sk->sk_frag is supposed to be instantiated in
sock_init_data_uid() by calling page_frag_cache_init() in this patch.

> 
> As per my earlier suggestion this could be made very simple if we are
> just pulling a bio_vec out from the page cache at the start. With that
> we could essentially plug it into the TCP_PAGE/TCP_OFF block here and
> most of it would just function the same.

I am not sure if we had the same common understanding as why chtls had
more changing than other places when replacing page_frag with
page_frag_cache.

chtls_sendmsg() was duplicating its own implementation of page_frag
refilling instead of using skb_page_frag_refill(), we can remove that
implementation by using the new API, that is why there is more changing
for chtls than other places. Are you suggesting to keep chtls's own
implementation of page_frag refilling by 'plug it into the TCP_PAGE/
TCP_OFF block' here?

> 
>>   			int i = skb_shinfo(skb)->nr_frags;
>> -			struct page *page = TCP_PAGE(sk);

TCP_PAGE macro is defined as below, that is why sk->sk_frag is used
instead of sk_page_frag() for chtls case if I understand your question
correctly:

#define TCP_PAGE(sk)   (sk->sk_frag.page)
#define TCP_OFF(sk)    (sk->sk_frag.offset)

>> -			int pg_size = PAGE_SIZE;
>> -			int off = TCP_OFF(sk);
>> -			bool merge;
>> -
>> -			if (page)
>> -				pg_size = page_size(page);
>> -			if (off < pg_size &&
>> -			    skb_can_coalesce(skb, i, page, off)) {
>> +			unsigned int offset, fragsz;
>> +			bool merge = false;
>> +			struct page *page;
>> +			void *va;
>> +
>> +			fragsz = 32U;
>> +			page = page_frag_alloc_prepare(pfrag, &offset, &fragsz,
>> +						       &va, sk->sk_allocation);
>> +			if (unlikely(!page))
>> +				goto wait_for_memory;
>> +
>> +			if (skb_can_coalesce(skb, i, page, offset))
>>   				merge = true;
>> -				goto copy;
>> -			}
>> -			merge = false;
>> -			if (i == (is_tls_tx(csk) ? (MAX_SKB_FRAGS - 1) :
>> -			    MAX_SKB_FRAGS))
>> +			else if (i == (is_tls_tx(csk) ? (MAX_SKB_FRAGS - 1) :
>> +				       MAX_SKB_FRAGS))
>>   				goto new_buf;
>>   
>> -			if (page && off == pg_size) {
>> -				put_page(page);
>> -				TCP_PAGE(sk) = page = NULL;
>> -				pg_size = PAGE_SIZE;
>> -			}
>> -
>> -			if (!page) {
>> -				gfp_t gfp = sk->sk_allocation;
>> -				int order = cdev->send_page_order;
>> -
>> -				if (order) {
>> -					page = alloc_pages(gfp | __GFP_COMP |
>> -							   __GFP_NOWARN |
>> -							   __GFP_NORETRY,
>> -							   order);
>> -					if (page)
>> -						pg_size <<= order;
>> -				}
>> -				if (!page) {
>> -					page = alloc_page(gfp);
>> -					pg_size = PAGE_SIZE;
>> -				}
>> -				if (!page)
>> -					goto wait_for_memory;
>> -				off = 0;
>> -			}
>> -copy:
>> -			if (copy > pg_size - off)
>> -				copy = pg_size - off;
>> +			copy = min_t(int, copy, fragsz);
>>   			if (is_tls_tx(csk))
>>   				copy = min_t(int, copy, csk->tlshws.txleft);
>>   
>> -			err = chtls_skb_copy_to_page_nocache(sk, &msg->msg_iter,
>> -							     skb, page,
>> -							     off, copy);
>> -			if (unlikely(err)) {
>> -				if (!TCP_PAGE(sk)) {
>> -					TCP_PAGE(sk) = page;
>> -					TCP_OFF(sk) = 0;
>> -				}
>> +			err = chtls_skb_copy_to_va_nocache(sk, &msg->msg_iter,
>> +							   skb, va, copy);
>> +			if (unlikely(err))
>>   				goto do_fault;
>> -			}
>> +
>>   			/* Update the skb. */
>>   			if (merge) {
>>   				skb_frag_size_add(
>>   						&skb_shinfo(skb)->frags[i - 1],
>>   						copy);
>> +				page_frag_alloc_commit_noref(pfrag, copy);
>>   			} else {
>> -				skb_fill_page_desc(skb, i, page, off, copy);
>> -				if (off + copy < pg_size) {
>> -					/* space left keep page */
>> -					get_page(page);
>> -					TCP_PAGE(sk) = page;
>> -				} else {
>> -					TCP_PAGE(sk) = NULL;
>> -				}
>> +				skb_fill_page_desc(skb, i, page, offset, copy);
>> +				page_frag_alloc_commit(pfrag, copy);
>>   			}
>> -			TCP_OFF(sk) = off + copy;
>>   		}
>>   		if (unlikely(skb->len == mss))
>>   			tx_skb_finalize(skb);
> 
> Really there is so much refactor here it is hard to tell what is what.
> I would suggest just trying to plug in an intermediary value and you
> can save the refactor for later.

I am not sure if your above suggestion works, if it does work, I am not
sure if it is that simple enough to just plug in an intermediary value
as the the fields in 'struct page_frag_cache' is much different from the
fields in 'struct page_frag' as below when replacing page_frag with 
page_frag_cache for sk->sk_frag:

struct page_frag_cache {
	unsigned long encoded_va;

+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
  	__u16 remaining;
	__u16 pagecnt_bias;
  #else
  	__u32 remaining;
	__u32 pagecnt_bias;
  #endif
};

struct page_frag {
	struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
	__u32 offset;
	__u32 size;
#else
	__u16 offset;
	__u16 size;
#endif
};


> 
>> diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
>> index 455a54708be4..ba88b2fc7cd8 100644
>> --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
>> +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
>> @@ -34,7 +34,6 @@ static DEFINE_MUTEX(notify_mutex);
>>   static RAW_NOTIFIER_HEAD(listen_notify_list);
>>   static struct proto chtls_cpl_prot, chtls_cpl_protv6;
>>   struct request_sock_ops chtls_rsk_ops, chtls_rsk_opsv6;
>> -static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT;
>>   
>>   static void register_listen_notifier(struct notifier_block *nb)
>>   {
>> @@ -273,8 +272,6 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
>>   	INIT_WORK(&cdev->deferq_task, process_deferq);
>>   	spin_lock_init(&cdev->listen_lock);
>>   	spin_lock_init(&cdev->idr_lock);
>> -	cdev->send_page_order = min_t(uint, get_order(32768),
>> -				      send_page_order);
>>   	cdev->max_host_sndbuf = 48 * 1024;
>>   
>>   	if (lldi->vr->key.size)
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 1d06c560c5e6..51df92fd60db 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1598,21 +1598,19 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
>>   }
>>   
>>   static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
>> -				       struct page_frag *alloc_frag, char *buf,
>> -				       int buflen, int len, int pad)
>> +				       char *buf, int buflen, int len, int pad)
>>   {
>>   	struct sk_buff *skb = build_skb(buf, buflen);
>>   
>> -	if (!skb)
>> +	if (!skb) {
>> +		page_frag_free_va(buf);
>>   		return ERR_PTR(-ENOMEM);
>> +	}
>>   
>>   	skb_reserve(skb, pad);
>>   	skb_put(skb, len);
>>   	skb_set_owner_w(skb, tfile->socket.sk);
>>   
>> -	get_page(alloc_frag->page);
>> -	alloc_frag->offset += buflen;
>> -
> 
> Rather than freeing the buf it would be better if you were to just
> stick to the existing pattern and commit the alloc_frag at the end here
> instead of calling get_page.
> 
>>   	return skb;
>>   }
>>   
>> @@ -1660,7 +1658,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   				     struct virtio_net_hdr *hdr,
>>   				     int len, int *skb_xdp)
>>   {
>> -	struct page_frag *alloc_frag = &current->task_frag;
>> +	struct page_frag_cache *alloc_frag = &current->task_frag;
>>   	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
>>   	struct bpf_prog *xdp_prog;
>>   	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> @@ -1676,16 +1674,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	buflen += SKB_DATA_ALIGN(len + pad);
>>   	rcu_read_unlock();
>>   
>> -	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
>> -	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
>> +	buf = page_frag_alloc_va_align(alloc_frag, buflen, GFP_KERNEL,
>> +				       SMP_CACHE_BYTES);
>> +	if (unlikely(!buf))
>>   		return ERR_PTR(-ENOMEM);
>>   
>> -	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> -	copied = copy_page_from_iter(alloc_frag->page,
>> -				     alloc_frag->offset + pad,
>> -				     len, from);
>> -	if (copied != len)
>> +	copied = copy_from_iter(buf + pad, len, from);
>> +	if (copied != len) {
>> +		page_frag_alloc_abort(alloc_frag, buflen);
>>   		return ERR_PTR(-EFAULT);
>> +	}
>>   
>>   	/* There's a small window that XDP may be set after the check
>>   	 * of xdp_prog above, this should be rare and for simplicity
>> @@ -1693,8 +1691,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	 */
>>   	if (hdr->gso_type || !xdp_prog) {
>>   		*skb_xdp = 1;
>> -		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
>> -				       pad);
>> +		return __tun_build_skb(tfile, buf, buflen, len, pad);
>>   	}
>>   
>>   	*skb_xdp = 0;
>> @@ -1711,21 +1708,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   		xdp_prepare_buff(&xdp, buf, pad, len, false);
>>   
>>   		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> -		if (act == XDP_REDIRECT || act == XDP_TX) {
>> -			get_page(alloc_frag->page);
>> -			alloc_frag->offset += buflen;

the above is only executed for XDP_REDIRECT and XDP_TX cases.

>> -		}
>>   		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
>> -		if (err < 0) {
>> -			if (act == XDP_REDIRECT || act == XDP_TX)
>> -				put_page(alloc_frag->page);

And there is a put_page() immediately when xdp_do_redirect() or
tun_xdp_tx() fails in tun_xdp_act(), so there is something else
might have taken a reference to the page and modified it in some way
even when tun_xdp_act() return error? Would you be more specific
about why above happens?

>> -			goto out;
>> -		}
>> -
>>   		if (err == XDP_REDIRECT)
>>   			xdp_do_flush();
>> -		if (err != XDP_PASS)
>> +
>> +		if (err == XDP_REDIRECT || err == XDP_TX) {
>> +			goto out;
>> +		} else if (err < 0 || err != XDP_PASS) {
>> +			page_frag_alloc_abort(alloc_frag, buflen);
>>   			goto out;
>> +		}
>>   
> 
> Your abort function here is not necessarily safe. It is assuming that
> nothing else might have taken a reference to the page or modified it in
> some way. Generally we shouldn't allow rewinding the pointer until we
> check the page count to guarantee nobody else is now working with a
> copy of the page.
> 
>>   		pad = xdp.data - xdp.data_hard_start;
>>   		len = xdp.data_end - xdp.data;
>> @@ -1734,7 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	rcu_read_unlock();
>>   	local_bh_enable();
>>   
>> -	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
>> +	return __tun_build_skb(tfile, buf, buflen, len, pad);
>>   
>>   out:
>>   	bpf_net_ctx_clear(bpf_net_ctx);
>

...


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ