[<prev] [next>] [day] [month] [year] [list]
Date: Wed, 6 Jun 2018 15:52:04 -0700
From: "Nambiar, Amritha" <amritha.nambiar@...el.com>
To: Tom Herbert <tom@...bertland.com>
Cc: Linux Kernel Network Developers <netdev@...r.kernel.org>,
"David S. Miller" <davem@...emloft.net>,
Alexander Duyck <alexander.h.duyck@...el.com>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Sridhar Samudrala <sridhar.samudrala@...el.com>,
Eric Dumazet <edumazet@...gle.com>,
Hannes Frederic Sowa <hannes@...essinduktion.org>
Subject: Re: [net-next PATCH v3 3/5] net: Enable Tx queue selection based on
Rx queues
On 6/5/2018 10:57 AM, Tom Herbert wrote:
>
>
> On Tue, Jun 5, 2018 at 1:38 AM, Amritha Nambiar
> <amritha.nambiar@...el.com <mailto:amritha.nambiar@...el.com>> wrote:
>
> This patch adds support to pick Tx queue based on the Rx queue(s) map
> configuration set by the admin through the sysfs attribute
> for each Tx queue. If the user configuration for receive queue(s) map
> does not apply, then the Tx queue selection falls back to CPU(s) map
> based selection and finally to hashing.
>
> Signed-off-by: Amritha Nambiar <amritha.nambiar@...el.com
> <mailto:amritha.nambiar@...el.com>>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com
> <mailto:sridhar.samudrala@...el.com>>
> ---
> include/net/busy_poll.h | 3 ++
> include/net/sock.h | 14 +++++++++++
> net/core/dev.c | 60
> ++++++++++++++++++++++++++++++++---------------
> net/core/sock.c | 4 +++
> net/ipv4/tcp_input.c | 3 ++
> 5 files changed, 65 insertions(+), 19 deletions(-)
>
> diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> index 71c72a9..fc4fb68 100644
> --- a/include/net/busy_poll.h
> +++ b/include/net/busy_poll.h
> @@ -136,6 +136,9 @@ static inline void sk_mark_napi_id(struct sock
> *sk, const struct sk_buff *skb)
> #ifdef CONFIG_NET_RX_BUSY_POLL
> sk->sk_napi_id = skb->napi_id;
> #endif
> +#ifdef CONFIG_XPS
> + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +#endif
> }
>
> /* variant used for unconnected sockets */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4f7c584..12313653 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
> * @skc_node: main hash linkage for various protocol lookup tables
> * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
> * @skc_tx_queue_mapping: tx queue number for this connection
> + * @skc_rx_queue_mapping: rx queue number for this connection
> * @skc_flags: place holder for sk_flags
> * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
> * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
> @@ -215,6 +216,9 @@ struct sock_common {
> struct hlist_nulls_node skc_nulls_node;
> };
> int skc_tx_queue_mapping;
> +#ifdef CONFIG_XPS
> + int skc_rx_queue_mapping;
>
>
> This is still expensive cost to be adding an int field into sock_common
> for a relatively rare use case. Maybe there should be a CONFIG_XPS_RQS?
> Or maybe skc_tx_queue_mapping and skc_rx_queue_mapping could be shorts
> (so maximum queue mapping would then be 2^16-2).
Thanks for the review, Tom. I will fix up the code incorporating all
your feedback in the next version (v4). I could have a new config option
CONFIG_XPS_RXQS that would be default off, in addition to the CONFIG_XPS
option that's already there. With changing the 'skc_tx_queue_mapping' to
short, my concern is that the change would become extensive, there are a
lot of places where this gets filled with int or u32 values.
>
> +#endif
> union {
> int skc_incoming_cpu;
> u32 skc_rcv_wnd;
> @@ -326,6 +330,9 @@ struct sock {
> #define sk_nulls_node __sk_common.skc_nulls_node
> #define sk_refcnt __sk_common.skc_refcnt
> #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
> +#ifdef CONFIG_XPS
> +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
> +#endif
>
> #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
> #define sk_dontcopy_end __sk_common.skc_dontcopy_end
> @@ -1696,6 +1703,13 @@ static inline int sk_tx_queue_get(const
> struct sock *sk)
> return sk ? sk->sk_tx_queue_mapping : -1;
> }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
> *skb)
> +{
> +#ifdef CONFIG_XPS
> + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +#endif
> +}
> +
> static inline void sk_set_socket(struct sock *sk, struct socket *sock)
> {
> sk_tx_queue_clear(sk);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index bba755f..1880e6c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3479,36 +3479,58 @@ sch_handle_egress(struct sk_buff *skb, int
> *ret, struct net_device *dev)
> }
> #endif /* CONFIG_NET_EGRESS */
>
> -static inline int get_xps_queue(struct net_device *dev, struct
> sk_buff *skb)
> +#ifdef CONFIG_XPS
> +static int __get_xps_queue_idx(struct net_device *dev, struct
> sk_buff *skb,
> + struct xps_dev_maps *dev_maps,
> unsigned int tci)
> +{
> + struct xps_map *map;
> + int queue_index = -1;
> +
> + if (dev->num_tc) {
> + tci *= dev->num_tc;
> + tci += netdev_get_prio_tc_map(dev, skb->priority);
> + }
> +
> + map = rcu_dereference(dev_maps->attr_map[tci]);
> + if (map) {
> + if (map->len == 1)
> + queue_index = map->queues[0];
> + else
> + queue_index = map->queues[reciprocal_scale(
> + skb_get_hash(skb),
> map->len)];
> + if (unlikely(queue_index >= dev->real_num_tx_queues))
> + queue_index = -1;
> + }
> + return queue_index;
> +}
> +#endif
> +
> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
> {
> #ifdef CONFIG_XPS
> struct xps_dev_maps *dev_maps;
> - struct xps_map *map;
> + struct sock *sk = skb->sk;
> int queue_index = -1;
> + unsigned int tci = 0;
>
> if (!static_key_false(&xps_needed))
> return -1;
>
> + if (sk && sk->sk_rx_queue_mapping <= dev->num_rx_queues)
> + tci = sk->sk_rx_queue_mapping;
>
>
> This is only be needed if xps_rxqs_map is not null so it should be in
> the block below.
>
>
> +
> rcu_read_lock();
> - dev_maps = rcu_dereference(dev->xps_cpus_map);
> - if (dev_maps) {
> - unsigned int tci = skb->sender_cpu - 1;
> + dev_maps = rcu_dereference(dev->xps_rxqs_map);
> + if (dev_maps)
> + queue_index = __get_xps_queue_idx(dev, skb,
> dev_maps, tci);
>
> - if (dev->num_tc) {
> - tci *= dev->num_tc;
> - tci += netdev_get_prio_tc_map(dev,
> skb->priority);
> - }
>
> - map = rcu_dereference(dev_maps->attr_map[tci]);
> - if (map) {
> - if (map->len == 1)
> - queue_index = map->queues[0];
> - else
> - queue_index =
> map->queues[reciprocal_scale(skb_get_hash(skb),
> -
> map->len)];
> - if (unlikely(queue_index >=
> dev->real_num_tx_queues))
> - queue_index = -1;
> - }
> + if (queue_index < 0) {
> + tci = skb->sender_cpu - 1;
> + dev_maps = rcu_dereference(dev->xps_cpus_map);
> + if (dev_maps)
> + queue_index = __get_xps_queue_idx(dev, skb,
> dev_maps,
> + tci);
> }
> rcu_read_unlock();
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 435a0ba..3c10d31 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2824,6 +2824,10 @@ void sock_init_data(struct socket *sock,
> struct sock *sk)
> sk->sk_pacing_rate = ~0U;
> sk->sk_pacing_shift = 10;
> sk->sk_incoming_cpu = -1;
> +
> +#ifdef CONFIG_XPS
> + sk->sk_rx_queue_mapping = -1;
> +#endif
> /*
> * Before updating sk_refcnt, we must commit prior changes
> to memory
> * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index d5ffb57..cc69f75 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -78,6 +78,7 @@
> #include <linux/errqueue.h>
> #include <trace/events/tcp.h>
> #include <linux/static_key.h>
> +#include <net/busy_poll.h>
>
> int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
>
> @@ -5574,6 +5575,7 @@ void tcp_finish_connect(struct sock *sk,
> struct sk_buff *skb)
> if (skb) {
> icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
> security_inet_conn_established(sk, skb);
> + sk_mark_napi_id(sk, skb);
> }
>
> tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
> @@ -6402,6 +6404,7 @@ int tcp_conn_request(struct request_sock_ops
> *rsk_ops,
> tcp_rsk(req)->snt_isn = isn;
> tcp_rsk(req)->txhash = net_tx_rndhash();
> tcp_openreq_init_rwin(req, sk, dst);
> + sk_mark_rx_queue(req_to_sk(req), skb);
> if (!want_cookie) {
> tcp_reqsk_record_syn(sk, req, skb);
> fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
>
>
Powered by blists - more mailing lists