[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKgT0Uc_-GVJgg3q6N4orJufW1iU9VZ5ZMaGo=SNUeCZr+k7HA@mail.gmail.com>
Date: Mon, 11 Sep 2017 10:48:55 -0700
From: Alexander Duyck <alexander.duyck@...il.com>
To: "Samudrala, Sridhar" <sridhar.samudrala@...el.com>
Cc: "Duyck, Alexander H" <alexander.h.duyck@...el.com>,
Netdev <netdev@...r.kernel.org>
Subject: Re: [RFC PATCH] net: Introduce a socket option to enable picking tx
queue based on rx queue.
On Mon, Sep 11, 2017 at 9:48 AM, Samudrala, Sridhar
<sridhar.samudrala@...el.com> wrote:
>
>
> On 9/9/2017 6:28 PM, Alexander Duyck wrote:
>>
>> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
>> <sridhar.samudrala@...el.com> wrote:
>>>
>>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be
>>> used
>>> to enable symmetric tx and rx queues on a socket.
>>>
>>> This option is specifically useful for epoll based multi threaded
>>> workloads
>>> where each thread handles packets received on a single RX queue . In this
>>> model,
>>> we have noticed that it helps to send the packets on the same TX queue
>>> corresponding to the queue-pair associated with the RX queue specifically
>>> when
>>> busy poll is enabled with epoll().
>>>
>>> Two new fields are added to struct sock_common to cache the last rx
>>> ifindex and
>>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns
>>> the cached
>>> rx queue when this option is enabled and the TX is happening on the same
>>> device.
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com>
>>> ---
>>> include/net/request_sock.h | 1 +
>>> include/net/sock.h | 17 +++++++++++++++++
>>> include/uapi/asm-generic/socket.h | 2 ++
>>> net/core/dev.c | 8 +++++++-
>>> net/core/sock.c | 10 ++++++++++
>>> net/ipv4/tcp_input.c | 1 +
>>> net/ipv4/tcp_ipv4.c | 1 +
>>> net/ipv4/tcp_minisocks.c | 1 +
>>> 8 files changed, 40 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>>> index 23e2205..c3bc12e 100644
>>> --- a/include/net/request_sock.h
>>> +++ b/include/net/request_sock.h
>>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct
>>> request_sock *req)
>>> req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>> sk_node_init(&req_to_sk(req)->sk_node);
>>> sk_tx_queue_clear(req_to_sk(req));
>>> + req_to_sk(req)->sk_symmetric_queues =
>>> sk_listener->sk_symmetric_queues;
>>> req->saved_syn = NULL;
>>> refcount_set(&req->rsk_refcnt, 0);
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 03a3625..3421809 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char
>>> *msg, ...)
>>> * @skc_node: main hash linkage for various protocol lookup tables
>>> * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>> * @skc_tx_queue_mapping: tx queue number for this connection
>>> + * @skc_rx_queue_mapping: rx queue number for this connection
>>> + * @skc_rx_ifindex: rx ifindex for this connection
>>> * @skc_flags: place holder for sk_flags
>>> * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>> * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>> * @skc_incoming_cpu: record/match cpu processing incoming packets
>>> * @skc_refcnt: reference count
>>> + * @skc_symmetric_queues: symmetric tx/rx queues
>>> *
>>> * This is the minimal network layer representation of sockets, the
>>> header
>>> * for struct sock and struct inet_timewait_sock.
>>> @@ -177,6 +180,7 @@ struct sock_common {
>>> unsigned char skc_reuseport:1;
>>> unsigned char skc_ipv6only:1;
>>> unsigned char skc_net_refcnt:1;
>>> + unsigned char skc_symmetric_queues:1;
>>> int skc_bound_dev_if;
>>> union {
>>> struct hlist_node skc_bind_node;
>>> @@ -214,6 +218,8 @@ struct sock_common {
>>> struct hlist_nulls_node skc_nulls_node;
>>> };
>>> int skc_tx_queue_mapping;
>>> + int skc_rx_queue_mapping;
>>> + int skc_rx_ifindex;
>>> union {
>>> int skc_incoming_cpu;
>>> u32 skc_rcv_wnd;
>>> @@ -324,6 +330,8 @@ struct sock {
>>> #define sk_nulls_node __sk_common.skc_nulls_node
>>> #define sk_refcnt __sk_common.skc_refcnt
>>> #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
>>> +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
>>> +#define sk_rx_ifindex __sk_common.skc_rx_ifindex
>>>
>>> #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
>>> #define sk_dontcopy_end __sk_common.skc_dontcopy_end
>>> @@ -340,6 +348,7 @@ struct sock {
>>> #define sk_reuseport __sk_common.skc_reuseport
>>> #define sk_ipv6only __sk_common.skc_ipv6only
>>> #define sk_net_refcnt __sk_common.skc_net_refcnt
>>> +#define sk_symmetric_queues __sk_common.skc_symmetric_queues
>>> #define sk_bound_dev_if __sk_common.skc_bound_dev_if
>>> #define sk_bind_node __sk_common.skc_bind_node
>>> #define sk_prot __sk_common.skc_prot
>>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct
>>> sock *sk)
>>> return sk ? sk->sk_tx_queue_mapping : -1;
>>> }
>>>
>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
>>> *skb)
>>> +{
>>> + if (sk->sk_symmetric_queues) {
>>> + sk->sk_rx_ifindex = skb->skb_iif;
>>> + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>> + }
>>> +}
>>> +
>>> static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>> {
>>> sk_tx_queue_clear(sk);
>>> diff --git a/include/uapi/asm-generic/socket.h
>>> b/include/uapi/asm-generic/socket.h
>>> index e47c9e4..f6b416e 100644
>>> --- a/include/uapi/asm-generic/socket.h
>>> +++ b/include/uapi/asm-generic/socket.h
>>> @@ -106,4 +106,6 @@
>>>
>>> #define SO_ZEROCOPY 60
>>>
>>> +#define SO_SYMMETRIC_QUEUES 61
>>> +
>>> #endif /* __ASM_GENERIC_SOCKET_H */
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 270b547..d96cda8 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device
>>> *dev, struct sk_buff *skb)
>>>
>>> if (queue_index < 0 || skb->ooo_okay ||
>>> queue_index >= dev->real_num_tx_queues) {
>>> - int new_index = get_xps_queue(dev, skb);
>>> + int new_index = -1;
>>> +
>>> + if (sk && sk->sk_symmetric_queues && dev->ifindex ==
>>> sk->sk_rx_ifindex)
>>> + new_index = sk->sk_rx_queue_mapping;
>>> +
>>> + if (new_index < 0 || new_index >=
>>> dev->real_num_tx_queues)
>>> + new_index = get_xps_queue(dev, skb);
>>>
>>> if (new_index < 0)
>>> new_index = skb_tx_hash(dev, skb);
>>
>> So one thing I am not sure about is if we should be overriding XPS. It
>> might make sense to instead place this after XPS so that if the root
>> user configures it then it applies, otherwise if the socket is
>> requesting symmetric queues you could fall back to that, and then
>> finally just use hashing as the final solution for distributing the
>> workload.
>
> Isn't XPS on by default and all the devices that support XPS setup XPS maps
> as part of
> the initialization?
> Are you suggesting that the root user needs to disable XPS on the specific
> queues before an
> application can use this option to enable symmetric queues?
>
XPS and this symmetric queue logic won't really play well together
since they are both attempting to do the same thing but from different
ends. Some sort of priority needs to be defined, and I would place the
request of the kernel/root user above the request of a socket and/or
application.
I guess it comes down to if we let the Tx CPU pick the queue or Rx
queue, and I would argue with an XPS configuration in place the kernel
is requesting that the Tx CPU sets the Tx queue, not the Rx/Tx hash or
Rx queue. This is similar to how we handle this for routing today as
XPS will also override the Rx to Tx mapping used there.
>>
>> That way if somebody decides to reserve queues for some sort of
>> specific traffic like AF_PACKET then they can configure the Tx via
>> XPS, configure the Rx via RSS redirection table reprogramming, and
>> then setup a filters on the hardware to direct the traffic they want
>> to the queues that are running AF_PACKET.
>>
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 9b7b6bb..3876cce 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int
>>> level, int optname,
>>> sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>>> break;
>>>
>>> + case SO_SYMMETRIC_QUEUES:
>>> + sk->sk_symmetric_queues = valbool;
>>> + break;
>>> +
>>> default:
>>> ret = -ENOPROTOOPT;
>>> break;
>>> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int
>>> level, int optname,
>>> v.val = sock_flag(sk, SOCK_ZEROCOPY);
>>> break;
>>>
>>> + case SO_SYMMETRIC_QUEUES:
>>> + v.val = sk->sk_symmetric_queues;
>>> + break;
>>> +
>>> default:
>>> /* We implement the SO_SNDLOWAT etc to not be settable
>>> * (1003.1g 7).
>>> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct
>>> sock *sk)
>>> sk->sk_max_pacing_rate = ~0U;
>>> sk->sk_pacing_rate = ~0U;
>>> sk->sk_incoming_cpu = -1;
>>> + sk->sk_rx_ifindex = -1;
>>> + sk->sk_rx_queue_mapping = -1;
>>> /*
>>> * Before updating sk_refcnt, we must commit prior changes to
>>> memory
>>> * (Documentation/RCU/rculist_nulls.txt for details)
>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>> index c5d7656..12381e0 100644
>>> --- a/net/ipv4/tcp_input.c
>>> +++ b/net/ipv4/tcp_input.c
>>> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops
>>> *rsk_ops,
>>> tcp_rsk(req)->snt_isn = isn;
>>> tcp_rsk(req)->txhash = net_tx_rndhash();
>>> tcp_openreq_init_rwin(req, sk, dst);
>>> + sk_mark_rx_queue(req_to_sk(req), skb);
>>> if (!want_cookie) {
>>> tcp_reqsk_record_syn(sk, req, skb);
>>> fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>> index a63486a..82f9af4 100644
>>> --- a/net/ipv4/tcp_ipv4.c
>>> +++ b/net/ipv4/tcp_ipv4.c
>>> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff
>>> *skb)
>>>
>>> sock_rps_save_rxhash(sk, skb);
>>> sk_mark_napi_id(sk, skb);
>>> + sk_mark_rx_queue(sk, skb);
>>> if (dst) {
>>> if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif
>>> ||
>>> !dst->ops->check(dst, 0)) {
>>> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>>> index 188a6f3..2b5efd5 100644
>>> --- a/net/ipv4/tcp_minisocks.c
>>> +++ b/net/ipv4/tcp_minisocks.c
>>> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct
>>> sock *child,
>>>
>>> /* record NAPI ID of child */
>>> sk_mark_napi_id(child, skb);
>>> + sk_mark_rx_queue(child, skb);
>>>
>>> tcp_segs_in(tcp_sk(child), skb);
>>> if (!sock_owned_by_user(child)) {
>>> --
>>> 1.8.3.1
>>>
>
Powered by blists - more mailing lists