[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <AANLkTikd6xej_qA7IHVucmDdjR4Wo-7AoGsQiSi-sdWQ@mail.gmail.com>
Date: Thu, 6 May 2010 07:45:40 -0700
From: Tom Herbert <therbert@...gle.com>
To: David Miller <davem@...emloft.net>
Cc: eric.dumazet@...il.com, franco@...tsummer.de, xiaosuo@...il.com,
netdev@...r.kernel.org
Subject: Re: [PATCH net-next-2.6] rps: consistent rxhash
On Thu, May 6, 2010 at 1:06 AM, David Miller <davem@...emloft.net> wrote:
> From: Tom Herbert <therbert@...gle.com>
> Date: Wed, 21 Apr 2010 12:12:41 -0700
>
>> On Tue, Apr 20, 2010 at 2:41 PM, David Miller <davem@...emloft.net> wrote:
>>> From: Eric Dumazet <eric.dumazet@...il.com>
>>> Date: Tue, 20 Apr 2010 16:57:01 +0200
>>>
>>>> I know many applications using TCP on loopback, they are real :)
>>>
>>> This is all true and I support your hashing patch and all of that.
>>>
>>> But if we really want TCP over loopback to go fast, there are much
>>> better ways to do this.
>>>
>>> Eric, do you remember that "TCP friends" rough patch I sent you last
>>> year that essentailly made TCP sockets over loopback behave like
>>> AF_UNIX ones and just queue the SKBs directly to the destination
>>> socket without doing any protocol work?
>>>
>> This is sounds very interesting! Could you post a patch? :-)
>
> I was finally able to unearth a copy, it's completely raw, it's at least
> a year old, and it's not fully implemented at all.
>
> But you asked for it :-)
>
Thanks! We'll take a look... I've always thought sockets should have
friends :-)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 299ec4b..7f855d3 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -206,6 +206,7 @@ typedef unsigned char *sk_buff_data_t;
> * @mac_header: Link layer header
> * @dst: destination entry
> * @sp: the security path, used for xfrm
> + * @friend: loopback friend socket
> * @cb: Control buffer. Free for use by every layer. Put private vars here
> * @len: Length of actual data
> * @data_len: Data length
> @@ -262,6 +263,7 @@ struct sk_buff {
> struct rtable *rtable;
> };
> struct sec_path *sp;
> + struct sock *friend;
>
> /*
> * This is the control buffer. It is free to use for every
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index b220b5f..52b2f7a 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -53,6 +53,7 @@ struct request_sock {
> unsigned long expires;
> const struct request_sock_ops *rsk_ops;
> struct sock *sk;
> + struct sock *friend;
> u32 secid;
> u32 peer_secid;
> };
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dc42b44..3e86190 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -137,6 +137,7 @@ struct sock_common {
> * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
> * @sk_lock: synchronizer
> * @sk_rcvbuf: size of receive buffer in bytes
> + * @sk_friend: loopback friend socket
> * @sk_sleep: sock wait queue
> * @sk_dst_cache: destination cache
> * @sk_dst_lock: destination cache lock
> @@ -227,6 +228,7 @@ struct sock {
> struct sk_buff *head;
> struct sk_buff *tail;
> } sk_backlog;
> + struct sock *sk_friend;
> wait_queue_head_t *sk_sleep;
> struct dst_entry *sk_dst_cache;
> struct xfrm_policy *sk_policy[2];
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 4fe605f..0eef90a 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -435,6 +435,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
> #ifdef CONFIG_INET
> new->sp = secpath_get(old->sp);
> #endif
> + new->friend = old->friend;
> memcpy(new->cb, old->cb, sizeof(old->cb));
> new->csum_start = old->csum_start;
> new->csum_offset = old->csum_offset;
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 828ea21..375dc2e 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -503,6 +503,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
> if (newsk != NULL) {
> struct inet_connection_sock *newicsk = inet_csk(newsk);
>
> + newsk->sk_friend = req->friend;
> +
> newsk->sk_state = TCP_SYN_RECV;
> newicsk->icsk_bind_hash = NULL;
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 58ac838..042ee1d 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -474,7 +474,8 @@ static inline int forced_push(struct tcp_sock *tp)
> return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
> }
>
> -static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
> +static inline void skb_entail(struct sock *sk, struct sk_buff *skb,
> + struct sk_buff_head *friend_queue)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
> @@ -484,7 +485,10 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
> tcb->flags = TCPCB_FLAG_ACK;
> tcb->sacked = 0;
> skb_header_release(skb);
> - tcp_add_write_queue_tail(sk, skb);
> + if (sk->sk_friend)
> + __skb_queue_tail(friend_queue, skb);
> + else
> + tcp_add_write_queue_tail(sk, skb);
> sk->sk_wmem_queued += skb->truesize;
> sk_mem_charge(sk, skb->truesize);
> if (tp->nonagle & TCP_NAGLE_PUSH)
> @@ -501,7 +505,7 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
> }
>
> static inline void tcp_push(struct sock *sk, int flags, int mss_now,
> - int nonagle)
> + int nonagle, struct sk_buff_head *friend_queue)
> {
> struct tcp_sock *tp = tcp_sk(sk);
>
> @@ -512,6 +516,19 @@ static inline void tcp_push(struct sock *sk, int flags, int mss_now,
> tcp_mark_urg(tp, flags, skb);
> __tcp_push_pending_frames(sk, mss_now,
> (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
> + } else if (sk->sk_friend) {
> + struct sock *friend = sk->sk_friend;
> + struct sk_buff *skb;
> + unsigned int len;
> +
> + spin_lock_bh(&friend->sk_lock.slock);
> + len = 0;
> + while ((skb = __skb_dequeue(friend_queue)) != NULL) {
> + len += skb->len;
> + __skb_queue_tail(&sk->sk_receive_queue, skb);
> + }
> + sk->sk_data_ready(friend, len);
> + spin_unlock_bh(&friend->sk_lock.slock);
> }
> }
>
> @@ -658,6 +675,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
> size_t psize, int flags)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> + struct sk_buff_head friend_queue;
> int mss_now, size_goal;
> int err;
> ssize_t copied;
> @@ -674,6 +692,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
> size_goal = tp->xmit_size_goal;
> copied = 0;
>
> + skb_queue_head_init(&friend_queue);
> +
> err = -EPIPE;
> if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
> goto do_error;
> @@ -694,7 +714,7 @@ new_segment:
> if (!skb)
> goto wait_for_memory;
>
> - skb_entail(sk, skb);
> + skb_entail(sk, skb, &friend_queue);
> copy = size_goal;
> }
>
> @@ -749,7 +769,8 @@ wait_for_sndbuf:
> set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> wait_for_memory:
> if (copied)
> - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
> + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH,
> + &friend_queue);
>
> if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
> goto do_error;
> @@ -760,7 +781,7 @@ wait_for_memory:
>
> out:
> if (copied)
> - tcp_push(sk, flags, mss_now, tp->nonagle);
> + tcp_push(sk, flags, mss_now, tp->nonagle, &friend_queue);
> return copied;
>
> do_error:
> @@ -817,6 +838,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
> struct sock *sk = sock->sk;
> struct iovec *iov;
> struct tcp_sock *tp = tcp_sk(sk);
> + struct sk_buff_head friend_queue;
> struct sk_buff *skb;
> int iovlen, flags;
> int mss_now, size_goal;
> @@ -849,6 +871,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
> if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
> goto do_error;
>
> + skb_queue_head_init(&friend_queue);
> while (--iovlen >= 0) {
> int seglen = iov->iov_len;
> unsigned char __user *from = iov->iov_base;
> @@ -881,7 +904,7 @@ new_segment:
> if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
> skb->ip_summed = CHECKSUM_PARTIAL;
>
> - skb_entail(sk, skb);
> + skb_entail(sk, skb, &friend_queue);
> copy = size_goal;
> }
>
> @@ -995,7 +1018,8 @@ wait_for_sndbuf:
> set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> wait_for_memory:
> if (copied)
> - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
> + tcp_push(sk, flags & ~MSG_MORE, mss_now,
> + TCP_NAGLE_PUSH, &friend_queue);
>
> if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
> goto do_error;
> @@ -1007,7 +1031,7 @@ wait_for_memory:
>
> out:
> if (copied)
> - tcp_push(sk, flags, mss_now, tp->nonagle);
> + tcp_push(sk, flags, mss_now, tp->nonagle, &friend_queue);
> TCP_CHECK_TIMER(sk);
> release_sock(sk);
> return copied;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index cdc051b..eb6f914 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4998,6 +4998,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
> * state to ESTABLISHED..."
> */
>
> + sk->sk_friend = skb->friend;
> TCP_ECN_rcv_synack(tp, th);
>
> tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 7766151..4d91ff4 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1289,6 +1289,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
> if (!req)
> goto drop;
>
> + req->friend = skb->friend;
> #ifdef CONFIG_TCP_MD5SIG
> tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
> #endif
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index debf235..a4d4c14 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -577,6 +577,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
> }
>
> if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
> + skb->friend = sk;
> tcp_syn_build_options((__be32 *)(th + 1),
> tcp_advertise_mss(sk),
> (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
> @@ -1006,6 +1007,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
> xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
> xmit_size_goal -= (xmit_size_goal % mss_now);
> }
> + if (sk->sk_friend)
> + xmit_size_goal = ~(u16)0;
> tp->xmit_size_goal = xmit_size_goal;
>
> return mss_now;
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 715965f..c79d3ea 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1280,6 +1280,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
> if (req == NULL)
> goto drop;
>
> + req->friend = skb->friend;
> #ifdef CONFIG_TCP_MD5SIG
> tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
> #endif
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists