lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <AANLkTikd6xej_qA7IHVucmDdjR4Wo-7AoGsQiSi-sdWQ@mail.gmail.com>
Date:	Thu, 6 May 2010 07:45:40 -0700
From:	Tom Herbert <therbert@...gle.com>
To:	David Miller <davem@...emloft.net>
Cc:	eric.dumazet@...il.com, franco@...tsummer.de, xiaosuo@...il.com,
	netdev@...r.kernel.org
Subject: Re: [PATCH net-next-2.6] rps: consistent rxhash

On Thu, May 6, 2010 at 1:06 AM, David Miller <davem@...emloft.net> wrote:
> From: Tom Herbert <therbert@...gle.com>
> Date: Wed, 21 Apr 2010 12:12:41 -0700
>
>> On Tue, Apr 20, 2010 at 2:41 PM, David Miller <davem@...emloft.net> wrote:
>>> From: Eric Dumazet <eric.dumazet@...il.com>
>>> Date: Tue, 20 Apr 2010 16:57:01 +0200
>>>
>>>> I know many applications using TCP on loopback, they are real :)
>>>
>>> This is all true and I support your hashing patch and all of that.
>>>
>>> But if we really want TCP over loopback to go fast, there are much
>>> better ways to do this.
>>>
>>> Eric, do you remember that "TCP friends" rough patch I sent you last
>>> year that essentailly made TCP sockets over loopback behave like
>>> AF_UNIX ones and just queue the SKBs directly to the destination
>>> socket without doing any protocol work?
>>>
>> This is sounds very interesting!  Could you post a patch? :-)
>
> I was finally able to unearth a copy, it's completely raw, it's at least
> a year old, and it's not fully implemented at all.
>
> But you asked for it :-)
>
Thanks!  We'll take a look... I've always thought sockets should have
friends :-)

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 299ec4b..7f855d3 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -206,6 +206,7 @@ typedef unsigned char *sk_buff_data_t;
>  *     @mac_header: Link layer header
>  *     @dst: destination entry
>  *     @sp: the security path, used for xfrm
> + *     @friend: loopback friend socket
>  *     @cb: Control buffer. Free for use by every layer. Put private vars here
>  *     @len: Length of actual data
>  *     @data_len: Data length
> @@ -262,6 +263,7 @@ struct sk_buff {
>                struct  rtable          *rtable;
>        };
>        struct  sec_path        *sp;
> +       struct sock             *friend;
>
>        /*
>         * This is the control buffer. It is free to use for every
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index b220b5f..52b2f7a 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -53,6 +53,7 @@ struct request_sock {
>        unsigned long                   expires;
>        const struct request_sock_ops   *rsk_ops;
>        struct sock                     *sk;
> +       struct sock                     *friend;
>        u32                             secid;
>        u32                             peer_secid;
>  };
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dc42b44..3e86190 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -137,6 +137,7 @@ struct sock_common {
>   *    @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
>   *    @sk_lock:       synchronizer
>   *    @sk_rcvbuf: size of receive buffer in bytes
> +  *    @sk_friend: loopback friend socket
>   *    @sk_sleep: sock wait queue
>   *    @sk_dst_cache: destination cache
>   *    @sk_dst_lock: destination cache lock
> @@ -227,6 +228,7 @@ struct sock {
>                struct sk_buff *head;
>                struct sk_buff *tail;
>        } sk_backlog;
> +       struct sock             *sk_friend;
>        wait_queue_head_t       *sk_sleep;
>        struct dst_entry        *sk_dst_cache;
>        struct xfrm_policy      *sk_policy[2];
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 4fe605f..0eef90a 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -435,6 +435,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
>  #ifdef CONFIG_INET
>        new->sp                 = secpath_get(old->sp);
>  #endif
> +       new->friend             = old->friend;
>        memcpy(new->cb, old->cb, sizeof(old->cb));
>        new->csum_start         = old->csum_start;
>        new->csum_offset        = old->csum_offset;
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 828ea21..375dc2e 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -503,6 +503,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
>        if (newsk != NULL) {
>                struct inet_connection_sock *newicsk = inet_csk(newsk);
>
> +               newsk->sk_friend = req->friend;
> +
>                newsk->sk_state = TCP_SYN_RECV;
>                newicsk->icsk_bind_hash = NULL;
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 58ac838..042ee1d 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -474,7 +474,8 @@ static inline int forced_push(struct tcp_sock *tp)
>        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
>  }
>
> -static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
> +static inline void skb_entail(struct sock *sk, struct sk_buff *skb,
> +                             struct sk_buff_head *friend_queue)
>  {
>        struct tcp_sock *tp = tcp_sk(sk);
>        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
> @@ -484,7 +485,10 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
>        tcb->flags   = TCPCB_FLAG_ACK;
>        tcb->sacked  = 0;
>        skb_header_release(skb);
> -       tcp_add_write_queue_tail(sk, skb);
> +       if (sk->sk_friend)
> +               __skb_queue_tail(friend_queue, skb);
> +       else
> +               tcp_add_write_queue_tail(sk, skb);
>        sk->sk_wmem_queued += skb->truesize;
>        sk_mem_charge(sk, skb->truesize);
>        if (tp->nonagle & TCP_NAGLE_PUSH)
> @@ -501,7 +505,7 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
>  }
>
>  static inline void tcp_push(struct sock *sk, int flags, int mss_now,
> -                           int nonagle)
> +                           int nonagle, struct sk_buff_head *friend_queue)
>  {
>        struct tcp_sock *tp = tcp_sk(sk);
>
> @@ -512,6 +516,19 @@ static inline void tcp_push(struct sock *sk, int flags, int mss_now,
>                tcp_mark_urg(tp, flags, skb);
>                __tcp_push_pending_frames(sk, mss_now,
>                                          (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
> +       } else if (sk->sk_friend) {
> +               struct sock *friend = sk->sk_friend;
> +               struct sk_buff *skb;
> +               unsigned int len;
> +
> +               spin_lock_bh(&friend->sk_lock.slock);
> +               len = 0;
> +               while ((skb = __skb_dequeue(friend_queue)) != NULL) {
> +                       len += skb->len;
> +                       __skb_queue_tail(&sk->sk_receive_queue, skb);
> +               }
> +               sk->sk_data_ready(friend, len);
> +               spin_unlock_bh(&friend->sk_lock.slock);
>        }
>  }
>
> @@ -658,6 +675,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
>                         size_t psize, int flags)
>  {
>        struct tcp_sock *tp = tcp_sk(sk);
> +       struct sk_buff_head friend_queue;
>        int mss_now, size_goal;
>        int err;
>        ssize_t copied;
> @@ -674,6 +692,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
>        size_goal = tp->xmit_size_goal;
>        copied = 0;
>
> +       skb_queue_head_init(&friend_queue);
> +
>        err = -EPIPE;
>        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
>                goto do_error;
> @@ -694,7 +714,7 @@ new_segment:
>                        if (!skb)
>                                goto wait_for_memory;
>
> -                       skb_entail(sk, skb);
> +                       skb_entail(sk, skb, &friend_queue);
>                        copy = size_goal;
>                }
>
> @@ -749,7 +769,8 @@ wait_for_sndbuf:
>                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>  wait_for_memory:
>                if (copied)
> -                       tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
> +                       tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH,
> +                                &friend_queue);
>
>                if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
>                        goto do_error;
> @@ -760,7 +781,7 @@ wait_for_memory:
>
>  out:
>        if (copied)
> -               tcp_push(sk, flags, mss_now, tp->nonagle);
> +               tcp_push(sk, flags, mss_now, tp->nonagle, &friend_queue);
>        return copied;
>
>  do_error:
> @@ -817,6 +838,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
>        struct sock *sk = sock->sk;
>        struct iovec *iov;
>        struct tcp_sock *tp = tcp_sk(sk);
> +       struct sk_buff_head friend_queue;
>        struct sk_buff *skb;
>        int iovlen, flags;
>        int mss_now, size_goal;
> @@ -849,6 +871,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
>        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
>                goto do_error;
>
> +       skb_queue_head_init(&friend_queue);
>        while (--iovlen >= 0) {
>                int seglen = iov->iov_len;
>                unsigned char __user *from = iov->iov_base;
> @@ -881,7 +904,7 @@ new_segment:
>                                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
>                                        skb->ip_summed = CHECKSUM_PARTIAL;
>
> -                               skb_entail(sk, skb);
> +                               skb_entail(sk, skb, &friend_queue);
>                                copy = size_goal;
>                        }
>
> @@ -995,7 +1018,8 @@ wait_for_sndbuf:
>                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>  wait_for_memory:
>                        if (copied)
> -                               tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
> +                               tcp_push(sk, flags & ~MSG_MORE, mss_now,
> +                                        TCP_NAGLE_PUSH, &friend_queue);
>
>                        if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
>                                goto do_error;
> @@ -1007,7 +1031,7 @@ wait_for_memory:
>
>  out:
>        if (copied)
> -               tcp_push(sk, flags, mss_now, tp->nonagle);
> +               tcp_push(sk, flags, mss_now, tp->nonagle, &friend_queue);
>        TCP_CHECK_TIMER(sk);
>        release_sock(sk);
>        return copied;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index cdc051b..eb6f914 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4998,6 +4998,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>                 *    state to ESTABLISHED..."
>                 */
>
> +               sk->sk_friend = skb->friend;
>                TCP_ECN_rcv_synack(tp, th);
>
>                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 7766151..4d91ff4 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1289,6 +1289,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
>        if (!req)
>                goto drop;
>
> +       req->friend = skb->friend;
>  #ifdef CONFIG_TCP_MD5SIG
>        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
>  #endif
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index debf235..a4d4c14 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -577,6 +577,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>        }
>
>        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
> +               skb->friend = sk;
>                tcp_syn_build_options((__be32 *)(th + 1),
>                                      tcp_advertise_mss(sk),
>                                      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
> @@ -1006,6 +1007,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
>                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
>                xmit_size_goal -= (xmit_size_goal % mss_now);
>        }
> +       if (sk->sk_friend)
> +               xmit_size_goal = ~(u16)0;
>        tp->xmit_size_goal = xmit_size_goal;
>
>        return mss_now;
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 715965f..c79d3ea 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1280,6 +1280,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
>        if (req == NULL)
>                goto drop;
>
> +       req->friend = skb->friend;
>  #ifdef CONFIG_TCP_MD5SIG
>        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
>  #endif
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ