diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 8f670da..14e3f01 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -16,6 +16,7 @@ comment "DMA Clients" config NET_DMA bool "Network: TCP receive copy offload" depends on DMA_ENGINE && NET + select TCP_PREQUEUE default y ---help--- This enables the use of DMA engines in the network stack to diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c6b9f92..844a05e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,11 +268,13 @@ struct tcp_sock { /* Data for direct copy to user */ struct { +#ifdef CONFIG_TCP_PREQUEUE struct sk_buff_head prequeue; struct task_struct *task; struct iovec *iov; int memory; int len; +#endif #ifdef CONFIG_NET_DMA /* members for async copy */ struct dma_chan *dma_chan; diff --git a/include/net/tcp.h b/include/net/tcp.h index 185c7ec..3430d8e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -835,10 +835,12 @@ static inline int tcp_checksum_complete(struct sk_buff *skb) static inline void tcp_prequeue_init(struct tcp_sock *tp) { +#ifdef CONFIG_TCP_PREQUEUE tp->ucopy.task = NULL; tp->ucopy.len = 0; tp->ucopy.memory = 0; skb_queue_head_init(&tp->ucopy.prequeue); +#endif #ifdef CONFIG_NET_DMA tp->ucopy.dma_chan = NULL; tp->ucopy.wakeup = 0; @@ -857,6 +859,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) */ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_TCP_PREQUEUE struct tcp_sock *tp = tcp_sk(sk); if (!sysctl_tcp_low_latency && tp->ucopy.task) { @@ -882,6 +885,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) } return 1; } +#endif return 0; } diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fb79097..b770829 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -616,5 +616,20 @@ config TCP_MD5SIG If unsure, say N. +config TCP_PREQUEUE + bool "Enable TCP prequeue" + default n + ---help--- + TCP PREQUEUE is an 'optimization' loosely based on the famous + "30 instruction TCP receive" Van Jacobson mail. + Van's trick is to deposit buffers into socket queue + on a device interrupt, to call tcp_recv function + on the receive process context and checksum and copy + the buffer to user space. smart... + + Some people believe this 'optimization' is not really needed + but for some benchmarks. Also, taking potential pagefaults in + softirq handler seems a high price to pay. + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e74011..8659533 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -994,6 +994,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +#ifdef CONFIG_TCP_PREQUEUE static void tcp_prequeue_process(struct sock *sk) { struct sk_buff *skb; @@ -1011,6 +1012,7 @@ static void tcp_prequeue_process(struct sock *sk) /* Clear memory counter. */ tp->ucopy.memory = 0; } +#endif static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { @@ -1251,6 +1253,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tcp_cleanup_rbuf(sk, copied); +#ifdef CONFIG_TCP_PREQUEUE if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { @@ -1295,7 +1298,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } - +#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1307,6 +1310,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp->ucopy.wakeup = 0; #endif +#ifdef CONFIG_TCP_PREQUEUE if (user_recv) { int chunk; @@ -1330,6 +1334,7 @@ do_prequeue: } } } +#endif if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", @@ -1430,6 +1435,7 @@ skip_copy: break; } while (len > 0); +#ifdef CONFIG_TCP_PREQUEUE if (user_recv) { if (!skb_queue_empty(&tp->ucopy.prequeue)) { int chunk; @@ -1448,6 +1454,7 @@ skip_copy: tp->ucopy.task = NULL; tp->ucopy.len = 0; } +#endif #ifdef CONFIG_NET_DMA if (tp->ucopy.dma_chan) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bbad2cd..85d3a5c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3467,6 +3467,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */ +#ifdef CONFIG_TCP_PREQUEUE if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && sock_owned_by_user(sk) && !tp->urg_data) { @@ -3484,7 +3485,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } local_bh_disable(); } - +#endif if (eaten <= 0) { queue_and_out: if (eaten < 0 && @@ -4078,6 +4079,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } +#ifdef CONFIG_TCP_PREQUEUE static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4100,6 +4102,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) local_bh_disable(); return err; } +#endif static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { @@ -4279,8 +4282,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; - int copied_early = 0; +#ifdef CONFIG_TCP_PREQUEUE + int copied_early = 0; if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA @@ -4315,6 +4319,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (copied_early) tcp_cleanup_rbuf(sk, skb->len); } +#endif if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) goto csum_error; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9c94627..7ac5bc1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1916,8 +1916,10 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&sk->sk_async_wait_queue); #endif +#ifdef CONFIG_TCP_PREQUEUE /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); +#endif /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e9b151b..5f3b38c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -167,7 +167,9 @@ static int tcp_write_timeout(struct sock *sk) static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; +#ifdef CONFIG_TCP_PREQUEUE struct tcp_sock *tp = tcp_sk(sk); +#endif struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); @@ -190,6 +192,7 @@ static void tcp_delack_timer(unsigned long data) } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; +#ifdef CONFIG_TCP_PREQUEUE if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; @@ -200,6 +203,7 @@ static void tcp_delack_timer(unsigned long data) tp->ucopy.memory = 0; } +#endif if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) {