[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S35f+DFTXCQgHPrMgdkAxEZKepWgZN2BowOp_Nt4nTpLgg@mail.gmail.com>
Date: Tue, 15 Mar 2016 10:47:12 -0700
From: Tom Herbert <tom@...bertland.com>
To: Sowmini Varadhan <sowmini.varadhan@...cle.com>
Cc: Linux Kernel Network Developers <netdev@...r.kernel.org>,
santosh.shilimkar@...cle.com,
"David S. Miller" <davem@...emloft.net>
Subject: Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for
sndbuf/rcvbuf on rds-tcp socket
On Tue, Mar 15, 2016 at 10:34 AM, Sowmini Varadhan
<sowmini.varadhan@...cle.com> wrote:
> On (03/15/16 10:30), Tom Herbert wrote:
>> I still don't understand why you won't consider using netlink. sndbuf
>> and rcvbuf seem like just two of many arbitrary parameters that a user
>> might want to configure.
>
>> I would think that being able to configure
>> the listener port (currently a fixed value), an address to bind to
>
> the listener port is properly registered with IANA. I dont think
> we will need to change that, any more than ssh/nfs ports will need
> to change.
>
Both sshd and nfsd and allow configurable listener port numbers. Any
listener service will allow a configurable port number. An IANA port
number is good as a default, but there are many reasons why people
want or need to use a different port number. I don't see what makes
RDS special in this regard.
>> (instead of always inaddr-any), or not automatically creating an RDS
>> listener in every namespace are at least as important configuration
>> parameters as these two.
>
> Those parameters can be added via sysctl if needed- we have not
> seen the need for that yet.
>
> And if something comes up that needs the netlink etc. we can
> add it down the road. It's just not needed now.
>
> Thanks
> --Sowmini
>>
>> Tom
>>
>> > net/rds/tcp.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>> > 1 files changed, 129 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/net/rds/tcp.c b/net/rds/tcp.c
>> > index ad60299..5d62fd2 100644
>> > --- a/net/rds/tcp.c
>> > +++ b/net/rds/tcp.c
>> > @@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
>> >
>> > static struct kmem_cache *rds_tcp_conn_slab;
>> >
>> > -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
>> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
>> > + void __user *buffer, size_t *lenp, loff_t *fpos);
>> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
>> > + void __user *buffer, size_t *lenp, loff_t *fpos);
>> > +static struct ctl_table rds_tcp_sysctl_table[] = {
>> > + {
>> > + .procname = "rds_tcp_sndbuf",
>> > + /* data is per-net pointer */
>> > + .maxlen = sizeof(int),
>> > + .mode = 0644,
>> > + .proc_handler = sndbuf_handler,
>> > + },
>> > + {
>> > + .procname = "rds_tcp_rcvbuf",
>> > + /* data is per-net pointer */
>> > + .maxlen = sizeof(int),
>> > + .mode = 0644,
>> > + .proc_handler = rcvbuf_handler,
>> > + },
>> > + { }
>> > +};
>> >
>> > /* doing it this way avoids calling tcp_sk() */
>> > void rds_tcp_nonagle(struct socket *sock)
>> > @@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
>> > set_fs(oldfs);
>> > }
>> >
>> > -/* All module specific customizations to the RDS-TCP socket should be done in
>> > - * rds_tcp_tune() and applied after socket creation. In general these
>> > - * customizations should be tunable via module_param()
>> > - */
>> > -void rds_tcp_tune(struct socket *sock)
>> > -{
>> > - rds_tcp_nonagle(sock);
>> > -}
>> > -
>> > u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
>> > {
>> > return tcp_sk(tc->t_sock->sk)->snd_nxt;
>> > @@ -272,8 +283,33 @@ static int rds_tcp_netid;
>> > struct rds_tcp_net {
>> > struct socket *rds_tcp_listen_sock;
>> > struct work_struct rds_tcp_accept_w;
>> > + struct ctl_table_header *rds_tcp_sysctl;
>> > + int sndbuf_size;
>> > + int rcvbuf_size;
>> > };
>> >
>> > +/* All module specific customizations to the RDS-TCP socket should be done in
>> > + * rds_tcp_tune() and applied after socket creation.
>> > + */
>> > +void rds_tcp_tune(struct socket *sock)
>> > +{
>> > + struct sock *sk = sock->sk;
>> > + struct net *net = sock_net(sk);
>> > + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > +
>> > + rds_tcp_nonagle(sock);
>> > + lock_sock(sk);
>> > + if (rtn->sndbuf_size > 0) {
>> > + sk->sk_sndbuf = rtn->sndbuf_size;
>> > + sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
>> > + }
>> > + if (rtn->rcvbuf_size > 0) {
>> > + sk->sk_rcvbuf = rtn->rcvbuf_size;
>> > + sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
>> > + }
>> > + release_sock(sk);
>> > +}
>> > +
>> > static void rds_tcp_accept_worker(struct work_struct *work)
>> > {
>> > struct rds_tcp_net *rtn = container_of(work,
>> > @@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
>> > {
>> > struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> >
>> > + /* 0 value for {snd, rcv}buf_size implies we let the stack
>> > + * pick the default, and permit auto-tuning of buffer size.
>> > + */
>> > + rtn->sndbuf_size = 0;
>> > + rtn->rcvbuf_size = 0;
>> > + rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
>> > + rds_tcp_sysctl_table);
>> > rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
>> > if (!rtn->rds_tcp_listen_sock) {
>> > pr_warn("could not set up listen sock\n");
>> > @@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
>> > {
>> > struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> >
>> > + unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
>> > /* If rds_tcp_exit_net() is called as a result of netns deletion,
>> > * the rds_tcp_kill_sock() device notifier would already have cleaned
>> > * up the listen socket, thus there is no work to do in this function.
>> > @@ -383,6 +427,81 @@ static struct notifier_block rds_tcp_dev_notifier = {
>> > .priority = -10, /* must be called after other network notifiers */
>> > };
>> >
>> > +static int user_atoi(char __user *ubuf, size_t len)
>> > +{
>> > + char buf[16];
>> > + unsigned long ret;
>> > + int err;
>> > +
>> > + if (len > 15)
>> > + return -EINVAL;
>> > +
>> > + if (copy_from_user(buf, ubuf, len))
>> > + return -EFAULT;
>> > +
>> > + buf[len] = 0;
>> > + err = kstrtoul(buf, 0, &ret);
>> > + if (err != 0)
>> > + return -ERANGE;
>> > + return ret;
>> > +}
>> > +
>> > +/* when sysctl is used to modify some kernel socket parameters,this
>> > + * function resets the RDS connections in that netns so that we can
>> > + * restart with new parameters. The assumption is that such reset
>> > + * events are few and far-between.
>> > + */
>> > +static void rds_tcp_sysctl_reset(struct net *net)
>> > +{
>> > + struct rds_tcp_connection *tc, *_tc;
>> > +
>> > + spin_lock_irq(&rds_tcp_conn_lock);
>> > + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
>> > + struct net *c_net = read_pnet(&tc->conn->c_net);
>> > +
>> > + if (net != c_net || !tc->t_sock)
>> > + continue;
>> > + rds_conn_drop(tc->conn); /* reconnect with new parameters */
>> > + }
>> > + spin_unlock_irq(&rds_tcp_conn_lock);
>> > +}
>> > +
>> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
>> > + void __user *buffer, size_t *lenp, loff_t *fpos)
>> > +{
>> > + struct net *net = current->nsproxy->net_ns;
>> > + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > + struct ctl_table tmp;
>> > +
>> > + tmp = *ctl;
>> > + tmp.data = &rtn->sndbuf_size;
>> > + if (!write)
>> > + return proc_dointvec(&tmp, write, buffer, lenp, fpos);
>> > +
>> > + rtn->sndbuf_size = user_atoi(buffer, *lenp);
>> > + rds_tcp_sysctl_reset(net);
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
>> > + void __user *buffer, size_t *lenp, loff_t *fpos)
>> > +{
>> > + struct net *net = current->nsproxy->net_ns;
>> > + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > + struct ctl_table tmp;
>> > +
>> > + tmp = *ctl;
>> > + tmp.data = &rtn->rcvbuf_size;
>> > + if (!write)
>> > + return proc_dointvec(&tmp, write, buffer, lenp, fpos);
>> > +
>> > + rtn->rcvbuf_size = user_atoi(buffer, *lenp);
>> > + rds_tcp_sysctl_reset(net);
>> > +
>> > + return 0;
>> > +}
>> > +
>> > static void rds_tcp_exit(void)
>> > {
>> > rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
>> > --
>> > 1.7.1
>> >
Powered by blists - more mailing lists