diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ac663c1..363b8e8 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -35,6 +35,10 @@ SUBSYS(cpuacct) SUBSYS(mem_cgroup) #endif +#ifdef CONFIG_CGROUP_KMEM +SUBSYS(kmem) +#endif + /* */ #ifdef CONFIG_CGROUP_DEVICE diff --git a/include/linux/kmem_cgroup.h b/include/linux/kmem_cgroup.h new file mode 100644 index 0000000..9c62718 --- /dev/null +++ b/include/linux/kmem_cgroup.h @@ -0,0 +1,68 @@ +/* kmem_cgroup.h - Kernel Memory Controller + * + * Copyright Parallels Inc., 2011 + * Author: Glauber Costa + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_KMEM_CGROUP_H +#define _LINUX_KMEM_CGROUP_H +#include +#include +#include + +struct kmem_cgroup { + struct cgroup_subsys_state css; + struct kmem_cgroup *parent; + + int tcp_memory_pressure; + int tcp_max_memory; + atomic_long_t tcp_memory_allocated; + struct percpu_counter tcp_sockets_allocated; + long tcp_prot_mem[3]; + + atomic_long_t udp_memory_allocated; +}; + + +#ifdef CONFIG_CGROUP_KMEM +static inline struct kmem_cgroup *cgroup_sk(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, kmem_subsys_id), + struct kmem_cgroup, css); +} + +static inline struct kmem_cgroup *task_sk(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, kmem_subsys_id), + struct kmem_cgroup, css); +} + +static inline bool kmem_cgroup_disabled(void) +{ + if (kmem_subsys.disabled) + return true; + return false; +} +#else +static inline struct kmem_cgroup *cgroup_sk(struct cgroup *cgrp) +{ + return NULL; +} + +static inline struct kmem_cgroup *task_sk(struct task_struct *tsk) +{ + return NULL; +} +#endif /* CONFIG_CGROUP_KMEM */ +#endif /* _LINUX_KMEM_CGROUP_H */ + diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d786b4f..bbd023a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -55,6 +55,7 @@ struct netns_ipv4 { int current_rt_cache_rebuild_count; unsigned int sysctl_ping_group_range[2]; + long sysctl_tcp_mem[3]; atomic_t rt_genid; atomic_t dev_addr_genid; diff --git a/include/net/sock.h b/include/net/sock.h index 8e4062f..b68e6ea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -62,7 +62,9 @@ #include #include #include +#include +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -339,6 +341,7 @@ struct sock { #endif __u32 sk_mark; u32 sk_classid; + struct kmem_cgroup *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk, int bytes); void (*sk_write_space)(struct sock *sk); @@ -786,16 +789,18 @@ struct proto { /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); - atomic_long_t *memory_allocated; /* Current allocated memory. */ - struct percpu_counter *sockets_allocated; /* Current number of sockets. */ + atomic_long_t *(*memory_allocated)(struct kmem_cgroup *sg); /* Current allocated memory. */ + struct percpu_counter *(*sockets_allocated)(struct kmem_cgroup *sg); /* Current number of sockets. */ + + int (*init_cgroup)(struct cgroup *cgrp, struct cgroup_subsys *ss); /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ - int *memory_pressure; - long *sysctl_mem; + int *(*memory_pressure)(struct kmem_cgroup *sg); + long *(*prot_mem)(struct kmem_cgroup *sg); int *sysctl_wmem; int *sysctl_rmem; int max_header; @@ -826,6 +831,56 @@ struct proto { #endif }; +#define sk_memory_pressure(sk) \ +({ \ + int *__ret = NULL; \ + if (sk->sk_prot->memory_pressure) \ + __ret = sk->sk_prot->memory_pressure(sk->sk_cgrp); \ + __ret; \ +}) + +#define sk_sockets_allocated(sk) \ +({ \ + struct percpu_counter *__p; \ + __p = sk->sk_prot->sockets_allocated(sk->sk_cgrp); \ + __p; \ +}) + +#define sk_memory_allocated(sk) \ +({ \ + atomic_long_t *__mem; \ + __mem = sk->sk_prot->memory_allocated(sk->sk_cgrp); \ + __mem; \ +}) + +#define sk_prot_mem(sk) \ +({ \ + long *__mem = sk->sk_prot->prot_mem(sk->sk_cgrp); \ + __mem; \ +}) + +#define sg_memory_pressure(prot, sg) \ +({ \ + int *__ret = NULL; \ + if (prot->memory_pressure) \ + __ret = prot->memory_pressure(sg); \ + __ret; \ +}) + +#define sg_memory_allocated(prot, sg) \ +({ \ + atomic_long_t *__mem; \ + __mem = prot->memory_allocated(sg); \ + __mem; \ +}) + +#define sg_sockets_allocated(prot, sg) \ +({ \ + struct percpu_counter *__p; \ + __p = prot->sockets_allocated(sg); \ + __p; \ +}) + extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); diff --git a/include/net/tcp.h b/include/net/tcp.h index 149a415..97405ed 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; extern int sysctl_tcp_dsack; -extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; @@ -255,7 +254,13 @@ extern int sysctl_tcp_thin_dupack; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; -extern int tcp_memory_pressure; + +struct kmem_cgroup; +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg); +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg); +int *memory_pressure_tcp(struct kmem_cgroup *sg); +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg); /* * The next routines deal with comparing 32 bit unsigned ints @@ -286,7 +291,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) } if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + atomic_long_read(sk_memory_allocated(sk)) > sk_prot_mem(sk)[2]) return true; return false; } diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 779abb9..44d2191 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -31,13 +31,14 @@ TRACE_EVENT(sock_rcvqueue_full, TRACE_EVENT(sock_exceed_buf_limit, - TP_PROTO(struct sock *sk, struct proto *prot, long allocated), + TP_PROTO(struct sock *sk, struct proto *prot, long allocated, + long *prot_mem), - TP_ARGS(sk, prot, allocated), + TP_ARGS(sk, prot, allocated, prot_mem), TP_STRUCT__entry( __array(char, name, 32) - __field(long *, sysctl_mem) + __field(long *, prot_mem) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) @@ -45,7 +46,7 @@ TRACE_EVENT(sock_exceed_buf_limit, TP_fast_assign( strncpy(__entry->name, prot->name, 32); - __entry->sysctl_mem = prot->sysctl_mem; + __entry->prot_mem = prot_mem; __entry->allocated = allocated; __entry->sysctl_rmem = prot->sysctl_rmem[0]; __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); @@ -54,9 +55,9 @@ TRACE_EVENT(sock_exceed_buf_limit, TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld " "sysctl_rmem=%d rmem_alloc=%d", __entry->name, - __entry->sysctl_mem[0], - __entry->sysctl_mem[1], - __entry->sysctl_mem[2], + __entry->prot_mem[0], + __entry->prot_mem[1], + __entry->prot_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc) diff --git a/init/Kconfig b/init/Kconfig index d627783..ed3019c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). +config CGROUP_KMEM + bool "Kernel Memory Resource Controller for Control Groups" + depends on CGROUPS + help + The Kernel Memory cgroup can limit the amount of memory used by + certain kernel objects in the system. Those are fundamentally + different from the entities handled by the Memory Controller, + which are page-based, and can be swapped. Users of the kmem + cgroup can use it to guarantee that no group of processes will + ever exhaust kernel resources alone. + config CGROUP_PERF bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" depends on PERF_EVENTS && CGROUPS diff --git a/mm/Makefile b/mm/Makefile index 836e416..1b1aa24 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/kmem_cgroup.c b/mm/kmem_cgroup.c new file mode 100644 index 0000000..d2a86dd --- /dev/null +++ b/mm/kmem_cgroup.c @@ -0,0 +1,53 @@ +/* kmem_cgroup.c - Kernel Memory Controller + * + * Copyright Parallels Inc, 2011 + * Author: Glauber Costa + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include + +static int kmem_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return sockets_populate(ss, cgrp); +} + +static void +kmem_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct kmem_cgroup *sk = cgroup_sk(cgrp); + kfree(sk); +} + +static struct cgroup_subsys_state *kmem_create( + struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct kmem_cgroup *sk = kzalloc(sizeof(*sk), GFP_KERNEL); + + if (!sk) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent) + sk->parent = cgroup_sk(cgrp->parent); + + return &sk->css; +} + +struct cgroup_subsys kmem_subsys = { + .name = "kmem", + .create = kmem_create, + .destroy = kmem_destroy, + .populate = kmem_populate, + .subsys_id = kmem_subsys_id, +}; diff --git a/net/core/sock.c b/net/core/sock.c index bc745d0..2b748d5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,25 @@ #include #endif +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct proto *proto; + int ret = 0; + + read_lock(&proto_list_lock); + list_for_each_entry(proto, &proto_list, node) { + if (proto->init_cgroup) { + ret |= proto->init_cgroup(cgrp, ss); + } + } + read_unlock(&proto_list_lock); + + return ret; +} + /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -1114,6 +1133,16 @@ void sock_update_classid(struct sock *sk) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); + +void sock_update_cgrp(struct sock *sk) +{ +#ifdef CONFIG_CGROUP_KMEM + rcu_read_lock(); + sk->sk_cgrp = task_sk(current); + rcu_read_unlock(); +#endif +} + #endif /** @@ -1141,6 +1170,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); + sock_update_cgrp(sk); } return sk; @@ -1289,8 +1319,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (newsk->sk_prot->sockets_allocated) - percpu_counter_inc(newsk->sk_prot->sockets_allocated); + if (sk_sockets_allocated(sk)) + percpu_counter_inc(sk_sockets_allocated(sk)); if (sock_flag(newsk, SOCK_TIMESTAMP) || sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1678,29 +1708,50 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { - struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); + struct proto *prot = sk->sk_prot; long allocated; + int *memory_pressure; + long *prot_mem; + int parent_failure = 0; + struct kmem_cgroup *sg = sk->sk_cgrp; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_long_add_return(amt, prot->memory_allocated); + + memory_pressure = sk_memory_pressure(sk); + prot_mem = sk_prot_mem(sk); + + allocated = atomic_long_add_return(amt, sk_memory_allocated(sk)); + +#ifdef CONFIG_KMEM_CGROUP + for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) { + long alloc; + /* + * Large nestings are not the common case, and stopping in the + * middle would be complicated enough, that we bill it all the + * way through the root, and if needed, unbill everything later + */ + alloc = atomic_long_add_return(amt, sg_memory_allocated(prot, sg)); + parent_failure |= (alloc > sk_prot_mem(sk)[2]); + } +#endif + + /* Over hard limit (we, or our parents) */ + if (parent_failure || (allocated > prot_mem[2])) + goto suppress_allocation; /* Under limit. */ - if (allocated <= prot->sysctl_mem[0]) { - if (prot->memory_pressure && *prot->memory_pressure) - *prot->memory_pressure = 0; + if (allocated <= prot_mem[0]) { + if (memory_pressure && *memory_pressure) + *memory_pressure = 0; return 1; } /* Under pressure. */ - if (allocated > prot->sysctl_mem[1]) + if (allocated > prot_mem[1]) if (prot->enter_memory_pressure) prot->enter_memory_pressure(sk); - /* Over hard limit. */ - if (allocated > prot->sysctl_mem[2]) - goto suppress_allocation; - /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) @@ -1714,13 +1765,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) return 1; } - if (prot->memory_pressure) { + if (memory_pressure) { int alloc; - if (!*prot->memory_pressure) + if (!*memory_pressure) return 1; - alloc = percpu_counter_read_positive(prot->sockets_allocated); - if (prot->sysctl_mem[2] > alloc * + alloc = percpu_counter_read_positive(sk_sockets_allocated(sk)); + if (prot_mem[2] > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) @@ -1739,11 +1790,19 @@ suppress_allocation: return 1; } - trace_sock_exceed_buf_limit(sk, prot, allocated); + trace_sock_exceed_buf_limit(sk, prot, allocated, prot_mem); /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_long_sub(amt, prot->memory_allocated); + + atomic_long_sub(amt, sk_memory_allocated(sk)); + +#ifdef CONFIG_CGROUP_KMEM + for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) { + atomic_long_sub(amt, sg_memory_allocated(prot, sg)); + } +#endif + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1755,14 +1814,25 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; + struct kmem_cgroup *sg = sk->sk_cgrp; + int *memory_pressure = sk_memory_pressure(sk); + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, - prot->memory_allocated); + sk_memory_allocated(sk)); + +#ifdef CONFIG_CGROUP_KMEM + for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) { + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + sg_memory_allocated(prot, sg)); + } +#endif + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; - if (prot->memory_pressure && *prot->memory_pressure && - (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) - *prot->memory_pressure = 0; + if (memory_pressure && *memory_pressure && + (atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0])) + *memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2254,9 +2324,6 @@ void sk_common_release(struct sock *sk) } EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -2481,13 +2548,15 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { + struct kmem_cgroup *sg = task_sk(current); + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + proto->memory_allocated != NULL ? atomic_long_read(sg_memory_allocated(proto, sg)) : -1L, + proto->memory_pressure != NULL ? *sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b14ec7d..e8e8889 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -53,19 +53,21 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) struct net *net = seq->private; int orphans, sockets; + struct kmem_cgroup *sg = task_sk(current); + local_bh_disable(); orphans = percpu_counter_sum_positive(&tcp_orphan_count); - sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); + sockets = percpu_counter_sum_positive(sg_sockets_allocated((&tcp_prot), sg)); local_bh_enable(); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_long_read(&tcp_memory_allocated)); + atomic_long_read(sg_memory_allocated((&tcp_prot), sg))); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_long_read(&udp_memory_allocated)); + atomic_long_read(sg_memory_allocated((&udp_prot), sg))); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 69fd720..9ce7e75 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } +static int ipv4_tcp_mem(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned long vec[3]; + struct kmem_cgroup *kmem = task_sk(current); + struct net *net = current->nsproxy->net_ns; + int i; + + ctl_table tmp = { + .data = &vec, + .maxlen = sizeof(vec), + .mode = ctl->mode, + }; + + if (!write) { + ctl->data = &net->ipv4.sysctl_tcp_mem; + return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); + } + + ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); + if (ret) + return ret; + + for (i = 0; i < 3; i++) + if (vec[i] > kmem->tcp_max_memory) + return -EINVAL; + + for (i = 0; i < 3; i++) { + net->ipv4.sysctl_tcp_mem[i] = vec[i]; + kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i]; + } + + return 0; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_mem", - .data = &sysctl_tcp_mem, - .maxlen = sizeof(sysctl_tcp_mem), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax - }, - { .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, + { + .procname = "tcp_mem", + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), + .mode = 0644, + .proc_handler = ipv4_tcp_mem, + }, { } }; @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; + unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); if (net->ipv4.ipv4_hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46febca..beec487 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -266,6 +266,7 @@ #include #include #include +#include #include #include @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ -EXPORT_SYMBOL(tcp_memory_allocated); - -/* - * Current number of TCP sockets. - */ -struct percpu_counter tcp_sockets_allocated; -EXPORT_SYMBOL(tcp_sockets_allocated); - /* * TCP splice context */ @@ -308,17 +298,141 @@ struct tcp_splice_state { unsigned int flags; }; +#ifdef CONFIG_CGROUP_KMEM /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure __read_mostly; -EXPORT_SYMBOL(tcp_memory_pressure); - void tcp_enter_memory_pressure(struct sock *sk) { + struct kmem_cgroup *sg = sk->sk_cgrp; + if (!sg->tcp_memory_pressure) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); + sg->tcp_memory_pressure = 1; + } +} +EXPORT_SYMBOL(tcp_enter_memory_pressure); + +long *tcp_sysctl_mem(struct kmem_cgroup *sg) +{ + return sg->tcp_prot_mem; +} +EXPORT_SYMBOL(tcp_sysctl_mem); + +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg) +{ + return &(sg->tcp_memory_allocated); +} +EXPORT_SYMBOL(memory_allocated_tcp); + +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct kmem_cgroup *sg = cgroup_sk(cgrp); + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + /* + * We can't allow more memory than our parents. Since this + * will be tested for all calls, by induction, there is no need + * to test any parent other than our own + * */ + if (sg->parent && (val > sg->parent->tcp_max_memory)) + val = sg->parent->tcp_max_memory; + + sg->tcp_max_memory = val; + + sg->tcp_prot_mem[0] = val / 2; + sg->tcp_prot_mem[1] = (val * 2) / 3; + sg->tcp_prot_mem[2] = val; + + cgroup_unlock(); + + return 0; +} + +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft) +{ + struct kmem_cgroup *sg = cgroup_sk(cgrp); + u64 ret; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + ret = sg->tcp_max_memory; + + cgroup_unlock(); + return ret; +} + +static struct cftype tcp_files[] = { + { + .name = "tcp_maxmem", + .write_u64 = tcp_write_maxmem, + .read_u64 = tcp_read_maxmem, + }, +}; + +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct kmem_cgroup *sg = cgroup_sk(cgrp); + unsigned long limit; + struct net *net = current->nsproxy->net_ns; + + sg->tcp_memory_pressure = 0; + + percpu_counter_init(&sg->tcp_sockets_allocated, 0); + atomic_long_set(&sg->tcp_memory_allocated, 0); + + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + + if (sg->parent) + sg->tcp_max_memory = sg->parent->tcp_max_memory; + else + sg->tcp_max_memory = limit * 2; + + sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; + sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; + sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; + + return cgroup_add_files(cgrp, ss, tcp_files, ARRAY_SIZE(tcp_files)); +} +EXPORT_SYMBOL(tcp_init_cgroup); + +int *memory_pressure_tcp(struct kmem_cgroup *sg) +{ + return &sg->tcp_memory_pressure; +} +EXPORT_SYMBOL(memory_pressure_tcp); + +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg) +{ + return &sg->tcp_sockets_allocated; +} +EXPORT_SYMBOL(sockets_allocated_tcp); +#else + +/* Current number of TCP sockets. */ +struct percpu_counter tcp_sockets_allocated; +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ +int tcp_memory_pressure; + +int *memory_pressure_tcp(struct kmem_cgroup *sg) +{ + return &tcp_memory_pressure; +} +EXPORT_SYMBOL(memory_pressure_tcp); + +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg) +{ + return &tcp_sockets_allocated; +} +EXPORT_SYMBOL(sockets_allocated_tcp); + +void tcp_enter_memory_pressure(struct sock *sock) +{ if (!tcp_memory_pressure) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); tcp_memory_pressure = 1; @@ -326,6 +440,19 @@ void tcp_enter_memory_pressure(struct sock *sk) } EXPORT_SYMBOL(tcp_enter_memory_pressure); +long *tcp_sysctl_mem(struct kmem_cgroup *sg) +{ + return init_net.ipv4.sysctl_tcp_mem; +} +EXPORT_SYMBOL(tcp_sysctl_mem); + +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg) +{ + return &tcp_memory_allocated; +} +EXPORT_SYMBOL(memory_allocated_tcp); +#endif /* CONFIG_CGROUP_KMEM */ + /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { @@ -3226,7 +3353,9 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); +#ifndef CONFIG_CGROUP_KMEM percpu_counter_init(&tcp_sockets_allocated, 0); +#endif percpu_counter_init(&tcp_orphan_count, 0); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", @@ -3277,14 +3406,8 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); + limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea0d218..c44e830 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -316,7 +316,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + !sk_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -393,15 +393,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct proto *prot = sk->sk_prot; icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < prot->sysctl_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + !sk_memory_pressure(sk) && + atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + prot->sysctl_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -4806,7 +4807,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (sk_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4872,11 +4873,11 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (sk_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(sk_memory_allocated(sk)) >= sk_prot_mem(sk)[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c12b8e..88034a3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1901,7 +1901,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - percpu_counter_inc(&tcp_sockets_allocated); + percpu_counter_inc(sk_sockets_allocated(sk)); local_bh_enable(); return 0; @@ -1957,7 +1957,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tp->cookie_values = NULL; } - percpu_counter_dec(&tcp_sockets_allocated); + percpu_counter_dec(sk_sockets_allocated(sk)); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2598,11 +2598,14 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, - .sockets_allocated = &tcp_sockets_allocated, + .memory_pressure = memory_pressure_tcp, + .sockets_allocated = sockets_allocated_tcp, .orphan_count = &tcp_orphan_count, - .memory_allocated = &tcp_memory_allocated, - .memory_pressure = &tcp_memory_pressure, - .sysctl_mem = sysctl_tcp_mem, + .memory_allocated = memory_allocated_tcp, +#ifdef CONFIG_CGROUP_KMEM + .init_cgroup = tcp_init_cgroup, +#endif + .prot_mem = tcp_sysctl_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 882e0b0..06aeb31 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1912,7 +1912,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (sk_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ecd44b0..2c67617 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data) } out: - if (tcp_memory_pressure) + if (sk_memory_pressure(sk)) sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b5a193..258f137 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -120,9 +120,6 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_long_t udp_memory_allocated; -EXPORT_SYMBOL(udp_memory_allocated); - #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) @@ -1918,6 +1915,24 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +#ifdef CONFIG_CGROUP_KMEM +static atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg) +{ + return &sg->udp_memory_allocated; +} +#else +atomic_long_t udp_memory_allocated; +static atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg) +{ + return &udp_memory_allocated; +} +#endif + +static long *udp_sysctl_mem(struct kmem_cgroup *sg) +{ + return sysctl_udp_mem; +} + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -1936,8 +1951,8 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, + .memory_allocated = &memory_allocated_udp, + .prot_mem = udp_sysctl_mem, .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock),