[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Tue, 7 Jul 2009 12:33:25 -0600
From: "Tantilov, Emil S" <emil.s.tantilov@...el.com>
To: Eric Dumazet <eric.dumazet@...il.com>,
"David S. Miller" <davem@...emloft.net>
CC: Emil S Tantilov <emils.tantilov@...il.com>,
NetDev <netdev@...r.kernel.org>,
"Brandeburg, Jesse" <jesse.brandeburg@...el.com>,
"Kirsher, Jeffrey T" <jeffrey.t.kirsher@...el.com>,
Jiri Olsa <jolsa@...hat.com>
Subject: RE: [PATCH] net: sk_alloc() should not blindly overwrite memory
Eric Dumazet wrote:
> Eric Dumazet a écrit :
>>
>>
>> Hold on, I found the problem and will submit a patch (after testing
>> it) in following hours
>>
>
> David, while looking at Emil case, I found following problem.
>
> I am not sure this bug is responsible for the BUG hit by Emil,
> but it seems a candidate for stable as well...
>
> Thanks
>
> [PATCH] net: sk_alloc() should not blindly overwrite memory
>
> Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code rely that some
> fields should not be blindly overwritten, even with null.
>
> These fields are sk->sk_refcnt and sk->sk_nulls_node.next
>
> Current sk_alloc() implementation doesnt respect this hypothesis,
> calling kmem_alloc with __GFP_ZERO and setting sk_refcnt to 1 instead
> of atomically increment it.
>
> Reported-by: Emil S Tantilov <emils.tantilov@...il.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
> ---
> include/net/sock.h | 24 +++++++++++-------
> net/core/sock.c | 41 ++++++++++++++++++++++++++++----
> net/ipv4/inet_timewait_sock.c | 2 -
> 3 files changed, 52 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 352f06b..3f6284e 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -103,15 +103,15 @@ struct net;
>
> /**
> * struct sock_common - minimal network layer representation of
> sockets + * @skc_node: main hash linkage for various protocol lookup
> tables + * @skc_nulls_node: main hash linkage for UDP/UDP-Lite
> protocol + * @skc_refcnt: reference count
> + * @skc_hash: hash value used with various protocol lookup tables
> * @skc_family: network address family
> * @skc_state: Connection state
> * @skc_reuse: %SO_REUSEADDR setting
> * @skc_bound_dev_if: bound device index if != 0
> - * @skc_node: main hash linkage for various protocol lookup tables
> - * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
> * @skc_bind_node: bind hash linkage for various protocol lookup
> tables
> - * @skc_refcnt: reference count
> - * @skc_hash: hash value used with various protocol lookup tables
> * @skc_prot: protocol handlers inside a network family
> * @skc_net: reference to the network namespace of this socket
> *
> @@ -119,17 +119,23 @@ struct net;
> * for struct sock and struct inet_timewait_sock.
> */
> struct sock_common {
> - unsigned short skc_family;
> - volatile unsigned char skc_state;
> - unsigned char skc_reuse;
> - int skc_bound_dev_if;
> + /*
> + * WARNING : skc_node, skc_refcnt must be first elements of this
> struct + * (sk_prot_alloc() should not clear skc_node.next &
> skc_refcnt because + * of RCU lookups)
> + */
> union {
> struct hlist_node skc_node;
> struct hlist_nulls_node skc_nulls_node;
> };
> - struct hlist_node skc_bind_node;
> atomic_t skc_refcnt;
> +
> unsigned int skc_hash;
> + unsigned short skc_family;
> + volatile unsigned char skc_state;
> + unsigned char skc_reuse;
> + int skc_bound_dev_if;
> + struct hlist_node skc_bind_node;
> struct proto *skc_prot;
> #ifdef CONFIG_NET_NS
> struct net *skc_net;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index b0ba569..daadcc5 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -994,8 +994,25 @@ struct sock *sk_alloc(struct net *net, int
> family, gfp_t priority, {
> struct sock *sk;
>
> - sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
> + sk = sk_prot_alloc(prot, priority, family);
> if (sk) {
> + /*
> + * caches using SLAB_DESTROY_BY_RCU should let sk_node
> + * and sk_refcnt untouched.
> + *
> + * Following makes sense only if sk first fields are
> + * in following order : sk_node, refcnt, sk_hash
> + */
> + BUILD_BUG_ON(offsetof(struct sock, sk_node) != 0);
> + BUILD_BUG_ON(offsetof(struct sock, sk_refcnt) !=
> + sizeof(sk->sk_node));
> + BUILD_BUG_ON(offsetof(struct sock, sk_hash) !=
> + sizeof(sk->sk_node) + sizeof(sk->sk_refcnt));
> + if (prot->slab_flags & SLAB_DESTROY_BY_RCU)
> + memset(&sk->sk_hash, 0,
> + prot->obj_size - offsetof(struct sock, sk_hash));
> + else
> + memset(sk, 0, prot->obj_size);
> sk->sk_family = family;
> /*
> * See comment in struct sock definition to understand
> @@ -1125,7 +1142,7 @@ struct sock *sk_clone(const struct sock *sk,
> const gfp_t priority)
>
> newsk->sk_err = 0;
> newsk->sk_priority = 0;
> - atomic_set(&newsk->sk_refcnt, 2);
> + atomic_add(2, &newsk->sk_refcnt);
>
> /*
> * Increment the counter in the same struct proto as the master
> @@ -1840,7 +1857,7 @@ void sock_init_data(struct socket *sock, struct
> sock *sk)
>
> sk->sk_stamp = ktime_set(-1L, 0);
>
> - atomic_set(&sk->sk_refcnt, 1);
> + atomic_inc(&sk->sk_refcnt);
> atomic_set(&sk->sk_wmem_alloc, 1);
> atomic_set(&sk->sk_drops, 0);
> }
> @@ -2140,12 +2157,25 @@ static inline void release_proto_idx(struct
> proto *prot) }
> #endif
>
> +/*
> + * We need to initialize skc->skc_refcnt to 0, and
> skc->skc_node.pprev to NULL + * but only on newly allocated objects
> (check sk_prot_alloc()) + */
> +static void sk_init_once(void *arg)
> +{
> + struct sock_common *skc = arg;
> +
> + atomic_set(&skc->skc_refcnt, 0);
> + skc->skc_node.pprev = NULL;
> +}
> +
> int proto_register(struct proto *prot, int alloc_slab)
> {
> if (alloc_slab) {
> prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
> SLAB_HWCACHE_ALIGN | prot->slab_flags,
> - NULL);
> + prot->slab_flags & SLAB_DESTROY_BY_RCU ?
> + sk_init_once : NULL);
>
> if (prot->slab == NULL) {
> printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
> @@ -2187,7 +2217,8 @@ int proto_register(struct proto *prot, int
> alloc_slab) 0,
> SLAB_HWCACHE_ALIGN |
> prot->slab_flags,
> - NULL);
> + prot->slab_flags & SLAB_DESTROY_BY_RCU ?
> + sk_init_once : NULL);
> if (prot->twsk_prot->twsk_slab == NULL)
> goto out_free_timewait_sock_slab_name;
> }
> diff --git a/net/ipv4/inet_timewait_sock.c
> b/net/ipv4/inet_timewait_sock.c
> index 61283f9..a2997df 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -139,7 +139,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const
> struct sock *sk, const int stat tw->tw_transparent =
> inet->transparent; tw->tw_prot = sk->sk_prot_creator;
> twsk_net_set(tw, hold_net(sock_net(sk)));
> - atomic_set(&tw->tw_refcnt, 1);
> + atomic_inc(&tw->tw_refcnt);
> inet_twsk_dead_node_init(tw);
> __module_get(tw->tw_prot->owner);
> }
Eric,
With this patch applied, I get panic on boot:
[ 5.334653] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[ 5.344225] IP: [<ffffffff811ed364>] selinux_socket_post_create+0x78/0x95
[ 5.352263] PGD 0
[ 5.354952] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
[ 5.360599] last sysfs file:
[ 5.364292] CPU 0
[ 5.366985] Modules linked in:
[ 5.370836] Pid: 1, comm: swapper Not tainted 2.6.31-rc1-net-2.6-igb-ed-patch-07071123 #2 S5520HC
[ 5.381465] RIP: 0010:[<ffffffff811ed364>] [<ffffffff811ed364>] selinux_socket_post_create+0x78/0x95
[ 5.392551] RSP: 0018:ffff8801ef0a5d60 EFLAGS: 00010286
[ 5.398871] RAX: 0000000000000001 RBX: ffff88036e5ace80 RCX: 0000000000000006
[ 5.407227] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000002
[ 5.415587] RBP: ffff8801ef0a5d80 R08: 0000000000000001 R09: 0000000000000000
[ 5.423945] R10: ffff88036e000000 R11: ffff8801ef0a5c20 R12: ffff88036ec09c80
[ 5.432300] R13: 0000000000000002 R14: 0000000000000002 R15: 0000000000000003
[ 5.440662] FS: 0000000000000000(0000) GS:ffffc90000000000(0000) knlGS:0000000000000000
[ 5.450386] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[ 5.457189] CR2: 0000000000000010 CR3: 0000000001001000 CR4: 00000000000006b0
[ 5.465545] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5.473908] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 5.482267] Process swapper (pid: 1, threadinfo ffff8801ef0a4000, task ffff88036f040000)
[ 5.491994] Stack:
[ 5.494606] 01ffffff816d0040 00000000ffffff9f ffffffff817244a0 ffff88036ec09c80
[ 5.503262] <0> ffff8801ef0a5d90 ffffffff811e5ef0 ffff8801ef0a5df0 ffffffff8138c041
[ 5.512853] <0> ffffffff8138bf93 00000001ef0a5de0 ffff8801ef0a5e18 0000000681478fb3
[ 5.523046] Call Trace:
[ 5.526158] [<ffffffff811e5ef0>] security_socket_post_create+0x11/0x13
[ 5.533932] [<ffffffff8138c041>] __sock_create+0x1a4/0x1ff
[ 5.540541] [<ffffffff8138bf93>] ? __sock_create+0xf6/0x1ff
[ 5.547243] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.553463] [<ffffffff8138c0bb>] sock_create_kern+0x1f/0x21
[ 5.560162] [<ffffffff813edd5a>] inet_ctl_sock_create+0x29/0x5d
[ 5.567255] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.573473] [<ffffffff8183fb60>] tcp_sk_init+0x25/0x27
[ 5.579687] [<ffffffff81396716>] register_pernet_operations+0x18/0x1a
[ 5.587365] [<ffffffff81396824>] register_pernet_subsys+0x29/0x3d
[ 5.594657] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.600882] [<ffffffff8183fb27>] tcp_v4_init+0x1c/0x30
[ 5.607106] [<ffffffff818403cd>] inet_init+0x141/0x204
[ 5.613333] [<ffffffff8100905c>] do_one_initcall+0x56/0x130
[ 5.620038] [<ffffffff810a2a72>] ? register_irq_proc+0xae/0xca
[ 5.627041] [<ffffffff81130000>] ? d_add+0x16/0x1d
[ 5.632875] [<ffffffff81811915>] kernel_init+0x15e/0x1b4
[ 5.639294] [<ffffffff81028b9a>] child_rip+0xa/0x20
[ 5.645214] [<ffffffff8102857c>] ? restore_args+0x0/0x30
[ 5.651622] [<ffffffff818117b7>] ? kernel_init+0x0/0x1b4
[ 5.658033] [<ffffffff81028b90>] ? child_rip+0x0/0x20
[ 5.664158] Code: ca 44 89 ef e8 d8 b6 ff ff c6 43 22 01 66 89 43 20 31 c0 49 8b 54 24 60 48 85 d2 74 22 8b 43 1c 48 8b 92 a8 03 00 00 41 0f b7 f5 <89> 42 10 8b 43 20 66 89 42 18 49 8b 7c 24 60 e8 de 4a 00 00 41
[ 5.690779] RIP [<ffffffff811ed364>] selinux_socket_post_create+0x78/0x95
[ 5.698921] RSP <ffff8801ef0a5d60>
[ 5.703202] CR2: 0000000000000010
[ 5.707298] ---[ end trace a7919e7f17c0a725 ]---
[ 5.712848] swapper used greatest stack depth: 4680 bytes left
[ 5.727678] Kernel panic - not syncing: Attempted to kill init!
[ 5.734672] Pid: 1, comm: swapper Tainted: G D 2.6.31-rc1-net-2.6-igb-ed-patch-07071123 #2
[ 5.745375] Call Trace:
[ 5.748485] [<ffffffff81057cfd>] panic+0xdb/0x190
[ 5.754217] [<ffffffff8105ae3b>] ? do_exit+0x35f/0x6d8
[ 5.760439] [<ffffffff8105adeb>] ? do_exit+0x30f/0x6d8
[ 5.766659] [<ffffffff8105adeb>] ? do_exit+0x30f/0x6d8
[ 5.772875] [<ffffffff8107bdbb>] ? trace_hardirqs_on+0xd/0xf
[ 5.779688] [<ffffffff8147a295>] ? _write_unlock_irq+0x2b/0x31
[ 5.786690] [<ffffffff8105ab55>] do_exit+0x79/0x6d8
[ 5.792622] [<ffffffff8147b07d>] oops_end+0xb2/0xba
[ 5.798555] [<ffffffff810407b4>] no_context+0x1ef/0x1fe
[ 5.804875] [<ffffffff8107ba3b>] ? mark_lock+0x22/0x1fb
[ 5.811196] [<ffffffff8107af82>] ? save_trace+0x3f/0x96
[ 5.817516] [<ffffffff81040a03>] __bad_area_nosemaphore+0x186/0x1a9
[ 5.825009] [<ffffffff813e5bbf>] ? raw_hash_sk+0x2a/0x73
[ 5.831421] [<ffffffff8147c354>] ? do_page_fault+0xbd/0x22e
[ 5.838128] [<ffffffff81040a9c>] bad_area_nosemaphore+0xe/0x10
[ 5.845132] [<ffffffff8147c3b6>] do_page_fault+0x11f/0x22e
[ 5.851747] [<ffffffff8147a4ef>] page_fault+0x1f/0x30
[ 5.857871] [<ffffffff811ed364>] ? selinux_socket_post_create+0x78/0x95
[ 5.865754] [<ffffffff811e5ef0>] security_socket_post_create+0x11/0x13
[ 5.873536] [<ffffffff8138c041>] __sock_create+0x1a4/0x1ff
[ 5.880149] [<ffffffff8138bf93>] ? __sock_create+0xf6/0x1ff
[ 5.886859] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.893072] [<ffffffff8138c0bb>] sock_create_kern+0x1f/0x21
[ 5.899774] [<ffffffff813edd5a>] inet_ctl_sock_create+0x29/0x5d
[ 5.906863] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.913080] [<ffffffff8183fb60>] tcp_sk_init+0x25/0x27
[ 5.919299] [<ffffffff81396716>] register_pernet_operations+0x18/0x1a
[ 5.926986] [<ffffffff81396824>] register_pernet_subsys+0x29/0x3d
[ 5.934274] [<ffffffff8184028c>] ? inet_init+0x0/0x204
[ 5.940504] [<ffffffff8183fb27>] tcp_v4_init+0x1c/0x30
[ 5.946730] [<ffffffff818403cd>] inet_init+0x141/0x204
[ 5.952957] [<ffffffff8100905c>] do_one_initcall+0x56/0x130
[ 5.959668] [<ffffffff810a2a72>] ? register_irq_proc+0xae/0xca
[ 5.966666] [<ffffffff81130000>] ? d_add+0x16/0x1d
[ 5.972495] [<ffffffff81811915>] kernel_init+0x15e/0x1b4
[ 5.978909] [<ffffffff81028b9a>] child_rip+0xa/0x20
[ 5.984845] [<ffffffff8102857c>] ? restore_args+0x0/0x30
[ 5.991266] [<ffffffff818117b7>] ? kernel_init+0x0/0x1b4
[ 5.997686] [<ffffffff81028b90>] ? child_rip+0x0/0x20
Emil
Powered by blists - more mailing lists