[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161005065244.GA2245@templeofstupid.com>
Date: Tue, 4 Oct 2016 23:52:44 -0700
From: Krister Johansen <kjlx@...pleofstupid.com>
To: Cong Wang <xiyou.wangcong@...il.com>
Cc: Krister Johansen <kjlx@...pleofstupid.com>,
Jamal Hadi Salim <jhs@...atatu.com>,
Linux Kernel Network Developers <netdev@...r.kernel.org>
Subject: Re: [PATCH net] Panic when tc_lookup_action_n finds a partially
initialized action.
On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
> Please try the attached patch. I also convert the read path to RCU
> to avoid a possible deadlock. A quick test shows no lockdep splat.
I tried this patch, but it doesn't solve the problem. I got a panic on
my very first try:
SYSTEM MAP: /boot/System.map-4.7.5-1.el7.x86_64
DEBUG KERNEL: /usr/lib/debug/lib/modules/4.7.5-1.el7.x86_64/vmlinux (4.7.5-1.el7.x86_64)
DUMPFILE: ./vmcore [PARTIAL DUMP]
CPUS: 4
DATE: Tue Oct 4 22:43:45 2016
UPTIME: 00:02:32
LOAD AVERAGE: 0.20, 0.14, 0.06
TASKS: 1637
RELEASE: 4.7.5-1.el7.x86_64
VERSION: #1 SMP Tue Oct 4 21:58:08 PDT 2016
MACHINE: x86_64 (3392 Mhz)
MEMORY: 4 GB
PANIC: "BUG: unable to handle kernel NULL pointer dereference at (null)"
PID: 20151
COMMAND: "test"
TASK: ffff880138f89480 [THREAD_INFO: ffff88009f450000]
CPU: 2
STATE: TASK_RUNNING (PANIC)
PID: 20151 TASK: ffff880138f89480 CPU: 2 COMMAND: "test"
#0 [ffff88009f453368] machine_kexec at ffffffff8105c2db
#1 [ffff88009f4533c8] __crash_kexec at ffffffff81111ca2
#2 [ffff88009f453498] crash_kexec at ffffffff81111d9c
#3 [ffff88009f4534b8] oops_end at ffffffff810309d1
#4 [ffff88009f4534e0] no_context at ffffffff8106a5de
#5 [ffff88009f453540] __bad_area_nosemaphore at ffffffff8106a92e
#6 [ffff88009f453590] bad_area at ffffffff81081b45
#7 [ffff88009f4535b8] __do_page_fault at ffffffff8106b42d
#8 [ffff88009f453620] trace_do_page_fault at ffffffff8106b5a3
#9 [ffff88009f453658] do_async_page_fault at ffffffff81063dda
#10 [ffff88009f453670] async_page_fault at ffffffff81735af8
[exception RIP: tcf_hash_check+18]
RIP: ffffffff81649772 RSP: ffff88009f453720 RFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001
RDX: ffff8801297ee440 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88009f453738 R8: ffffffffa04020f0 R9: ffff88009f453770
R10: ffffffff8164a6a1 R11: ffff880129621f00 R12: ffff8801297ee440
R13: ffff8800ba0f9940 R14: ffff880093cd3e6c R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#11 [ffff88009f453740] tcf_mirred_init at ffffffffa04011cc [act_mirred]
#12 [ffff88009f4537c8] tcf_action_init_1 at ffffffff8164a7c9
#13 [ffff88009f453868] tcf_action_init at ffffffff8164a881
#14 [ffff88009f4539c0] tcf_exts_validate at ffffffff8164868a
#15 [ffff88009f4539e8] u32_set_parms at ffffffffa04153f2 [cls_u32]
#16 [ffff88009f453a60] u32_change at ffffffffa0416bc4 [cls_u32]
#17 [ffff88009f453b50] tc_ctl_tfilter at ffffffff8164900d
#18 [ffff88009f453c48] rtnetlink_rcv_msg at ffffffff8162b8f4
#19 [ffff88009f453cc0] netlink_rcv_skb at ffffffff81650b77
#20 [ffff88009f453ce8] rtnetlink_rcv at ffffffff8162b848
#21 [ffff88009f453d00] netlink_unicast at ffffffff81650531
#22 [ffff88009f453d48] netlink_sendmsg at ffffffff8165091e
#23 [ffff88009f453dc8] sock_sendmsg at ffffffff815fcae8
#24 [ffff88009f453de8] SYSC_sendto at ffffffff815fcfa2
#25 [ffff88009f453f18] sys_sendto at ffffffff815fdb4e
#26 [ffff88009f453f28] do_syscall_64 at ffffffff81003b12
RIP: 00000000004cbfba RSP: 000000c8200a4eb8 RFLAGS: 00000212
RAX: ffffffffffffffda RBX: 000000c820000180 RCX: 00000000004cbfba
RDX: 000000000000008c RSI: 000000c82007a900 RDI: 0000000000000007
RBP: 0000000000000000 R8: 000000c8200fd844 R9: 000000000000000c
R10: 0000000000000000 R11: 0000000000000212 R12: 000000c82007a900
R13: 0000000000000003 R14: 0012492492492492 R15: 0000000000000039
ORIG_RAX: 000000000000002c CS: 0033 SS: 002b
The problem here is the same as before: by using RCU the race isn't
fixed because the module is still discoverable from act_base before the
pernet initialization is completed.
You can see from the trap frame that the first two arguments to
tcf_hash_check were 0. It couldn't look up the correct per-subsystem
pointer because the id hadn't yet been registered.
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index d09d068..4aac846 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> int tcf_register_action(struct tc_action_ops *act,
> struct pernet_operations *ops)
> @@ -341,22 +356,23 @@ int tcf_register_action(struct tc_action_ops *act,
> if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
> return -EINVAL;
>
> - write_lock(&act_mod_lock);
> + mutex_lock(&act_mod_lock);
> list_for_each_entry(a, &act_base, head) {
> if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
> - write_unlock(&act_mod_lock);
> + mutex_unlock(&act_mod_lock);
> return -EEXIST;
> }
> }
> - list_add_tail(&act->head, &act_base);
> - write_unlock(&act_mod_lock);
> + list_add_tail_rcu(&act->head, &act_base);
>
> ret = register_pernet_subsys(ops);
> if (ret) {
> - tcf_unregister_action(act, ops);
> + __tcf_unregister_action(act);
> + mutex_unlock(&act_mod_lock);
> return ret;
> }
>
> + mutex_unlock(&act_mod_lock);
> return 0;
> }
If lookups use rcu, then the list_add_tail_rcu() has to come after
register_pernet_subsys() returns success.
> EXPORT_SYMBOL(tcf_register_action);
> @@ -364,20 +380,13 @@ EXPORT_SYMBOL(tcf_register_action);
> int tcf_unregister_action(struct tc_action_ops *act,
> struct pernet_operations *ops)
> {
> - struct tc_action_ops *a;
> int err = -ENOENT;
>
> + mutex_lock(&act_mod_lock);
> unregister_pernet_subsys(ops);
> + err = __tcf_unregister_action(act);
> + mutex_unlock(&act_mod_lock);
>
> - write_lock(&act_mod_lock);
> - list_for_each_entry(a, &act_base, head) {
> - if (a == act) {
> - list_del(&act->head);
> - err = 0;
> - break;
> - }
> - }
> - write_unlock(&act_mod_lock);
> return err;
> }
Similarly, unregistering the pernet_subsystem before ensuring that the
action has been removed from act_base leads to another version of this
race on module unload. Presumably you need to wait for the rcu grace
period to elapse upon removal before it's actually safe to unregister
the subsystem, since any callers with references to the action will have
the potential for a similar NULL deference if their per-net id suddenly
expires.
> EXPORT_SYMBOL(tcf_unregister_action);
> @@ -388,15 +397,15 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind)
> struct tc_action_ops *a, *res = NULL;
>
> if (kind) {
> - read_lock(&act_mod_lock);
> - list_for_each_entry(a, &act_base, head) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(a, &act_base, head) {
> if (strcmp(kind, a->kind) == 0) {
> if (try_module_get(a->owner))
> res = a;
> break;
> }
> }
> - read_unlock(&act_mod_lock);
> + rcu_read_unlock();
> }
> return res;
> }
> @@ -407,15 +416,15 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
> struct tc_action_ops *a, *res = NULL;
>
> if (kind) {
> - read_lock(&act_mod_lock);
> - list_for_each_entry(a, &act_base, head) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(a, &act_base, head) {
> if (nla_strcmp(kind, a->kind) == 0) {
> if (try_module_get(a->owner))
> res = a;
> break;
> }
> }
> - read_unlock(&act_mod_lock);
> + rcu_read_unlock();
> }
> return res;
> }
Given the fact that RCU tolerates inherent races, but another trip
around the modprobe loop blocks the caller, and forks an unnecessary
process, it's not clear to me that converting this to RCU makes it
simpler, or is the right tool for this particular job.
After testing it, I'm not convinced that this is any less complicated
than the approach that I proposed.
Thanks,
-K
Powered by blists - more mailing lists