lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 21 Sep 2011 20:50:37 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Mike Galbraith <efault@....de>
Cc:	linux-rt-users <linux-rt-users@...r.kernel.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	LKML <linux-kernel@...r.kernel.org>,
	Oleg Nesterov <oleg@...hat.com>,
	Miklos Szeredi <miklos@...redi.hu>, mingo <mingo@...hat.com>
Subject: Re: rt14: strace ->  migrate_disable_atomic imbalance

On Wed, 2011-09-21 at 19:01 +0200, Peter Zijlstra wrote:
> On Wed, 2011-09-21 at 12:17 +0200, Mike Galbraith wrote:
> > [  144.212272] ------------[ cut here ]------------
> > [  144.212280] WARNING: at kernel/sched.c:6152 migrate_disable+0x1b6/0x200()
> > [  144.212282] Hardware name: MS-7502
> > [  144.212283] Modules linked in: snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device edd nfsd lockd parport_pc parport nfs_acl auth_rpcgss sunrpc bridge ipv6 stp cpufreq_conservative microcode cpufreq_ondemand cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf nls_iso8859_1 nls_cp437 vfat fat fuse ext3 jbd dm_mod usbmouse usb_storage usbhid snd_hda_codec_realtek usb_libusual uas sr_mod cdrom hid snd_hda_intel e1000e snd_hda_codec kvm_intel snd_hwdep sg snd_pcm kvm i2c_i801 snd_timer snd firewire_ohci firewire_core soundcore snd_page_alloc crc_itu_t button ext4 mbcache jbd2 crc16 uhci_hcd sd_mod ehci_hcd usbcore rtc_cmos ahci libahci libata scsi_mod fan processor thermal
> > [  144.212317] Pid: 6215, comm: strace Not tainted 3.0.4-rt14 #2052
> > [  144.212319] Call Trace:
> > [  144.212323]  [<ffffffff8104662f>] warn_slowpath_common+0x7f/0xc0
> > [  144.212326]  [<ffffffff8104668a>] warn_slowpath_null+0x1a/0x20
> > [  144.212328]  [<ffffffff8103f606>] migrate_disable+0x1b6/0x200
> > [  144.212331]  [<ffffffff8105a2a8>] ptrace_stop+0x128/0x240
> > [  144.212334]  [<ffffffff81057b9b>] ? recalc_sigpending+0x1b/0x50
> > [  144.212337]  [<ffffffff8105b6f1>] get_signal_to_deliver+0x211/0x530
> > [  144.212340]  [<ffffffff81001835>] do_signal+0x75/0x7a0
> > [  144.212342]  [<ffffffff8105ae68>] ? kill_pid_info+0x58/0x80
> > [  144.212344]  [<ffffffff8105c34c>] ? sys_kill+0xac/0x1e0
> > [  144.212347]  [<ffffffff81001fe5>] do_notify_resume+0x65/0x80
> > [  144.212350]  [<ffffffff8135978b>] int_signal+0x12/0x17
> > [  144.212352] ---[ end trace 0000000000000002 ]---
> 
> 
> Right, that's because of 
> 53da1d9456fe7f87a920a78fdbdcf1225d197cb7, I think we simply want a full
> revert of that for -rt.

This also made me stare at the trainwreck called wait_task_inactive(),
how about something like the below, it survives a boot and simple
strace.

I'm not particularly keen on always enabling preempt notifiers, but
seeing that pretty much world+dog already has them enabled...

Also, less LOC is always better, right ;-)

---
 arch/ia64/kvm/Kconfig    |    1 -
 arch/powerpc/kvm/Kconfig |    1 -
 arch/s390/kvm/Kconfig    |    1 -
 arch/tile/kvm/Kconfig    |    1 -
 arch/x86/kvm/Kconfig     |    1 -
 include/linux/kvm_host.h |    2 -
 include/linux/preempt.h  |    4 -
 include/linux/sched.h    |    2 -
 init/Kconfig             |    3 -
 kernel/sched.c           |  163 ++++++++++++++++++----------------------------
 10 files changed, 64 insertions(+), 115 deletions(-)

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 9806e55..02b36ca 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	depends on HAVE_KVM && MODULES && EXPERIMENTAL
 	# for device assignment:
 	depends on PCI
-	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select KVM_APIC_ARCHITECTURE
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 78133de..0bcd5a8 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -18,7 +18,6 @@ if VIRTUALIZATION
 
 config KVM
 	bool
-	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 
 config KVM_BOOK3S_HANDLER
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index a216341..7ff8d54 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -19,7 +19,6 @@ config KVM
 	def_tristate y
 	prompt "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && EXPERIMENTAL
-	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 669fcdb..6a936d1 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -19,7 +19,6 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM && MODULES && EXPERIMENTAL
-	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
 	  Support hosting paravirtualized guest machines.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d..d82150a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -24,7 +24,6 @@ config KVM
 	depends on PCI
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET
-	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eabb21a..a9343b8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -111,9 +111,7 @@ enum {
 
 struct kvm_vcpu {
 	struct kvm *kvm;
-#ifdef CONFIG_PREEMPT_NOTIFIERS
 	struct preempt_notifier preempt_notifier;
-#endif
 	int cpu;
 	int vcpu_id;
 	int srcu_idx;
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 58969b2..7ca8968 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -101,8 +101,6 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
 struct preempt_notifier;
 
 /**
@@ -147,6 +145,4 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 	notifier->ops = ops;
 }
 
-#endif
-
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e54c890..64fc7c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1236,10 +1236,8 @@ struct task_struct {
 	struct sched_entity se;
 	struct sched_rt_entity rt;
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
 	struct hlist_head preempt_notifiers;
-#endif
 
 	/*
 	 * fpu_counter contains the number of consecutive context switches
diff --git a/init/Kconfig b/init/Kconfig
index d19b3a7..c1c411c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1403,9 +1403,6 @@ config STOP_MACHINE
 
 source "block/Kconfig"
 
-config PREEMPT_NOTIFIERS
-	bool
-
 config PADATA
 	depends on SMP
 	bool
diff --git a/kernel/sched.c b/kernel/sched.c
index db143fd..b38ab2e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2387,6 +2387,38 @@ struct migration_arg {
 
 static int migration_cpu_stop(void *data);
 
+struct wait_task_inactive_blocked {
+	struct preempt_notifier notifier;
+	struct task_struct *waiter;
+};
+
+static void wait_task_inactive_sched_in(struct preempt_notifier *n, int cpu)
+{
+	/* Dummy, could be called when preempted before sleeping */
+}
+
+static void wait_task_inactive_sched_out(struct preempt_notifier *n,
+		struct task_struct *next)
+{
+	struct task_struct *p;
+	struct wait_task_inactive_blocked *blocked = 
+		container_of(n, struct wait_task_inactive_blocked, notifier);
+
+	if (current->on_rq) /* we're not inactive yet */
+		return;
+
+	hlist_del(&n->link);
+
+	p = ACCESS_ONCE(blocked->waiter);
+	blocked->waiter = NULL;
+	wake_up_process(p);
+}
+
+static struct preempt_ops wait_task_inactive_ops = {
+	.sched_in = wait_task_inactive_sched_in,
+	.sched_out = wait_task_inactive_sched_out,
+};
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -2405,93 +2437,45 @@ static int migration_cpu_stop(void *data);
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
+	unsigned long ncsw = 0;
 	unsigned long flags;
-	int running, on_rq;
-	unsigned long ncsw;
 	struct rq *rq;
 
-	for (;;) {
-		/*
-		 * We do the initial early heuristics without holding
-		 * any task-queue locks at all. We'll only try to get
-		 * the runqueue lock when things look like they will
-		 * work out!
-		 */
-		rq = task_rq(p);
-
-		/*
-		 * If the task is actively running on another CPU
-		 * still, just relax and busy-wait without holding
-		 * any locks.
-		 *
-		 * NOTE! Since we don't hold any locks, it's not
-		 * even sure that "rq" stays as the right runqueue!
-		 * But we don't care, since "task_running()" will
-		 * return false if the runqueue has changed and p
-		 * is actually now running somewhere else!
-		 */
-		while (task_running(rq, p)) {
-			if (match_state && unlikely(p->state != match_state))
-				return 0;
-			cpu_relax();
-		}
-
-		/*
-		 * Ok, time to look more closely! We need the rq
-		 * lock now, to be *sure*. If we're wrong, we'll
-		 * just go back and repeat.
-		 */
-		rq = task_rq_lock(p, &flags);
-		trace_sched_wait_task(p);
-		running = task_running(rq, p);
-		on_rq = p->on_rq;
-		ncsw = 0;
-		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_rq_unlock(rq, p, &flags);
-
-		/*
-		 * If it changed from the expected state, bail out now.
-		 */
-		if (unlikely(!ncsw))
-			break;
+	struct wait_task_inactive_blocked blocked = {
+		.notifier = {
+			.ops = &wait_task_inactive_ops,
+		},
+		.waiter = current,
+	};
 
-		/*
-		 * Was it really running after all now that we
-		 * checked with the proper locks actually held?
-		 *
-		 * Oops. Go back and try again..
-		 */
-		if (unlikely(running)) {
-			cpu_relax();
-			continue;
-		}
+	rq = task_rq_lock(p, &flags);
+	if (!task_running(rq, p))
+		goto done;
 
-		/*
-		 * It's not enough that it's not actively running,
-		 * it must be off the runqueue _entirely_, and not
-		 * preempted!
-		 *
-		 * So if it was still runnable (but just not actively
-		 * running right now), it's preempted, and we should
-		 * yield - it could be a while.
-		 */
-		if (unlikely(on_rq)) {
-			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+	if (match_state && unlikely(p->state != match_state))
+		goto unlock;
 
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
-			continue;
-		}
+	hlist_add_head(&blocked.notifier.link, &p->preempt_notifiers);
+	task_rq_unlock(rq, p, &flags);
 
-		/*
-		 * Ahh, all good. It wasn't running, and it wasn't
-		 * runnable, which means that it will never become
-		 * running in the future either. We're all done!
-		 */
-		break;
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!blocked.waiter)
+			break;
+		schedule();
 	}
+	__set_current_state(TASK_RUNNING);
 
+	/*
+	 * Serializes against the completion of the previously observed context
+	 * switch.
+	 */
+	rq = task_rq_lock(p, &flags);
+done:
+	if (!match_state || p->state == match_state)
+		ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+unlock:
+	task_rq_unlock(rq, p, &flags);
 	return ncsw;
 }
 
@@ -2967,10 +2951,7 @@ static void __sched_fork(struct task_struct *p)
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
 }
 
 /*
@@ -3084,8 +3065,6 @@ void wake_up_new_task(struct task_struct *p)
 	task_rq_unlock(rq, p, &flags);
 }
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
@@ -3122,26 +3101,12 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
-	struct hlist_node *node;
+	struct hlist_node *node, *n;
 
-	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+	hlist_for_each_entry_safe(notifier, node, n, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-				 struct task_struct *next)
-{
-}
-
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ