linux-kernel - [RFC patch] BFS: 421-1

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJd=RBBSEVOUL-J=gGWO4FpbtC30_A8eb6HyKK48ZdjmNn_y-Q@mail.gmail.com>
Date:	Sun, 1 Jul 2012 21:27:33 +0800
From:	Hillf Danton <dhillf@...il.com>
To:	LKML <linux-kernel@...r.kernel.org>,
	Hillf Danton <dhillf@...il.com>
Subject: [RFC patch] BFS: 421-1

With 15 patches collected, the 421-1 is ready with iso untouched.
Note diff is based not on 420 but on 421.


--- a/kernel/sched/bfs.c	Sun Jul  1 20:39:30 2012
+++ b/kernel/sched/bfs.c	Fri Jun 15 20:00:52 2012
@@ -113,7 +113,6 @@
 #define USER_PRIO(p)		((p) - MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
-#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
 #define STOP_PRIO		(MAX_RT_PRIO - 1)

 /*
@@ -150,6 +149,19 @@ int rr_interval __read_mostly = 6;
  */
 int sched_iso_cpu __read_mostly = 70;

+#ifdef CONFIG_SMP
+enum {
+	EDL_CK,		//default
+	EDL_MS,		//map cache distance to milliseconds
+	EDL_NONE,	//strict edl
+};
+int edl_mode = EDL_CK;
+
+unsigned long	grab_rq_lock = 0,
+		wait_rq_lock = 0,
+		tsk_csw = 0,
+		cpu_csw = 0;
+#endif
 /*
  * The relative length of deadline for each priority(nice) level.
  */
@@ -247,7 +259,6 @@ struct rq {
 	int rq_time_slice;
 	u64 rq_last_ran;
 	int rq_prio;
-	bool rq_running; /* There is a task running */

 	/* Accurate timekeeping data */
 	u64 timekeep_clock;
@@ -313,7 +324,6 @@ struct rq {
 };

 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);

 #ifdef CONFIG_SMP
 /*
@@ -327,7 +337,6 @@ int __weak arch_sd_sibling_asym_packing(
 {
        return 0*SD_ASYM_PACKING;
 }
-#endif

 #define rcu_dereference_check_sched_domain(p) \
 	rcu_dereference_check((p), \
@@ -342,6 +351,9 @@ int __weak arch_sd_sibling_asym_packing(
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
__sd; __sd = __sd->parent)
+#else
+#define for_each_domain(cpu, __sd)	BUILD_BUG()
+#endif

 static inline void update_rq_clock(struct rq *rq);

@@ -523,12 +535,6 @@ static inline struct rq *task_grq_lock_i
 	return task_rq(p);
 }

-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	struct rq *rq = task_grq_lock_irq(p);
-	update_clocks(rq);
-}

 static inline void task_grq_unlock_irq(void)
 	__releases(grq.lock)
@@ -986,15 +992,11 @@ static void activate_task(struct task_st
 {
 	update_clocks(rq);

-	/*
-	 * Sleep time is in units of nanosecs, so shift by 20 to get a
-	 * milliseconds-range estimation of the amount of time that the task
-	 * spent sleeping:
-	 */
+	/* Sleep time is tracked in units of nanosecs, but reported in ms */
 	if (unlikely(prof_on == SLEEP_PROFILING)) {
 		if (p->state == TASK_UNINTERRUPTIBLE)
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
-				     (rq->clock - p->last_ran) >> 20);
+					NS_TO_MS(rq->clock - p->last_ran));
 	}

 	p->prio = effective_prio(p);
@@ -1029,16 +1031,10 @@ void set_task_cpu(struct task_struct *p,
 	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
 #endif
 	trace_sched_migrate_task(p, cpu);
-	if (task_cpu(p) != cpu)
+	if (task_cpu(p) != cpu) {
+		task_thread_info(p)->cpu = cpu;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-
-	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
-	 * successfully executed on another CPU. We must ensure that updates of
-	 * per-task data have been completed by this moment.
-	 */
-	smp_wmb();
-	task_thread_info(p)->cpu = cpu;
+	}
 }

 static inline void clear_sticky(struct task_struct *p)
@@ -1057,6 +1053,8 @@ resched_closest_idle(struct rq *rq, int
 {
 	cpumask_t tmpmask;

+	if (!grq.idle_cpus)
+		return;
 	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
 	cpu_clear(cpu, tmpmask);
 	if (cpus_empty(tmpmask))
@@ -1125,29 +1123,12 @@ static inline void unstick_task(struct r
  */
 static inline void take_task(int cpu, struct task_struct *p)
 {
-#ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SMP
-	if (p->wakeup_cpu == -1)
-		goto skip;
-
-	if (cpu == p->wakeup_cpu) {
-		schedstat_inc(cpu_rq(cpu), ttwu_local);
+	if (p != current) {
+		tsk_csw++;
+		if (cpu != task_cpu(p))
+			cpu_csw++;
 	}
-	else if (cpu_online(p->wakeup_cpu)) {
-		struct sched_domain *sd;
-
-		rcu_read_lock();
-		for_each_domain(p->wakeup_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
-		}
-		rcu_read_unlock();
-	}
-	p->wakeup_cpu = -1;
-skip:
-#endif
 #endif
 	set_task_cpu(p, cpu);
 	dequeue_task(p);
@@ -1221,11 +1202,6 @@ inline int task_curr(const struct task_s
 }

 #ifdef CONFIG_SMP
-struct migration_req {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1456,10 +1432,11 @@ static void try_preempt(struct task_stru
 		if (rq_prio < highest_prio)
 			continue;

-		if (rq_prio > highest_prio ||
-		    deadline_after(rq->rq_deadline, latest_deadline)) {
+		if (rq_prio > highest_prio)
+			goto set;
+		if (deadline_after(rq->rq_deadline, latest_deadline)) {
 			latest_deadline = rq->rq_deadline;
-			highest_prio = rq_prio;
+set:			highest_prio = rq_prio;
 			highest_prio_rq = rq;
 		}
 	}
@@ -1660,7 +1637,6 @@ void sched_fork(struct task_struct *p)
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
-	set_task_cpu(p, cpu);

 	/* Should be reset in fork.c but done here for ease of bfs patching */
 	p->sched_time = p->stime_pc = p->utime_pc = 0;
@@ -1710,8 +1686,6 @@ void sched_fork(struct task_struct *p)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto out;
 	/*
 	 * Share the timeslice between parent and child, thus the
 	 * total amount of pending timeslices in the system doesn't change,
@@ -1722,6 +1696,9 @@ void sched_fork(struct task_struct *p)
 	 * is always equal to current->deadline.
 	 */
 	rq = task_grq_lock_irq(curr);
+	set_task_cpu(p, cpu);
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
 	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
 		rq->rq_time_slice /= 2;
 		p->time_slice = rq->rq_time_slice;
@@ -1737,8 +1714,8 @@ void sched_fork(struct task_struct *p)
 		time_slice_expired(p);
 	}
 	p->last_ran = rq->rq_last_ran;
-	task_grq_unlock_irq();
 out:
+	task_grq_unlock_irq();
 	put_cpu();
 }

@@ -2047,7 +2024,8 @@ unsigned long nr_active(void)
 /* Beyond a task running on this CPU, load is equal everywhere on BFS */
 unsigned long this_cpu_load(void)
 {
-	return this_rq()->rq_running +
+	struct rq *rq = this_rq();
+	return (rq->curr != rq->idle) +
 		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
 }

@@ -2582,28 +2560,6 @@ static void account_guest_time(struct ta
 	}
 }

-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-			cputime_t cputime_scaled, cputime64_t *target_cputime64)
-{
-	/* Add system time to process. */
-	p->stime += (__force u64)cputime;
-	p->stimescaled += (__force u64)cputime_scaled;
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	*target_cputime64 += (__force u64)cputime;
-
-	/* Account for system time used */
-	acct_update_integrals(p);
-}

 /*
  * Account system cpu time to a process.
@@ -2785,11 +2741,13 @@ static void task_running_tick(struct rq
 	} else if (rq->rq_time_slice >= RESCHED_US)
 			return;

-	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
+	/*
+	 * With irq disabled, current is descheduled without global lock
+	 * held and IPI cared.
+	 */
 	p = rq->curr;
-	grq_lock();
-	set_tsk_need_resched(p);
-	grq_unlock();
+	if (!test_tsk_need_resched(p))
+		set_tsk_need_resched(p);
 }


@@ -3048,7 +3006,17 @@ task_struct *earliest_deadline_task(stru
 			 */
 			dl = p->deadline;
 #ifdef CONFIG_SMP
-			dl <<= locality_diff(p, rq) + scaling_rq(rq);
+			switch (edl_mode) {
+			default:
+			case EDL_CK:
+				dl <<= locality_diff(p, rq) + scaling_rq(rq);
+				break;
+			case EDL_MS:
+				dl += MS_TO_NS(locality_diff(p, rq) +
+						4* scaling_rq(rq));
+			case EDL_NONE:
+				break;
+			}
 #endif

 			if (deadline_before(dl, earliest_deadline)) {
@@ -3117,10 +3085,6 @@ static inline void set_rq_task(struct rq
 	rq->rq_last_ran = p->last_ran = rq->clock;
 	rq->rq_policy = p->policy;
 	rq->rq_prio = p->prio;
-	if (p != rq->idle)
-		rq->rq_running = true;
-	else
-		rq->rq_running = false;
 }

 static void reset_rq_task(struct rq *rq, struct task_struct *p)
@@ -3151,6 +3115,11 @@ need_resched:
 	deactivate = false;
 	schedule_debug(prev);

+#ifdef CONFIG_SMP
+	grab_rq_lock++;
+	if (grunqueue_is_locked())
+		wait_rq_lock++;
+#endif
 	grq_lock_irq();

 	switch_count = &prev->nivcsw;
@@ -3260,6 +3229,8 @@ need_resched:
 		++*switch_count;

 		context_switch(rq, prev, next); /* unlocks the grq */
+#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SMP
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
@@ -3269,6 +3240,29 @@ need_resched:
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 		idle = rq->idle;
+		next = rq->curr;
+		if (next == idle || next->wakeup_cpu < 0)
+			goto skip;
+
+		if (cpu == next->wakeup_cpu)
+			schedstat_inc(rq, ttwu_local);
+
+		else if (cpu_online(next->wakeup_cpu)) {
+			struct sched_domain *sd;
+
+			rcu_read_lock();
+			for_each_domain(next->wakeup_cpu, sd) {
+				if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+					schedstat_inc(sd, ttwu_wake_remote);
+					break;
+				}
+			}
+			rcu_read_unlock();
+		}
+		next->wakeup_cpu = -1;
+skip:
+#endif
+#endif
 	} else
 		grq_unlock_irq();

@@ -5352,7 +5346,7 @@ migration_call(struct notifier_block *nf
 		/* Update our root-domain */
 		grq_lock_irqsave(&flags);
 		if (rq->rd) {
-			BUG_ON(cpumask_test_cpu(cpu, rq->rd->span));
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

 			set_rq_online(rq);
 		}
--- a/kernel/sysctl.c	Sun Jul  1 21:06:54 2012
+++ b/kernel/sysctl.c	Tue Jun 12 20:04:02 2012
@@ -125,6 +125,13 @@ static int __maybe_unused one_hundred =
 #ifdef CONFIG_SCHED_BFS
 extern int rr_interval;
 extern int sched_iso_cpu;
+#ifdef CONFIG_SMP
+extern int edl_mode;
+extern unsigned long grab_rq_lock,
+			wait_rq_lock,
+			tsk_csw,
+			cpu_csw;
+#endif
 static int __read_mostly one_thousand = 1000;
 #endif
 #ifdef CONFIG_PRINTK
@@ -876,6 +883,43 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "edl_mode",
+		.data		= &edl_mode,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cpu_csw",
+		.data		= &cpu_csw,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "tsk_csw",
+		.data		= &tsk_csw,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "grab_rq_lock",
+		.data		= &grab_rq_lock,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "wait_rq_lock",
+		.data		= &wait_rq_lock,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+#endif
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/