[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130714111638.GA12546@redhat.com>
Date: Sun, 14 Jul 2013 14:16:38 +0300
From: Gleb Natapov <gleb@...hat.com>
To: Marcelo Tosatti <mtosatti@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>, mingo@...hat.com,
glommer@...allels.com, tglx@...utronix.de, jeremy@...p.org,
rostedt@...dmis.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] remove sched notifier for cross-cpu migrations
On Wed, Jul 10, 2013 at 10:21:57PM -0300, Marcelo Tosatti wrote:
>
> Linux as a guest on KVM hypervisor, the only user of the pvclock
> vsyscall interface, does not require notification on task migration
> because:
>
> 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
> 2. per-CPU pvclock time info is updated if the
> underlying CPU changes.
> 3. that version is increased whenever underlying CPU
> changes.
>
> Which is sufficient to guarantee nanoseconds counter
> is calculated properly.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@...hat.com>
>
Applied, thanks.
> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
> index 109a9dd..be8269b 100644
> --- a/arch/x86/include/asm/pvclock.h
> +++ b/arch/x86/include/asm/pvclock.h
> @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
>
> struct pvclock_vsyscall_time_info {
> struct pvclock_vcpu_time_info pvti;
> - u32 migrate_count;
> } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>
> #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
> index 2cb9470..a16bae3 100644
> --- a/arch/x86/kernel/pvclock.c
> +++ b/arch/x86/kernel/pvclock.c
> @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
> set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
> }
>
> -static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
> -
> -static struct pvclock_vsyscall_time_info *
> -pvclock_get_vsyscall_user_time_info(int cpu)
> -{
> - if (!pvclock_vdso_info) {
> - BUG();
> - return NULL;
> - }
> -
> - return &pvclock_vdso_info[cpu];
> -}
> -
> -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
> -{
> - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
> -}
> -
> #ifdef CONFIG_X86_64
> -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
> - void *v)
> -{
> - struct task_migration_notifier *mn = v;
> - struct pvclock_vsyscall_time_info *pvti;
> -
> - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
> -
> - /* this is NULL when pvclock vsyscall is not initialized */
> - if (unlikely(pvti == NULL))
> - return NOTIFY_DONE;
> -
> - pvti->migrate_count++;
> -
> - return NOTIFY_DONE;
> -}
> -
> -static struct notifier_block pvclock_migrate = {
> - .notifier_call = pvclock_task_migrate,
> -};
> -
> /*
> * Initialize the generic pvclock vsyscall state. This will allocate
> * a/some page(s) for the per-vcpu pvclock information, set up a
> @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
>
> WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
>
> - pvclock_vdso_info = i;
> -
> for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
> __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
> __pa(i) + (idx*PAGE_SIZE),
> PAGE_KERNEL_VVAR);
> }
>
> -
> - register_task_migration_notifier(&pvclock_migrate);
> -
> return 0;
> }
> #endif
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index c74436e..72074d5 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
> cycle_t ret;
> u64 last;
> u32 version;
> - u32 migrate_count;
> u8 flags;
> unsigned cpu, cpu1;
>
>
> /*
> - * When looping to get a consistent (time-info, tsc) pair, we
> - * also need to deal with the possibility we can switch vcpus,
> - * so make sure we always re-fetch time-info for the current vcpu.
> + * Note: hypervisor must guarantee that:
> + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
> + * 2. that per-CPU pvclock time info is updated if the
> + * underlying CPU changes.
> + * 3. that version is increased whenever underlying CPU
> + * changes.
> + *
> */
> do {
> cpu = __getcpu() & VGETCPU_CPU_MASK;
> @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
>
> pvti = get_pvti(cpu);
>
> - migrate_count = pvti->migrate_count;
> -
> version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
>
> /*
> @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
> cpu1 = __getcpu() & VGETCPU_CPU_MASK;
> } while (unlikely(cpu != cpu1 ||
> (pvti->pvti.version & 1) ||
> - pvti->pvti.version != version ||
> - pvti->migrate_count != migrate_count));
> + pvti->pvti.version != version));
>
> if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
> *mode = VCLOCK_NONE;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 178a8d9..217ce4b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
> extern void calc_global_load(unsigned long ticks);
> extern void update_cpu_load_nohz(void);
>
> -/* Notifier for when a task gets migrated to a new CPU */
> -struct task_migration_notifier {
> - struct task_struct *task;
> - int from_cpu;
> - int to_cpu;
> -};
> -extern void register_task_migration_notifier(struct notifier_block *n);
> -
> extern unsigned long get_parent_ip(unsigned long addr);
>
> extern void dump_cpu_task(int cpu);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e8b3350..122e499 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -974,13 +974,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
> rq->skip_clock_update = 1;
> }
>
> -static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
> -
> -void register_task_migration_notifier(struct notifier_block *n)
> -{
> - atomic_notifier_chain_register(&task_migration_notifier, n);
> -}
> -
> #ifdef CONFIG_SMP
> void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> {
> @@ -1011,18 +1004,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> trace_sched_migrate_task(p, new_cpu);
>
> if (task_cpu(p) != new_cpu) {
> - struct task_migration_notifier tmn;
> -
> if (p->sched_class->migrate_task_rq)
> p->sched_class->migrate_task_rq(p, new_cpu);
> p->se.nr_migrations++;
> perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
> -
> - tmn.task = p;
> - tmn.from_cpu = task_cpu(p);
> - tmn.to_cpu = new_cpu;
> -
> - atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
> }
>
> __set_task_cpu(p, new_cpu);
>
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists